awslabs.terraform-mcp-server 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of awslabs.terraform-mcp-server might be problematic. Click here for more details.

Files changed (32) hide show
  1. awslabs/__init__.py +2 -0
  2. awslabs/terraform_mcp_server/__init__.py +3 -0
  3. awslabs/terraform_mcp_server/impl/resources/__init__.py +11 -0
  4. awslabs/terraform_mcp_server/impl/resources/terraform_aws_provider_resources_listing.py +52 -0
  5. awslabs/terraform_mcp_server/impl/resources/terraform_awscc_provider_resources_listing.py +55 -0
  6. awslabs/terraform_mcp_server/impl/tools/__init__.py +15 -0
  7. awslabs/terraform_mcp_server/impl/tools/execute_terraform_command.py +206 -0
  8. awslabs/terraform_mcp_server/impl/tools/run_checkov_scan.py +359 -0
  9. awslabs/terraform_mcp_server/impl/tools/search_aws_provider_docs.py +677 -0
  10. awslabs/terraform_mcp_server/impl/tools/search_awscc_provider_docs.py +627 -0
  11. awslabs/terraform_mcp_server/impl/tools/search_specific_aws_ia_modules.py +444 -0
  12. awslabs/terraform_mcp_server/impl/tools/utils.py +558 -0
  13. awslabs/terraform_mcp_server/models/__init__.py +27 -0
  14. awslabs/terraform_mcp_server/models/models.py +260 -0
  15. awslabs/terraform_mcp_server/scripts/generate_aws_provider_resources.py +1224 -0
  16. awslabs/terraform_mcp_server/scripts/generate_awscc_provider_resources.py +1020 -0
  17. awslabs/terraform_mcp_server/scripts/scrape_aws_terraform_best_practices.py +129 -0
  18. awslabs/terraform_mcp_server/server.py +329 -0
  19. awslabs/terraform_mcp_server/static/AWSCC_PROVIDER_RESOURCES.md +3125 -0
  20. awslabs/terraform_mcp_server/static/AWS_PROVIDER_RESOURCES.md +3833 -0
  21. awslabs/terraform_mcp_server/static/AWS_TERRAFORM_BEST_PRACTICES.md +2523 -0
  22. awslabs/terraform_mcp_server/static/MCP_INSTRUCTIONS.md +126 -0
  23. awslabs/terraform_mcp_server/static/TERRAFORM_WORKFLOW_GUIDE.md +198 -0
  24. awslabs/terraform_mcp_server/static/__init__.py +22 -0
  25. awslabs/terraform_mcp_server/tests/__init__.py +1 -0
  26. awslabs/terraform_mcp_server/tests/run_tests.sh +35 -0
  27. awslabs/terraform_mcp_server/tests/test_parameter_annotations.py +207 -0
  28. awslabs/terraform_mcp_server/tests/test_tool_implementations.py +309 -0
  29. awslabs_terraform_mcp_server-0.0.1.dist-info/METADATA +97 -0
  30. awslabs_terraform_mcp_server-0.0.1.dist-info/RECORD +32 -0
  31. awslabs_terraform_mcp_server-0.0.1.dist-info/WHEEL +4 -0
  32. awslabs_terraform_mcp_server-0.0.1.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,1020 @@
1
+ """Script to generate AWSCC provider resources markdown for the Terraform Expert MCP server.
2
+
3
+ This script scrapes the Terraform AWSCC provider documentation using Playwright
4
+ and generates a comprehensive markdown file listing all AWS service categories,
5
+ resources, and data sources.
6
+
7
+ The generated markdown is saved to the static directory for use by the MCP server.
8
+
9
+ Usage:
10
+ python generate_awscc_provider_resources.py [--max-categories N] [--output PATH]
11
+
12
+ Options:
13
+ --max-categories N Limit to N categories (default: all)
14
+ --output PATH Output file path (default: terraform_mcp_server/static/AWSCC_PROVIDER_RESOURCES.md)
15
+ --no-fallback Don't use fallback data if scraping fails
16
+ """
17
+
18
+ import argparse
19
+ import asyncio
20
+ import os
21
+ import re
22
+ import sys
23
+ import tempfile
24
+ import time
25
+ from bs4 import BeautifulSoup, Tag
26
+ from bs4.element import PageElement, ResultSet
27
+ from bs4.filter import SoupStrainer
28
+ from datetime import datetime
29
+ from loguru import logger
30
+ from pathlib import Path
31
+ from typing import Any, Optional, TypeVar
32
+
33
+
34
+ # Type helpers for BeautifulSoup
35
+ T = TypeVar('T')
36
+
37
+
38
+ def ensure_tag(element: Optional[PageElement]) -> Optional[Tag]:
39
+ """Ensure an element is a Tag or return None."""
40
+ if isinstance(element, Tag):
41
+ return element
42
+ return None
43
+
44
+
45
+ def safe_find(element: Any, *args: Any, **kwargs: Any) -> Optional[Tag]:
46
+ """Safely find an element in a Tag."""
47
+ if not isinstance(element, Tag):
48
+ return None
49
+ result = element.find(*args, **kwargs)
50
+ return ensure_tag(result)
51
+
52
+
53
+ def safe_find_all(element: Any, *args: Any, **kwargs: Any) -> ResultSet:
54
+ """Safely find all elements in a Tag."""
55
+ if not isinstance(element, Tag):
56
+ return ResultSet(SoupStrainer(), [])
57
+ return element.find_all(*args, **kwargs)
58
+
59
+
60
+ def safe_get_text(element: Any, strip: bool = False) -> str:
61
+ """Safely get text from an element."""
62
+ if hasattr(element, 'get_text'):
63
+ return element.get_text(strip=strip)
64
+ return str(element) if element is not None else ''
65
+
66
+
67
+ ## Playwright optional import
68
+ try:
69
+ from playwright.async_api import async_playwright
70
+ except ImportError:
71
+ # Playwright is optional, we'll use fallback data if it's not available
72
+ async_playwright = None
73
+
74
+ # Add the parent directory to sys.path so we can import from terraform_mcp_server
75
+ script_dir = Path(__file__).resolve().parent
76
+ repo_root = script_dir.parent.parent.parent
77
+ sys.path.insert(0, str(repo_root))
78
+
79
+
80
+ # Configure logger for enhanced diagnostics with stacktraces
81
+ logger.configure(
82
+ handlers=[
83
+ {
84
+ 'sink': sys.stderr,
85
+ 'backtrace': True,
86
+ 'diagnose': True,
87
+ 'format': '<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>',
88
+ }
89
+ ]
90
+ )
91
+
92
+ # Environment variable to control whether to use Playwright or go straight to fallback data
93
+ USE_PLAYWRIGHT = os.environ.get('USE_PLAYWRIGHT', '1').lower() in ('1', 'true', 'yes')
94
+ # Shorter timeout to fail faster if it's not going to work
95
+ NAVIGATION_TIMEOUT = 20000 # 20 seconds
96
+ # Default output path
97
+ DEFAULT_OUTPUT_PATH = (
98
+ repo_root / 'awslabs' / 'terraform_mcp_server' / 'static' / 'AWSCC_PROVIDER_RESOURCES.md'
99
+ )
100
+ # AWSCC provider URL
101
+ AWSCC_PROVIDER_URL = 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs'
102
+
103
+
104
+ async def fetch_awscc_provider_page():
105
+ """Fetch the AWSCC provider documentation page using Playwright.
106
+
107
+ This function uses a headless browser to render the JavaScript-driven
108
+ Terraform Registry website and extract the AWSCC provider resources.
109
+
110
+ It will fall back to pre-defined data if:
111
+ - The USE_PLAYWRIGHT environment variable is set to 0/false/no
112
+ - There's any error during the scraping process
113
+
114
+ Returns:
115
+ A dictionary containing:
116
+ - 'categories': Dictionary of AWSCC service categories with resources and data sources
117
+ - 'version': AWSCC provider version string (e.g., "1.36.0")
118
+ """
119
+ # Check if we should skip Playwright or if it's not available
120
+ if not USE_PLAYWRIGHT or async_playwright is None:
121
+ logger.info(
122
+ 'Skipping Playwright and using pre-defined resource structure (USE_PLAYWRIGHT=0)'
123
+ )
124
+ return {'categories': get_fallback_resource_data(), 'version': 'unknown'}
125
+
126
+ logger.info('Starting browser to extract AWSCC provider resources structure')
127
+ start_time = time.time()
128
+ categories = {}
129
+
130
+ try:
131
+ async with async_playwright() as p:
132
+ # Launch the browser with specific options for better performance
133
+ browser = await p.chromium.launch(
134
+ headless=True,
135
+ args=['--disable-dev-shm-usage', '--no-sandbox', '--disable-setuid-sandbox'],
136
+ )
137
+ context = await browser.new_context(
138
+ viewport={'width': 1280, 'height': 800},
139
+ user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
140
+ )
141
+ page = await context.new_page()
142
+
143
+ # Set a shorter timeout for navigation
144
+ page.set_default_timeout(NAVIGATION_TIMEOUT)
145
+
146
+ # Navigate to the AWS provider docs with reduced timeout
147
+ logger.info(
148
+ f'Navigating to Terraform AWSCC provider documentation (timeout: {NAVIGATION_TIMEOUT}ms)'
149
+ )
150
+ try:
151
+ await page.goto(
152
+ AWSCC_PROVIDER_URL,
153
+ wait_until='domcontentloaded',
154
+ ) # Using 'domcontentloaded' instead of 'networkidle'
155
+ logger.info('Basic page loaded successfully')
156
+ except Exception as nav_error:
157
+ logger.error(f'Error during navigation: {nav_error}')
158
+ await browser.close()
159
+ return {'categories': get_fallback_resource_data(), 'version': 'unknown'}
160
+
161
+ # Wait for the content to be fully loaded
162
+ logger.info('Waiting for page to render completely')
163
+
164
+ # Add a small fixed delay to let JavaScript finish rendering
165
+ await asyncio.sleep(2)
166
+
167
+ # Extract AWS provider version
168
+ provider_version = 'unknown'
169
+ try:
170
+ # Try to extract version using the selector provided
171
+ logger.info('Attempting to extract AWSCC provider version')
172
+
173
+ # Try using the selector approach
174
+ version_element = await page.query_selector(
175
+ 'body > div.provider-view > div.provider-nav > nav.bread-crumbs.is-light > div > div > ul > li:nth-child(4) > span'
176
+ )
177
+ if version_element:
178
+ # Try to extract text from the element
179
+ version_text = await version_element.inner_text()
180
+ logger.debug(f'Found version element with text: {version_text}')
181
+
182
+ # Extract just the version number using regex
183
+ version_match = re.search(r'Version\s+([0-9.]+)', version_text)
184
+ if version_match:
185
+ provider_version = version_match.group(1) # e.g., "5.91.0"
186
+ logger.info(f'Extracted AWSCC provider version: {provider_version}')
187
+ else:
188
+ # If regex doesn't match, try JavaScript approach
189
+ logger.debug("Regex pattern didn't match, trying JavaScript approach")
190
+ provider_version = await page.evaluate("""
191
+ () => {
192
+ const versionEl = document.querySelector('.version-dropdown button span');
193
+ return versionEl ? versionEl.innerText.trim() : null;
194
+ }
195
+ """)
196
+ # Clean up the version string if needed
197
+ if provider_version:
198
+ provider_version = provider_version.strip()
199
+ version_match = re.search(r'([0-9.]+)', provider_version)
200
+ if version_match:
201
+ provider_version = version_match.group(1)
202
+ logger.info(
203
+ f'Extracted AWS provider version via JavaScript: {provider_version}'
204
+ )
205
+ else:
206
+ # If the specific selector doesn't work, try a more general approach
207
+ logger.debug(
208
+ 'Specific version selector not found, trying alternative selectors'
209
+ )
210
+ provider_version = await page.evaluate("""
211
+ () => {
212
+ // Try different selectors that might contain the version
213
+ const selectors = [
214
+ '.version-dropdown button span',
215
+ '.dropdown-trigger button span',
216
+ 'span:contains("Version")'
217
+ ];
218
+ for (const selector of selectors) {
219
+ try {
220
+ const el = document.querySelector(selector);
221
+ if (el && el.innerText.includes('Version')) {
222
+ return el.innerText.trim();
223
+ }
224
+ } catch (e) {}
225
+ }
226
+ return null;
227
+ }
228
+ """)
229
+
230
+ # Extract version number from text if found
231
+ if provider_version:
232
+ version_match = re.search(r'([0-9.]+)', provider_version)
233
+ if version_match:
234
+ provider_version = version_match.group(1)
235
+ logger.info(
236
+ f'Extracted AWSCC provider version via alternative selector: {provider_version}'
237
+ )
238
+ except Exception as version_error:
239
+ logger.warning(f'Error extracting AWSCC provider version: {version_error}')
240
+
241
+ # Check for and handle cookie consent banner
242
+ logger.info('Checking for cookie consent banner')
243
+ try:
244
+ # Check if the consent banner is present
245
+ consent_banner = await page.query_selector('#consent-banner')
246
+ if consent_banner:
247
+ logger.info('Cookie consent banner detected, attempting to dismiss')
248
+
249
+ # Target the specific dismiss button based on the HTML structure provided
250
+ dismiss_button_selectors = [
251
+ 'button.hds-button:has-text("Dismiss")',
252
+ 'button.hds-button .hds-button__text:has-text("Dismiss")',
253
+ 'button.hds-button--color-primary',
254
+ ]
255
+
256
+ for selector in dismiss_button_selectors:
257
+ try:
258
+ # Check if the button exists with this selector
259
+ button = await page.query_selector(selector)
260
+ if button:
261
+ logger.info(f'Found dismiss button with selector: {selector}')
262
+ await button.click()
263
+ logger.info('Clicked the dismiss button')
264
+
265
+ # Wait a moment for the banner to disappear
266
+ await asyncio.sleep(1)
267
+
268
+ # Check if the banner is gone
269
+ banner_still_visible = await page.query_selector('#consent-banner')
270
+ if not banner_still_visible:
271
+ logger.info('Banner successfully dismissed')
272
+ break
273
+ except Exception as button_error:
274
+ logger.warning(f'Failed to click button {selector}: {button_error}')
275
+
276
+ # If button clicking didn't work, try JavaScript approach as a fallback
277
+ banner_still_visible = await page.query_selector('#consent-banner')
278
+ if banner_still_visible:
279
+ logger.info('Attempting to remove banner via JavaScript')
280
+ try:
281
+ # Try to remove the banner using JavaScript
282
+ await page.evaluate("""() => {
283
+ const banner = document.getElementById('consent-banner');
284
+ if (banner) banner.remove();
285
+ return true;
286
+ }""")
287
+ logger.info('Removed banner using JavaScript')
288
+ except Exception as js_error:
289
+ logger.warning(f'Failed to remove banner via JavaScript: {js_error}')
290
+ except Exception as banner_error:
291
+ logger.warning(f'Error handling consent banner: {banner_error}')
292
+
293
+ # Progressive wait strategy - try multiple conditions in sequence
294
+ # Define selectors to try in order of preference
295
+ selectors = [
296
+ '.provider-docs-menu-content',
297
+ 'nav',
298
+ '.docs-nav',
299
+ 'aside',
300
+ 'ul.nav',
301
+ 'div[role="navigation"]',
302
+ ]
303
+
304
+ # Try each selector with a short timeout
305
+ for selector in selectors:
306
+ try:
307
+ logger.info(f'Trying to locate element with selector: {selector}')
308
+ await page.wait_for_selector(selector, timeout=5000)
309
+ logger.info(f'Found element with selector: {selector}')
310
+ break
311
+ except Exception as se:
312
+ logger.warning(f"Selector '{selector}' not found: {se}")
313
+
314
+ # Extract the HTML content after JS rendering
315
+ logger.info('Extracting page content')
316
+ content = await page.content()
317
+
318
+ # Save HTML for debugging using tempfile for security
319
+ with tempfile.NamedTemporaryFile(
320
+ prefix='terraform_awscc_debug_playwright_', suffix='.html', mode='w', delete=False
321
+ ) as temp_file:
322
+ temp_file.write(content)
323
+ debug_file_path = temp_file.name
324
+ logger.debug(f'Saved rendered HTML content to {debug_file_path}')
325
+
326
+ # Parse the HTML
327
+ soup = BeautifulSoup(content, 'html.parser')
328
+
329
+ # First try the specific provider-docs-menu-content selector
330
+ menu_content = soup.select_one('.provider-docs-menu-content')
331
+
332
+ if not menu_content:
333
+ logger.warning(
334
+ "Couldn't find the .provider-docs-menu-content element, trying alternatives"
335
+ )
336
+
337
+ # Try each selector that might contain the menu
338
+ for selector in selectors:
339
+ menu_content = soup.select_one(selector)
340
+ if menu_content:
341
+ logger.info(f'Found menu content with selector: {selector}')
342
+ break
343
+
344
+ # If still not found, look for any substantial navigation
345
+ if not menu_content:
346
+ logger.warning("Still couldn't find navigation using standard selectors")
347
+
348
+ # Try to find any element with many links as a potential menu
349
+ potential_menus = []
350
+ for elem in safe_find_all(soup, ['div', 'nav', 'ul']):
351
+ links = safe_find_all(elem, 'a')
352
+ if len(links) > 10: # Any element with many links might be navigation
353
+ potential_menus.append((elem, len(links)))
354
+
355
+ # Sort by number of links, highest first
356
+ potential_menus.sort(key=lambda x: x[1], reverse=True)
357
+
358
+ if potential_menus:
359
+ menu_content = potential_menus[0][0]
360
+ logger.info(f'Using element with {potential_menus[0][1]} links as menu')
361
+
362
+ # If we still have nothing, use fallback
363
+ if not menu_content:
364
+ logger.error("Couldn't find any navigation element, using fallback data")
365
+ await browser.close()
366
+ return {'categories': get_fallback_resource_data(), 'version': 'unknown'}
367
+
368
+ # Find all category titles (excluding 'guides' and 'functions')
369
+ category_titles = menu_content.select('.menu-list-category-link-title')
370
+
371
+ if not category_titles:
372
+ logger.error("Couldn't find any .menu-list-category-link-title elements")
373
+ await browser.close()
374
+ return {'categories': get_fallback_resource_data(), 'version': 'unknown'}
375
+
376
+ logger.info(f'Found {len(category_titles)} category titles')
377
+
378
+ # First collect all categories that we need to process
379
+ categories_to_process = []
380
+ for category_el in category_titles:
381
+ category_name = category_el.get_text(strip=True)
382
+
383
+ # Skip non-service entries like 'Guides' and 'Functions'
384
+ if category_name.lower() in ['guides', 'functions', 'awscc provider']:
385
+ logger.debug(f'Skipping category: {category_name}')
386
+ continue
387
+
388
+ logger.debug(f'Will process category: {category_name}')
389
+ categories_to_process.append((category_name, category_el))
390
+
391
+ # Initialize category entry
392
+ categories[category_name] = {'resources': [], 'data_sources': []}
393
+
394
+ # Process a smaller set of categories if there are too many (for testing/development)
395
+ MAX_CATEGORIES = int(os.environ.get('MAX_CATEGORIES', '999'))
396
+ if len(categories_to_process) > MAX_CATEGORIES:
397
+ logger.info(
398
+ f'Limiting to {MAX_CATEGORIES} categories (from {len(categories_to_process)})'
399
+ )
400
+ categories_to_process = categories_to_process[:MAX_CATEGORIES]
401
+
402
+ logger.info(
403
+ f'Processing {len(categories_to_process)} categories with click interaction'
404
+ )
405
+
406
+ # Now process each category by clicking on it first
407
+ for category_idx, (category_name, category_el) in enumerate(categories_to_process):
408
+ try:
409
+ # Get the DOM path or some identifier for this category
410
+ # Try to find a unique identifier for the category to click on
411
+ # First, try to get the href attribute from the parent <a> tag
412
+ href = None
413
+ parent_a = category_el.parent
414
+ if parent_a and parent_a.name == 'a':
415
+ href = parent_a.get('href')
416
+
417
+ logger.info(
418
+ f'[{category_idx + 1}/{len(categories_to_process)}] Clicking on category: {category_name}'
419
+ )
420
+
421
+ # Handle potential cookie consent banner interference
422
+ try:
423
+ # Check if banner reappeared
424
+ consent_banner = await page.query_selector('#consent-banner')
425
+ if consent_banner:
426
+ logger.info(
427
+ 'Cookie consent banner detected again, removing via JavaScript'
428
+ )
429
+ await page.evaluate("""() => {
430
+ const banner = document.getElementById('consent-banner');
431
+ if (banner) banner.remove();
432
+ return true;
433
+ }""")
434
+ except Exception:
435
+ pass # Ignore errors in this extra banner check
436
+
437
+ # Click with increased timeout and multiple attempts
438
+ click_success = False
439
+ click_attempts = 0
440
+ max_attempts = 3
441
+
442
+ while not click_success and click_attempts < max_attempts:
443
+ click_attempts += 1
444
+ try:
445
+ if href:
446
+ # If we have an href, use that to locate the element
447
+ try:
448
+ selector = f"a[href='{href}']"
449
+ await page.click(selector, timeout=8000) # Increased timeout
450
+ logger.debug(
451
+ f'Clicked category using href selector: {selector}'
452
+ )
453
+ click_success = True
454
+ except Exception as click_error:
455
+ logger.warning(
456
+ f'Failed to click using href, trying text: {click_error}'
457
+ )
458
+ # If that fails, try to click by text content
459
+ escaped_name = category_name.replace("'", "\\'")
460
+ await page.click(
461
+ f"text='{escaped_name}'", timeout=8000
462
+ ) # Increased timeout
463
+ click_success = True
464
+ else:
465
+ # Otherwise try to click by text content
466
+ escaped_name = category_name.replace("'", "\\'")
467
+ await page.click(
468
+ f"text='{escaped_name}'", timeout=8000
469
+ ) # Increased timeout
470
+ click_success = True
471
+
472
+ except Exception as click_error:
473
+ logger.warning(
474
+ f'Click attempt {click_attempts} failed for {category_name}: {click_error}'
475
+ )
476
+ if click_attempts >= max_attempts:
477
+ logger.error(
478
+ f'Failed to click category {category_name} after {max_attempts} attempts'
479
+ )
480
+ # Don't break the loop, continue with next category
481
+ raise click_error
482
+
483
+ # Try removing any overlays before next attempt
484
+ try:
485
+ await page.evaluate("""() => {
486
+ // Remove common overlay patterns
487
+ document.querySelectorAll('[id*="banner"],[id*="overlay"],[id*="popup"],[class*="banner"],[class*="overlay"],[class*="popup"]')
488
+ .forEach(el => el.remove());
489
+ return true;
490
+ }""")
491
+ await asyncio.sleep(0.5) # Brief pause between attempts
492
+ except Exception:
493
+ pass # Ignore errors in overlay removal
494
+
495
+ # Wait briefly for content to load
496
+ await asyncio.sleep(0.3)
497
+
498
+ # Extract resources and data sources from the now-expanded category
499
+ # We need to use the HTML structure to locate the specific sections for this category
500
+ try:
501
+ # Get the updated HTML after clicking
502
+ current_html = await page.content()
503
+ current_soup = BeautifulSoup(current_html, 'html.parser')
504
+
505
+ resource_count = 0
506
+ data_source_count = 0
507
+
508
+ # Find the clicked category element in the updated DOM
509
+ # This is important because the structure changes after clicking
510
+ # First, find the category span by its text
511
+ category_spans = safe_find_all(
512
+ current_soup, 'span', class_='menu-list-category-link-title'
513
+ )
514
+ clicked_category_span = None
515
+ for span in category_spans:
516
+ if safe_get_text(span, strip=True) == category_name:
517
+ clicked_category_span = span
518
+ break
519
+
520
+ if not clicked_category_span:
521
+ logger.warning(
522
+ f'Could not find clicked category {category_name} in updated DOM'
523
+ )
524
+ continue
525
+
526
+ # Navigate up to find the parent LI, which contains all content for this category
527
+ parent_li = ensure_tag(clicked_category_span.find_parent('li'))
528
+ if not parent_li:
529
+ logger.warning(
530
+ f'Could not find parent LI for category {category_name}'
531
+ )
532
+ continue
533
+
534
+ # Find the ul.menu-list that contains both Resources and Data Sources sections
535
+ category_menu_list = safe_find(parent_li, 'ul', class_='menu-list')
536
+ if not category_menu_list:
537
+ logger.warning(
538
+ f'Could not find menu-list for category {category_name}'
539
+ )
540
+ continue
541
+
542
+ # Process Resources section
543
+ # Find the span with text "Resources"
544
+ resource_spans = safe_find_all(
545
+ category_menu_list, 'span', class_='menu-list-category-link-title'
546
+ )
547
+ resource_section = None
548
+ for span in resource_spans:
549
+ if safe_get_text(span, strip=True) == 'Resources':
550
+ resource_section_li = ensure_tag(span.find_parent('li'))
551
+ if resource_section_li:
552
+ resource_section = safe_find(
553
+ resource_section_li, 'ul', class_='menu-list'
554
+ )
555
+ break
556
+
557
+ # If we can't find the Resources section using the span approach,
558
+ # try alternative methods
559
+ if not resource_section:
560
+ # Look for any UL that might contain resource links
561
+ potential_resource_sections = safe_find_all(category_menu_list, 'ul')
562
+ for ul in potential_resource_sections:
563
+ # Check if this UL contains links that look like resources
564
+ links = safe_find_all(ul, 'a')
565
+ for link in links:
566
+ link_text = safe_get_text(link, strip=True)
567
+ # AWSCC resources typically start with "awscc_"
568
+ if (
569
+ isinstance(link_text, str)
570
+ and link_text.startswith('awscc_')
571
+ and '_data_' not in link_text.lower()
572
+ ):
573
+ resource_section = ul
574
+ break
575
+ if resource_section:
576
+ break
577
+
578
+ # Extract resources
579
+ if resource_section:
580
+ # Try both menu-list-link class and direct a tags
581
+ resource_links = safe_find_all(
582
+ resource_section, 'li', class_='menu-list-link'
583
+ )
584
+
585
+ # If not resource_links, try direct a tags
586
+ if not resource_links:
587
+ resource_links = safe_find_all(resource_section, 'a')
588
+
589
+ for item in resource_links:
590
+ # If item is a link itself (a tag)
591
+ if isinstance(item, Tag) and item.name == 'a':
592
+ link = item
593
+ else:
594
+ # If item is a container (li), find the link inside
595
+ link = safe_find(item, 'a')
596
+
597
+ if not link:
598
+ continue
599
+
600
+ href = link.get('href') if isinstance(link, Tag) else None
601
+ if not href:
602
+ continue
603
+
604
+ link_text = safe_get_text(link, strip=True)
605
+ if not link_text:
606
+ continue
607
+
608
+ # Skip if this doesn't look like an AWSCC resource
609
+ if not isinstance(link_text, str) or not link_text.startswith(
610
+ 'awscc_'
611
+ ):
612
+ continue
613
+
614
+ # Skip data sources (they'll be handled separately)
615
+ if isinstance(link_text, str) and '_data_' in link_text.lower():
616
+ continue
617
+
618
+ # Complete the URL if it's a relative path
619
+ full_url = (
620
+ f'https://registry.terraform.io{href}'
621
+ if isinstance(href, str) and href.startswith('/')
622
+ else href
623
+ )
624
+
625
+ # Add to resources
626
+ resource = {'name': link_text, 'url': full_url, 'type': 'resource'}
627
+
628
+ categories[category_name]['resources'].append(resource)
629
+ resource_count += 1
630
+
631
+ # Process Data Sources section
632
+ # Find the span with text "Data Sources"
633
+ data_spans = safe_find_all(
634
+ category_menu_list, 'span', class_='menu-list-category-link-title'
635
+ )
636
+ data_section = None
637
+ for span in data_spans:
638
+ if safe_get_text(span, strip=True) == 'Data Sources':
639
+ data_section_li = ensure_tag(span.find_parent('li'))
640
+ if data_section_li:
641
+ data_section = safe_find(
642
+ data_section_li, 'ul', class_='menu-list'
643
+ )
644
+ break
645
+
646
+ # If we can't find the Data Sources section using the span approach,
647
+ # try alternative methods
648
+ if not data_section:
649
+ # Look for any UL that might contain data source links
650
+ potential_data_sections = safe_find_all(category_menu_list, 'ul')
651
+ for ul in potential_data_sections:
652
+ # Check if this UL contains links that look like data sources
653
+ links = safe_find_all(ul, 'a')
654
+ for link in links:
655
+ link_text = safe_get_text(link, strip=True)
656
+ href_attr = (
657
+ link.get('href', '') if isinstance(link, Tag) else ''
658
+ )
659
+
660
+ # Data sources typically have "data" in the URL or name
661
+ if (
662
+ isinstance(link_text, str)
663
+ and link_text.startswith('awscc_')
664
+ and (
665
+ (
666
+ isinstance(href_attr, str)
667
+ and 'data' in href_attr.lower()
668
+ )
669
+ or (
670
+ isinstance(link_text, str)
671
+ and 'data' in link_text.lower()
672
+ )
673
+ )
674
+ ):
675
+ data_section = ul
676
+ break
677
+ if data_section:
678
+ break
679
+
680
+ # Extract data sources
681
+ if data_section:
682
+ # Try both menu-list-link class and direct a tags
683
+ data_links = safe_find_all(data_section, 'li', class_='menu-list-link')
684
+
685
+ # If no menu-list-link items found, try direct a tags
686
+ if not data_links:
687
+ data_links = safe_find_all(data_section, 'a')
688
+
689
+ for item in data_links:
690
+ # If item is a link itself (a tag)
691
+ if isinstance(item, Tag) and item.name == 'a':
692
+ link = item
693
+ else:
694
+ # If item is a container (li), find the link inside
695
+ link = safe_find(item, 'a')
696
+
697
+ if not link:
698
+ continue
699
+
700
+ href = link.get('href') if isinstance(link, Tag) else None
701
+ if not href:
702
+ continue
703
+
704
+ link_text = safe_get_text(link, strip=True)
705
+ if not link_text:
706
+ continue
707
+
708
+ # Skip if this doesn't look like an AWSCC data source
709
+ if not isinstance(link_text, str) or not link_text.startswith(
710
+ 'awscc_'
711
+ ):
712
+ continue
713
+
714
+ # Make sure it's a data source (contains "data" in URL or name)
715
+ if not (
716
+ (isinstance(href, str) and 'data' in href.lower())
717
+ or (isinstance(link_text, str) and 'data' in link_text.lower())
718
+ ):
719
+ continue
720
+
721
+ # Complete the URL if it's a relative path
722
+ full_url = (
723
+ f'https://registry.terraform.io{href}'
724
+ if isinstance(href, str) and href.startswith('/')
725
+ else href
726
+ )
727
+
728
+ # Add to data sources
729
+ data_source = {
730
+ 'name': link_text,
731
+ 'url': full_url,
732
+ 'type': 'data_source',
733
+ }
734
+
735
+ categories[category_name]['data_sources'].append(data_source)
736
+ data_source_count += 1
737
+
738
+ # If we still haven't found any resources or data sources,
739
+ # try a more aggressive approach by looking at all links in the category
740
+ if resource_count == 0 and data_source_count == 0:
741
+ all_links = safe_find_all(category_menu_list, 'a')
742
+ for link in all_links:
743
+ href = link.get('href', '') if isinstance(link, Tag) else ''
744
+ link_text = safe_get_text(link, strip=True)
745
+
746
+ if not isinstance(link_text, str) or not link_text.startswith(
747
+ 'awscc_'
748
+ ):
749
+ continue
750
+
751
+ # Complete the URL if it's a relative path
752
+ full_url = (
753
+ f'https://registry.terraform.io{href}'
754
+ if isinstance(href, str) and href.startswith('/')
755
+ else href
756
+ )
757
+
758
+ # Determine if it's a resource or data source based on URL/name
759
+ if isinstance(href, str) and (
760
+ 'data' in href.lower() or 'data-source' in href.lower()
761
+ ):
762
+ data_source = {
763
+ 'name': link_text,
764
+ 'url': full_url,
765
+ 'type': 'data_source',
766
+ }
767
+ categories[category_name]['data_sources'].append(data_source)
768
+ data_source_count += 1
769
+ else:
770
+ resource = {
771
+ 'name': link_text,
772
+ 'url': full_url,
773
+ 'type': 'resource',
774
+ }
775
+ categories[category_name]['resources'].append(resource)
776
+ resource_count += 1
777
+
778
+ logger.info(
779
+ f'Category {category_name}: found {resource_count} resources, {data_source_count} data sources'
780
+ )
781
+
782
+ except Exception as extract_error:
783
+ logger.error(
784
+ f'Error extracting resources for {category_name}: {extract_error}'
785
+ )
786
+
787
+ except Exception as click_error:
788
+ logger.warning(
789
+ f'Error interacting with category {category_name}: {click_error}'
790
+ )
791
+
792
+ # Close the browser
793
+ await browser.close()
794
+
795
+ # Count statistics for logging
796
+ service_count = len(categories)
797
+ resource_count = sum(len(cat['resources']) for cat in categories.values())
798
+ data_source_count = sum(len(cat['data_sources']) for cat in categories.values())
799
+
800
+ duration = time.time() - start_time
801
+ logger.info(
802
+ f'Extracted {service_count} service categories with {resource_count} resources and {data_source_count} data sources in {duration:.2f} seconds'
803
+ )
804
+
805
+ # Return the structure if we have data
806
+ if service_count > 0:
807
+ return {'categories': categories, 'version': provider_version}
808
+ else:
809
+ logger.warning('No categories found, using fallback data')
810
+ return {'categories': get_fallback_resource_data(), 'version': 'unknown'}
811
+
812
+ except Exception as e:
813
+ logger.error(f'Error extracting AWSCC provider resources: {str(e)}')
814
+ # Return fallback data in case of error
815
+ return {'categories': get_fallback_resource_data(), 'version': 'unknown'}
816
+
817
+
818
+ def get_fallback_resource_data():
819
+ """Provide fallback resource data in case the scraping fails.
820
+
821
+ Returns:
822
+ A dictionary with pre-defined AWSCC resources and data sources
823
+ """
824
+ logger.warning('Using pre-defined resource structure as fallback')
825
+
826
+ # The AWSCC provider has a different structure than the AWS provider
827
+ # It has two main categories: Resources and Data Sources
828
+ categories = {
829
+ 'Resources': {
830
+ 'resources': [
831
+ {
832
+ 'name': 'awscc_accessanalyzer_analyzer',
833
+ 'url': 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs/resources/accessanalyzer_analyzer',
834
+ 'type': 'resource',
835
+ },
836
+ {
837
+ 'name': 'awscc_acmpca_certificate',
838
+ 'url': 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs/resources/acmpca_certificate',
839
+ 'type': 'resource',
840
+ },
841
+ {
842
+ 'name': 'awscc_acmpca_certificate_authority',
843
+ 'url': 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs/resources/acmpca_certificate_authority',
844
+ 'type': 'resource',
845
+ },
846
+ {
847
+ 'name': 'awscc_acmpca_certificate_authority_activation',
848
+ 'url': 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs/resources/acmpca_certificate_authority_activation',
849
+ 'type': 'resource',
850
+ },
851
+ {
852
+ 'name': 'awscc_acmpca_permission',
853
+ 'url': 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs/resources/acmpca_permission',
854
+ 'type': 'resource',
855
+ },
856
+ # Add more resources as needed
857
+ ],
858
+ 'data_sources': [],
859
+ },
860
+ 'Data Sources': {
861
+ 'resources': [],
862
+ 'data_sources': [
863
+ {
864
+ 'name': 'awscc_accessanalyzer_analyzer',
865
+ 'url': 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs/data-sources/accessanalyzer_analyzer',
866
+ 'type': 'data_source',
867
+ },
868
+ {
869
+ 'name': 'awscc_accessanalyzer_analyzers',
870
+ 'url': 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs/data-sources/accessanalyzer_analyzers',
871
+ 'type': 'data_source',
872
+ },
873
+ # Add more data sources as needed
874
+ ],
875
+ },
876
+ }
877
+ return categories
878
+
879
+
880
+ def parse_arguments():
881
+ """Parse command line arguments."""
882
+ parser = argparse.ArgumentParser(
883
+ description='Generate AWSCC provider resources markdown for the Terraform Expert MCP server.'
884
+ )
885
+ parser.add_argument(
886
+ '--max-categories',
887
+ type=int,
888
+ default=999,
889
+ help='Limit to N categories (default: all)',
890
+ )
891
+ parser.add_argument(
892
+ '--output',
893
+ type=Path,
894
+ default=DEFAULT_OUTPUT_PATH,
895
+ help=f'Output file path (default: {DEFAULT_OUTPUT_PATH})',
896
+ )
897
+ parser.add_argument(
898
+ '--no-fallback',
899
+ action='store_true',
900
+ help="Don't use fallback data if scraping fails",
901
+ )
902
+ return parser.parse_args()
903
+
904
+
905
+ async def main():
906
+ """Main entry point for the script."""
907
+ start_time = datetime.now()
908
+
909
+ # Parse command line arguments
910
+ args = parse_arguments()
911
+
912
+ print('Generating AWSCC provider resources markdown...')
913
+ print(f'Output path: {args.output}')
914
+ print(f'Max categories: {args.max_categories if args.max_categories < 999 else "all"}')
915
+
916
+ # Set environment variable for max categories
917
+ os.environ['MAX_CATEGORIES'] = str(args.max_categories)
918
+
919
+ # Set environment variable for fallback behavior
920
+ if args.no_fallback:
921
+ os.environ['USE_PLAYWRIGHT'] = '1'
922
+ print('Using live scraping without fallback')
923
+
924
+ try:
925
+ # Fetch AWSCC provider data using the existing implementation
926
+ result = await fetch_awscc_provider_page()
927
+
928
+ # Extract categories and version
929
+ if isinstance(result, dict) and 'categories' in result and 'version' in result:
930
+ categories = result['categories']
931
+ provider_version = result.get('version', 'unknown')
932
+ else:
933
+ # Handle backward compatibility with older API
934
+ categories = result
935
+ provider_version = 'unknown'
936
+
937
+ # Sort categories alphabetically
938
+ sorted_categories = sorted(categories.keys())
939
+
940
+ # Count totals
941
+ total_resources = sum(len(cat['resources']) for cat in categories.values())
942
+ total_data_sources = sum(len(cat['data_sources']) for cat in categories.values())
943
+
944
+ print(
945
+ f'Found {len(categories)} categories, {total_resources} resources, and {total_data_sources} data sources'
946
+ )
947
+
948
+ # Generate markdown
949
+ markdown = []
950
+ markdown.append('# AWSCC Provider Resources Listing')
951
+ markdown.append(f'\nAWSCC Provider Version: {provider_version}')
952
+ markdown.append(f'\nLast updated: {datetime.now().strftime("%B %d, %Y %H:%M:%S")}')
953
+ markdown.append(
954
+ f'\nFound {total_resources} resources and {total_data_sources} data sources across {len(categories)} AWSCC service categories.\n'
955
+ )
956
+
957
+ # Generate table of contents
958
+ # markdown.append('## Table of Contents')
959
+ # for category in sorted_categories:
960
+ # sanitized_category = (
961
+ # category.replace(' ', '-').replace('(', '').replace(')', '').lower()
962
+ # )
963
+ # markdown.append(f'- [{category}](#{sanitized_category})')
964
+ # markdown.append('')
965
+
966
+ # Generate content for each category
967
+ for category in sorted_categories:
968
+ cat_data = categories[category]
969
+ sanitized_heading = category.replace('(', '').replace(')', '')
970
+
971
+ markdown.append(f'## {sanitized_heading}')
972
+
973
+ resource_count = len(cat_data['resources'])
974
+ data_source_count = len(cat_data['data_sources'])
975
+
976
+ # Add category summary
977
+ markdown.append(
978
+ f'\n*{resource_count} resources and {data_source_count} data sources*\n'
979
+ )
980
+
981
+ # Add resources section if available
982
+ if cat_data['resources']:
983
+ markdown.append('### Resources')
984
+ for resource in sorted(cat_data['resources'], key=lambda x: x['name']):
985
+ markdown.append(f'- [{resource["name"]}]({resource["url"]})')
986
+
987
+ # Add data sources section if available
988
+ if cat_data['data_sources']:
989
+ markdown.append('\n### Data Sources')
990
+ for data_source in sorted(cat_data['data_sources'], key=lambda x: x['name']):
991
+ markdown.append(f'- [{data_source["name"]}]({data_source["url"]})')
992
+
993
+ markdown.append('') # Add blank line between categories
994
+
995
+ # Add generation metadata at the end
996
+ duration = datetime.now() - start_time
997
+ markdown.append('---')
998
+ markdown.append(
999
+ '*This document was generated automatically by the AWSCC Provider Resources Generator script.*'
1000
+ )
1001
+ markdown.append(f'*Generation time: {duration.total_seconds():.2f} seconds*')
1002
+
1003
+ # Ensure directory exists
1004
+ args.output.parent.mkdir(parents=True, exist_ok=True)
1005
+
1006
+ # Write markdown to output file
1007
+ with open(args.output, 'w') as f:
1008
+ f.write('\n'.join(markdown))
1009
+
1010
+ print(f'Successfully generated markdown file at: {args.output}')
1011
+ print(f'Generation completed in {duration.total_seconds():.2f} seconds')
1012
+ return 0
1013
+
1014
+ except Exception as e:
1015
+ print(f'Error generating AWSCC provider resources: {str(e)}', file=sys.stderr)
1016
+ return 1
1017
+
1018
+
1019
+ if __name__ == '__main__':
1020
+ sys.exit(asyncio.run(main()))