awslabs.terraform-mcp-server 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of awslabs.terraform-mcp-server might be problematic. Click here for more details.
- awslabs/__init__.py +2 -0
- awslabs/terraform_mcp_server/__init__.py +3 -0
- awslabs/terraform_mcp_server/impl/resources/__init__.py +11 -0
- awslabs/terraform_mcp_server/impl/resources/terraform_aws_provider_resources_listing.py +52 -0
- awslabs/terraform_mcp_server/impl/resources/terraform_awscc_provider_resources_listing.py +55 -0
- awslabs/terraform_mcp_server/impl/tools/__init__.py +15 -0
- awslabs/terraform_mcp_server/impl/tools/execute_terraform_command.py +206 -0
- awslabs/terraform_mcp_server/impl/tools/run_checkov_scan.py +359 -0
- awslabs/terraform_mcp_server/impl/tools/search_aws_provider_docs.py +677 -0
- awslabs/terraform_mcp_server/impl/tools/search_awscc_provider_docs.py +627 -0
- awslabs/terraform_mcp_server/impl/tools/search_specific_aws_ia_modules.py +444 -0
- awslabs/terraform_mcp_server/impl/tools/utils.py +558 -0
- awslabs/terraform_mcp_server/models/__init__.py +27 -0
- awslabs/terraform_mcp_server/models/models.py +260 -0
- awslabs/terraform_mcp_server/scripts/generate_aws_provider_resources.py +1224 -0
- awslabs/terraform_mcp_server/scripts/generate_awscc_provider_resources.py +1020 -0
- awslabs/terraform_mcp_server/scripts/scrape_aws_terraform_best_practices.py +129 -0
- awslabs/terraform_mcp_server/server.py +329 -0
- awslabs/terraform_mcp_server/static/AWSCC_PROVIDER_RESOURCES.md +3125 -0
- awslabs/terraform_mcp_server/static/AWS_PROVIDER_RESOURCES.md +3833 -0
- awslabs/terraform_mcp_server/static/AWS_TERRAFORM_BEST_PRACTICES.md +2523 -0
- awslabs/terraform_mcp_server/static/MCP_INSTRUCTIONS.md +126 -0
- awslabs/terraform_mcp_server/static/TERRAFORM_WORKFLOW_GUIDE.md +198 -0
- awslabs/terraform_mcp_server/static/__init__.py +22 -0
- awslabs/terraform_mcp_server/tests/__init__.py +1 -0
- awslabs/terraform_mcp_server/tests/run_tests.sh +35 -0
- awslabs/terraform_mcp_server/tests/test_parameter_annotations.py +207 -0
- awslabs/terraform_mcp_server/tests/test_tool_implementations.py +309 -0
- awslabs_terraform_mcp_server-0.0.1.dist-info/METADATA +97 -0
- awslabs_terraform_mcp_server-0.0.1.dist-info/RECORD +32 -0
- awslabs_terraform_mcp_server-0.0.1.dist-info/WHEEL +4 -0
- awslabs_terraform_mcp_server-0.0.1.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,1020 @@
|
|
|
1
|
+
"""Script to generate AWSCC provider resources markdown for the Terraform Expert MCP server.
|
|
2
|
+
|
|
3
|
+
This script scrapes the Terraform AWSCC provider documentation using Playwright
|
|
4
|
+
and generates a comprehensive markdown file listing all AWS service categories,
|
|
5
|
+
resources, and data sources.
|
|
6
|
+
|
|
7
|
+
The generated markdown is saved to the static directory for use by the MCP server.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python generate_awscc_provider_resources.py [--max-categories N] [--output PATH]
|
|
11
|
+
|
|
12
|
+
Options:
|
|
13
|
+
--max-categories N Limit to N categories (default: all)
|
|
14
|
+
--output PATH Output file path (default: terraform_mcp_server/static/AWSCC_PROVIDER_RESOURCES.md)
|
|
15
|
+
--no-fallback Don't use fallback data if scraping fails
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
import asyncio
|
|
20
|
+
import os
|
|
21
|
+
import re
|
|
22
|
+
import sys
|
|
23
|
+
import tempfile
|
|
24
|
+
import time
|
|
25
|
+
from bs4 import BeautifulSoup, Tag
|
|
26
|
+
from bs4.element import PageElement, ResultSet
|
|
27
|
+
from bs4.filter import SoupStrainer
|
|
28
|
+
from datetime import datetime
|
|
29
|
+
from loguru import logger
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
from typing import Any, Optional, TypeVar
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# Type helpers for BeautifulSoup
|
|
35
|
+
T = TypeVar('T')
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def ensure_tag(element: Optional[PageElement]) -> Optional[Tag]:
|
|
39
|
+
"""Ensure an element is a Tag or return None."""
|
|
40
|
+
if isinstance(element, Tag):
|
|
41
|
+
return element
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def safe_find(element: Any, *args: Any, **kwargs: Any) -> Optional[Tag]:
|
|
46
|
+
"""Safely find an element in a Tag."""
|
|
47
|
+
if not isinstance(element, Tag):
|
|
48
|
+
return None
|
|
49
|
+
result = element.find(*args, **kwargs)
|
|
50
|
+
return ensure_tag(result)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def safe_find_all(element: Any, *args: Any, **kwargs: Any) -> ResultSet:
|
|
54
|
+
"""Safely find all elements in a Tag."""
|
|
55
|
+
if not isinstance(element, Tag):
|
|
56
|
+
return ResultSet(SoupStrainer(), [])
|
|
57
|
+
return element.find_all(*args, **kwargs)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def safe_get_text(element: Any, strip: bool = False) -> str:
|
|
61
|
+
"""Safely get text from an element."""
|
|
62
|
+
if hasattr(element, 'get_text'):
|
|
63
|
+
return element.get_text(strip=strip)
|
|
64
|
+
return str(element) if element is not None else ''
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
## Playwright optional import
|
|
68
|
+
try:
|
|
69
|
+
from playwright.async_api import async_playwright
|
|
70
|
+
except ImportError:
|
|
71
|
+
# Playwright is optional, we'll use fallback data if it's not available
|
|
72
|
+
async_playwright = None
|
|
73
|
+
|
|
74
|
+
# Add the parent directory to sys.path so we can import from terraform_mcp_server
|
|
75
|
+
script_dir = Path(__file__).resolve().parent
|
|
76
|
+
repo_root = script_dir.parent.parent.parent
|
|
77
|
+
sys.path.insert(0, str(repo_root))
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# Configure logger for enhanced diagnostics with stacktraces
|
|
81
|
+
logger.configure(
|
|
82
|
+
handlers=[
|
|
83
|
+
{
|
|
84
|
+
'sink': sys.stderr,
|
|
85
|
+
'backtrace': True,
|
|
86
|
+
'diagnose': True,
|
|
87
|
+
'format': '<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>',
|
|
88
|
+
}
|
|
89
|
+
]
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# Environment variable to control whether to use Playwright or go straight to fallback data
|
|
93
|
+
USE_PLAYWRIGHT = os.environ.get('USE_PLAYWRIGHT', '1').lower() in ('1', 'true', 'yes')
|
|
94
|
+
# Shorter timeout to fail faster if it's not going to work
|
|
95
|
+
NAVIGATION_TIMEOUT = 20000 # 20 seconds
|
|
96
|
+
# Default output path
|
|
97
|
+
DEFAULT_OUTPUT_PATH = (
|
|
98
|
+
repo_root / 'awslabs' / 'terraform_mcp_server' / 'static' / 'AWSCC_PROVIDER_RESOURCES.md'
|
|
99
|
+
)
|
|
100
|
+
# AWSCC provider URL
|
|
101
|
+
AWSCC_PROVIDER_URL = 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs'
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
async def fetch_awscc_provider_page():
|
|
105
|
+
"""Fetch the AWSCC provider documentation page using Playwright.
|
|
106
|
+
|
|
107
|
+
This function uses a headless browser to render the JavaScript-driven
|
|
108
|
+
Terraform Registry website and extract the AWSCC provider resources.
|
|
109
|
+
|
|
110
|
+
It will fall back to pre-defined data if:
|
|
111
|
+
- The USE_PLAYWRIGHT environment variable is set to 0/false/no
|
|
112
|
+
- There's any error during the scraping process
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
A dictionary containing:
|
|
116
|
+
- 'categories': Dictionary of AWSCC service categories with resources and data sources
|
|
117
|
+
- 'version': AWSCC provider version string (e.g., "1.36.0")
|
|
118
|
+
"""
|
|
119
|
+
# Check if we should skip Playwright or if it's not available
|
|
120
|
+
if not USE_PLAYWRIGHT or async_playwright is None:
|
|
121
|
+
logger.info(
|
|
122
|
+
'Skipping Playwright and using pre-defined resource structure (USE_PLAYWRIGHT=0)'
|
|
123
|
+
)
|
|
124
|
+
return {'categories': get_fallback_resource_data(), 'version': 'unknown'}
|
|
125
|
+
|
|
126
|
+
logger.info('Starting browser to extract AWSCC provider resources structure')
|
|
127
|
+
start_time = time.time()
|
|
128
|
+
categories = {}
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
async with async_playwright() as p:
|
|
132
|
+
# Launch the browser with specific options for better performance
|
|
133
|
+
browser = await p.chromium.launch(
|
|
134
|
+
headless=True,
|
|
135
|
+
args=['--disable-dev-shm-usage', '--no-sandbox', '--disable-setuid-sandbox'],
|
|
136
|
+
)
|
|
137
|
+
context = await browser.new_context(
|
|
138
|
+
viewport={'width': 1280, 'height': 800},
|
|
139
|
+
user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
140
|
+
)
|
|
141
|
+
page = await context.new_page()
|
|
142
|
+
|
|
143
|
+
# Set a shorter timeout for navigation
|
|
144
|
+
page.set_default_timeout(NAVIGATION_TIMEOUT)
|
|
145
|
+
|
|
146
|
+
# Navigate to the AWS provider docs with reduced timeout
|
|
147
|
+
logger.info(
|
|
148
|
+
f'Navigating to Terraform AWSCC provider documentation (timeout: {NAVIGATION_TIMEOUT}ms)'
|
|
149
|
+
)
|
|
150
|
+
try:
|
|
151
|
+
await page.goto(
|
|
152
|
+
AWSCC_PROVIDER_URL,
|
|
153
|
+
wait_until='domcontentloaded',
|
|
154
|
+
) # Using 'domcontentloaded' instead of 'networkidle'
|
|
155
|
+
logger.info('Basic page loaded successfully')
|
|
156
|
+
except Exception as nav_error:
|
|
157
|
+
logger.error(f'Error during navigation: {nav_error}')
|
|
158
|
+
await browser.close()
|
|
159
|
+
return {'categories': get_fallback_resource_data(), 'version': 'unknown'}
|
|
160
|
+
|
|
161
|
+
# Wait for the content to be fully loaded
|
|
162
|
+
logger.info('Waiting for page to render completely')
|
|
163
|
+
|
|
164
|
+
# Add a small fixed delay to let JavaScript finish rendering
|
|
165
|
+
await asyncio.sleep(2)
|
|
166
|
+
|
|
167
|
+
# Extract AWS provider version
|
|
168
|
+
provider_version = 'unknown'
|
|
169
|
+
try:
|
|
170
|
+
# Try to extract version using the selector provided
|
|
171
|
+
logger.info('Attempting to extract AWSCC provider version')
|
|
172
|
+
|
|
173
|
+
# Try using the selector approach
|
|
174
|
+
version_element = await page.query_selector(
|
|
175
|
+
'body > div.provider-view > div.provider-nav > nav.bread-crumbs.is-light > div > div > ul > li:nth-child(4) > span'
|
|
176
|
+
)
|
|
177
|
+
if version_element:
|
|
178
|
+
# Try to extract text from the element
|
|
179
|
+
version_text = await version_element.inner_text()
|
|
180
|
+
logger.debug(f'Found version element with text: {version_text}')
|
|
181
|
+
|
|
182
|
+
# Extract just the version number using regex
|
|
183
|
+
version_match = re.search(r'Version\s+([0-9.]+)', version_text)
|
|
184
|
+
if version_match:
|
|
185
|
+
provider_version = version_match.group(1) # e.g., "5.91.0"
|
|
186
|
+
logger.info(f'Extracted AWSCC provider version: {provider_version}')
|
|
187
|
+
else:
|
|
188
|
+
# If regex doesn't match, try JavaScript approach
|
|
189
|
+
logger.debug("Regex pattern didn't match, trying JavaScript approach")
|
|
190
|
+
provider_version = await page.evaluate("""
|
|
191
|
+
() => {
|
|
192
|
+
const versionEl = document.querySelector('.version-dropdown button span');
|
|
193
|
+
return versionEl ? versionEl.innerText.trim() : null;
|
|
194
|
+
}
|
|
195
|
+
""")
|
|
196
|
+
# Clean up the version string if needed
|
|
197
|
+
if provider_version:
|
|
198
|
+
provider_version = provider_version.strip()
|
|
199
|
+
version_match = re.search(r'([0-9.]+)', provider_version)
|
|
200
|
+
if version_match:
|
|
201
|
+
provider_version = version_match.group(1)
|
|
202
|
+
logger.info(
|
|
203
|
+
f'Extracted AWS provider version via JavaScript: {provider_version}'
|
|
204
|
+
)
|
|
205
|
+
else:
|
|
206
|
+
# If the specific selector doesn't work, try a more general approach
|
|
207
|
+
logger.debug(
|
|
208
|
+
'Specific version selector not found, trying alternative selectors'
|
|
209
|
+
)
|
|
210
|
+
provider_version = await page.evaluate("""
|
|
211
|
+
() => {
|
|
212
|
+
// Try different selectors that might contain the version
|
|
213
|
+
const selectors = [
|
|
214
|
+
'.version-dropdown button span',
|
|
215
|
+
'.dropdown-trigger button span',
|
|
216
|
+
'span:contains("Version")'
|
|
217
|
+
];
|
|
218
|
+
for (const selector of selectors) {
|
|
219
|
+
try {
|
|
220
|
+
const el = document.querySelector(selector);
|
|
221
|
+
if (el && el.innerText.includes('Version')) {
|
|
222
|
+
return el.innerText.trim();
|
|
223
|
+
}
|
|
224
|
+
} catch (e) {}
|
|
225
|
+
}
|
|
226
|
+
return null;
|
|
227
|
+
}
|
|
228
|
+
""")
|
|
229
|
+
|
|
230
|
+
# Extract version number from text if found
|
|
231
|
+
if provider_version:
|
|
232
|
+
version_match = re.search(r'([0-9.]+)', provider_version)
|
|
233
|
+
if version_match:
|
|
234
|
+
provider_version = version_match.group(1)
|
|
235
|
+
logger.info(
|
|
236
|
+
f'Extracted AWSCC provider version via alternative selector: {provider_version}'
|
|
237
|
+
)
|
|
238
|
+
except Exception as version_error:
|
|
239
|
+
logger.warning(f'Error extracting AWSCC provider version: {version_error}')
|
|
240
|
+
|
|
241
|
+
# Check for and handle cookie consent banner
|
|
242
|
+
logger.info('Checking for cookie consent banner')
|
|
243
|
+
try:
|
|
244
|
+
# Check if the consent banner is present
|
|
245
|
+
consent_banner = await page.query_selector('#consent-banner')
|
|
246
|
+
if consent_banner:
|
|
247
|
+
logger.info('Cookie consent banner detected, attempting to dismiss')
|
|
248
|
+
|
|
249
|
+
# Target the specific dismiss button based on the HTML structure provided
|
|
250
|
+
dismiss_button_selectors = [
|
|
251
|
+
'button.hds-button:has-text("Dismiss")',
|
|
252
|
+
'button.hds-button .hds-button__text:has-text("Dismiss")',
|
|
253
|
+
'button.hds-button--color-primary',
|
|
254
|
+
]
|
|
255
|
+
|
|
256
|
+
for selector in dismiss_button_selectors:
|
|
257
|
+
try:
|
|
258
|
+
# Check if the button exists with this selector
|
|
259
|
+
button = await page.query_selector(selector)
|
|
260
|
+
if button:
|
|
261
|
+
logger.info(f'Found dismiss button with selector: {selector}')
|
|
262
|
+
await button.click()
|
|
263
|
+
logger.info('Clicked the dismiss button')
|
|
264
|
+
|
|
265
|
+
# Wait a moment for the banner to disappear
|
|
266
|
+
await asyncio.sleep(1)
|
|
267
|
+
|
|
268
|
+
# Check if the banner is gone
|
|
269
|
+
banner_still_visible = await page.query_selector('#consent-banner')
|
|
270
|
+
if not banner_still_visible:
|
|
271
|
+
logger.info('Banner successfully dismissed')
|
|
272
|
+
break
|
|
273
|
+
except Exception as button_error:
|
|
274
|
+
logger.warning(f'Failed to click button {selector}: {button_error}')
|
|
275
|
+
|
|
276
|
+
# If button clicking didn't work, try JavaScript approach as a fallback
|
|
277
|
+
banner_still_visible = await page.query_selector('#consent-banner')
|
|
278
|
+
if banner_still_visible:
|
|
279
|
+
logger.info('Attempting to remove banner via JavaScript')
|
|
280
|
+
try:
|
|
281
|
+
# Try to remove the banner using JavaScript
|
|
282
|
+
await page.evaluate("""() => {
|
|
283
|
+
const banner = document.getElementById('consent-banner');
|
|
284
|
+
if (banner) banner.remove();
|
|
285
|
+
return true;
|
|
286
|
+
}""")
|
|
287
|
+
logger.info('Removed banner using JavaScript')
|
|
288
|
+
except Exception as js_error:
|
|
289
|
+
logger.warning(f'Failed to remove banner via JavaScript: {js_error}')
|
|
290
|
+
except Exception as banner_error:
|
|
291
|
+
logger.warning(f'Error handling consent banner: {banner_error}')
|
|
292
|
+
|
|
293
|
+
# Progressive wait strategy - try multiple conditions in sequence
|
|
294
|
+
# Define selectors to try in order of preference
|
|
295
|
+
selectors = [
|
|
296
|
+
'.provider-docs-menu-content',
|
|
297
|
+
'nav',
|
|
298
|
+
'.docs-nav',
|
|
299
|
+
'aside',
|
|
300
|
+
'ul.nav',
|
|
301
|
+
'div[role="navigation"]',
|
|
302
|
+
]
|
|
303
|
+
|
|
304
|
+
# Try each selector with a short timeout
|
|
305
|
+
for selector in selectors:
|
|
306
|
+
try:
|
|
307
|
+
logger.info(f'Trying to locate element with selector: {selector}')
|
|
308
|
+
await page.wait_for_selector(selector, timeout=5000)
|
|
309
|
+
logger.info(f'Found element with selector: {selector}')
|
|
310
|
+
break
|
|
311
|
+
except Exception as se:
|
|
312
|
+
logger.warning(f"Selector '{selector}' not found: {se}")
|
|
313
|
+
|
|
314
|
+
# Extract the HTML content after JS rendering
|
|
315
|
+
logger.info('Extracting page content')
|
|
316
|
+
content = await page.content()
|
|
317
|
+
|
|
318
|
+
# Save HTML for debugging using tempfile for security
|
|
319
|
+
with tempfile.NamedTemporaryFile(
|
|
320
|
+
prefix='terraform_awscc_debug_playwright_', suffix='.html', mode='w', delete=False
|
|
321
|
+
) as temp_file:
|
|
322
|
+
temp_file.write(content)
|
|
323
|
+
debug_file_path = temp_file.name
|
|
324
|
+
logger.debug(f'Saved rendered HTML content to {debug_file_path}')
|
|
325
|
+
|
|
326
|
+
# Parse the HTML
|
|
327
|
+
soup = BeautifulSoup(content, 'html.parser')
|
|
328
|
+
|
|
329
|
+
# First try the specific provider-docs-menu-content selector
|
|
330
|
+
menu_content = soup.select_one('.provider-docs-menu-content')
|
|
331
|
+
|
|
332
|
+
if not menu_content:
|
|
333
|
+
logger.warning(
|
|
334
|
+
"Couldn't find the .provider-docs-menu-content element, trying alternatives"
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
# Try each selector that might contain the menu
|
|
338
|
+
for selector in selectors:
|
|
339
|
+
menu_content = soup.select_one(selector)
|
|
340
|
+
if menu_content:
|
|
341
|
+
logger.info(f'Found menu content with selector: {selector}')
|
|
342
|
+
break
|
|
343
|
+
|
|
344
|
+
# If still not found, look for any substantial navigation
|
|
345
|
+
if not menu_content:
|
|
346
|
+
logger.warning("Still couldn't find navigation using standard selectors")
|
|
347
|
+
|
|
348
|
+
# Try to find any element with many links as a potential menu
|
|
349
|
+
potential_menus = []
|
|
350
|
+
for elem in safe_find_all(soup, ['div', 'nav', 'ul']):
|
|
351
|
+
links = safe_find_all(elem, 'a')
|
|
352
|
+
if len(links) > 10: # Any element with many links might be navigation
|
|
353
|
+
potential_menus.append((elem, len(links)))
|
|
354
|
+
|
|
355
|
+
# Sort by number of links, highest first
|
|
356
|
+
potential_menus.sort(key=lambda x: x[1], reverse=True)
|
|
357
|
+
|
|
358
|
+
if potential_menus:
|
|
359
|
+
menu_content = potential_menus[0][0]
|
|
360
|
+
logger.info(f'Using element with {potential_menus[0][1]} links as menu')
|
|
361
|
+
|
|
362
|
+
# If we still have nothing, use fallback
|
|
363
|
+
if not menu_content:
|
|
364
|
+
logger.error("Couldn't find any navigation element, using fallback data")
|
|
365
|
+
await browser.close()
|
|
366
|
+
return {'categories': get_fallback_resource_data(), 'version': 'unknown'}
|
|
367
|
+
|
|
368
|
+
# Find all category titles (excluding 'guides' and 'functions')
|
|
369
|
+
category_titles = menu_content.select('.menu-list-category-link-title')
|
|
370
|
+
|
|
371
|
+
if not category_titles:
|
|
372
|
+
logger.error("Couldn't find any .menu-list-category-link-title elements")
|
|
373
|
+
await browser.close()
|
|
374
|
+
return {'categories': get_fallback_resource_data(), 'version': 'unknown'}
|
|
375
|
+
|
|
376
|
+
logger.info(f'Found {len(category_titles)} category titles')
|
|
377
|
+
|
|
378
|
+
# First collect all categories that we need to process
|
|
379
|
+
categories_to_process = []
|
|
380
|
+
for category_el in category_titles:
|
|
381
|
+
category_name = category_el.get_text(strip=True)
|
|
382
|
+
|
|
383
|
+
# Skip non-service entries like 'Guides' and 'Functions'
|
|
384
|
+
if category_name.lower() in ['guides', 'functions', 'awscc provider']:
|
|
385
|
+
logger.debug(f'Skipping category: {category_name}')
|
|
386
|
+
continue
|
|
387
|
+
|
|
388
|
+
logger.debug(f'Will process category: {category_name}')
|
|
389
|
+
categories_to_process.append((category_name, category_el))
|
|
390
|
+
|
|
391
|
+
# Initialize category entry
|
|
392
|
+
categories[category_name] = {'resources': [], 'data_sources': []}
|
|
393
|
+
|
|
394
|
+
# Process a smaller set of categories if there are too many (for testing/development)
|
|
395
|
+
MAX_CATEGORIES = int(os.environ.get('MAX_CATEGORIES', '999'))
|
|
396
|
+
if len(categories_to_process) > MAX_CATEGORIES:
|
|
397
|
+
logger.info(
|
|
398
|
+
f'Limiting to {MAX_CATEGORIES} categories (from {len(categories_to_process)})'
|
|
399
|
+
)
|
|
400
|
+
categories_to_process = categories_to_process[:MAX_CATEGORIES]
|
|
401
|
+
|
|
402
|
+
logger.info(
|
|
403
|
+
f'Processing {len(categories_to_process)} categories with click interaction'
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
# Now process each category by clicking on it first
|
|
407
|
+
for category_idx, (category_name, category_el) in enumerate(categories_to_process):
|
|
408
|
+
try:
|
|
409
|
+
# Get the DOM path or some identifier for this category
|
|
410
|
+
# Try to find a unique identifier for the category to click on
|
|
411
|
+
# First, try to get the href attribute from the parent <a> tag
|
|
412
|
+
href = None
|
|
413
|
+
parent_a = category_el.parent
|
|
414
|
+
if parent_a and parent_a.name == 'a':
|
|
415
|
+
href = parent_a.get('href')
|
|
416
|
+
|
|
417
|
+
logger.info(
|
|
418
|
+
f'[{category_idx + 1}/{len(categories_to_process)}] Clicking on category: {category_name}'
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
# Handle potential cookie consent banner interference
|
|
422
|
+
try:
|
|
423
|
+
# Check if banner reappeared
|
|
424
|
+
consent_banner = await page.query_selector('#consent-banner')
|
|
425
|
+
if consent_banner:
|
|
426
|
+
logger.info(
|
|
427
|
+
'Cookie consent banner detected again, removing via JavaScript'
|
|
428
|
+
)
|
|
429
|
+
await page.evaluate("""() => {
|
|
430
|
+
const banner = document.getElementById('consent-banner');
|
|
431
|
+
if (banner) banner.remove();
|
|
432
|
+
return true;
|
|
433
|
+
}""")
|
|
434
|
+
except Exception:
|
|
435
|
+
pass # Ignore errors in this extra banner check
|
|
436
|
+
|
|
437
|
+
# Click with increased timeout and multiple attempts
|
|
438
|
+
click_success = False
|
|
439
|
+
click_attempts = 0
|
|
440
|
+
max_attempts = 3
|
|
441
|
+
|
|
442
|
+
while not click_success and click_attempts < max_attempts:
|
|
443
|
+
click_attempts += 1
|
|
444
|
+
try:
|
|
445
|
+
if href:
|
|
446
|
+
# If we have an href, use that to locate the element
|
|
447
|
+
try:
|
|
448
|
+
selector = f"a[href='{href}']"
|
|
449
|
+
await page.click(selector, timeout=8000) # Increased timeout
|
|
450
|
+
logger.debug(
|
|
451
|
+
f'Clicked category using href selector: {selector}'
|
|
452
|
+
)
|
|
453
|
+
click_success = True
|
|
454
|
+
except Exception as click_error:
|
|
455
|
+
logger.warning(
|
|
456
|
+
f'Failed to click using href, trying text: {click_error}'
|
|
457
|
+
)
|
|
458
|
+
# If that fails, try to click by text content
|
|
459
|
+
escaped_name = category_name.replace("'", "\\'")
|
|
460
|
+
await page.click(
|
|
461
|
+
f"text='{escaped_name}'", timeout=8000
|
|
462
|
+
) # Increased timeout
|
|
463
|
+
click_success = True
|
|
464
|
+
else:
|
|
465
|
+
# Otherwise try to click by text content
|
|
466
|
+
escaped_name = category_name.replace("'", "\\'")
|
|
467
|
+
await page.click(
|
|
468
|
+
f"text='{escaped_name}'", timeout=8000
|
|
469
|
+
) # Increased timeout
|
|
470
|
+
click_success = True
|
|
471
|
+
|
|
472
|
+
except Exception as click_error:
|
|
473
|
+
logger.warning(
|
|
474
|
+
f'Click attempt {click_attempts} failed for {category_name}: {click_error}'
|
|
475
|
+
)
|
|
476
|
+
if click_attempts >= max_attempts:
|
|
477
|
+
logger.error(
|
|
478
|
+
f'Failed to click category {category_name} after {max_attempts} attempts'
|
|
479
|
+
)
|
|
480
|
+
# Don't break the loop, continue with next category
|
|
481
|
+
raise click_error
|
|
482
|
+
|
|
483
|
+
# Try removing any overlays before next attempt
|
|
484
|
+
try:
|
|
485
|
+
await page.evaluate("""() => {
|
|
486
|
+
// Remove common overlay patterns
|
|
487
|
+
document.querySelectorAll('[id*="banner"],[id*="overlay"],[id*="popup"],[class*="banner"],[class*="overlay"],[class*="popup"]')
|
|
488
|
+
.forEach(el => el.remove());
|
|
489
|
+
return true;
|
|
490
|
+
}""")
|
|
491
|
+
await asyncio.sleep(0.5) # Brief pause between attempts
|
|
492
|
+
except Exception:
|
|
493
|
+
pass # Ignore errors in overlay removal
|
|
494
|
+
|
|
495
|
+
# Wait briefly for content to load
|
|
496
|
+
await asyncio.sleep(0.3)
|
|
497
|
+
|
|
498
|
+
# Extract resources and data sources from the now-expanded category
|
|
499
|
+
# We need to use the HTML structure to locate the specific sections for this category
|
|
500
|
+
try:
|
|
501
|
+
# Get the updated HTML after clicking
|
|
502
|
+
current_html = await page.content()
|
|
503
|
+
current_soup = BeautifulSoup(current_html, 'html.parser')
|
|
504
|
+
|
|
505
|
+
resource_count = 0
|
|
506
|
+
data_source_count = 0
|
|
507
|
+
|
|
508
|
+
# Find the clicked category element in the updated DOM
|
|
509
|
+
# This is important because the structure changes after clicking
|
|
510
|
+
# First, find the category span by its text
|
|
511
|
+
category_spans = safe_find_all(
|
|
512
|
+
current_soup, 'span', class_='menu-list-category-link-title'
|
|
513
|
+
)
|
|
514
|
+
clicked_category_span = None
|
|
515
|
+
for span in category_spans:
|
|
516
|
+
if safe_get_text(span, strip=True) == category_name:
|
|
517
|
+
clicked_category_span = span
|
|
518
|
+
break
|
|
519
|
+
|
|
520
|
+
if not clicked_category_span:
|
|
521
|
+
logger.warning(
|
|
522
|
+
f'Could not find clicked category {category_name} in updated DOM'
|
|
523
|
+
)
|
|
524
|
+
continue
|
|
525
|
+
|
|
526
|
+
# Navigate up to find the parent LI, which contains all content for this category
|
|
527
|
+
parent_li = ensure_tag(clicked_category_span.find_parent('li'))
|
|
528
|
+
if not parent_li:
|
|
529
|
+
logger.warning(
|
|
530
|
+
f'Could not find parent LI for category {category_name}'
|
|
531
|
+
)
|
|
532
|
+
continue
|
|
533
|
+
|
|
534
|
+
# Find the ul.menu-list that contains both Resources and Data Sources sections
|
|
535
|
+
category_menu_list = safe_find(parent_li, 'ul', class_='menu-list')
|
|
536
|
+
if not category_menu_list:
|
|
537
|
+
logger.warning(
|
|
538
|
+
f'Could not find menu-list for category {category_name}'
|
|
539
|
+
)
|
|
540
|
+
continue
|
|
541
|
+
|
|
542
|
+
# Process Resources section
|
|
543
|
+
# Find the span with text "Resources"
|
|
544
|
+
resource_spans = safe_find_all(
|
|
545
|
+
category_menu_list, 'span', class_='menu-list-category-link-title'
|
|
546
|
+
)
|
|
547
|
+
resource_section = None
|
|
548
|
+
for span in resource_spans:
|
|
549
|
+
if safe_get_text(span, strip=True) == 'Resources':
|
|
550
|
+
resource_section_li = ensure_tag(span.find_parent('li'))
|
|
551
|
+
if resource_section_li:
|
|
552
|
+
resource_section = safe_find(
|
|
553
|
+
resource_section_li, 'ul', class_='menu-list'
|
|
554
|
+
)
|
|
555
|
+
break
|
|
556
|
+
|
|
557
|
+
# If we can't find the Resources section using the span approach,
|
|
558
|
+
# try alternative methods
|
|
559
|
+
if not resource_section:
|
|
560
|
+
# Look for any UL that might contain resource links
|
|
561
|
+
potential_resource_sections = safe_find_all(category_menu_list, 'ul')
|
|
562
|
+
for ul in potential_resource_sections:
|
|
563
|
+
# Check if this UL contains links that look like resources
|
|
564
|
+
links = safe_find_all(ul, 'a')
|
|
565
|
+
for link in links:
|
|
566
|
+
link_text = safe_get_text(link, strip=True)
|
|
567
|
+
# AWSCC resources typically start with "awscc_"
|
|
568
|
+
if (
|
|
569
|
+
isinstance(link_text, str)
|
|
570
|
+
and link_text.startswith('awscc_')
|
|
571
|
+
and '_data_' not in link_text.lower()
|
|
572
|
+
):
|
|
573
|
+
resource_section = ul
|
|
574
|
+
break
|
|
575
|
+
if resource_section:
|
|
576
|
+
break
|
|
577
|
+
|
|
578
|
+
# Extract resources
|
|
579
|
+
if resource_section:
|
|
580
|
+
# Try both menu-list-link class and direct a tags
|
|
581
|
+
resource_links = safe_find_all(
|
|
582
|
+
resource_section, 'li', class_='menu-list-link'
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
# If not resource_links, try direct a tags
|
|
586
|
+
if not resource_links:
|
|
587
|
+
resource_links = safe_find_all(resource_section, 'a')
|
|
588
|
+
|
|
589
|
+
for item in resource_links:
|
|
590
|
+
# If item is a link itself (a tag)
|
|
591
|
+
if isinstance(item, Tag) and item.name == 'a':
|
|
592
|
+
link = item
|
|
593
|
+
else:
|
|
594
|
+
# If item is a container (li), find the link inside
|
|
595
|
+
link = safe_find(item, 'a')
|
|
596
|
+
|
|
597
|
+
if not link:
|
|
598
|
+
continue
|
|
599
|
+
|
|
600
|
+
href = link.get('href') if isinstance(link, Tag) else None
|
|
601
|
+
if not href:
|
|
602
|
+
continue
|
|
603
|
+
|
|
604
|
+
link_text = safe_get_text(link, strip=True)
|
|
605
|
+
if not link_text:
|
|
606
|
+
continue
|
|
607
|
+
|
|
608
|
+
# Skip if this doesn't look like an AWSCC resource
|
|
609
|
+
if not isinstance(link_text, str) or not link_text.startswith(
|
|
610
|
+
'awscc_'
|
|
611
|
+
):
|
|
612
|
+
continue
|
|
613
|
+
|
|
614
|
+
# Skip data sources (they'll be handled separately)
|
|
615
|
+
if isinstance(link_text, str) and '_data_' in link_text.lower():
|
|
616
|
+
continue
|
|
617
|
+
|
|
618
|
+
# Complete the URL if it's a relative path
|
|
619
|
+
full_url = (
|
|
620
|
+
f'https://registry.terraform.io{href}'
|
|
621
|
+
if isinstance(href, str) and href.startswith('/')
|
|
622
|
+
else href
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
# Add to resources
|
|
626
|
+
resource = {'name': link_text, 'url': full_url, 'type': 'resource'}
|
|
627
|
+
|
|
628
|
+
categories[category_name]['resources'].append(resource)
|
|
629
|
+
resource_count += 1
|
|
630
|
+
|
|
631
|
+
# Process Data Sources section
|
|
632
|
+
# Find the span with text "Data Sources"
|
|
633
|
+
data_spans = safe_find_all(
|
|
634
|
+
category_menu_list, 'span', class_='menu-list-category-link-title'
|
|
635
|
+
)
|
|
636
|
+
data_section = None
|
|
637
|
+
for span in data_spans:
|
|
638
|
+
if safe_get_text(span, strip=True) == 'Data Sources':
|
|
639
|
+
data_section_li = ensure_tag(span.find_parent('li'))
|
|
640
|
+
if data_section_li:
|
|
641
|
+
data_section = safe_find(
|
|
642
|
+
data_section_li, 'ul', class_='menu-list'
|
|
643
|
+
)
|
|
644
|
+
break
|
|
645
|
+
|
|
646
|
+
# If we can't find the Data Sources section using the span approach,
|
|
647
|
+
# try alternative methods
|
|
648
|
+
if not data_section:
|
|
649
|
+
# Look for any UL that might contain data source links
|
|
650
|
+
potential_data_sections = safe_find_all(category_menu_list, 'ul')
|
|
651
|
+
for ul in potential_data_sections:
|
|
652
|
+
# Check if this UL contains links that look like data sources
|
|
653
|
+
links = safe_find_all(ul, 'a')
|
|
654
|
+
for link in links:
|
|
655
|
+
link_text = safe_get_text(link, strip=True)
|
|
656
|
+
href_attr = (
|
|
657
|
+
link.get('href', '') if isinstance(link, Tag) else ''
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
# Data sources typically have "data" in the URL or name
|
|
661
|
+
if (
|
|
662
|
+
isinstance(link_text, str)
|
|
663
|
+
and link_text.startswith('awscc_')
|
|
664
|
+
and (
|
|
665
|
+
(
|
|
666
|
+
isinstance(href_attr, str)
|
|
667
|
+
and 'data' in href_attr.lower()
|
|
668
|
+
)
|
|
669
|
+
or (
|
|
670
|
+
isinstance(link_text, str)
|
|
671
|
+
and 'data' in link_text.lower()
|
|
672
|
+
)
|
|
673
|
+
)
|
|
674
|
+
):
|
|
675
|
+
data_section = ul
|
|
676
|
+
break
|
|
677
|
+
if data_section:
|
|
678
|
+
break
|
|
679
|
+
|
|
680
|
+
# Extract data sources
|
|
681
|
+
if data_section:
|
|
682
|
+
# Try both menu-list-link class and direct a tags
|
|
683
|
+
data_links = safe_find_all(data_section, 'li', class_='menu-list-link')
|
|
684
|
+
|
|
685
|
+
# If no menu-list-link items found, try direct a tags
|
|
686
|
+
if not data_links:
|
|
687
|
+
data_links = safe_find_all(data_section, 'a')
|
|
688
|
+
|
|
689
|
+
for item in data_links:
|
|
690
|
+
# If item is a link itself (a tag)
|
|
691
|
+
if isinstance(item, Tag) and item.name == 'a':
|
|
692
|
+
link = item
|
|
693
|
+
else:
|
|
694
|
+
# If item is a container (li), find the link inside
|
|
695
|
+
link = safe_find(item, 'a')
|
|
696
|
+
|
|
697
|
+
if not link:
|
|
698
|
+
continue
|
|
699
|
+
|
|
700
|
+
href = link.get('href') if isinstance(link, Tag) else None
|
|
701
|
+
if not href:
|
|
702
|
+
continue
|
|
703
|
+
|
|
704
|
+
link_text = safe_get_text(link, strip=True)
|
|
705
|
+
if not link_text:
|
|
706
|
+
continue
|
|
707
|
+
|
|
708
|
+
# Skip if this doesn't look like an AWSCC data source
|
|
709
|
+
if not isinstance(link_text, str) or not link_text.startswith(
|
|
710
|
+
'awscc_'
|
|
711
|
+
):
|
|
712
|
+
continue
|
|
713
|
+
|
|
714
|
+
# Make sure it's a data source (contains "data" in URL or name)
|
|
715
|
+
if not (
|
|
716
|
+
(isinstance(href, str) and 'data' in href.lower())
|
|
717
|
+
or (isinstance(link_text, str) and 'data' in link_text.lower())
|
|
718
|
+
):
|
|
719
|
+
continue
|
|
720
|
+
|
|
721
|
+
# Complete the URL if it's a relative path
|
|
722
|
+
full_url = (
|
|
723
|
+
f'https://registry.terraform.io{href}'
|
|
724
|
+
if isinstance(href, str) and href.startswith('/')
|
|
725
|
+
else href
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
# Add to data sources
|
|
729
|
+
data_source = {
|
|
730
|
+
'name': link_text,
|
|
731
|
+
'url': full_url,
|
|
732
|
+
'type': 'data_source',
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
categories[category_name]['data_sources'].append(data_source)
|
|
736
|
+
data_source_count += 1
|
|
737
|
+
|
|
738
|
+
# If we still haven't found any resources or data sources,
|
|
739
|
+
# try a more aggressive approach by looking at all links in the category
|
|
740
|
+
if resource_count == 0 and data_source_count == 0:
|
|
741
|
+
all_links = safe_find_all(category_menu_list, 'a')
|
|
742
|
+
for link in all_links:
|
|
743
|
+
href = link.get('href', '') if isinstance(link, Tag) else ''
|
|
744
|
+
link_text = safe_get_text(link, strip=True)
|
|
745
|
+
|
|
746
|
+
if not isinstance(link_text, str) or not link_text.startswith(
|
|
747
|
+
'awscc_'
|
|
748
|
+
):
|
|
749
|
+
continue
|
|
750
|
+
|
|
751
|
+
# Complete the URL if it's a relative path
|
|
752
|
+
full_url = (
|
|
753
|
+
f'https://registry.terraform.io{href}'
|
|
754
|
+
if isinstance(href, str) and href.startswith('/')
|
|
755
|
+
else href
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
# Determine if it's a resource or data source based on URL/name
|
|
759
|
+
if isinstance(href, str) and (
|
|
760
|
+
'data' in href.lower() or 'data-source' in href.lower()
|
|
761
|
+
):
|
|
762
|
+
data_source = {
|
|
763
|
+
'name': link_text,
|
|
764
|
+
'url': full_url,
|
|
765
|
+
'type': 'data_source',
|
|
766
|
+
}
|
|
767
|
+
categories[category_name]['data_sources'].append(data_source)
|
|
768
|
+
data_source_count += 1
|
|
769
|
+
else:
|
|
770
|
+
resource = {
|
|
771
|
+
'name': link_text,
|
|
772
|
+
'url': full_url,
|
|
773
|
+
'type': 'resource',
|
|
774
|
+
}
|
|
775
|
+
categories[category_name]['resources'].append(resource)
|
|
776
|
+
resource_count += 1
|
|
777
|
+
|
|
778
|
+
logger.info(
|
|
779
|
+
f'Category {category_name}: found {resource_count} resources, {data_source_count} data sources'
|
|
780
|
+
)
|
|
781
|
+
|
|
782
|
+
except Exception as extract_error:
|
|
783
|
+
logger.error(
|
|
784
|
+
f'Error extracting resources for {category_name}: {extract_error}'
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
except Exception as click_error:
|
|
788
|
+
logger.warning(
|
|
789
|
+
f'Error interacting with category {category_name}: {click_error}'
|
|
790
|
+
)
|
|
791
|
+
|
|
792
|
+
# Close the browser
|
|
793
|
+
await browser.close()
|
|
794
|
+
|
|
795
|
+
# Count statistics for logging
|
|
796
|
+
service_count = len(categories)
|
|
797
|
+
resource_count = sum(len(cat['resources']) for cat in categories.values())
|
|
798
|
+
data_source_count = sum(len(cat['data_sources']) for cat in categories.values())
|
|
799
|
+
|
|
800
|
+
duration = time.time() - start_time
|
|
801
|
+
logger.info(
|
|
802
|
+
f'Extracted {service_count} service categories with {resource_count} resources and {data_source_count} data sources in {duration:.2f} seconds'
|
|
803
|
+
)
|
|
804
|
+
|
|
805
|
+
# Return the structure if we have data
|
|
806
|
+
if service_count > 0:
|
|
807
|
+
return {'categories': categories, 'version': provider_version}
|
|
808
|
+
else:
|
|
809
|
+
logger.warning('No categories found, using fallback data')
|
|
810
|
+
return {'categories': get_fallback_resource_data(), 'version': 'unknown'}
|
|
811
|
+
|
|
812
|
+
except Exception as e:
|
|
813
|
+
logger.error(f'Error extracting AWSCC provider resources: {str(e)}')
|
|
814
|
+
# Return fallback data in case of error
|
|
815
|
+
return {'categories': get_fallback_resource_data(), 'version': 'unknown'}
|
|
816
|
+
|
|
817
|
+
|
|
818
|
+
def get_fallback_resource_data():
|
|
819
|
+
"""Provide fallback resource data in case the scraping fails.
|
|
820
|
+
|
|
821
|
+
Returns:
|
|
822
|
+
A dictionary with pre-defined AWSCC resources and data sources
|
|
823
|
+
"""
|
|
824
|
+
logger.warning('Using pre-defined resource structure as fallback')
|
|
825
|
+
|
|
826
|
+
# The AWSCC provider has a different structure than the AWS provider
|
|
827
|
+
# It has two main categories: Resources and Data Sources
|
|
828
|
+
categories = {
|
|
829
|
+
'Resources': {
|
|
830
|
+
'resources': [
|
|
831
|
+
{
|
|
832
|
+
'name': 'awscc_accessanalyzer_analyzer',
|
|
833
|
+
'url': 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs/resources/accessanalyzer_analyzer',
|
|
834
|
+
'type': 'resource',
|
|
835
|
+
},
|
|
836
|
+
{
|
|
837
|
+
'name': 'awscc_acmpca_certificate',
|
|
838
|
+
'url': 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs/resources/acmpca_certificate',
|
|
839
|
+
'type': 'resource',
|
|
840
|
+
},
|
|
841
|
+
{
|
|
842
|
+
'name': 'awscc_acmpca_certificate_authority',
|
|
843
|
+
'url': 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs/resources/acmpca_certificate_authority',
|
|
844
|
+
'type': 'resource',
|
|
845
|
+
},
|
|
846
|
+
{
|
|
847
|
+
'name': 'awscc_acmpca_certificate_authority_activation',
|
|
848
|
+
'url': 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs/resources/acmpca_certificate_authority_activation',
|
|
849
|
+
'type': 'resource',
|
|
850
|
+
},
|
|
851
|
+
{
|
|
852
|
+
'name': 'awscc_acmpca_permission',
|
|
853
|
+
'url': 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs/resources/acmpca_permission',
|
|
854
|
+
'type': 'resource',
|
|
855
|
+
},
|
|
856
|
+
# Add more resources as needed
|
|
857
|
+
],
|
|
858
|
+
'data_sources': [],
|
|
859
|
+
},
|
|
860
|
+
'Data Sources': {
|
|
861
|
+
'resources': [],
|
|
862
|
+
'data_sources': [
|
|
863
|
+
{
|
|
864
|
+
'name': 'awscc_accessanalyzer_analyzer',
|
|
865
|
+
'url': 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs/data-sources/accessanalyzer_analyzer',
|
|
866
|
+
'type': 'data_source',
|
|
867
|
+
},
|
|
868
|
+
{
|
|
869
|
+
'name': 'awscc_accessanalyzer_analyzers',
|
|
870
|
+
'url': 'https://registry.terraform.io/providers/hashicorp/awscc/latest/docs/data-sources/accessanalyzer_analyzers',
|
|
871
|
+
'type': 'data_source',
|
|
872
|
+
},
|
|
873
|
+
# Add more data sources as needed
|
|
874
|
+
],
|
|
875
|
+
},
|
|
876
|
+
}
|
|
877
|
+
return categories
|
|
878
|
+
|
|
879
|
+
|
|
880
|
+
def parse_arguments():
|
|
881
|
+
"""Parse command line arguments."""
|
|
882
|
+
parser = argparse.ArgumentParser(
|
|
883
|
+
description='Generate AWSCC provider resources markdown for the Terraform Expert MCP server.'
|
|
884
|
+
)
|
|
885
|
+
parser.add_argument(
|
|
886
|
+
'--max-categories',
|
|
887
|
+
type=int,
|
|
888
|
+
default=999,
|
|
889
|
+
help='Limit to N categories (default: all)',
|
|
890
|
+
)
|
|
891
|
+
parser.add_argument(
|
|
892
|
+
'--output',
|
|
893
|
+
type=Path,
|
|
894
|
+
default=DEFAULT_OUTPUT_PATH,
|
|
895
|
+
help=f'Output file path (default: {DEFAULT_OUTPUT_PATH})',
|
|
896
|
+
)
|
|
897
|
+
parser.add_argument(
|
|
898
|
+
'--no-fallback',
|
|
899
|
+
action='store_true',
|
|
900
|
+
help="Don't use fallback data if scraping fails",
|
|
901
|
+
)
|
|
902
|
+
return parser.parse_args()
|
|
903
|
+
|
|
904
|
+
|
|
905
|
+
async def main():
|
|
906
|
+
"""Main entry point for the script."""
|
|
907
|
+
start_time = datetime.now()
|
|
908
|
+
|
|
909
|
+
# Parse command line arguments
|
|
910
|
+
args = parse_arguments()
|
|
911
|
+
|
|
912
|
+
print('Generating AWSCC provider resources markdown...')
|
|
913
|
+
print(f'Output path: {args.output}')
|
|
914
|
+
print(f'Max categories: {args.max_categories if args.max_categories < 999 else "all"}')
|
|
915
|
+
|
|
916
|
+
# Set environment variable for max categories
|
|
917
|
+
os.environ['MAX_CATEGORIES'] = str(args.max_categories)
|
|
918
|
+
|
|
919
|
+
# Set environment variable for fallback behavior
|
|
920
|
+
if args.no_fallback:
|
|
921
|
+
os.environ['USE_PLAYWRIGHT'] = '1'
|
|
922
|
+
print('Using live scraping without fallback')
|
|
923
|
+
|
|
924
|
+
try:
|
|
925
|
+
# Fetch AWSCC provider data using the existing implementation
|
|
926
|
+
result = await fetch_awscc_provider_page()
|
|
927
|
+
|
|
928
|
+
# Extract categories and version
|
|
929
|
+
if isinstance(result, dict) and 'categories' in result and 'version' in result:
|
|
930
|
+
categories = result['categories']
|
|
931
|
+
provider_version = result.get('version', 'unknown')
|
|
932
|
+
else:
|
|
933
|
+
# Handle backward compatibility with older API
|
|
934
|
+
categories = result
|
|
935
|
+
provider_version = 'unknown'
|
|
936
|
+
|
|
937
|
+
# Sort categories alphabetically
|
|
938
|
+
sorted_categories = sorted(categories.keys())
|
|
939
|
+
|
|
940
|
+
# Count totals
|
|
941
|
+
total_resources = sum(len(cat['resources']) for cat in categories.values())
|
|
942
|
+
total_data_sources = sum(len(cat['data_sources']) for cat in categories.values())
|
|
943
|
+
|
|
944
|
+
print(
|
|
945
|
+
f'Found {len(categories)} categories, {total_resources} resources, and {total_data_sources} data sources'
|
|
946
|
+
)
|
|
947
|
+
|
|
948
|
+
# Generate markdown
|
|
949
|
+
markdown = []
|
|
950
|
+
markdown.append('# AWSCC Provider Resources Listing')
|
|
951
|
+
markdown.append(f'\nAWSCC Provider Version: {provider_version}')
|
|
952
|
+
markdown.append(f'\nLast updated: {datetime.now().strftime("%B %d, %Y %H:%M:%S")}')
|
|
953
|
+
markdown.append(
|
|
954
|
+
f'\nFound {total_resources} resources and {total_data_sources} data sources across {len(categories)} AWSCC service categories.\n'
|
|
955
|
+
)
|
|
956
|
+
|
|
957
|
+
# Generate table of contents
|
|
958
|
+
# markdown.append('## Table of Contents')
|
|
959
|
+
# for category in sorted_categories:
|
|
960
|
+
# sanitized_category = (
|
|
961
|
+
# category.replace(' ', '-').replace('(', '').replace(')', '').lower()
|
|
962
|
+
# )
|
|
963
|
+
# markdown.append(f'- [{category}](#{sanitized_category})')
|
|
964
|
+
# markdown.append('')
|
|
965
|
+
|
|
966
|
+
# Generate content for each category
|
|
967
|
+
for category in sorted_categories:
|
|
968
|
+
cat_data = categories[category]
|
|
969
|
+
sanitized_heading = category.replace('(', '').replace(')', '')
|
|
970
|
+
|
|
971
|
+
markdown.append(f'## {sanitized_heading}')
|
|
972
|
+
|
|
973
|
+
resource_count = len(cat_data['resources'])
|
|
974
|
+
data_source_count = len(cat_data['data_sources'])
|
|
975
|
+
|
|
976
|
+
# Add category summary
|
|
977
|
+
markdown.append(
|
|
978
|
+
f'\n*{resource_count} resources and {data_source_count} data sources*\n'
|
|
979
|
+
)
|
|
980
|
+
|
|
981
|
+
# Add resources section if available
|
|
982
|
+
if cat_data['resources']:
|
|
983
|
+
markdown.append('### Resources')
|
|
984
|
+
for resource in sorted(cat_data['resources'], key=lambda x: x['name']):
|
|
985
|
+
markdown.append(f'- [{resource["name"]}]({resource["url"]})')
|
|
986
|
+
|
|
987
|
+
# Add data sources section if available
|
|
988
|
+
if cat_data['data_sources']:
|
|
989
|
+
markdown.append('\n### Data Sources')
|
|
990
|
+
for data_source in sorted(cat_data['data_sources'], key=lambda x: x['name']):
|
|
991
|
+
markdown.append(f'- [{data_source["name"]}]({data_source["url"]})')
|
|
992
|
+
|
|
993
|
+
markdown.append('') # Add blank line between categories
|
|
994
|
+
|
|
995
|
+
# Add generation metadata at the end
|
|
996
|
+
duration = datetime.now() - start_time
|
|
997
|
+
markdown.append('---')
|
|
998
|
+
markdown.append(
|
|
999
|
+
'*This document was generated automatically by the AWSCC Provider Resources Generator script.*'
|
|
1000
|
+
)
|
|
1001
|
+
markdown.append(f'*Generation time: {duration.total_seconds():.2f} seconds*')
|
|
1002
|
+
|
|
1003
|
+
# Ensure directory exists
|
|
1004
|
+
args.output.parent.mkdir(parents=True, exist_ok=True)
|
|
1005
|
+
|
|
1006
|
+
# Write markdown to output file
|
|
1007
|
+
with open(args.output, 'w') as f:
|
|
1008
|
+
f.write('\n'.join(markdown))
|
|
1009
|
+
|
|
1010
|
+
print(f'Successfully generated markdown file at: {args.output}')
|
|
1011
|
+
print(f'Generation completed in {duration.total_seconds():.2f} seconds')
|
|
1012
|
+
return 0
|
|
1013
|
+
|
|
1014
|
+
except Exception as e:
|
|
1015
|
+
print(f'Error generating AWSCC provider resources: {str(e)}', file=sys.stderr)
|
|
1016
|
+
return 1
|
|
1017
|
+
|
|
1018
|
+
|
|
1019
|
+
if __name__ == '__main__':
|
|
1020
|
+
sys.exit(asyncio.run(main()))
|