skill-seekers 2.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. skill_seekers/__init__.py +22 -0
  2. skill_seekers/cli/__init__.py +39 -0
  3. skill_seekers/cli/adaptors/__init__.py +120 -0
  4. skill_seekers/cli/adaptors/base.py +221 -0
  5. skill_seekers/cli/adaptors/claude.py +485 -0
  6. skill_seekers/cli/adaptors/gemini.py +453 -0
  7. skill_seekers/cli/adaptors/markdown.py +269 -0
  8. skill_seekers/cli/adaptors/openai.py +503 -0
  9. skill_seekers/cli/ai_enhancer.py +310 -0
  10. skill_seekers/cli/api_reference_builder.py +373 -0
  11. skill_seekers/cli/architectural_pattern_detector.py +525 -0
  12. skill_seekers/cli/code_analyzer.py +1462 -0
  13. skill_seekers/cli/codebase_scraper.py +1225 -0
  14. skill_seekers/cli/config_command.py +563 -0
  15. skill_seekers/cli/config_enhancer.py +431 -0
  16. skill_seekers/cli/config_extractor.py +871 -0
  17. skill_seekers/cli/config_manager.py +452 -0
  18. skill_seekers/cli/config_validator.py +394 -0
  19. skill_seekers/cli/conflict_detector.py +528 -0
  20. skill_seekers/cli/constants.py +72 -0
  21. skill_seekers/cli/dependency_analyzer.py +757 -0
  22. skill_seekers/cli/doc_scraper.py +2332 -0
  23. skill_seekers/cli/enhance_skill.py +488 -0
  24. skill_seekers/cli/enhance_skill_local.py +1096 -0
  25. skill_seekers/cli/enhance_status.py +194 -0
  26. skill_seekers/cli/estimate_pages.py +433 -0
  27. skill_seekers/cli/generate_router.py +1209 -0
  28. skill_seekers/cli/github_fetcher.py +534 -0
  29. skill_seekers/cli/github_scraper.py +1466 -0
  30. skill_seekers/cli/guide_enhancer.py +723 -0
  31. skill_seekers/cli/how_to_guide_builder.py +1267 -0
  32. skill_seekers/cli/install_agent.py +461 -0
  33. skill_seekers/cli/install_skill.py +178 -0
  34. skill_seekers/cli/language_detector.py +614 -0
  35. skill_seekers/cli/llms_txt_detector.py +60 -0
  36. skill_seekers/cli/llms_txt_downloader.py +104 -0
  37. skill_seekers/cli/llms_txt_parser.py +150 -0
  38. skill_seekers/cli/main.py +558 -0
  39. skill_seekers/cli/markdown_cleaner.py +132 -0
  40. skill_seekers/cli/merge_sources.py +806 -0
  41. skill_seekers/cli/package_multi.py +77 -0
  42. skill_seekers/cli/package_skill.py +241 -0
  43. skill_seekers/cli/pattern_recognizer.py +1825 -0
  44. skill_seekers/cli/pdf_extractor_poc.py +1166 -0
  45. skill_seekers/cli/pdf_scraper.py +617 -0
  46. skill_seekers/cli/quality_checker.py +519 -0
  47. skill_seekers/cli/rate_limit_handler.py +438 -0
  48. skill_seekers/cli/resume_command.py +160 -0
  49. skill_seekers/cli/run_tests.py +230 -0
  50. skill_seekers/cli/setup_wizard.py +93 -0
  51. skill_seekers/cli/split_config.py +390 -0
  52. skill_seekers/cli/swift_patterns.py +560 -0
  53. skill_seekers/cli/test_example_extractor.py +1081 -0
  54. skill_seekers/cli/test_unified_simple.py +179 -0
  55. skill_seekers/cli/unified_codebase_analyzer.py +572 -0
  56. skill_seekers/cli/unified_scraper.py +932 -0
  57. skill_seekers/cli/unified_skill_builder.py +1605 -0
  58. skill_seekers/cli/upload_skill.py +162 -0
  59. skill_seekers/cli/utils.py +432 -0
  60. skill_seekers/mcp/__init__.py +33 -0
  61. skill_seekers/mcp/agent_detector.py +316 -0
  62. skill_seekers/mcp/git_repo.py +273 -0
  63. skill_seekers/mcp/server.py +231 -0
  64. skill_seekers/mcp/server_fastmcp.py +1249 -0
  65. skill_seekers/mcp/server_legacy.py +2302 -0
  66. skill_seekers/mcp/source_manager.py +285 -0
  67. skill_seekers/mcp/tools/__init__.py +115 -0
  68. skill_seekers/mcp/tools/config_tools.py +251 -0
  69. skill_seekers/mcp/tools/packaging_tools.py +826 -0
  70. skill_seekers/mcp/tools/scraping_tools.py +842 -0
  71. skill_seekers/mcp/tools/source_tools.py +828 -0
  72. skill_seekers/mcp/tools/splitting_tools.py +212 -0
  73. skill_seekers/py.typed +0 -0
  74. skill_seekers-2.7.3.dist-info/METADATA +2027 -0
  75. skill_seekers-2.7.3.dist-info/RECORD +79 -0
  76. skill_seekers-2.7.3.dist-info/WHEEL +5 -0
  77. skill_seekers-2.7.3.dist-info/entry_points.txt +19 -0
  78. skill_seekers-2.7.3.dist-info/licenses/LICENSE +21 -0
  79. skill_seekers-2.7.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2332 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Documentation to Claude Skill Converter
4
+ Single tool to scrape any documentation and create high-quality Claude skills.
5
+
6
+ Usage:
7
+ skill-seekers scrape --interactive
8
+ skill-seekers scrape --config configs/godot.json
9
+ skill-seekers scrape --url https://react.dev/ --name react
10
+ """
11
+
12
+ import argparse
13
+ import asyncio
14
+ import hashlib
15
+ import json
16
+ import logging
17
+ import os
18
+ import re
19
+ import sys
20
+ import time
21
+ from collections import defaultdict, deque
22
+ from pathlib import Path
23
+ from typing import Any, Optional
24
+ from urllib.parse import urljoin, urlparse
25
+
26
+ import httpx
27
+ import requests
28
+ from bs4 import BeautifulSoup
29
+
30
+ # Add parent directory to path for imports when run as script
31
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
32
+
33
+ from skill_seekers.cli.constants import (
34
+ CONTENT_PREVIEW_LENGTH,
35
+ DEFAULT_ASYNC_MODE,
36
+ DEFAULT_CHECKPOINT_INTERVAL,
37
+ DEFAULT_MAX_PAGES,
38
+ DEFAULT_RATE_LIMIT,
39
+ MAX_PAGES_WARNING_THRESHOLD,
40
+ MIN_CATEGORIZATION_SCORE,
41
+ )
42
+ from skill_seekers.cli.language_detector import LanguageDetector
43
+ from skill_seekers.cli.llms_txt_detector import LlmsTxtDetector
44
+ from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader
45
+ from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
46
+
47
+ # Configure logging
48
+ logger = logging.getLogger(__name__)
49
+
50
+
51
+ def setup_logging(verbose: bool = False, quiet: bool = False) -> None:
52
+ """Configure logging based on verbosity level.
53
+
54
+ Args:
55
+ verbose: Enable DEBUG level logging
56
+ quiet: Enable WARNING level logging only
57
+ """
58
+ if quiet:
59
+ level = logging.WARNING
60
+ elif verbose:
61
+ level = logging.DEBUG
62
+ else:
63
+ level = logging.INFO
64
+
65
+ logging.basicConfig(level=level, format="%(message)s", force=True)
66
+
67
+
68
+ def infer_description_from_docs(
69
+ base_url: str, first_page_content: str | None = None, name: str = ""
70
+ ) -> str:
71
+ """
72
+ Infer skill description from documentation metadata or first page content.
73
+
74
+ Tries multiple strategies:
75
+ 1. Extract meta description tag from first page
76
+ 2. Extract first meaningful paragraph from content
77
+ 3. Fall back to improved template
78
+
79
+ Args:
80
+ base_url: Documentation base URL
81
+ first_page_content: HTML content of first page (optional)
82
+ name: Skill name
83
+
84
+ Returns:
85
+ Description string suitable for "Use when..." format
86
+ """
87
+ # If we have first page content, try to extract description
88
+ if first_page_content:
89
+ try:
90
+ soup = BeautifulSoup(first_page_content, "html.parser")
91
+
92
+ # Strategy 1: Try meta description tag
93
+ meta_desc = soup.find("meta", {"name": "description"})
94
+ if meta_desc and meta_desc.get("content"):
95
+ desc = meta_desc["content"].strip()
96
+ if len(desc) > 20: # Meaningful length
97
+ # Clean and format
98
+ if len(desc) > 150:
99
+ desc = desc[:147] + "..."
100
+ return f"Use when {desc.lower()}"
101
+
102
+ # Strategy 2: Try OpenGraph description
103
+ og_desc = soup.find("meta", {"property": "og:description"})
104
+ if og_desc and og_desc.get("content"):
105
+ desc = og_desc["content"].strip()
106
+ if len(desc) > 20:
107
+ if len(desc) > 150:
108
+ desc = desc[:147] + "..."
109
+ return f"Use when {desc.lower()}"
110
+
111
+ # Strategy 3: Extract first meaningful paragraph from main content
112
+ # Look for common documentation main content areas
113
+ main_content = None
114
+ for selector in [
115
+ "article",
116
+ "main",
117
+ 'div[role="main"]',
118
+ "div.content",
119
+ "div.doc-content",
120
+ ]:
121
+ main_content = soup.select_one(selector)
122
+ if main_content:
123
+ break
124
+
125
+ if main_content:
126
+ # Find first paragraph
127
+ for p in main_content.find_all("p", limit=5):
128
+ text = p.get_text().strip()
129
+ # Skip empty, very short, or navigation-like paragraphs
130
+ if len(text) > 30 and not any(
131
+ skip in text.lower()
132
+ for skip in ["table of contents", "on this page", "navigation"]
133
+ ):
134
+ # Clean and format
135
+ if len(text) > 150:
136
+ text = text[:147] + "..."
137
+ return f"Use when working with {text.lower()}"
138
+
139
+ except Exception as e:
140
+ logger.debug(f"Could not infer description from page content: {e}")
141
+
142
+ # Improved fallback template
143
+ return (
144
+ f"Use when working with {name}"
145
+ if name
146
+ else f"Use when working with documentation at {urlparse(base_url).netloc}"
147
+ )
148
+
149
+
150
+ class DocToSkillConverter:
151
+ def __init__(self, config: dict[str, Any], dry_run: bool = False, resume: bool = False) -> None:
152
+ self.config = config
153
+ self.name = config["name"]
154
+ self.base_url = config["base_url"]
155
+ self.dry_run = dry_run
156
+ self.resume = resume
157
+
158
+ # Paths
159
+ self.data_dir = f"output/{self.name}_data"
160
+ self.skill_dir = f"output/{self.name}"
161
+ self.checkpoint_file = f"{self.data_dir}/checkpoint.json"
162
+
163
+ # Checkpoint config
164
+ checkpoint_config = config.get("checkpoint", {})
165
+ self.checkpoint_enabled = checkpoint_config.get("enabled", False)
166
+ self.checkpoint_interval = checkpoint_config.get("interval", DEFAULT_CHECKPOINT_INTERVAL)
167
+
168
+ # llms.txt detection state
169
+ skip_llms_txt_value = config.get("skip_llms_txt", False)
170
+ if not isinstance(skip_llms_txt_value, bool):
171
+ logger.warning(
172
+ "Invalid value for 'skip_llms_txt': %r (expected bool). Defaulting to False.",
173
+ skip_llms_txt_value,
174
+ )
175
+ self.skip_llms_txt = False
176
+ else:
177
+ self.skip_llms_txt = skip_llms_txt_value
178
+ self.llms_txt_detected = False
179
+ self.llms_txt_variant = None
180
+ self.llms_txt_variants: list[str] = [] # Track all downloaded variants
181
+
182
+ # Parallel scraping config
183
+ self.workers = config.get("workers", 1)
184
+ self.async_mode = config.get("async_mode", DEFAULT_ASYNC_MODE)
185
+
186
+ # State
187
+ self.visited_urls: set[str] = set()
188
+ # Support multiple starting URLs
189
+ start_urls = config.get("start_urls", [self.base_url])
190
+ self.pending_urls = deque(start_urls)
191
+ self.pages: list[dict[str, Any]] = []
192
+ self.pages_scraped = 0
193
+
194
+ # Language detection
195
+ self.language_detector = LanguageDetector(min_confidence=0.15)
196
+
197
+ # Thread-safe lock for parallel scraping
198
+ if self.workers > 1:
199
+ import threading
200
+
201
+ self.lock = threading.Lock()
202
+
203
+ # Create directories (unless dry-run)
204
+ if not dry_run:
205
+ os.makedirs(f"{self.data_dir}/pages", exist_ok=True)
206
+ os.makedirs(f"{self.skill_dir}/references", exist_ok=True)
207
+ os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True)
208
+ os.makedirs(f"{self.skill_dir}/assets", exist_ok=True)
209
+
210
+ # Load checkpoint if resuming
211
+ if resume and not dry_run:
212
+ self.load_checkpoint()
213
+
214
+ def is_valid_url(self, url: str) -> bool:
215
+ """Check if URL should be scraped based on patterns.
216
+
217
+ Args:
218
+ url (str): URL to validate
219
+
220
+ Returns:
221
+ bool: True if URL matches include patterns and doesn't match exclude patterns
222
+ """
223
+ if not url.startswith(self.base_url):
224
+ return False
225
+
226
+ # Include patterns
227
+ includes = self.config.get("url_patterns", {}).get("include", [])
228
+ if includes and not any(pattern in url for pattern in includes):
229
+ return False
230
+
231
+ # Exclude patterns
232
+ excludes = self.config.get("url_patterns", {}).get("exclude", [])
233
+ return not any(pattern in url for pattern in excludes)
234
+
235
+ def save_checkpoint(self) -> None:
236
+ """Save progress checkpoint"""
237
+ if not self.checkpoint_enabled or self.dry_run:
238
+ return
239
+
240
+ checkpoint_data = {
241
+ "config": self.config,
242
+ "visited_urls": list(self.visited_urls),
243
+ "pending_urls": list(self.pending_urls),
244
+ "pages_scraped": self.pages_scraped,
245
+ "last_updated": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
246
+ "checkpoint_interval": self.checkpoint_interval,
247
+ }
248
+
249
+ try:
250
+ with open(self.checkpoint_file, "w", encoding="utf-8") as f:
251
+ json.dump(checkpoint_data, f, indent=2)
252
+ logger.info(" 💾 Checkpoint saved (%d pages)", self.pages_scraped)
253
+ except Exception as e:
254
+ logger.warning(" ⚠️ Failed to save checkpoint: %s", e)
255
+
256
+ def load_checkpoint(self) -> None:
257
+ """Load progress from checkpoint"""
258
+ if not os.path.exists(self.checkpoint_file):
259
+ logger.info("ℹ️ No checkpoint found, starting fresh")
260
+ return
261
+
262
+ try:
263
+ with open(self.checkpoint_file, encoding="utf-8") as f:
264
+ checkpoint_data = json.load(f)
265
+
266
+ self.visited_urls = set(checkpoint_data["visited_urls"])
267
+ self.pending_urls = deque(checkpoint_data["pending_urls"])
268
+ self.pages_scraped = checkpoint_data["pages_scraped"]
269
+
270
+ logger.info("✅ Resumed from checkpoint")
271
+ logger.info(" Pages already scraped: %d", self.pages_scraped)
272
+ logger.info(" URLs visited: %d", len(self.visited_urls))
273
+ logger.info(" URLs pending: %d", len(self.pending_urls))
274
+ logger.info(" Last updated: %s", checkpoint_data["last_updated"])
275
+ logger.info("")
276
+
277
+ except Exception as e:
278
+ logger.warning("⚠️ Failed to load checkpoint: %s", e)
279
+ logger.info(" Starting fresh")
280
+
281
+ def clear_checkpoint(self) -> None:
282
+ """Remove checkpoint file"""
283
+ if os.path.exists(self.checkpoint_file):
284
+ try:
285
+ os.remove(self.checkpoint_file)
286
+ logger.info("✅ Checkpoint cleared")
287
+ except Exception as e:
288
+ logger.warning("⚠️ Failed to clear checkpoint: %s", e)
289
+
290
+ def extract_content(self, soup: Any, url: str) -> dict[str, Any]:
291
+ """Extract content with improved code and pattern detection"""
292
+ page = {
293
+ "url": url,
294
+ "title": "",
295
+ "content": "",
296
+ "headings": [],
297
+ "code_samples": [],
298
+ "patterns": [], # NEW: Extract common patterns
299
+ "links": [],
300
+ }
301
+
302
+ selectors = self.config.get("selectors", {})
303
+
304
+ # Extract title
305
+ title_elem = soup.select_one(selectors.get("title", "title"))
306
+ if title_elem:
307
+ page["title"] = self.clean_text(title_elem.get_text())
308
+
309
+ # Find main content
310
+ main_selector = selectors.get("main_content", 'div[role="main"]')
311
+ main = soup.select_one(main_selector)
312
+
313
+ if not main:
314
+ logger.warning("⚠ No content: %s", url)
315
+ return page
316
+
317
+ # Extract headings with better structure
318
+ for h in main.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
319
+ text = self.clean_text(h.get_text())
320
+ if text:
321
+ page["headings"].append({"level": h.name, "text": text, "id": h.get("id", "")})
322
+
323
+ # Extract code with language detection
324
+ code_selector = selectors.get("code_blocks", "pre code")
325
+ for code_elem in main.select(code_selector):
326
+ code = code_elem.get_text()
327
+ if len(code.strip()) > 10:
328
+ # Try to detect language
329
+ lang = self.detect_language(code_elem, code)
330
+ page["code_samples"].append({"code": code.strip(), "language": lang})
331
+
332
+ # Extract patterns (NEW: common code patterns)
333
+ page["patterns"] = self.extract_patterns(main, page["code_samples"])
334
+
335
+ # Extract paragraphs
336
+ paragraphs = []
337
+ for p in main.find_all("p"):
338
+ text = self.clean_text(p.get_text())
339
+ if text and len(text) > 20: # Skip very short paragraphs
340
+ paragraphs.append(text)
341
+
342
+ page["content"] = "\n\n".join(paragraphs)
343
+
344
+ # Extract links from entire page (not just main content)
345
+ # This allows discovery of navigation links outside the main content area
346
+ for link in soup.find_all("a", href=True):
347
+ href = urljoin(url, link["href"])
348
+ # Strip anchor fragments to avoid treating #anchors as separate pages
349
+ href = href.split("#")[0]
350
+ if self.is_valid_url(href) and href not in page["links"]:
351
+ page["links"].append(href)
352
+
353
+ return page
354
+
355
+ def _extract_markdown_content(self, content: str, url: str) -> dict[str, Any]:
356
+ """Extract structured content from a Markdown file.
357
+
358
+ Parses markdown files from llms.txt URLs to extract:
359
+ - Title from first h1 heading
360
+ - Headings (h2-h6, excluding h1)
361
+ - Code blocks with language detection
362
+ - Internal .md links for BFS crawling
363
+ - Content paragraphs (>20 chars)
364
+
365
+ Auto-detects HTML content and falls back to _extract_html_as_markdown.
366
+
367
+ Args:
368
+ content: Raw markdown content string (or HTML if server returned HTML)
369
+ url: Source URL for resolving relative links
370
+
371
+ Returns:
372
+ Dict with keys:
373
+ - url: str - Source URL
374
+ - title: str - Extracted from first # heading
375
+ - content: str - Paragraphs joined with double newlines
376
+ - headings: List[Dict] - {'level': 'h2', 'text': str, 'id': str}
377
+ - code_samples: List[Dict] - {'code': str, 'language': str}
378
+ - links: List[str] - Absolute URLs to other .md files
379
+ - patterns: List - Empty (reserved for future use)
380
+
381
+ Note:
382
+ Only .md links are extracted to avoid client-side rendered HTML pages.
383
+ Anchor fragments (#section) are stripped from links.
384
+ """
385
+ import re
386
+
387
+ # Detect if content is actually HTML (some .md URLs return HTML)
388
+ if content.strip().startswith("<!DOCTYPE") or content.strip().startswith("<html"):
389
+ return self._extract_html_as_markdown(content, url)
390
+
391
+ page = {
392
+ "url": url,
393
+ "title": "",
394
+ "content": "",
395
+ "headings": [],
396
+ "code_samples": [],
397
+ "patterns": [],
398
+ "links": [],
399
+ }
400
+
401
+ lines = content.split("\n")
402
+
403
+ # Extract title from first h1
404
+ for line in lines:
405
+ if line.startswith("# "):
406
+ page["title"] = line[2:].strip()
407
+ break
408
+
409
+ # Extract headings (h2-h6)
410
+ for line in lines:
411
+ match = re.match(r"^(#{2,6})\s+(.+)$", line)
412
+ if match:
413
+ level = len(match.group(1))
414
+ text = match.group(2).strip()
415
+ page["headings"].append(
416
+ {
417
+ "level": f"h{level}",
418
+ "text": text,
419
+ "id": text.lower().replace(" ", "-"),
420
+ }
421
+ )
422
+
423
+ # Extract code blocks with language
424
+ code_blocks = re.findall(r"```(\w+)?\n(.*?)```", content, re.DOTALL)
425
+ for lang, code in code_blocks:
426
+ if len(code.strip()) > 10:
427
+ page["code_samples"].append({"code": code.strip(), "language": lang or "unknown"})
428
+
429
+ # Extract content (paragraphs)
430
+ content_no_code = re.sub(r"```.*?```", "", content, flags=re.DOTALL)
431
+ paragraphs = []
432
+ for para in content_no_code.split("\n\n"):
433
+ text = para.strip()
434
+ # Skip headings and short text
435
+ if text and len(text) > 20 and not text.startswith("#"):
436
+ paragraphs.append(text)
437
+ page["content"] = "\n\n".join(paragraphs)
438
+
439
+ # Extract links from markdown (only .md files to avoid client-side rendered HTML pages)
440
+ md_links = re.findall(r"\[([^\]]*)\]\(([^)]+)\)", content)
441
+ for _, href in md_links:
442
+ if href.startswith("http"):
443
+ full_url = href
444
+ elif not href.startswith("#"):
445
+ full_url = urljoin(url, href)
446
+ else:
447
+ continue
448
+ # Strip anchor fragments
449
+ full_url = full_url.split("#")[0]
450
+ # Only include .md URLs to avoid client-side rendered HTML pages
451
+ if ".md" in full_url and self.is_valid_url(full_url) and full_url not in page["links"]:
452
+ page["links"].append(full_url)
453
+
454
+ return page
455
+
456
+ def _extract_html_as_markdown(self, html_content: str, url: str) -> dict[str, Any]:
457
+ """Extract content from HTML and convert to markdown-like structure.
458
+
459
+ Fallback method when .md URL returns HTML content instead of markdown.
460
+ Uses BeautifulSoup to extract structured data from HTML elements.
461
+
462
+ Extraction strategy:
463
+ 1. Title from <title> tag
464
+ 2. Main content from <main>, <article>, [role="main"], or <body>
465
+ 3. Headings (h1-h6) with text and id attributes
466
+ 4. Code blocks from <pre><code> or <pre> tags
467
+ 5. Text content from paragraphs
468
+
469
+ Args:
470
+ html_content: Raw HTML content string
471
+ url: Source URL (for reference in result dict)
472
+
473
+ Returns:
474
+ Dict with keys:
475
+ - url: str - Source URL
476
+ - title: str - From <title> tag, cleaned
477
+ - content: str - Text content from main area
478
+ - headings: List[Dict] - {'level': 'h2', 'text': str, 'id': str}
479
+ - code_samples: List[Dict] - {'code': str, 'language': str}
480
+ - links: List - Empty (HTML links not extracted to avoid client-side routes)
481
+ - patterns: List - Empty (reserved for future use)
482
+
483
+ Note:
484
+ Prefers <main> or <article> tags for content area.
485
+ Falls back to <body> if no semantic content container found.
486
+ Language detection uses detect_language() method.
487
+ """
488
+ page = {
489
+ "url": url,
490
+ "title": "",
491
+ "content": "",
492
+ "headings": [],
493
+ "code_samples": [],
494
+ "patterns": [],
495
+ "links": [],
496
+ }
497
+
498
+ soup = BeautifulSoup(html_content, "html.parser")
499
+
500
+ # Try to extract title
501
+ title_elem = soup.select_one("title")
502
+ if title_elem:
503
+ page["title"] = self.clean_text(title_elem.get_text())
504
+
505
+ # Try to find main content area
506
+ main = soup.select_one('main, article, [role="main"], .content')
507
+ if not main:
508
+ main = soup.body if soup.body else soup
509
+
510
+ if main:
511
+ # Extract headings
512
+ for h in main.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
513
+ text = self.clean_text(h.get_text())
514
+ if text:
515
+ page["headings"].append({"level": h.name, "text": text, "id": h.get("id", "")})
516
+
517
+ # Extract code blocks
518
+ for code_elem in main.select("pre code, pre"):
519
+ code = code_elem.get_text()
520
+ if len(code.strip()) > 10:
521
+ lang = self.detect_language(code_elem, code)
522
+ page["code_samples"].append({"code": code.strip(), "language": lang})
523
+
524
+ # Extract paragraphs
525
+ paragraphs = []
526
+ for p in main.find_all("p"):
527
+ text = self.clean_text(p.get_text())
528
+ if text and len(text) > 20:
529
+ paragraphs.append(text)
530
+ page["content"] = "\n\n".join(paragraphs)
531
+
532
+ return page
533
+
534
+ def detect_language(self, elem, code):
535
+ """Detect programming language from code block
536
+
537
+ UPDATED: Now uses confidence-based detection with 20+ languages
538
+ """
539
+ lang, confidence = self.language_detector.detect_from_html(elem, code)
540
+
541
+ # Log low-confidence detections for debugging
542
+ if confidence < 0.5:
543
+ logger.debug(f"Low confidence language detection: {lang} ({confidence:.2f})")
544
+
545
+ return lang # Return string for backward compatibility
546
+
547
+ def extract_patterns(
548
+ self, main: Any, _code_samples: list[dict[str, Any]]
549
+ ) -> list[dict[str, str]]:
550
+ """Extract common coding patterns (NEW FEATURE)"""
551
+ patterns = []
552
+
553
+ # Look for "Example:" or "Pattern:" sections
554
+ for elem in main.find_all(["p", "div"]):
555
+ text = elem.get_text().lower()
556
+ if any(word in text for word in ["example:", "pattern:", "usage:", "typical use"]):
557
+ # Get the code that follows
558
+ next_code = elem.find_next(["pre", "code"])
559
+ if next_code:
560
+ patterns.append(
561
+ {
562
+ "description": self.clean_text(elem.get_text()),
563
+ "code": next_code.get_text().strip(),
564
+ }
565
+ )
566
+
567
+ return patterns[:5] # Limit to 5 most relevant patterns
568
+
569
+ def clean_text(self, text: str) -> str:
570
+ """Clean text content"""
571
+ text = re.sub(r"\s+", " ", text)
572
+ return text.strip()
573
+
574
+ def save_page(self, page: dict[str, Any]) -> None:
575
+ """Save page data (skip pages with empty content)"""
576
+ # Skip pages with empty or very short content
577
+ if not page.get("content") or len(page.get("content", "")) < 50:
578
+ logger.debug("Skipping page with empty/short content: %s", page.get("url", "unknown"))
579
+ return
580
+
581
+ url_hash = hashlib.md5(page["url"].encode()).hexdigest()[:10]
582
+ safe_title = re.sub(r"[^\w\s-]", "", page["title"])[:50]
583
+ safe_title = re.sub(r"[-\s]+", "_", safe_title)
584
+
585
+ filename = f"{safe_title}_{url_hash}.json"
586
+ filepath = os.path.join(self.data_dir, "pages", filename)
587
+
588
+ with open(filepath, "w", encoding="utf-8") as f:
589
+ json.dump(page, f, indent=2, ensure_ascii=False)
590
+
591
+ def scrape_page(self, url: str) -> None:
592
+ """Scrape a single page with thread-safe operations.
593
+
594
+ Args:
595
+ url (str): URL to scrape
596
+
597
+ Returns:
598
+ dict or None: Page data dict on success, None on failure
599
+
600
+ Note:
601
+ Uses threading locks when workers > 1 for thread safety
602
+ Supports both HTML pages and Markdown (.md) files
603
+ """
604
+ try:
605
+ # Scraping part (no lock needed - independent)
606
+ headers = {"User-Agent": "Mozilla/5.0 (Documentation Scraper)"}
607
+ response = requests.get(url, headers=headers, timeout=30)
608
+ response.raise_for_status()
609
+
610
+ # Check if this is a Markdown file
611
+ if url.endswith(".md") or ".md" in url:
612
+ page = self._extract_markdown_content(response.text, url)
613
+ else:
614
+ soup = BeautifulSoup(response.content, "html.parser")
615
+ page = self.extract_content(soup, url)
616
+
617
+ # Thread-safe operations (lock required)
618
+ if self.workers > 1:
619
+ with self.lock:
620
+ logger.info(" %s", url)
621
+ self.save_page(page)
622
+ self.pages.append(page)
623
+
624
+ # Add new URLs
625
+ for link in page["links"]:
626
+ if link not in self.visited_urls and link not in self.pending_urls:
627
+ self.pending_urls.append(link)
628
+ else:
629
+ # Single-threaded mode (no lock needed)
630
+ logger.info(" %s", url)
631
+ self.save_page(page)
632
+ self.pages.append(page)
633
+
634
+ # Add new URLs
635
+ for link in page["links"]:
636
+ if link not in self.visited_urls and link not in self.pending_urls:
637
+ self.pending_urls.append(link)
638
+
639
+ # Rate limiting
640
+ rate_limit = self.config.get("rate_limit", DEFAULT_RATE_LIMIT)
641
+ if rate_limit > 0:
642
+ time.sleep(rate_limit)
643
+
644
+ except Exception as e:
645
+ if self.workers > 1:
646
+ with self.lock:
647
+ logger.error(" ✗ Error scraping %s: %s: %s", url, type(e).__name__, e)
648
+ else:
649
+ logger.error(" ✗ Error scraping page: %s: %s", type(e).__name__, e)
650
+ logger.error(" URL: %s", url)
651
+
652
+ async def scrape_page_async(
653
+ self, url: str, semaphore: asyncio.Semaphore, client: httpx.AsyncClient
654
+ ) -> None:
655
+ """Scrape a single page asynchronously.
656
+
657
+ Args:
658
+ url: URL to scrape
659
+ semaphore: Asyncio semaphore for concurrency control
660
+ client: Shared httpx AsyncClient for connection pooling
661
+
662
+ Note:
663
+ Uses asyncio.Lock for async-safe operations instead of threading.Lock
664
+ Supports both HTML pages and Markdown (.md) files
665
+ """
666
+ async with semaphore: # Limit concurrent requests
667
+ try:
668
+ # Async HTTP request
669
+ headers = {"User-Agent": "Mozilla/5.0 (Documentation Scraper)"}
670
+ response = await client.get(url, headers=headers, timeout=30.0)
671
+ response.raise_for_status()
672
+
673
+ # Check if this is a Markdown file
674
+ if url.endswith(".md") or ".md" in url:
675
+ page = self._extract_markdown_content(response.text, url)
676
+ else:
677
+ # BeautifulSoup parsing (still synchronous, but fast)
678
+ soup = BeautifulSoup(response.content, "html.parser")
679
+ page = self.extract_content(soup, url)
680
+
681
+ # Async-safe operations (no lock needed - single event loop)
682
+ logger.info(" %s", url)
683
+ self.save_page(page)
684
+ self.pages.append(page)
685
+
686
+ # Add new URLs
687
+ for link in page["links"]:
688
+ if link not in self.visited_urls and link not in self.pending_urls:
689
+ self.pending_urls.append(link)
690
+
691
+ # Rate limiting
692
+ rate_limit = self.config.get("rate_limit", DEFAULT_RATE_LIMIT)
693
+ if rate_limit > 0:
694
+ await asyncio.sleep(rate_limit)
695
+
696
+ except Exception as e:
697
+ logger.error(" ✗ Error scraping %s: %s: %s", url, type(e).__name__, e)
698
+
699
+ def _convert_to_md_urls(self, urls: list[str]) -> list[str]:
700
+ """
701
+ Convert URLs to .md format, trying /index.html.md suffix for non-.md URLs.
702
+ 不预先检查 URL 是否存在,直接加入队列,在爬取时再验证。
703
+
704
+ Args:
705
+ urls: List of URLs to process
706
+
707
+ Returns:
708
+ List of .md URLs (未验证)
709
+ """
710
+ md_urls = []
711
+
712
+ for url in urls:
713
+ if ".md" in url:
714
+ md_urls.append(url)
715
+ else:
716
+ # 直接转换为 .md 格式,不发送 HEAD 请求检查
717
+ url = url.rstrip("/")
718
+ md_url = f"{url}/index.html.md"
719
+ md_urls.append(md_url)
720
+
721
+ logger.info(
722
+ " ✓ Converted %d URLs to .md format (will validate during crawl)",
723
+ len(md_urls),
724
+ )
725
+ return md_urls
726
+
727
+ # ORIGINAL _convert_to_md_urls (with HEAD request validation):
728
+ # def _convert_to_md_urls(self, urls: List[str]) -> List[str]:
729
+ # md_urls = []
730
+ # non_md_urls = []
731
+ # for url in urls:
732
+ # if '.md' in url:
733
+ # md_urls.append(url)
734
+ # else:
735
+ # non_md_urls.append(url)
736
+ # if non_md_urls:
737
+ # logger.info(" 🔄 Trying to convert %d non-.md URLs to .md format...", len(non_md_urls))
738
+ # converted = 0
739
+ # for url in non_md_urls:
740
+ # url = url.rstrip('/')
741
+ # md_url = f"{url}/index.html.md"
742
+ # try:
743
+ # resp = requests.head(md_url, timeout=5, allow_redirects=True)
744
+ # if resp.status_code == 200:
745
+ # md_urls.append(md_url)
746
+ # converted += 1
747
+ # except Exception:
748
+ # pass
749
+ # logger.info(" ✓ Converted %d URLs to .md format", converted)
750
+ # return md_urls
751
+
752
+ def _try_llms_txt(self) -> bool:
753
+ """
754
+ Try to use llms.txt instead of HTML scraping.
755
+ Downloads ALL available variants and stores with .md extension.
756
+
757
+ Returns:
758
+ True if llms.txt was found and processed successfully
759
+ """
760
+ logger.info("\n🔍 Checking for llms.txt at %s...", self.base_url)
761
+
762
+ # Check for explicit config URL first
763
+ explicit_url = self.config.get("llms_txt_url")
764
+ if explicit_url:
765
+ logger.info("\n📌 Using explicit llms_txt_url from config: %s", explicit_url)
766
+
767
+ # Download explicit file first
768
+ downloader = LlmsTxtDownloader(explicit_url)
769
+ content = downloader.download()
770
+
771
+ if content:
772
+ # Save explicit file with proper .md extension
773
+ filename = downloader.get_proper_filename()
774
+ filepath = os.path.join(self.skill_dir, "references", filename)
775
+ os.makedirs(os.path.dirname(filepath), exist_ok=True)
776
+
777
+ with open(filepath, "w", encoding="utf-8") as f:
778
+ f.write(content)
779
+ logger.info(" 💾 Saved %s (%d chars)", filename, len(content))
780
+
781
+ # Also try to detect and download ALL other variants
782
+ detector = LlmsTxtDetector(self.base_url)
783
+ variants = detector.detect_all()
784
+
785
+ if variants:
786
+ logger.info(
787
+ "\n🔍 Found %d total variant(s), downloading remaining...",
788
+ len(variants),
789
+ )
790
+ for variant_info in variants:
791
+ url = variant_info["url"]
792
+ variant = variant_info["variant"]
793
+
794
+ # Skip the explicit one we already downloaded
795
+ if url == explicit_url:
796
+ continue
797
+
798
+ logger.info(" 📥 Downloading %s...", variant)
799
+ extra_downloader = LlmsTxtDownloader(url)
800
+ extra_content = extra_downloader.download()
801
+
802
+ if extra_content:
803
+ extra_filename = extra_downloader.get_proper_filename()
804
+ extra_filepath = os.path.join(
805
+ self.skill_dir, "references", extra_filename
806
+ )
807
+ with open(extra_filepath, "w", encoding="utf-8") as f:
808
+ f.write(extra_content)
809
+ logger.info(
810
+ " ✓ %s (%d chars)",
811
+ extra_filename,
812
+ len(extra_content),
813
+ )
814
+
815
+ # Parse explicit file for skill building
816
+ parser = LlmsTxtParser(content, self.base_url)
817
+
818
+ # Extract URLs from llms.txt and add to pending_urls for BFS crawling
819
+ extracted_urls = parser.extract_urls()
820
+ if extracted_urls:
821
+ # Convert non-.md URLs to .md format by trying /index.html.md suffix
822
+ md_urls = self._convert_to_md_urls(extracted_urls)
823
+ logger.info(
824
+ "\n🔗 Found %d URLs in llms.txt (%d .md files), starting BFS crawl...",
825
+ len(extracted_urls),
826
+ len(md_urls),
827
+ )
828
+
829
+ # Filter URLs based on url_patterns config
830
+ for url in md_urls:
831
+ if self.is_valid_url(url) and url not in self.visited_urls:
832
+ self.pending_urls.append(url)
833
+
834
+ logger.info(
835
+ " 📋 %d URLs added to crawl queue after filtering",
836
+ len(self.pending_urls),
837
+ )
838
+
839
+ # Return False to trigger HTML scraping with the populated pending_urls
840
+ self.llms_txt_detected = True
841
+ self.llms_txt_variant = "explicit"
842
+ return False # Continue with BFS crawling
843
+
844
+ # Fallback: if no URLs found, use section-based parsing
845
+ pages = parser.parse()
846
+
847
+ if pages:
848
+ for page in pages:
849
+ self.save_page(page)
850
+ self.pages.append(page)
851
+
852
+ self.llms_txt_detected = True
853
+ self.llms_txt_variant = "explicit"
854
+ return True
855
+
856
+ # Auto-detection: Find ALL variants
857
+ detector = LlmsTxtDetector(self.base_url)
858
+ variants = detector.detect_all()
859
+
860
+ if not variants:
861
+ logger.info("ℹ️ No llms.txt found, using HTML scraping")
862
+ return False
863
+
864
+ logger.info("✅ Found %d llms.txt variant(s)", len(variants))
865
+
866
+ # Download ALL variants
867
+ downloaded = {}
868
+ for variant_info in variants:
869
+ url = variant_info["url"]
870
+ variant = variant_info["variant"]
871
+
872
+ logger.info(" 📥 Downloading %s...", variant)
873
+ downloader = LlmsTxtDownloader(url)
874
+ content = downloader.download()
875
+
876
+ if content:
877
+ filename = downloader.get_proper_filename()
878
+ downloaded[variant] = {
879
+ "content": content,
880
+ "filename": filename,
881
+ "size": len(content),
882
+ }
883
+ logger.info(" ✓ %s (%d chars)", filename, len(content))
884
+
885
+ if not downloaded:
886
+ logger.warning("⚠️ Failed to download any variants, falling back to HTML scraping")
887
+ return False
888
+
889
+ # Save ALL variants to references/
890
+ os.makedirs(os.path.join(self.skill_dir, "references"), exist_ok=True)
891
+
892
+ for _variant, data in downloaded.items():
893
+ filepath = os.path.join(self.skill_dir, "references", data["filename"])
894
+ with open(filepath, "w", encoding="utf-8") as f:
895
+ f.write(data["content"])
896
+ logger.info(" 💾 Saved %s", data["filename"])
897
+
898
+ # Parse LARGEST variant for skill building
899
+ largest = max(downloaded.items(), key=lambda x: x[1]["size"])
900
+ logger.info("\n📄 Parsing %s for skill building...", largest[1]["filename"])
901
+
902
+ parser = LlmsTxtParser(largest[1]["content"], self.base_url)
903
+
904
+ # Extract URLs from llms.txt and add to pending_urls for BFS crawling
905
+ extracted_urls = parser.extract_urls()
906
+ if extracted_urls:
907
+ # Convert non-.md URLs to .md format by trying /index.html.md suffix
908
+ md_urls = self._convert_to_md_urls(extracted_urls)
909
+ logger.info(
910
+ "\n🔗 Found %d URLs in llms.txt (%d .md files), starting BFS crawl...",
911
+ len(extracted_urls),
912
+ len(md_urls),
913
+ )
914
+
915
+ # Filter URLs based on url_patterns config
916
+ for url in md_urls:
917
+ if self.is_valid_url(url) and url not in self.visited_urls:
918
+ self.pending_urls.append(url)
919
+
920
+ logger.info(
921
+ " 📋 %d URLs added to crawl queue after filtering",
922
+ len(self.pending_urls),
923
+ )
924
+
925
+ # Return False to trigger HTML scraping with the populated pending_urls
926
+ self.llms_txt_detected = True
927
+ self.llms_txt_variants = list(downloaded.keys())
928
+ return False # Continue with BFS crawling
929
+
930
+ # Fallback: if no URLs found, use section-based parsing
931
+ pages = parser.parse()
932
+
933
+ if not pages:
934
+ logger.warning("⚠️ Failed to parse llms.txt, falling back to HTML scraping")
935
+ return False
936
+
937
+ logger.info(" ✓ Parsed %d sections", len(pages))
938
+
939
+ # Save pages for skill building
940
+ for page in pages:
941
+ self.save_page(page)
942
+ self.pages.append(page)
943
+
944
+ self.llms_txt_detected = True
945
+ self.llms_txt_variants = list(downloaded.keys())
946
+
947
+ return True
948
+
949
+ def scrape_all(self) -> None:
950
+ """Scrape all pages (supports llms.txt and HTML scraping)
951
+
952
+ Routes to async version if async_mode is enabled in config.
953
+ """
954
+ # Route to async version if enabled
955
+ if self.async_mode:
956
+ asyncio.run(self.scrape_all_async())
957
+ return
958
+
959
+ # Try llms.txt first (unless dry-run or explicitly disabled)
960
+ if not self.dry_run and not self.skip_llms_txt:
961
+ llms_result = self._try_llms_txt()
962
+ if llms_result:
963
+ logger.info(
964
+ "\n✅ Used llms.txt (%s) - skipping HTML scraping",
965
+ self.llms_txt_variant,
966
+ )
967
+ self.save_summary()
968
+ return
969
+
970
+ # HTML scraping (sync/thread-based logic)
971
+ logger.info("\n" + "=" * 60)
972
+ if self.dry_run:
973
+ logger.info("DRY RUN: %s", self.name)
974
+ else:
975
+ logger.info("SCRAPING: %s", self.name)
976
+ logger.info("=" * 60)
977
+ logger.info("Base URL: %s", self.base_url)
978
+
979
+ if self.dry_run:
980
+ logger.info("Mode: Preview only (no actual scraping)\n")
981
+ else:
982
+ logger.info("Output: %s", self.data_dir)
983
+ if self.workers > 1:
984
+ logger.info("Workers: %d parallel threads", self.workers)
985
+ logger.info("")
986
+
987
+ max_pages = self.config.get("max_pages", DEFAULT_MAX_PAGES)
988
+
989
+ # Handle unlimited mode
990
+ if max_pages is None or max_pages == -1:
991
+ logger.warning("⚠️ UNLIMITED MODE: No page limit (will scrape all pages)\n")
992
+ unlimited = True
993
+ else:
994
+ unlimited = False
995
+
996
+ # Dry run: preview first 20 URLs
997
+ preview_limit = 20 if self.dry_run else max_pages
998
+
999
+ # Single-threaded mode (original sequential logic)
1000
+ if self.workers <= 1:
1001
+ while self.pending_urls and (unlimited or len(self.visited_urls) < preview_limit):
1002
+ url = self.pending_urls.popleft()
1003
+
1004
+ if url in self.visited_urls:
1005
+ continue
1006
+
1007
+ self.visited_urls.add(url)
1008
+
1009
+ if self.dry_run:
1010
+ # Just show what would be scraped
1011
+ logger.info(" [Preview] %s", url)
1012
+ try:
1013
+ headers = {"User-Agent": "Mozilla/5.0 (Documentation Scraper - Dry Run)"}
1014
+ response = requests.get(url, headers=headers, timeout=10)
1015
+ soup = BeautifulSoup(response.content, "html.parser")
1016
+
1017
+ main_selector = self.config.get("selectors", {}).get(
1018
+ "main_content", 'div[role="main"]'
1019
+ )
1020
+ main = soup.select_one(main_selector)
1021
+
1022
+ if main:
1023
+ for link in main.find_all("a", href=True):
1024
+ href = urljoin(url, link["href"])
1025
+ if self.is_valid_url(href) and href not in self.visited_urls:
1026
+ self.pending_urls.append(href)
1027
+ except Exception as e:
1028
+ # Failed to extract links in fast mode, continue anyway
1029
+ logger.warning("⚠️ Warning: Could not extract links from %s: %s", url, e)
1030
+ else:
1031
+ self.scrape_page(url)
1032
+ self.pages_scraped += 1
1033
+
1034
+ if (
1035
+ self.checkpoint_enabled
1036
+ and self.pages_scraped % self.checkpoint_interval == 0
1037
+ ):
1038
+ self.save_checkpoint()
1039
+
1040
+ if len(self.visited_urls) % 10 == 0:
1041
+ logger.info(" [%d pages]", len(self.visited_urls))
1042
+
1043
+ # Multi-threaded mode (parallel scraping)
1044
+ else:
1045
+ from concurrent.futures import ThreadPoolExecutor, as_completed
1046
+
1047
+ logger.info("🚀 Starting parallel scraping with %d workers\n", self.workers)
1048
+
1049
+ with ThreadPoolExecutor(max_workers=self.workers) as executor:
1050
+ futures = []
1051
+
1052
+ while self.pending_urls and (unlimited or len(self.visited_urls) < preview_limit):
1053
+ # Get next batch of URLs (thread-safe)
1054
+ batch = []
1055
+ batch_size = min(self.workers * 2, len(self.pending_urls))
1056
+
1057
+ with self.lock:
1058
+ for _ in range(batch_size):
1059
+ if not self.pending_urls:
1060
+ break
1061
+ url = self.pending_urls.popleft()
1062
+
1063
+ if url not in self.visited_urls:
1064
+ self.visited_urls.add(url)
1065
+ batch.append(url)
1066
+
1067
+ # Submit batch to executor
1068
+ for url in batch:
1069
+ if unlimited or len(self.visited_urls) <= preview_limit:
1070
+ future = executor.submit(self.scrape_page, url)
1071
+ futures.append(future)
1072
+
1073
+ # Wait for some to complete before submitting more
1074
+ for future in as_completed(futures[:batch_size]):
1075
+ # Check for exceptions
1076
+ try:
1077
+ future.result() # Raises exception if scrape_page failed
1078
+ except Exception as e:
1079
+ with self.lock:
1080
+ logger.warning(" ⚠️ Worker exception: %s", e)
1081
+
1082
+ with self.lock:
1083
+ self.pages_scraped += 1
1084
+
1085
+ if (
1086
+ self.checkpoint_enabled
1087
+ and self.pages_scraped % self.checkpoint_interval == 0
1088
+ ):
1089
+ self.save_checkpoint()
1090
+
1091
+ if self.pages_scraped % 10 == 0:
1092
+ logger.info(" [%d pages scraped]", self.pages_scraped)
1093
+
1094
+ # Remove completed futures
1095
+ futures = [f for f in futures if not f.done()]
1096
+
1097
+ # Wait for remaining futures
1098
+ for future in as_completed(futures):
1099
+ # Check for exceptions
1100
+ try:
1101
+ future.result()
1102
+ except Exception as e:
1103
+ with self.lock:
1104
+ logger.warning(" ⚠️ Worker exception: %s", e)
1105
+
1106
+ with self.lock:
1107
+ self.pages_scraped += 1
1108
+
1109
+ if self.dry_run:
1110
+ logger.info("\n✅ Dry run complete: would scrape ~%d pages", len(self.visited_urls))
1111
+ if len(self.visited_urls) >= preview_limit:
1112
+ logger.info(
1113
+ " (showing first %d, actual scraping may find more)",
1114
+ preview_limit,
1115
+ )
1116
+ logger.info("\n💡 To actually scrape, run without --dry-run")
1117
+ else:
1118
+ logger.info("\n✅ Scraped %d pages", len(self.visited_urls))
1119
+ self.save_summary()
1120
+
1121
+ async def scrape_all_async(self) -> None:
1122
+ """Scrape all pages asynchronously (async/await version).
1123
+
1124
+ This method provides significantly better performance for parallel scraping
1125
+ compared to thread-based scraping, with lower memory overhead and better
1126
+ CPU utilization.
1127
+
1128
+ Performance: ~2-3x faster than sync mode with same worker count.
1129
+ """
1130
+ # Try llms.txt first (unless dry-run or explicitly disabled)
1131
+ if not self.dry_run and not self.skip_llms_txt:
1132
+ llms_result = self._try_llms_txt()
1133
+ if llms_result:
1134
+ logger.info(
1135
+ "\n✅ Used llms.txt (%s) - skipping HTML scraping",
1136
+ self.llms_txt_variant,
1137
+ )
1138
+ self.save_summary()
1139
+ return
1140
+
1141
+ # HTML scraping (async version)
1142
+ logger.info("\n" + "=" * 60)
1143
+ if self.dry_run:
1144
+ logger.info("DRY RUN (ASYNC): %s", self.name)
1145
+ else:
1146
+ logger.info("SCRAPING (ASYNC): %s", self.name)
1147
+ logger.info("=" * 60)
1148
+ logger.info("Base URL: %s", self.base_url)
1149
+
1150
+ if self.dry_run:
1151
+ logger.info("Mode: Preview only (no actual scraping)\n")
1152
+ else:
1153
+ logger.info("Output: %s", self.data_dir)
1154
+ logger.info("Workers: %d concurrent tasks (async)", self.workers)
1155
+ logger.info("")
1156
+
1157
+ max_pages = self.config.get("max_pages", DEFAULT_MAX_PAGES)
1158
+
1159
+ # Handle unlimited mode
1160
+ if max_pages is None or max_pages == -1:
1161
+ logger.warning("⚠️ UNLIMITED MODE: No page limit (will scrape all pages)\n")
1162
+ unlimited = True
1163
+ preview_limit = float("inf")
1164
+ else:
1165
+ unlimited = False
1166
+ preview_limit = 20 if self.dry_run else max_pages
1167
+
1168
+ # Create semaphore for concurrency control
1169
+ semaphore = asyncio.Semaphore(self.workers)
1170
+
1171
+ # Create shared HTTP client with connection pooling
1172
+ async with httpx.AsyncClient(
1173
+ timeout=30.0, limits=httpx.Limits(max_connections=self.workers * 2)
1174
+ ) as client:
1175
+ tasks = []
1176
+
1177
+ while self.pending_urls and (unlimited or len(self.visited_urls) < preview_limit):
1178
+ # Get next batch of URLs
1179
+ batch = []
1180
+ batch_size = min(self.workers * 2, len(self.pending_urls))
1181
+
1182
+ for _ in range(batch_size):
1183
+ if not self.pending_urls:
1184
+ break
1185
+ url = self.pending_urls.popleft()
1186
+
1187
+ if url not in self.visited_urls:
1188
+ self.visited_urls.add(url)
1189
+ batch.append(url)
1190
+
1191
+ # Create async tasks for batch
1192
+ for url in batch:
1193
+ if unlimited or len(self.visited_urls) <= preview_limit:
1194
+ if self.dry_run:
1195
+ logger.info(" [Preview] %s", url)
1196
+ else:
1197
+ task = asyncio.create_task(
1198
+ self.scrape_page_async(url, semaphore, client)
1199
+ )
1200
+ tasks.append(task)
1201
+
1202
+ # Wait for batch to complete before continuing
1203
+ if tasks:
1204
+ await asyncio.gather(*tasks, return_exceptions=True)
1205
+ tasks = []
1206
+ self.pages_scraped = len(self.visited_urls)
1207
+
1208
+ # Progress indicator
1209
+ if self.pages_scraped % 10 == 0 and not self.dry_run:
1210
+ logger.info(" [%d pages scraped]", self.pages_scraped)
1211
+
1212
+ # Checkpoint saving
1213
+ if (
1214
+ not self.dry_run
1215
+ and self.checkpoint_enabled
1216
+ and self.pages_scraped % self.checkpoint_interval == 0
1217
+ ):
1218
+ self.save_checkpoint()
1219
+
1220
+ # Wait for any remaining tasks
1221
+ if tasks:
1222
+ await asyncio.gather(*tasks, return_exceptions=True)
1223
+
1224
+ if self.dry_run:
1225
+ logger.info("\n✅ Dry run complete: would scrape ~%d pages", len(self.visited_urls))
1226
+ if len(self.visited_urls) >= preview_limit:
1227
+ logger.info(
1228
+ " (showing first %d, actual scraping may find more)",
1229
+ int(preview_limit),
1230
+ )
1231
+ logger.info("\n💡 To actually scrape, run without --dry-run")
1232
+ else:
1233
+ logger.info("\n✅ Scraped %d pages (async mode)", len(self.visited_urls))
1234
+ self.save_summary()
1235
+
1236
+ def save_summary(self) -> None:
1237
+ """Save scraping summary"""
1238
+ summary = {
1239
+ "name": self.name,
1240
+ "total_pages": len(self.pages),
1241
+ "base_url": self.base_url,
1242
+ "llms_txt_detected": self.llms_txt_detected,
1243
+ "llms_txt_variant": self.llms_txt_variant,
1244
+ "pages": [{"title": p["title"], "url": p["url"]} for p in self.pages],
1245
+ }
1246
+
1247
+ with open(f"{self.data_dir}/summary.json", "w", encoding="utf-8") as f:
1248
+ json.dump(summary, f, indent=2, ensure_ascii=False)
1249
+
1250
+ def load_scraped_data(self) -> list[dict[str, Any]]:
1251
+ """Load previously scraped data"""
1252
+ pages = []
1253
+ pages_dir = Path(self.data_dir) / "pages"
1254
+
1255
+ if not pages_dir.exists():
1256
+ return []
1257
+
1258
+ for json_file in pages_dir.glob("*.json"):
1259
+ try:
1260
+ with open(json_file, encoding="utf-8") as f:
1261
+ pages.append(json.load(f))
1262
+ except Exception as e:
1263
+ logger.error(
1264
+ "⚠️ Error loading scraped data file %s: %s: %s",
1265
+ json_file,
1266
+ type(e).__name__,
1267
+ e,
1268
+ )
1269
+ logger.error(
1270
+ " Suggestion: File may be corrupted, consider re-scraping with --fresh"
1271
+ )
1272
+
1273
+ return pages
1274
+
1275
+ def smart_categorize(self, pages: list[dict[str, Any]]) -> dict[str, list[dict[str, Any]]]:
1276
+ """Improved categorization with better pattern matching"""
1277
+ category_defs = self.config.get("categories", {})
1278
+
1279
+ # Default smart categories if none provided
1280
+ if not category_defs:
1281
+ category_defs = self.infer_categories(pages)
1282
+
1283
+ categories: dict[str, list[dict[str, Any]]] = {cat: [] for cat in category_defs}
1284
+ categories["other"] = []
1285
+
1286
+ for page in pages:
1287
+ url = page["url"].lower()
1288
+ title = page["title"].lower()
1289
+ content = page.get("content", "").lower()[
1290
+ :CONTENT_PREVIEW_LENGTH
1291
+ ] # Check first N chars for categorization
1292
+
1293
+ categorized = False
1294
+
1295
+ # Match against keywords
1296
+ for cat, keywords in category_defs.items():
1297
+ score = 0
1298
+ for keyword in keywords:
1299
+ keyword = keyword.lower()
1300
+ if keyword in url:
1301
+ score += 3
1302
+ if keyword in title:
1303
+ score += 2
1304
+ if keyword in content:
1305
+ score += 1
1306
+
1307
+ if score >= MIN_CATEGORIZATION_SCORE: # Threshold for categorization
1308
+ categories[cat].append(page)
1309
+ categorized = True
1310
+ break
1311
+
1312
+ if not categorized:
1313
+ categories["other"].append(page)
1314
+
1315
+ # Remove empty categories
1316
+ categories = {k: v for k, v in categories.items() if v}
1317
+
1318
+ return categories
1319
+
1320
+ def infer_categories(self, pages: list[dict[str, Any]]) -> dict[str, list[str]]:
1321
+ """Infer categories from URL patterns (IMPROVED)"""
1322
+ url_segments: defaultdict[str, int] = defaultdict(int)
1323
+
1324
+ for page in pages:
1325
+ path = urlparse(page["url"]).path
1326
+ segments = [
1327
+ s for s in path.split("/") if s and s not in ["en", "stable", "latest", "docs"]
1328
+ ]
1329
+
1330
+ for seg in segments:
1331
+ url_segments[seg] += 1
1332
+
1333
+ # Top segments become categories
1334
+ top_segments = sorted(url_segments.items(), key=lambda x: x[1], reverse=True)[:8]
1335
+
1336
+ categories = {}
1337
+ for seg, count in top_segments:
1338
+ if count >= 3: # At least 3 pages
1339
+ categories[seg] = [seg]
1340
+
1341
+ # Add common defaults
1342
+ if "tutorial" not in categories and any(
1343
+ "tutorial" in url for url in [p["url"] for p in pages]
1344
+ ):
1345
+ categories["tutorials"] = ["tutorial", "guide", "getting-started"]
1346
+
1347
+ if "api" not in categories and any(
1348
+ "api" in url or "reference" in url for url in [p["url"] for p in pages]
1349
+ ):
1350
+ categories["api"] = ["api", "reference", "class"]
1351
+
1352
+ return categories
1353
+
1354
+ def generate_quick_reference(self, pages: list[dict[str, Any]]) -> list[dict[str, str]]:
1355
+ """Generate quick reference from common patterns (NEW FEATURE)"""
1356
+ quick_ref = []
1357
+
1358
+ # Collect all patterns
1359
+ all_patterns = []
1360
+ for page in pages:
1361
+ all_patterns.extend(page.get("patterns", []))
1362
+
1363
+ # Get most common code patterns
1364
+ seen_codes = set()
1365
+ for pattern in all_patterns:
1366
+ code = pattern["code"]
1367
+ if code not in seen_codes and len(code) < 300:
1368
+ quick_ref.append(pattern)
1369
+ seen_codes.add(code)
1370
+ if len(quick_ref) >= 15:
1371
+ break
1372
+
1373
+ return quick_ref
1374
+
1375
+ def create_reference_file(self, category: str, pages: list[dict[str, Any]]) -> None:
1376
+ """Create enhanced reference file"""
1377
+ if not pages:
1378
+ return
1379
+
1380
+ lines = []
1381
+ lines.append(f"# {self.name.title()} - {category.replace('_', ' ').title()}\n")
1382
+ lines.append(f"**Pages:** {len(pages)}\n")
1383
+ lines.append("---\n")
1384
+
1385
+ for page in pages:
1386
+ lines.append(f"## {page['title']}\n")
1387
+ lines.append(f"**URL:** {page['url']}\n")
1388
+
1389
+ # Table of contents from headings
1390
+ if page.get("headings"):
1391
+ lines.append("**Contents:**")
1392
+ for h in page["headings"][:10]:
1393
+ level = int(h["level"][1]) if len(h["level"]) > 1 else 1
1394
+ indent = " " * max(0, level - 2)
1395
+ lines.append(f"{indent}- {h['text']}")
1396
+ lines.append("")
1397
+
1398
+ # Content (NO TRUNCATION)
1399
+ if page.get("content"):
1400
+ lines.append(page["content"])
1401
+ lines.append("")
1402
+
1403
+ # Code examples with language (NO TRUNCATION)
1404
+ if page.get("code_samples"):
1405
+ lines.append("**Examples:**\n")
1406
+ for i, sample in enumerate(page["code_samples"][:4], 1):
1407
+ lang = sample.get("language", "unknown")
1408
+ code = sample.get("code", sample if isinstance(sample, str) else "")
1409
+ lines.append(f"Example {i} ({lang}):")
1410
+ lines.append(f"```{lang}")
1411
+ lines.append(code) # Full code, no truncation
1412
+ lines.append("```\n")
1413
+
1414
+ lines.append("---\n")
1415
+
1416
+ filepath = os.path.join(self.skill_dir, "references", f"{category}.md")
1417
+ with open(filepath, "w", encoding="utf-8") as f:
1418
+ f.write("\n".join(lines))
1419
+
1420
+ logger.info(" ✓ %s.md (%d pages)", category, len(pages))
1421
+
1422
+ def create_enhanced_skill_md(
1423
+ self,
1424
+ categories: dict[str, list[dict[str, Any]]],
1425
+ quick_ref: list[dict[str, str]],
1426
+ ) -> None:
1427
+ """Create SKILL.md with actual examples (IMPROVED)"""
1428
+ # Try to infer description if not in config
1429
+ if "description" not in self.config:
1430
+ # Get first page HTML content to infer description
1431
+ first_page_html = None
1432
+ for pages in categories.values():
1433
+ if pages:
1434
+ first_page_html = pages[0].get("raw_html", "")
1435
+ break
1436
+ description = infer_description_from_docs(self.base_url, first_page_html, self.name)
1437
+ else:
1438
+ description = self.config["description"]
1439
+
1440
+ # Extract actual code examples from docs
1441
+ example_codes = []
1442
+ for pages in categories.values():
1443
+ for page in pages[:3]: # First 3 pages per category
1444
+ for sample in page.get("code_samples", [])[:2]: # First 2 samples per page
1445
+ code = sample.get("code", sample if isinstance(sample, str) else "")
1446
+ lang = sample.get("language", "unknown")
1447
+ if len(code) < 200 and lang != "unknown":
1448
+ example_codes.append((lang, code))
1449
+ if len(example_codes) >= 10:
1450
+ break
1451
+ if len(example_codes) >= 10:
1452
+ break
1453
+ if len(example_codes) >= 10:
1454
+ break
1455
+
1456
+ content = f"""---
1457
+ name: {self.name}
1458
+ description: {description}
1459
+ ---
1460
+
1461
+ # {self.name.title()} Skill
1462
+
1463
+ {description.capitalize()}, generated from official documentation.
1464
+
1465
+ ## When to Use This Skill
1466
+
1467
+ This skill should be triggered when:
1468
+ - Working with {self.name}
1469
+ - Asking about {self.name} features or APIs
1470
+ - Implementing {self.name} solutions
1471
+ - Debugging {self.name} code
1472
+ - Learning {self.name} best practices
1473
+
1474
+ ## Quick Reference
1475
+
1476
+ ### Common Patterns
1477
+
1478
+ """
1479
+
1480
+ # Add actual quick reference patterns
1481
+ if quick_ref:
1482
+ for i, pattern in enumerate(quick_ref[:8], 1):
1483
+ desc = pattern.get("description", "Example pattern")
1484
+ # Format description: extract first sentence, truncate if too long
1485
+ first_sentence = desc.split(".")[0] if "." in desc else desc
1486
+ if len(first_sentence) > 150:
1487
+ first_sentence = first_sentence[:147] + "..."
1488
+
1489
+ content += f"**Pattern {i}:** {first_sentence}\n\n"
1490
+ content += "```\n"
1491
+ content += pattern.get("code", "")[:300]
1492
+ content += "\n```\n\n"
1493
+ else:
1494
+ content += "*Quick reference patterns will be added as you use the skill.*\n\n"
1495
+
1496
+ # Add example codes from docs
1497
+ if example_codes:
1498
+ content += "### Example Code Patterns\n\n"
1499
+ for i, (lang, code) in enumerate(example_codes[:5], 1):
1500
+ content += f"**Example {i}** ({lang}):\n```{lang}\n{code}\n```\n\n"
1501
+
1502
+ content += """## Reference Files
1503
+
1504
+ This skill includes comprehensive documentation in `references/`:
1505
+
1506
+ """
1507
+
1508
+ for cat in sorted(categories.keys()):
1509
+ content += f"- **{cat}.md** - {cat.replace('_', ' ').title()} documentation\n"
1510
+
1511
+ content += """
1512
+ Use `view` to read specific reference files when detailed information is needed.
1513
+
1514
+ ## Working with This Skill
1515
+
1516
+ ### For Beginners
1517
+ Start with the getting_started or tutorials reference files for foundational concepts.
1518
+
1519
+ ### For Specific Features
1520
+ Use the appropriate category reference file (api, guides, etc.) for detailed information.
1521
+
1522
+ ### For Code Examples
1523
+ The quick reference section above contains common patterns extracted from the official docs.
1524
+
1525
+ ## Resources
1526
+
1527
+ ### references/
1528
+ Organized documentation extracted from official sources. These files contain:
1529
+ - Detailed explanations
1530
+ - Code examples with language annotations
1531
+ - Links to original documentation
1532
+ - Table of contents for quick navigation
1533
+
1534
+ ### scripts/
1535
+ Add helper scripts here for common automation tasks.
1536
+
1537
+ ### assets/
1538
+ Add templates, boilerplate, or example projects here.
1539
+
1540
+ ## Notes
1541
+
1542
+ - This skill was automatically generated from official documentation
1543
+ - Reference files preserve the structure and examples from source docs
1544
+ - Code examples include language detection for better syntax highlighting
1545
+ - Quick reference patterns are extracted from common usage examples in the docs
1546
+
1547
+ ## Updating
1548
+
1549
+ To refresh this skill with updated documentation:
1550
+ 1. Re-run the scraper with the same configuration
1551
+ 2. The skill will be rebuilt with the latest information
1552
+ """
1553
+
1554
+ filepath = os.path.join(self.skill_dir, "SKILL.md")
1555
+ with open(filepath, "w", encoding="utf-8") as f:
1556
+ f.write(content)
1557
+
1558
+ logger.info(" ✓ SKILL.md (enhanced with %d examples)", len(example_codes))
1559
+
1560
+ def create_index(self, categories: dict[str, list[dict[str, Any]]]) -> None:
1561
+ """Create navigation index"""
1562
+ lines = []
1563
+ lines.append(f"# {self.name.title()} Documentation Index\n")
1564
+ lines.append("## Categories\n")
1565
+
1566
+ for cat, pages in sorted(categories.items()):
1567
+ lines.append(f"### {cat.replace('_', ' ').title()}")
1568
+ lines.append(f"**File:** `{cat}.md`")
1569
+ lines.append(f"**Pages:** {len(pages)}\n")
1570
+
1571
+ filepath = os.path.join(self.skill_dir, "references", "index.md")
1572
+ with open(filepath, "w", encoding="utf-8") as f:
1573
+ f.write("\n".join(lines))
1574
+
1575
+ logger.info(" ✓ index.md")
1576
+
1577
+ def build_skill(self) -> bool:
1578
+ """Build the skill from scraped data.
1579
+
1580
+ Loads scraped JSON files, categorizes pages, extracts patterns,
1581
+ and generates SKILL.md and reference files.
1582
+
1583
+ Returns:
1584
+ bool: True if build succeeded, False otherwise
1585
+ """
1586
+ logger.info("\n" + "=" * 60)
1587
+ logger.info("BUILDING SKILL: %s", self.name)
1588
+ logger.info("=" * 60 + "\n")
1589
+
1590
+ # Load data
1591
+ logger.info("Loading scraped data...")
1592
+ pages = self.load_scraped_data()
1593
+
1594
+ if not pages:
1595
+ logger.error("✗ No scraped data found!")
1596
+ return False
1597
+
1598
+ logger.info(" ✓ Loaded %d pages\n", len(pages))
1599
+
1600
+ # Categorize
1601
+ logger.info("Categorizing pages...")
1602
+ categories = self.smart_categorize(pages)
1603
+ logger.info(" ✓ Created %d categories\n", len(categories))
1604
+
1605
+ # Generate quick reference
1606
+ logger.info("Generating quick reference...")
1607
+ quick_ref = self.generate_quick_reference(pages)
1608
+ logger.info(" ✓ Extracted %d patterns\n", len(quick_ref))
1609
+
1610
+ # Create reference files
1611
+ logger.info("Creating reference files...")
1612
+ for cat, cat_pages in categories.items():
1613
+ self.create_reference_file(cat, cat_pages)
1614
+
1615
+ # Create index
1616
+ self.create_index(categories)
1617
+ logger.info("")
1618
+
1619
+ # Create enhanced SKILL.md
1620
+ logger.info("Creating SKILL.md...")
1621
+ self.create_enhanced_skill_md(categories, quick_ref)
1622
+
1623
+ logger.info("\n✅ Skill built: %s/", self.skill_dir)
1624
+ return True
1625
+
1626
+
1627
+ def validate_config(config: dict[str, Any]) -> tuple[list[str], list[str]]:
1628
+ """Validate configuration structure and values.
1629
+
1630
+ Args:
1631
+ config (dict): Configuration dictionary to validate
1632
+
1633
+ Returns:
1634
+ tuple: (errors, warnings) where each is a list of strings
1635
+
1636
+ Example:
1637
+ >>> errors, warnings = validate_config({'name': 'test', 'base_url': 'https://example.com'})
1638
+ >>> if errors:
1639
+ ... print("Invalid config:", errors)
1640
+ """
1641
+ errors = []
1642
+ warnings = []
1643
+
1644
+ # Required fields
1645
+ required_fields = ["name", "base_url"]
1646
+ for field in required_fields:
1647
+ if field not in config:
1648
+ errors.append(f"Missing required field: '{field}'")
1649
+
1650
+ # Validate name (alphanumeric, hyphens, underscores only)
1651
+ if "name" in config and not re.match(r"^[a-zA-Z0-9_-]+$", config["name"]):
1652
+ errors.append(
1653
+ f"Invalid name: '{config['name']}' (use only letters, numbers, hyphens, underscores)"
1654
+ )
1655
+
1656
+ # Validate base_url
1657
+ if "base_url" in config and not config["base_url"].startswith(("http://", "https://")):
1658
+ errors.append(
1659
+ f"Invalid base_url: '{config['base_url']}' (must start with http:// or https://)"
1660
+ )
1661
+
1662
+ # Validate selectors structure
1663
+ if "selectors" in config:
1664
+ if not isinstance(config["selectors"], dict):
1665
+ errors.append("'selectors' must be a dictionary")
1666
+ else:
1667
+ recommended_selectors = ["main_content", "title", "code_blocks"]
1668
+ for selector in recommended_selectors:
1669
+ if selector not in config["selectors"]:
1670
+ warnings.append(f"Missing recommended selector: '{selector}'")
1671
+ else:
1672
+ warnings.append("Missing 'selectors' section (recommended)")
1673
+
1674
+ # Validate url_patterns
1675
+ if "url_patterns" in config:
1676
+ if not isinstance(config["url_patterns"], dict):
1677
+ errors.append("'url_patterns' must be a dictionary")
1678
+ else:
1679
+ for key in ["include", "exclude"]:
1680
+ if key in config["url_patterns"] and not isinstance(
1681
+ config["url_patterns"][key], list
1682
+ ):
1683
+ errors.append(f"'url_patterns.{key}' must be a list")
1684
+
1685
+ # Validate categories
1686
+ if "categories" in config:
1687
+ if not isinstance(config["categories"], dict):
1688
+ errors.append("'categories' must be a dictionary")
1689
+ else:
1690
+ for cat_name, keywords in config["categories"].items():
1691
+ if not isinstance(keywords, list):
1692
+ errors.append(f"'categories.{cat_name}' must be a list of keywords")
1693
+
1694
+ # Validate rate_limit
1695
+ if "rate_limit" in config:
1696
+ try:
1697
+ rate = float(config["rate_limit"])
1698
+ if rate < 0:
1699
+ errors.append(f"'rate_limit' must be non-negative (got {rate})")
1700
+ elif rate > 10:
1701
+ warnings.append(
1702
+ f"'rate_limit' is very high ({rate}s) - this may slow down scraping significantly"
1703
+ )
1704
+ except (ValueError, TypeError):
1705
+ errors.append(f"'rate_limit' must be a number (got {config['rate_limit']})")
1706
+
1707
+ # Validate max_pages
1708
+ if "max_pages" in config:
1709
+ max_p_value = config["max_pages"]
1710
+
1711
+ # Allow None for unlimited
1712
+ if max_p_value is None:
1713
+ warnings.append(
1714
+ "'max_pages' is None (unlimited) - this will scrape ALL pages. Use with caution!"
1715
+ )
1716
+ else:
1717
+ try:
1718
+ max_p = int(max_p_value)
1719
+ # Allow -1 for unlimited
1720
+ if max_p == -1:
1721
+ warnings.append(
1722
+ "'max_pages' is -1 (unlimited) - this will scrape ALL pages. Use with caution!"
1723
+ )
1724
+ elif max_p < 1:
1725
+ errors.append(
1726
+ f"'max_pages' must be at least 1 or -1 for unlimited (got {max_p})"
1727
+ )
1728
+ elif max_p > MAX_PAGES_WARNING_THRESHOLD:
1729
+ warnings.append(
1730
+ f"'max_pages' is very high ({max_p}) - scraping may take a very long time"
1731
+ )
1732
+ except (ValueError, TypeError):
1733
+ errors.append(
1734
+ f"'max_pages' must be an integer, -1, or null (got {config['max_pages']})"
1735
+ )
1736
+
1737
+ # Validate start_urls if present
1738
+ if "start_urls" in config:
1739
+ if not isinstance(config["start_urls"], list):
1740
+ errors.append("'start_urls' must be a list")
1741
+ else:
1742
+ for url in config["start_urls"]:
1743
+ if not url.startswith(("http://", "https://")):
1744
+ errors.append(
1745
+ f"Invalid start_url: '{url}' (must start with http:// or https://)"
1746
+ )
1747
+
1748
+ return errors, warnings
1749
+
1750
+
1751
+ def load_config(config_path: str) -> dict[str, Any]:
1752
+ """Load and validate configuration from JSON file.
1753
+
1754
+ Args:
1755
+ config_path (str): Path to JSON configuration file
1756
+
1757
+ Returns:
1758
+ dict: Validated configuration dictionary
1759
+
1760
+ Raises:
1761
+ SystemExit: If config is invalid or file not found
1762
+
1763
+ Example:
1764
+ >>> config = load_config('configs/react.json')
1765
+ >>> print(config['name'])
1766
+ 'react'
1767
+ """
1768
+ try:
1769
+ with open(config_path, encoding="utf-8") as f:
1770
+ config = json.load(f)
1771
+ except json.JSONDecodeError as e:
1772
+ logger.error("❌ Error: Invalid JSON in config file: %s", config_path)
1773
+ logger.error(" Details: %s", e)
1774
+ logger.error(" Suggestion: Check syntax at line %d, column %d", e.lineno, e.colno)
1775
+ sys.exit(1)
1776
+ except FileNotFoundError:
1777
+ logger.error("❌ Error: Config file not found: %s", config_path)
1778
+ logger.error(" Suggestion: Create a config file or use an existing one from configs/")
1779
+ logger.error(" Available configs: react.json, vue.json, django.json, godot.json")
1780
+ sys.exit(1)
1781
+
1782
+ # Validate config
1783
+ errors, warnings = validate_config(config)
1784
+
1785
+ # Show warnings (non-blocking)
1786
+ if warnings:
1787
+ logger.warning("⚠️ Configuration warnings in %s:", config_path)
1788
+ for warning in warnings:
1789
+ logger.warning(" - %s", warning)
1790
+ logger.info("")
1791
+
1792
+ # Show errors (blocking)
1793
+ if errors:
1794
+ logger.error("❌ Configuration validation errors in %s:", config_path)
1795
+ for error in errors:
1796
+ logger.error(" - %s", error)
1797
+ logger.error("\n Suggestion: Fix the above errors or check configs/ for working examples")
1798
+ sys.exit(1)
1799
+
1800
+ return config
1801
+
1802
+
1803
+ def interactive_config() -> dict[str, Any]:
1804
+ """Interactive configuration wizard for creating new configs.
1805
+
1806
+ Prompts user for all required configuration fields step-by-step
1807
+ and returns a complete configuration dictionary.
1808
+
1809
+ Returns:
1810
+ dict: Complete configuration dictionary with user-provided values
1811
+
1812
+ Example:
1813
+ >>> config = interactive_config()
1814
+ # User enters: name=react, url=https://react.dev, etc.
1815
+ >>> config['name']
1816
+ 'react'
1817
+ """
1818
+ logger.info("\n" + "=" * 60)
1819
+ logger.info("Documentation to Skill Converter")
1820
+ logger.info("=" * 60 + "\n")
1821
+
1822
+ config: dict[str, Any] = {}
1823
+
1824
+ # Basic info
1825
+ config["name"] = input("Skill name (e.g., 'react', 'godot'): ").strip()
1826
+ config["description"] = input("Skill description: ").strip()
1827
+ config["base_url"] = input("Base URL (e.g., https://docs.example.com/): ").strip()
1828
+
1829
+ if not config["base_url"].endswith("/"):
1830
+ config["base_url"] += "/"
1831
+
1832
+ # Selectors
1833
+ logger.info("\nCSS Selectors (press Enter for defaults):")
1834
+ selectors = {}
1835
+ selectors["main_content"] = (
1836
+ input(" Main content [div[role='main']]: ").strip() or "div[role='main']"
1837
+ )
1838
+ selectors["title"] = input(" Title [title]: ").strip() or "title"
1839
+ selectors["code_blocks"] = input(" Code blocks [pre code]: ").strip() or "pre code"
1840
+ config["selectors"] = selectors
1841
+
1842
+ # URL patterns
1843
+ logger.info("\nURL Patterns (comma-separated, optional):")
1844
+ include = input(" Include: ").strip()
1845
+ exclude = input(" Exclude: ").strip()
1846
+ config["url_patterns"] = {
1847
+ "include": [p.strip() for p in include.split(",") if p.strip()],
1848
+ "exclude": [p.strip() for p in exclude.split(",") if p.strip()],
1849
+ }
1850
+
1851
+ # Settings
1852
+ rate = input(f"\nRate limit (seconds) [{DEFAULT_RATE_LIMIT}]: ").strip()
1853
+ config["rate_limit"] = float(rate) if rate else DEFAULT_RATE_LIMIT
1854
+
1855
+ max_p = input(f"Max pages [{DEFAULT_MAX_PAGES}]: ").strip()
1856
+ config["max_pages"] = int(max_p) if max_p else DEFAULT_MAX_PAGES
1857
+
1858
+ return config
1859
+
1860
+
1861
+ def check_existing_data(name: str) -> tuple[bool, int]:
1862
+ """Check if scraped data already exists for a skill.
1863
+
1864
+ Args:
1865
+ name (str): Skill name to check
1866
+
1867
+ Returns:
1868
+ tuple: (exists, page_count) where exists is bool and page_count is int
1869
+
1870
+ Example:
1871
+ >>> exists, count = check_existing_data('react')
1872
+ >>> if exists:
1873
+ ... print(f"Found {count} existing pages")
1874
+ """
1875
+ data_dir = f"output/{name}_data"
1876
+ if os.path.exists(data_dir) and os.path.exists(f"{data_dir}/summary.json"):
1877
+ with open(f"{data_dir}/summary.json", encoding="utf-8") as f:
1878
+ summary = json.load(f)
1879
+ return True, summary.get("total_pages", 0)
1880
+ return False, 0
1881
+
1882
+
1883
+ def setup_argument_parser() -> argparse.ArgumentParser:
1884
+ """Setup and configure command-line argument parser.
1885
+
1886
+ Creates an ArgumentParser with all CLI options for the doc scraper tool,
1887
+ including configuration, scraping, enhancement, and performance options.
1888
+
1889
+ Returns:
1890
+ argparse.ArgumentParser: Configured argument parser
1891
+
1892
+ Example:
1893
+ >>> parser = setup_argument_parser()
1894
+ >>> args = parser.parse_args(['--config', 'configs/react.json'])
1895
+ >>> print(args.config)
1896
+ configs/react.json
1897
+ """
1898
+ parser = argparse.ArgumentParser(
1899
+ description="Convert documentation websites to Claude skills",
1900
+ formatter_class=argparse.RawDescriptionHelpFormatter,
1901
+ )
1902
+
1903
+ # Positional URL argument (optional, for quick scraping)
1904
+ parser.add_argument(
1905
+ "url",
1906
+ nargs="?",
1907
+ type=str,
1908
+ help="Base documentation URL (alternative to --url)",
1909
+ )
1910
+
1911
+ parser.add_argument(
1912
+ "--interactive",
1913
+ "-i",
1914
+ action="store_true",
1915
+ help="Interactive configuration mode",
1916
+ )
1917
+ parser.add_argument(
1918
+ "--config",
1919
+ "-c",
1920
+ type=str,
1921
+ help="Load configuration from file (e.g., configs/godot.json)",
1922
+ )
1923
+ parser.add_argument("--name", type=str, help="Skill name")
1924
+ parser.add_argument("--url", type=str, help="Base documentation URL (alternative to positional URL)")
1925
+ parser.add_argument("--description", "-d", type=str, help="Skill description")
1926
+ parser.add_argument(
1927
+ "--max-pages",
1928
+ type=int,
1929
+ metavar="N",
1930
+ help="Maximum pages to scrape (overrides config). Use with caution - for testing/prototyping only.",
1931
+ )
1932
+ parser.add_argument(
1933
+ "--skip-scrape", action="store_true", help="Skip scraping, use existing data"
1934
+ )
1935
+ parser.add_argument(
1936
+ "--dry-run",
1937
+ action="store_true",
1938
+ help="Preview what will be scraped without actually scraping",
1939
+ )
1940
+ parser.add_argument(
1941
+ "--enhance",
1942
+ action="store_true",
1943
+ help="Enhance SKILL.md using Claude API after building (requires API key)",
1944
+ )
1945
+ parser.add_argument(
1946
+ "--enhance-local",
1947
+ action="store_true",
1948
+ help="Enhance SKILL.md using Claude Code (no API key needed, runs in background)",
1949
+ )
1950
+ parser.add_argument(
1951
+ "--interactive-enhancement",
1952
+ action="store_true",
1953
+ help="Open terminal window for enhancement (use with --enhance-local)",
1954
+ )
1955
+ parser.add_argument(
1956
+ "--api-key",
1957
+ type=str,
1958
+ help="Anthropic API key for --enhance (or set ANTHROPIC_API_KEY)",
1959
+ )
1960
+ parser.add_argument(
1961
+ "--resume",
1962
+ action="store_true",
1963
+ help="Resume from last checkpoint (for interrupted scrapes)",
1964
+ )
1965
+ parser.add_argument("--fresh", action="store_true", help="Clear checkpoint and start fresh")
1966
+ parser.add_argument(
1967
+ "--rate-limit",
1968
+ "-r",
1969
+ type=float,
1970
+ metavar="SECONDS",
1971
+ help=f"Override rate limit in seconds (default: from config or {DEFAULT_RATE_LIMIT}). Use 0 for no delay.",
1972
+ )
1973
+ parser.add_argument(
1974
+ "--workers",
1975
+ "-w",
1976
+ type=int,
1977
+ metavar="N",
1978
+ help="Number of parallel workers for faster scraping (default: 1, max: 10)",
1979
+ )
1980
+ parser.add_argument(
1981
+ "--async",
1982
+ dest="async_mode",
1983
+ action="store_true",
1984
+ help="Enable async mode for better parallel performance (2-3x faster than threads)",
1985
+ )
1986
+ parser.add_argument(
1987
+ "--no-rate-limit",
1988
+ action="store_true",
1989
+ help="Disable rate limiting completely (same as --rate-limit 0)",
1990
+ )
1991
+ parser.add_argument(
1992
+ "--verbose",
1993
+ "-v",
1994
+ action="store_true",
1995
+ help="Enable verbose output (DEBUG level logging)",
1996
+ )
1997
+ parser.add_argument(
1998
+ "--quiet",
1999
+ "-q",
2000
+ action="store_true",
2001
+ help="Minimize output (WARNING level logging only)",
2002
+ )
2003
+
2004
+ return parser
2005
+
2006
+
2007
+ def get_configuration(args: argparse.Namespace) -> dict[str, Any]:
2008
+ """Load or create configuration from command-line arguments.
2009
+
2010
+ Handles three configuration modes:
2011
+ 1. Load from JSON file (--config)
2012
+ 2. Interactive configuration wizard (--interactive or missing args)
2013
+ 3. Quick mode from command-line arguments (--name, --url)
2014
+
2015
+ Also applies CLI overrides for rate limiting and worker count.
2016
+
2017
+ Args:
2018
+ args: Parsed command-line arguments from argparse
2019
+
2020
+ Returns:
2021
+ dict: Configuration dictionary with all required fields
2022
+
2023
+ Example:
2024
+ >>> args = parser.parse_args(['--name', 'react', '--url', 'https://react.dev'])
2025
+ >>> config = get_configuration(args)
2026
+ >>> print(config['name'])
2027
+ react
2028
+ """
2029
+ # Handle URL from either positional argument or --url flag
2030
+ # Positional 'url' takes precedence, then --url flag
2031
+ effective_url = getattr(args, 'url', None)
2032
+
2033
+ # Get base configuration
2034
+ if args.config:
2035
+ config = load_config(args.config)
2036
+ elif args.interactive or not (args.name and effective_url):
2037
+ config = interactive_config()
2038
+ else:
2039
+ config = {
2040
+ "name": args.name,
2041
+ "description": args.description or f"Use when working with {args.name}",
2042
+ "base_url": effective_url,
2043
+ "selectors": {
2044
+ "main_content": "div[role='main']",
2045
+ "title": "title",
2046
+ "code_blocks": "pre code",
2047
+ },
2048
+ "url_patterns": {"include": [], "exclude": []},
2049
+ "rate_limit": DEFAULT_RATE_LIMIT,
2050
+ "max_pages": DEFAULT_MAX_PAGES,
2051
+ }
2052
+
2053
+ # Apply CLI overrides for rate limiting
2054
+ if args.no_rate_limit:
2055
+ config["rate_limit"] = 0
2056
+ logger.info("⚡ Rate limiting disabled")
2057
+ elif args.rate_limit is not None:
2058
+ config["rate_limit"] = args.rate_limit
2059
+ if args.rate_limit == 0:
2060
+ logger.info("⚡ Rate limiting disabled")
2061
+ else:
2062
+ logger.info("⚡ Rate limit override: %ss per page", args.rate_limit)
2063
+
2064
+ # Apply CLI overrides for worker count
2065
+ if args.workers:
2066
+ # Validate workers count
2067
+ if args.workers < 1:
2068
+ logger.error("❌ Error: --workers must be at least 1 (got %d)", args.workers)
2069
+ logger.error(" Suggestion: Use --workers 1 (default) or omit the flag")
2070
+ sys.exit(1)
2071
+ if args.workers > 10:
2072
+ logger.warning("⚠️ Warning: --workers capped at 10 (requested %d)", args.workers)
2073
+ args.workers = 10
2074
+ config["workers"] = args.workers
2075
+ if args.workers > 1:
2076
+ logger.info("🚀 Parallel scraping enabled: %d workers", args.workers)
2077
+
2078
+ # Apply CLI override for async mode
2079
+ if args.async_mode:
2080
+ config["async_mode"] = True
2081
+ if config.get("workers", 1) > 1:
2082
+ logger.info("⚡ Async mode enabled (2-3x faster than threads)")
2083
+ else:
2084
+ logger.warning(
2085
+ "⚠️ Async mode enabled but workers=1. Consider using --workers 4 for better performance"
2086
+ )
2087
+
2088
+ # Apply CLI override for max_pages
2089
+ if args.max_pages is not None:
2090
+ old_max = config.get("max_pages", DEFAULT_MAX_PAGES)
2091
+ config["max_pages"] = args.max_pages
2092
+
2093
+ # Warnings for --max-pages usage
2094
+ if args.max_pages > 1000:
2095
+ logger.warning(
2096
+ "⚠️ --max-pages=%d is very high - scraping may take hours", args.max_pages
2097
+ )
2098
+ logger.warning(
2099
+ " Recommendation: Use configs with reasonable limits for production"
2100
+ )
2101
+ elif args.max_pages < 10:
2102
+ logger.warning(
2103
+ "⚠️ --max-pages=%d is very low - may result in incomplete skill", args.max_pages
2104
+ )
2105
+
2106
+ if old_max and old_max != args.max_pages:
2107
+ logger.info(
2108
+ "📊 Max pages override: %d → %d (from --max-pages flag)", old_max, args.max_pages
2109
+ )
2110
+ else:
2111
+ logger.info("📊 Max pages set to: %d (from --max-pages flag)", args.max_pages)
2112
+
2113
+ return config
2114
+
2115
+
2116
+ def execute_scraping_and_building(
2117
+ config: dict[str, Any], args: argparse.Namespace
2118
+ ) -> Optional["DocToSkillConverter"]:
2119
+ """Execute the scraping and skill building process.
2120
+
2121
+ Handles dry run mode, existing data checks, scraping with checkpoints,
2122
+ keyboard interrupts, and skill building. This is the core workflow
2123
+ orchestration for the scraping phase.
2124
+
2125
+ Args:
2126
+ config (dict): Configuration dictionary with scraping parameters
2127
+ args: Parsed command-line arguments
2128
+
2129
+ Returns:
2130
+ DocToSkillConverter: The converter instance after scraping/building,
2131
+ or None if process was aborted
2132
+
2133
+ Example:
2134
+ >>> config = {'name': 'react', 'base_url': 'https://react.dev'}
2135
+ >>> converter = execute_scraping_and_building(config, args)
2136
+ >>> if converter:
2137
+ ... print("Scraping complete!")
2138
+ """
2139
+ # Dry run mode - preview only
2140
+ if args.dry_run:
2141
+ logger.info("\n" + "=" * 60)
2142
+ logger.info("DRY RUN MODE")
2143
+ logger.info("=" * 60)
2144
+ logger.info("This will show what would be scraped without saving anything.\n")
2145
+
2146
+ converter = DocToSkillConverter(config, dry_run=True)
2147
+ converter.scrape_all()
2148
+
2149
+ logger.info("\n📋 Configuration Summary:")
2150
+ logger.info(" Name: %s", config["name"])
2151
+ logger.info(" Base URL: %s", config["base_url"])
2152
+ logger.info(" Max pages: %d", config.get("max_pages", DEFAULT_MAX_PAGES))
2153
+ logger.info(" Rate limit: %ss", config.get("rate_limit", DEFAULT_RATE_LIMIT))
2154
+ logger.info(" Categories: %d", len(config.get("categories", {})))
2155
+ return None
2156
+
2157
+ # Check for existing data
2158
+ exists, page_count = check_existing_data(config["name"])
2159
+
2160
+ if exists and not args.skip_scrape and not args.fresh:
2161
+ # Check force_rescrape flag from config
2162
+ if config.get("force_rescrape", False):
2163
+ # Auto-delete cached data and rescrape
2164
+ logger.info("\n✓ Found existing data: %d pages", page_count)
2165
+ logger.info(" force_rescrape enabled - deleting cached data and rescaping")
2166
+ import shutil
2167
+
2168
+ data_dir = f"output/{config['name']}_data"
2169
+ if os.path.exists(data_dir):
2170
+ shutil.rmtree(data_dir)
2171
+ logger.info(f" Deleted: {data_dir}")
2172
+ else:
2173
+ # Only prompt if force_rescrape is False
2174
+ logger.info("\n✓ Found existing data: %d pages", page_count)
2175
+ response = input("Use existing data? (y/n): ").strip().lower()
2176
+ if response == "y":
2177
+ args.skip_scrape = True
2178
+ elif exists and args.fresh:
2179
+ logger.info("\n✓ Found existing data: %d pages", page_count)
2180
+ logger.info(" --fresh flag set, will re-scrape from scratch")
2181
+
2182
+ # Create converter
2183
+ converter = DocToSkillConverter(config, resume=args.resume)
2184
+
2185
+ # Handle fresh start (clear checkpoint)
2186
+ if args.fresh:
2187
+ converter.clear_checkpoint()
2188
+
2189
+ # Scrape or skip
2190
+ if not args.skip_scrape:
2191
+ try:
2192
+ converter.scrape_all()
2193
+ # Save final checkpoint
2194
+ if converter.checkpoint_enabled:
2195
+ converter.save_checkpoint()
2196
+ logger.info("\n💾 Final checkpoint saved")
2197
+ # Clear checkpoint after successful completion
2198
+ converter.clear_checkpoint()
2199
+ logger.info("✅ Scraping complete - checkpoint cleared")
2200
+ except KeyboardInterrupt:
2201
+ logger.warning("\n\nScraping interrupted.")
2202
+ if converter.checkpoint_enabled:
2203
+ converter.save_checkpoint()
2204
+ logger.info("💾 Progress saved to checkpoint")
2205
+ logger.info(
2206
+ " Resume with: --config %s --resume",
2207
+ args.config if args.config else "config.json",
2208
+ )
2209
+ response = input("Continue with skill building? (y/n): ").strip().lower()
2210
+ if response != "y":
2211
+ return None
2212
+ else:
2213
+ logger.info("\n⏭️ Skipping scrape, using existing data")
2214
+
2215
+ # Build skill
2216
+ success = converter.build_skill()
2217
+
2218
+ if not success:
2219
+ sys.exit(1)
2220
+
2221
+ return converter
2222
+
2223
+
2224
+ def execute_enhancement(config: dict[str, Any], args: argparse.Namespace) -> None:
2225
+ """Execute optional SKILL.md enhancement with Claude.
2226
+
2227
+ Supports two enhancement modes:
2228
+ 1. API-based enhancement (requires ANTHROPIC_API_KEY)
2229
+ 2. Local enhancement using Claude Code (no API key needed)
2230
+
2231
+ Prints appropriate messages and suggestions based on whether
2232
+ enhancement was requested and whether it succeeded.
2233
+
2234
+ Args:
2235
+ config (dict): Configuration dictionary with skill name
2236
+ args: Parsed command-line arguments with enhancement flags
2237
+
2238
+ Example:
2239
+ >>> execute_enhancement(config, args)
2240
+ # Runs enhancement if --enhance or --enhance-local flag is set
2241
+ """
2242
+ import subprocess
2243
+
2244
+ # Optional enhancement with Claude API
2245
+ if args.enhance:
2246
+ logger.info("\n" + "=" * 60)
2247
+ logger.info("ENHANCING SKILL.MD WITH CLAUDE API")
2248
+ logger.info("=" * 60 + "\n")
2249
+
2250
+ try:
2251
+ enhance_cmd = [
2252
+ "python3",
2253
+ "cli/enhance_skill.py",
2254
+ f"output/{config['name']}/",
2255
+ ]
2256
+ if args.api_key:
2257
+ enhance_cmd.extend(["--api-key", args.api_key])
2258
+
2259
+ result = subprocess.run(enhance_cmd, check=True)
2260
+ if result.returncode == 0:
2261
+ logger.info("\n✅ Enhancement complete!")
2262
+ except subprocess.CalledProcessError:
2263
+ logger.warning("\n⚠ Enhancement failed, but skill was still built")
2264
+ except FileNotFoundError:
2265
+ logger.warning("\n⚠ enhance_skill.py not found. Run manually:")
2266
+ logger.info(" skill-seekers-enhance output/%s/", config["name"])
2267
+
2268
+ # Optional enhancement with Claude Code (local, no API key)
2269
+ if args.enhance_local:
2270
+ logger.info("\n" + "=" * 60)
2271
+ if args.interactive_enhancement:
2272
+ logger.info("ENHANCING SKILL.MD WITH CLAUDE CODE (INTERACTIVE)")
2273
+ else:
2274
+ logger.info("ENHANCING SKILL.MD WITH CLAUDE CODE (HEADLESS)")
2275
+ logger.info("=" * 60 + "\n")
2276
+
2277
+ try:
2278
+ enhance_cmd = ["skill-seekers-enhance", f"output/{config['name']}/"]
2279
+ if args.interactive_enhancement:
2280
+ enhance_cmd.append("--interactive-enhancement")
2281
+
2282
+ result = subprocess.run(enhance_cmd, check=True)
2283
+
2284
+ if result.returncode == 0:
2285
+ logger.info("\n✅ Enhancement complete!")
2286
+ except subprocess.CalledProcessError:
2287
+ logger.warning("\n⚠ Enhancement failed, but skill was still built")
2288
+ except FileNotFoundError:
2289
+ logger.warning("\n⚠ skill-seekers-enhance command not found. Run manually:")
2290
+ logger.info(" skill-seekers-enhance output/%s/", config["name"])
2291
+
2292
+ # Print packaging instructions
2293
+ logger.info("\n📦 Package your skill:")
2294
+ logger.info(" skill-seekers-package output/%s/", config["name"])
2295
+
2296
+ # Suggest enhancement if not done
2297
+ if not args.enhance and not args.enhance_local:
2298
+ logger.info("\n💡 Optional: Enhance SKILL.md with Claude:")
2299
+ logger.info(" Local (recommended): skill-seekers-enhance output/%s/", config["name"])
2300
+ logger.info(" or re-run with: --enhance-local")
2301
+ logger.info(
2302
+ " API-based: skill-seekers-enhance-api output/%s/",
2303
+ config["name"],
2304
+ )
2305
+ logger.info(" or re-run with: --enhance")
2306
+ logger.info(
2307
+ "\n💡 Tip: Use --interactive-enhancement with --enhance-local to open terminal window"
2308
+ )
2309
+
2310
+
2311
+ def main() -> None:
2312
+ parser = setup_argument_parser()
2313
+ args = parser.parse_args()
2314
+
2315
+ # Setup logging based on verbosity flags
2316
+ setup_logging(verbose=args.verbose, quiet=args.quiet)
2317
+
2318
+ config = get_configuration(args)
2319
+
2320
+ # Execute scraping and building
2321
+ converter = execute_scraping_and_building(config, args)
2322
+
2323
+ # Exit if dry run or aborted
2324
+ if converter is None:
2325
+ return
2326
+
2327
+ # Execute enhancement and print instructions
2328
+ execute_enhancement(config, args)
2329
+
2330
+
2331
+ if __name__ == "__main__":
2332
+ main()