@techwavedev/agi-agent-kit 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. package/CHANGELOG.md +59 -0
  2. package/README.md +147 -0
  3. package/bin/init.js +471 -0
  4. package/package.json +36 -0
  5. package/templates/.agent/agents/backend-specialist.md +263 -0
  6. package/templates/.agent/agents/code-archaeologist.md +106 -0
  7. package/templates/.agent/agents/database-architect.md +226 -0
  8. package/templates/.agent/agents/debugger.md +225 -0
  9. package/templates/.agent/agents/devops-engineer.md +242 -0
  10. package/templates/.agent/agents/documentation-writer.md +104 -0
  11. package/templates/.agent/agents/explorer-agent.md +73 -0
  12. package/templates/.agent/agents/frontend-specialist.md +556 -0
  13. package/templates/.agent/agents/game-developer.md +162 -0
  14. package/templates/.agent/agents/mobile-developer.md +377 -0
  15. package/templates/.agent/agents/orchestrator.md +416 -0
  16. package/templates/.agent/agents/penetration-tester.md +188 -0
  17. package/templates/.agent/agents/performance-optimizer.md +187 -0
  18. package/templates/.agent/agents/product-manager.md +112 -0
  19. package/templates/.agent/agents/project-planner.md +403 -0
  20. package/templates/.agent/agents/qa-automation-engineer.md +109 -0
  21. package/templates/.agent/agents/security-auditor.md +170 -0
  22. package/templates/.agent/agents/seo-specialist.md +111 -0
  23. package/templates/.agent/agents/test-engineer.md +158 -0
  24. package/templates/.agent/rules/GEMINI.md +253 -0
  25. package/templates/.agent/workflows/brainstorm.md +113 -0
  26. package/templates/.agent/workflows/create.md +59 -0
  27. package/templates/.agent/workflows/debug.md +103 -0
  28. package/templates/.agent/workflows/deploy.md +176 -0
  29. package/templates/.agent/workflows/enhance.md +63 -0
  30. package/templates/.agent/workflows/orchestrate.md +237 -0
  31. package/templates/.agent/workflows/plan.md +89 -0
  32. package/templates/.agent/workflows/preview.md +81 -0
  33. package/templates/.agent/workflows/status.md +86 -0
  34. package/templates/.agent/workflows/test.md +144 -0
  35. package/templates/.agent/workflows/ui-ux-pro-max.md +296 -0
  36. package/templates/base/.env.example +54 -0
  37. package/templates/base/AGENTS.md +463 -0
  38. package/templates/base/requirements.txt +6 -0
  39. package/templates/base/skill-creator/LICENSE.txt +202 -0
  40. package/templates/base/skill-creator/SKILL_skillcreator.md +389 -0
  41. package/templates/base/skill-creator/references/output-patterns.md +82 -0
  42. package/templates/base/skill-creator/references/workflows.md +28 -0
  43. package/templates/base/skill-creator/scripts/init_skill.py +304 -0
  44. package/templates/base/skill-creator/scripts/package_skill.py +110 -0
  45. package/templates/base/skill-creator/scripts/quick_validate.py +95 -0
  46. package/templates/base/skill-creator/scripts/update_catalog.py +371 -0
  47. package/templates/skills/core/README.md +21 -0
  48. package/templates/skills/core/documentation/SKILL.md +351 -0
  49. package/templates/skills/core/documentation/references/best_practices.md +201 -0
  50. package/templates/skills/core/documentation/scripts/analyze_code.py +307 -0
  51. package/templates/skills/core/documentation/scripts/detect_changes.py +460 -0
  52. package/templates/skills/core/documentation/scripts/generate_changelog.py +312 -0
  53. package/templates/skills/core/documentation/scripts/sync_docs.py +272 -0
  54. package/templates/skills/core/documentation/scripts/update_skill_docs.py +366 -0
  55. package/templates/skills/core/pdf-reader/SKILL.md +104 -0
  56. package/templates/skills/core/pdf-reader/references/pdf_libraries.md +83 -0
  57. package/templates/skills/core/pdf-reader/scripts/extract_text.py +295 -0
  58. package/templates/skills/core/qdrant-memory/SKILL.md +435 -0
  59. package/templates/skills/core/qdrant-memory/references/advanced_patterns.md +375 -0
  60. package/templates/skills/core/qdrant-memory/references/collection_schemas.md +229 -0
  61. package/templates/skills/core/qdrant-memory/references/complete_guide.md +724 -0
  62. package/templates/skills/core/qdrant-memory/references/embedding_models.md +325 -0
  63. package/templates/skills/core/qdrant-memory/scripts/benchmark_token_savings.py +640 -0
  64. package/templates/skills/core/qdrant-memory/scripts/embedding_utils.py +323 -0
  65. package/templates/skills/core/qdrant-memory/scripts/hybrid_search.py +214 -0
  66. package/templates/skills/core/qdrant-memory/scripts/init_collection.py +193 -0
  67. package/templates/skills/core/qdrant-memory/scripts/memory_retrieval.py +345 -0
  68. package/templates/skills/core/qdrant-memory/scripts/semantic_cache.py +282 -0
  69. package/templates/skills/core/qdrant-memory/scripts/test_skill.py +655 -0
  70. package/templates/skills/core/webcrawler/SKILL.md +292 -0
  71. package/templates/skills/core/webcrawler/references/advanced_crawling.md +181 -0
  72. package/templates/skills/core/webcrawler/scripts/crawl_docs.py +532 -0
  73. package/templates/skills/core/webcrawler/scripts/extract_page.py +189 -0
  74. package/templates/skills/core/webcrawler/scripts/filter_docs.py +200 -0
  75. package/templates/skills/knowledge/api-patterns/SKILL.md +81 -0
  76. package/templates/skills/knowledge/api-patterns/api-style.md +42 -0
  77. package/templates/skills/knowledge/api-patterns/auth.md +24 -0
  78. package/templates/skills/knowledge/api-patterns/documentation.md +26 -0
  79. package/templates/skills/knowledge/api-patterns/graphql.md +41 -0
  80. package/templates/skills/knowledge/api-patterns/rate-limiting.md +31 -0
  81. package/templates/skills/knowledge/api-patterns/response.md +37 -0
  82. package/templates/skills/knowledge/api-patterns/rest.md +40 -0
  83. package/templates/skills/knowledge/api-patterns/scripts/api_validator.py +211 -0
  84. package/templates/skills/knowledge/api-patterns/security-testing.md +122 -0
  85. package/templates/skills/knowledge/api-patterns/trpc.md +41 -0
  86. package/templates/skills/knowledge/api-patterns/versioning.md +22 -0
  87. package/templates/skills/knowledge/app-builder/SKILL.md +75 -0
  88. package/templates/skills/knowledge/app-builder/agent-coordination.md +71 -0
  89. package/templates/skills/knowledge/app-builder/feature-building.md +53 -0
  90. package/templates/skills/knowledge/app-builder/project-detection.md +34 -0
  91. package/templates/skills/knowledge/app-builder/scaffolding.md +118 -0
  92. package/templates/skills/knowledge/app-builder/tech-stack.md +40 -0
  93. package/templates/skills/knowledge/app-builder/templates/SKILL.md +39 -0
  94. package/templates/skills/knowledge/app-builder/templates/astro-static/TEMPLATE.md +76 -0
  95. package/templates/skills/knowledge/app-builder/templates/chrome-extension/TEMPLATE.md +92 -0
  96. package/templates/skills/knowledge/app-builder/templates/cli-tool/TEMPLATE.md +88 -0
  97. package/templates/skills/knowledge/app-builder/templates/electron-desktop/TEMPLATE.md +88 -0
  98. package/templates/skills/knowledge/app-builder/templates/express-api/TEMPLATE.md +83 -0
  99. package/templates/skills/knowledge/app-builder/templates/flutter-app/TEMPLATE.md +90 -0
  100. package/templates/skills/knowledge/app-builder/templates/monorepo-turborepo/TEMPLATE.md +90 -0
  101. package/templates/skills/knowledge/app-builder/templates/nextjs-fullstack/TEMPLATE.md +82 -0
  102. package/templates/skills/knowledge/app-builder/templates/nextjs-saas/TEMPLATE.md +100 -0
  103. package/templates/skills/knowledge/app-builder/templates/nextjs-static/TEMPLATE.md +106 -0
  104. package/templates/skills/knowledge/app-builder/templates/nuxt-app/TEMPLATE.md +101 -0
  105. package/templates/skills/knowledge/app-builder/templates/python-fastapi/TEMPLATE.md +83 -0
  106. package/templates/skills/knowledge/app-builder/templates/react-native-app/TEMPLATE.md +93 -0
  107. package/templates/skills/knowledge/architecture/SKILL.md +55 -0
  108. package/templates/skills/knowledge/architecture/context-discovery.md +43 -0
  109. package/templates/skills/knowledge/architecture/examples.md +94 -0
  110. package/templates/skills/knowledge/architecture/pattern-selection.md +68 -0
  111. package/templates/skills/knowledge/architecture/patterns-reference.md +50 -0
  112. package/templates/skills/knowledge/architecture/trade-off-analysis.md +77 -0
  113. package/templates/skills/knowledge/bash-linux/SKILL.md +199 -0
  114. package/templates/skills/knowledge/behavioral-modes/SKILL.md +242 -0
  115. package/templates/skills/knowledge/brainstorming/SKILL.md +163 -0
  116. package/templates/skills/knowledge/brainstorming/dynamic-questioning.md +350 -0
  117. package/templates/skills/knowledge/clean-code/SKILL.md +201 -0
  118. package/templates/skills/knowledge/code-review-checklist/SKILL.md +109 -0
  119. package/templates/skills/knowledge/database-design/SKILL.md +52 -0
  120. package/templates/skills/knowledge/database-design/database-selection.md +43 -0
  121. package/templates/skills/knowledge/database-design/indexing.md +39 -0
  122. package/templates/skills/knowledge/database-design/migrations.md +48 -0
  123. package/templates/skills/knowledge/database-design/optimization.md +36 -0
  124. package/templates/skills/knowledge/database-design/orm-selection.md +30 -0
  125. package/templates/skills/knowledge/database-design/schema-design.md +56 -0
  126. package/templates/skills/knowledge/database-design/scripts/schema_validator.py +172 -0
  127. package/templates/skills/knowledge/deployment-procedures/SKILL.md +241 -0
  128. package/templates/skills/knowledge/doc.md +177 -0
  129. package/templates/skills/knowledge/documentation-templates/SKILL.md +194 -0
  130. package/templates/skills/knowledge/frontend-design/SKILL.md +396 -0
  131. package/templates/skills/knowledge/frontend-design/animation-guide.md +331 -0
  132. package/templates/skills/knowledge/frontend-design/color-system.md +311 -0
  133. package/templates/skills/knowledge/frontend-design/decision-trees.md +418 -0
  134. package/templates/skills/knowledge/frontend-design/motion-graphics.md +306 -0
  135. package/templates/skills/knowledge/frontend-design/scripts/accessibility_checker.py +183 -0
  136. package/templates/skills/knowledge/frontend-design/scripts/ux_audit.py +722 -0
  137. package/templates/skills/knowledge/frontend-design/typography-system.md +345 -0
  138. package/templates/skills/knowledge/frontend-design/ux-psychology.md +541 -0
  139. package/templates/skills/knowledge/frontend-design/visual-effects.md +383 -0
  140. package/templates/skills/knowledge/game-development/2d-games/SKILL.md +119 -0
  141. package/templates/skills/knowledge/game-development/3d-games/SKILL.md +135 -0
  142. package/templates/skills/knowledge/game-development/SKILL.md +167 -0
  143. package/templates/skills/knowledge/game-development/game-art/SKILL.md +185 -0
  144. package/templates/skills/knowledge/game-development/game-audio/SKILL.md +190 -0
  145. package/templates/skills/knowledge/game-development/game-design/SKILL.md +129 -0
  146. package/templates/skills/knowledge/game-development/mobile-games/SKILL.md +108 -0
  147. package/templates/skills/knowledge/game-development/multiplayer/SKILL.md +132 -0
  148. package/templates/skills/knowledge/game-development/pc-games/SKILL.md +144 -0
  149. package/templates/skills/knowledge/game-development/vr-ar/SKILL.md +123 -0
  150. package/templates/skills/knowledge/game-development/web-games/SKILL.md +150 -0
  151. package/templates/skills/knowledge/geo-fundamentals/SKILL.md +156 -0
  152. package/templates/skills/knowledge/geo-fundamentals/scripts/geo_checker.py +289 -0
  153. package/templates/skills/knowledge/i18n-localization/SKILL.md +154 -0
  154. package/templates/skills/knowledge/i18n-localization/scripts/i18n_checker.py +241 -0
  155. package/templates/skills/knowledge/intelligent-routing/SKILL.md +334 -0
  156. package/templates/skills/knowledge/lint-and-validate/SKILL.md +45 -0
  157. package/templates/skills/knowledge/lint-and-validate/scripts/lint_runner.py +172 -0
  158. package/templates/skills/knowledge/lint-and-validate/scripts/type_coverage.py +173 -0
  159. package/templates/skills/knowledge/mcp-builder/SKILL.md +176 -0
  160. package/templates/skills/knowledge/mobile-design/SKILL.md +394 -0
  161. package/templates/skills/knowledge/mobile-design/decision-trees.md +516 -0
  162. package/templates/skills/knowledge/mobile-design/mobile-backend.md +491 -0
  163. package/templates/skills/knowledge/mobile-design/mobile-color-system.md +420 -0
  164. package/templates/skills/knowledge/mobile-design/mobile-debugging.md +122 -0
  165. package/templates/skills/knowledge/mobile-design/mobile-design-thinking.md +357 -0
  166. package/templates/skills/knowledge/mobile-design/mobile-navigation.md +458 -0
  167. package/templates/skills/knowledge/mobile-design/mobile-performance.md +767 -0
  168. package/templates/skills/knowledge/mobile-design/mobile-testing.md +356 -0
  169. package/templates/skills/knowledge/mobile-design/mobile-typography.md +433 -0
  170. package/templates/skills/knowledge/mobile-design/platform-android.md +666 -0
  171. package/templates/skills/knowledge/mobile-design/platform-ios.md +561 -0
  172. package/templates/skills/knowledge/mobile-design/scripts/mobile_audit.py +670 -0
  173. package/templates/skills/knowledge/mobile-design/touch-psychology.md +537 -0
  174. package/templates/skills/knowledge/nextjs-best-practices/SKILL.md +203 -0
  175. package/templates/skills/knowledge/nodejs-best-practices/SKILL.md +333 -0
  176. package/templates/skills/knowledge/parallel-agents/SKILL.md +175 -0
  177. package/templates/skills/knowledge/performance-profiling/SKILL.md +143 -0
  178. package/templates/skills/knowledge/performance-profiling/scripts/lighthouse_audit.py +76 -0
  179. package/templates/skills/knowledge/plan-writing/SKILL.md +152 -0
  180. package/templates/skills/knowledge/powershell-windows/SKILL.md +167 -0
  181. package/templates/skills/knowledge/python-patterns/SKILL.md +441 -0
  182. package/templates/skills/knowledge/react-patterns/SKILL.md +198 -0
  183. package/templates/skills/knowledge/red-team-tactics/SKILL.md +199 -0
  184. package/templates/skills/knowledge/seo-fundamentals/SKILL.md +129 -0
  185. package/templates/skills/knowledge/seo-fundamentals/scripts/seo_checker.py +219 -0
  186. package/templates/skills/knowledge/server-management/SKILL.md +161 -0
  187. package/templates/skills/knowledge/systematic-debugging/SKILL.md +109 -0
  188. package/templates/skills/knowledge/tailwind-patterns/SKILL.md +269 -0
  189. package/templates/skills/knowledge/tdd-workflow/SKILL.md +149 -0
  190. package/templates/skills/knowledge/testing-patterns/SKILL.md +178 -0
  191. package/templates/skills/knowledge/testing-patterns/scripts/test_runner.py +219 -0
  192. package/templates/skills/knowledge/vulnerability-scanner/SKILL.md +276 -0
  193. package/templates/skills/knowledge/vulnerability-scanner/checklists.md +121 -0
  194. package/templates/skills/knowledge/vulnerability-scanner/scripts/security_scan.py +458 -0
  195. package/templates/skills/knowledge/webapp-testing/SKILL.md +187 -0
  196. package/templates/skills/knowledge/webapp-testing/scripts/playwright_runner.py +173 -0
@@ -0,0 +1,532 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script: crawl_docs.py
4
+ Purpose: Recursively crawl documentation websites and extract content as markdown/JSON.
5
+
6
+ Usage:
7
+ python crawl_docs.py --url <base-url> --subject <topic> [options]
8
+
9
+ Arguments:
10
+ --url, -u Starting URL (required)
11
+ --subject, -s Subject focus for filtering (required)
12
+ --output, -o Output directory (default: .tmp/crawled/)
13
+ --depth, -d Max crawl depth (default: 2)
14
+ --filter, -f URL path filter pattern (optional)
15
+ --delay Delay between requests in seconds (default: 0.5)
16
+ --max-pages Maximum pages to crawl (default: 100)
17
+ --same-domain Stay within same domain (default: true)
18
+ --include-code Preserve code blocks (default: true)
19
+ --format Output format: md, json, or both (default: both)
20
+ --ignore-robots Ignore robots.txt (default: false)
21
+ --verbose, -v Verbose output
22
+
23
+ Exit Codes:
24
+ 0 - Success
25
+ 1 - Invalid arguments
26
+ 2 - Network error
27
+ 3 - No content found
28
+ 4 - Processing error
29
+ """
30
+
31
+ import argparse
32
+ import json
33
+ import os
34
+ import re
35
+ import sys
36
+ import time
37
+ import hashlib
38
+ from datetime import datetime
39
+ from pathlib import Path
40
+ from urllib.parse import urljoin, urlparse, urlunparse
41
+ from urllib.robotparser import RobotFileParser
42
+
43
+ try:
44
+ import requests
45
+ from bs4 import BeautifulSoup
46
+ import html2text
47
+ except ImportError as e:
48
+ print(json.dumps({
49
+ "status": "error",
50
+ "message": f"Missing dependency: {e}. Install with: pip install requests beautifulsoup4 html2text lxml"
51
+ }), file=sys.stderr)
52
+ sys.exit(1)
53
+
54
+
55
+ class DocumentationCrawler:
56
+ """Intelligent documentation crawler with content extraction."""
57
+
58
+ def __init__(self, base_url: str, subject: str, config: dict):
59
+ self.base_url = base_url
60
+ self.subject = subject
61
+ self.config = config
62
+ self.visited = set()
63
+ self.pages = []
64
+ self.domain = urlparse(base_url).netloc
65
+ self.base_path = urlparse(base_url).path
66
+ self.robot_parser = None
67
+ self.session = requests.Session()
68
+ self.session.headers.update({
69
+ 'User-Agent': 'DocumentationHarvester/1.0 (+https://github.com/techwavedev/agi)',
70
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
71
+ 'Accept-Language': 'en-US,en;q=0.5',
72
+ })
73
+
74
+ # Initialize html2text converter
75
+ self.converter = html2text.HTML2Text()
76
+ self.converter.ignore_links = False
77
+ self.converter.ignore_images = False
78
+ self.converter.ignore_emphasis = False
79
+ self.converter.body_width = 0 # No wrapping
80
+ self.converter.unicode_snob = True
81
+ self.converter.skip_internal_links = False
82
+
83
+ # Load robots.txt if not ignored
84
+ if not config.get('ignore_robots', False):
85
+ self._load_robots_txt()
86
+
87
+ def _load_robots_txt(self):
88
+ """Load and parse robots.txt."""
89
+ try:
90
+ robots_url = f"{urlparse(self.base_url).scheme}://{self.domain}/robots.txt"
91
+ self.robot_parser = RobotFileParser()
92
+ self.robot_parser.set_url(robots_url)
93
+ self.robot_parser.read()
94
+ except Exception:
95
+ self.robot_parser = None
96
+
97
+ def _can_fetch(self, url: str) -> bool:
98
+ """Check if URL can be fetched according to robots.txt."""
99
+ if self.robot_parser is None:
100
+ return True
101
+ try:
102
+ return self.robot_parser.can_fetch('*', url)
103
+ except Exception:
104
+ return True
105
+
106
+ def _normalize_url(self, url: str) -> str:
107
+ """Normalize URL to prevent duplicate crawling."""
108
+ parsed = urlparse(url)
109
+ # Remove fragment and normalize path
110
+ normalized = urlunparse((
111
+ parsed.scheme,
112
+ parsed.netloc.lower(),
113
+ parsed.path.rstrip('/') or '/',
114
+ '', # params
115
+ parsed.query,
116
+ '' # fragment
117
+ ))
118
+ return normalized
119
+
120
+ def _is_valid_url(self, url: str) -> bool:
121
+ """Check if URL should be crawled."""
122
+ parsed = urlparse(url)
123
+
124
+ # Must be HTTP(S)
125
+ if parsed.scheme not in ('http', 'https'):
126
+ return False
127
+
128
+ # Check domain restriction
129
+ if self.config.get('same_domain', True):
130
+ if parsed.netloc.lower() != self.domain.lower():
131
+ return False
132
+
133
+ # Check path filter
134
+ path_filter = self.config.get('filter')
135
+ if path_filter and path_filter not in parsed.path:
136
+ return False
137
+
138
+ # Skip non-documentation links
139
+ skip_extensions = ('.pdf', '.zip', '.tar', '.gz', '.exe', '.dmg',
140
+ '.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico',
141
+ '.css', '.js', '.woff', '.woff2', '.ttf')
142
+ if any(parsed.path.lower().endswith(ext) for ext in skip_extensions):
143
+ return False
144
+
145
+ return True
146
+
147
+ def _extract_links(self, soup: BeautifulSoup, current_url: str) -> list:
148
+ """Extract valid documentation links from a page."""
149
+ links = []
150
+ for anchor in soup.find_all('a', href=True):
151
+ href = anchor['href']
152
+ # Skip anchors
153
+ if href.startswith('#'):
154
+ continue
155
+ # Resolve relative URLs
156
+ absolute_url = urljoin(current_url, href)
157
+ normalized = self._normalize_url(absolute_url)
158
+ if self._is_valid_url(normalized) and normalized not in self.visited:
159
+ links.append(normalized)
160
+ return list(set(links))
161
+
162
+ def _extract_main_content(self, soup: BeautifulSoup) -> BeautifulSoup:
163
+ """Extract the main content area, removing navigation/sidebars."""
164
+ # Try common content containers
165
+ content_selectors = [
166
+ 'main',
167
+ 'article',
168
+ '[role="main"]',
169
+ '.main-content',
170
+ '.content',
171
+ '.documentation',
172
+ '.docs-content',
173
+ '.markdown-body',
174
+ '#content',
175
+ '#main-content',
176
+ '.post-content',
177
+ ]
178
+
179
+ for selector in content_selectors:
180
+ content = soup.select_one(selector)
181
+ if content:
182
+ return content
183
+
184
+ # Fallback: return body after removing known non-content elements
185
+ body = soup.find('body')
186
+ if body:
187
+ for selector in ['nav', 'header', 'footer', 'aside', '.sidebar',
188
+ '.navigation', '.nav', '.toc', '.menu']:
189
+ for element in body.select(selector):
190
+ element.decompose()
191
+ return body
192
+
193
+ return soup
194
+
195
+ def _preserve_code_blocks(self, soup: BeautifulSoup) -> None:
196
+ """Ensure code blocks are properly preserved."""
197
+ # Mark code blocks to prevent conversion issues
198
+ for pre in soup.find_all('pre'):
199
+ code = pre.find('code')
200
+ if code:
201
+ # Try to get language from class
202
+ classes = code.get('class', [])
203
+ lang = ''
204
+ for cls in classes:
205
+ if cls.startswith('language-') or cls.startswith('lang-'):
206
+ lang = cls.split('-', 1)[1]
207
+ break
208
+ if lang:
209
+ code['data-language'] = lang
210
+
211
+ def _html_to_markdown(self, soup: BeautifulSoup, url: str) -> str:
212
+ """Convert HTML content to clean markdown."""
213
+ # Preserve code blocks
214
+ if self.config.get('include_code', True):
215
+ self._preserve_code_blocks(soup)
216
+
217
+ # Convert to markdown
218
+ html_content = str(soup)
219
+ markdown = self.converter.handle(html_content)
220
+
221
+ # Clean up excessive whitespace
222
+ markdown = re.sub(r'\n{3,}', '\n\n', markdown)
223
+
224
+ # Add source URL as metadata
225
+ header = f"---\nsource: {url}\nsubject: {self.subject}\ncrawled: {datetime.now().isoformat()}\n---\n\n"
226
+
227
+ return header + markdown.strip()
228
+
229
+ def _extract_title(self, soup: BeautifulSoup) -> str:
230
+ """Extract page title."""
231
+ # Try h1 first
232
+ h1 = soup.find('h1')
233
+ if h1:
234
+ return h1.get_text(strip=True)
235
+ # Fall back to title tag
236
+ title = soup.find('title')
237
+ if title:
238
+ text = title.get_text(strip=True)
239
+ # Remove common suffixes
240
+ for sep in [' |', ' -', ' ::']:
241
+ if sep in text:
242
+ text = text.split(sep)[0].strip()
243
+ return text
244
+ return 'Untitled'
245
+
246
+ def _is_relevant(self, content: str, title: str) -> bool:
247
+ """Check if content is relevant to the subject."""
248
+ subject_lower = self.subject.lower()
249
+ subject_words = subject_lower.split()
250
+
251
+ text = (title + ' ' + content).lower()
252
+
253
+ # Check if any subject word appears in content
254
+ for word in subject_words:
255
+ if len(word) > 2 and word in text:
256
+ return True
257
+
258
+ return False
259
+
260
+ def _fetch_page(self, url: str) -> tuple:
261
+ """Fetch a page and return (soup, status)."""
262
+ try:
263
+ if not self._can_fetch(url):
264
+ if self.config.get('verbose'):
265
+ print(f" ⛔ Blocked by robots.txt: {url}")
266
+ return None, 'robots_blocked'
267
+
268
+ response = self.session.get(url, timeout=30)
269
+ response.raise_for_status()
270
+
271
+ # Check content type
272
+ content_type = response.headers.get('Content-Type', '')
273
+ if 'text/html' not in content_type and 'application/xhtml' not in content_type:
274
+ return None, 'not_html'
275
+
276
+ soup = BeautifulSoup(response.content, 'lxml')
277
+ return soup, 'ok'
278
+
279
+ except requests.exceptions.RequestException as e:
280
+ if self.config.get('verbose'):
281
+ print(f" ❌ Error fetching {url}: {e}")
282
+ return None, 'error'
283
+
284
+ def _url_to_filename(self, url: str) -> str:
285
+ """Convert URL to a safe filename."""
286
+ parsed = urlparse(url)
287
+ path = parsed.path.strip('/').replace('/', '_') or 'index'
288
+ # Sanitize
289
+ path = re.sub(r'[^\w\-_.]', '_', path)
290
+ # Limit length
291
+ if len(path) > 100:
292
+ path = path[:80] + '_' + hashlib.md5(path.encode()).hexdigest()[:8]
293
+ return path + '.md'
294
+
295
+ def crawl(self) -> dict:
296
+ """Execute the crawl and return results."""
297
+ max_depth = self.config.get('depth', 2)
298
+ max_pages = self.config.get('max_pages', 100)
299
+ delay = self.config.get('delay', 0.5)
300
+ verbose = self.config.get('verbose', False)
301
+
302
+ # Queue: (url, depth)
303
+ queue = [(self._normalize_url(self.base_url), 0)]
304
+
305
+ print(f"🕷️ Starting crawl: {self.base_url}")
306
+ print(f" Subject: {self.subject}")
307
+ print(f" Max depth: {max_depth}, Max pages: {max_pages}")
308
+ print()
309
+
310
+ while queue and len(self.pages) < max_pages:
311
+ url, depth = queue.pop(0)
312
+
313
+ if url in self.visited:
314
+ continue
315
+
316
+ self.visited.add(url)
317
+
318
+ if verbose:
319
+ print(f" 📄 [{depth}] {url}")
320
+
321
+ # Fetch page
322
+ soup, status = self._fetch_page(url)
323
+
324
+ if soup is None:
325
+ continue
326
+
327
+ # Extract content
328
+ title = self._extract_title(soup)
329
+ main_content = self._extract_main_content(soup)
330
+ markdown = self._html_to_markdown(main_content, url)
331
+
332
+ # Check relevance
333
+ if not self._is_relevant(markdown, title):
334
+ if verbose:
335
+ print(f" ↳ Skipped (not relevant)")
336
+ continue
337
+
338
+ # Store page
339
+ page_data = {
340
+ 'url': url,
341
+ 'title': title,
342
+ 'depth': depth,
343
+ 'content': markdown,
344
+ 'filename': self._url_to_filename(url),
345
+ 'word_count': len(markdown.split()),
346
+ }
347
+ self.pages.append(page_data)
348
+
349
+ if verbose:
350
+ print(f" ↳ ✅ {title} ({page_data['word_count']} words)")
351
+
352
+ # Extract and queue links if not at max depth
353
+ if depth < max_depth:
354
+ links = self._extract_links(soup, url)
355
+ for link in links:
356
+ if link not in self.visited:
357
+ queue.append((link, depth + 1))
358
+
359
+ # Polite delay
360
+ time.sleep(delay)
361
+
362
+ print()
363
+ print(f"✅ Crawl complete: {len(self.pages)} pages harvested")
364
+
365
+ return {
366
+ 'base_url': self.base_url,
367
+ 'subject': self.subject,
368
+ 'pages_crawled': len(self.visited),
369
+ 'pages_harvested': len(self.pages),
370
+ 'pages': self.pages,
371
+ 'timestamp': datetime.now().isoformat(),
372
+ }
373
+
374
+ def save(self, output_dir: str, output_format: str = 'both') -> dict:
375
+ """Save crawled content to disk."""
376
+ output_path = Path(output_dir)
377
+ output_path.mkdir(parents=True, exist_ok=True)
378
+ pages_path = output_path / 'pages'
379
+ pages_path.mkdir(exist_ok=True)
380
+
381
+ # Save individual markdown files
382
+ if output_format in ('md', 'both'):
383
+ for page in self.pages:
384
+ filepath = pages_path / page['filename']
385
+ filepath.write_text(page['content'], encoding='utf-8')
386
+
387
+ # Generate index
388
+ index_content = self._generate_index()
389
+ (output_path / 'index.md').write_text(index_content, encoding='utf-8')
390
+
391
+ # Save metadata
392
+ metadata = {
393
+ 'base_url': self.base_url,
394
+ 'subject': self.subject,
395
+ 'pages_crawled': len(self.visited),
396
+ 'pages_harvested': len(self.pages),
397
+ 'timestamp': datetime.now().isoformat(),
398
+ 'config': self.config,
399
+ 'pages': [{k: v for k, v in p.items() if k != 'content'} for p in self.pages]
400
+ }
401
+ (output_path / 'metadata.json').write_text(
402
+ json.dumps(metadata, indent=2), encoding='utf-8'
403
+ )
404
+
405
+ # Save JSON content
406
+ if output_format in ('json', 'both'):
407
+ content_data = {
408
+ 'subject': self.subject,
409
+ 'base_url': self.base_url,
410
+ 'timestamp': datetime.now().isoformat(),
411
+ 'pages': [{
412
+ 'url': p['url'],
413
+ 'title': p['title'],
414
+ 'content': p['content'],
415
+ 'word_count': p['word_count'],
416
+ } for p in self.pages]
417
+ }
418
+ (output_path / 'content.json').write_text(
419
+ json.dumps(content_data, indent=2, ensure_ascii=False), encoding='utf-8'
420
+ )
421
+
422
+ return {
423
+ 'output_dir': str(output_path),
424
+ 'files_created': {
425
+ 'index': str(output_path / 'index.md'),
426
+ 'metadata': str(output_path / 'metadata.json'),
427
+ 'pages_dir': str(pages_path),
428
+ 'page_count': len(self.pages),
429
+ }
430
+ }
431
+
432
+ def _generate_index(self) -> str:
433
+ """Generate a master index file."""
434
+ lines = [
435
+ f"# {self.subject} Documentation",
436
+ "",
437
+ f"> Crawled from: [{self.base_url}]({self.base_url})",
438
+ f"> Pages: {len(self.pages)}",
439
+ f"> Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
440
+ "",
441
+ "## Table of Contents",
442
+ "",
443
+ ]
444
+
445
+ # Group by depth for visual hierarchy
446
+ for page in sorted(self.pages, key=lambda p: (p['depth'], p['title'])):
447
+ indent = " " * page['depth']
448
+ lines.append(f"{indent}- [{page['title']}](pages/{page['filename']})")
449
+
450
+ lines.extend([
451
+ "",
452
+ "---",
453
+ "",
454
+ "*Generated by Documentation Webcrawler*",
455
+ ])
456
+
457
+ return '\n'.join(lines)
458
+
459
+
460
+ def main():
461
+ parser = argparse.ArgumentParser(
462
+ description='Crawl documentation websites and extract content.',
463
+ formatter_class=argparse.RawDescriptionHelpFormatter,
464
+ epilog=__doc__
465
+ )
466
+
467
+ parser.add_argument('--url', '-u', required=True, help='Starting URL')
468
+ parser.add_argument('--subject', '-s', required=True, help='Subject focus for filtering')
469
+ parser.add_argument('--output', '-o', default='.tmp/crawled/', help='Output directory')
470
+ parser.add_argument('--depth', '-d', type=int, default=2, help='Max crawl depth')
471
+ parser.add_argument('--filter', '-f', help='URL path filter pattern')
472
+ parser.add_argument('--delay', type=float, default=0.5, help='Delay between requests')
473
+ parser.add_argument('--max-pages', type=int, default=100, help='Maximum pages to crawl')
474
+ parser.add_argument('--same-domain', action='store_true', default=True, help='Stay within same domain')
475
+ parser.add_argument('--include-code', action='store_true', default=True, help='Preserve code blocks')
476
+ parser.add_argument('--format', choices=['md', 'json', 'both'], default='both', help='Output format')
477
+ parser.add_argument('--ignore-robots', action='store_true', help='Ignore robots.txt')
478
+ parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
479
+
480
+ args = parser.parse_args()
481
+
482
+ config = {
483
+ 'depth': args.depth,
484
+ 'filter': args.filter,
485
+ 'delay': args.delay,
486
+ 'max_pages': args.max_pages,
487
+ 'same_domain': args.same_domain,
488
+ 'include_code': args.include_code,
489
+ 'ignore_robots': args.ignore_robots,
490
+ 'verbose': args.verbose,
491
+ }
492
+
493
+ try:
494
+ crawler = DocumentationCrawler(args.url, args.subject, config)
495
+ results = crawler.crawl()
496
+
497
+ if not results['pages']:
498
+ print(json.dumps({
499
+ "status": "error",
500
+ "message": "No relevant pages found"
501
+ }), file=sys.stderr)
502
+ sys.exit(3)
503
+
504
+ save_result = crawler.save(args.output, args.format)
505
+
506
+ print()
507
+ print(json.dumps({
508
+ "status": "success",
509
+ "pages_harvested": results['pages_harvested'],
510
+ "output": save_result
511
+ }, indent=2))
512
+ sys.exit(0)
513
+
514
+ except requests.exceptions.RequestException as e:
515
+ print(json.dumps({
516
+ "status": "error",
517
+ "type": "network_error",
518
+ "message": str(e)
519
+ }), file=sys.stderr)
520
+ sys.exit(2)
521
+
522
+ except Exception as e:
523
+ print(json.dumps({
524
+ "status": "error",
525
+ "type": type(e).__name__,
526
+ "message": str(e)
527
+ }), file=sys.stderr)
528
+ sys.exit(4)
529
+
530
+
531
+ if __name__ == '__main__':
532
+ main()