@techwavedev/agi-agent-kit 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. package/CHANGELOG.md +59 -0
  2. package/README.md +147 -0
  3. package/bin/init.js +471 -0
  4. package/package.json +36 -0
  5. package/templates/.agent/agents/backend-specialist.md +263 -0
  6. package/templates/.agent/agents/code-archaeologist.md +106 -0
  7. package/templates/.agent/agents/database-architect.md +226 -0
  8. package/templates/.agent/agents/debugger.md +225 -0
  9. package/templates/.agent/agents/devops-engineer.md +242 -0
  10. package/templates/.agent/agents/documentation-writer.md +104 -0
  11. package/templates/.agent/agents/explorer-agent.md +73 -0
  12. package/templates/.agent/agents/frontend-specialist.md +556 -0
  13. package/templates/.agent/agents/game-developer.md +162 -0
  14. package/templates/.agent/agents/mobile-developer.md +377 -0
  15. package/templates/.agent/agents/orchestrator.md +416 -0
  16. package/templates/.agent/agents/penetration-tester.md +188 -0
  17. package/templates/.agent/agents/performance-optimizer.md +187 -0
  18. package/templates/.agent/agents/product-manager.md +112 -0
  19. package/templates/.agent/agents/project-planner.md +403 -0
  20. package/templates/.agent/agents/qa-automation-engineer.md +109 -0
  21. package/templates/.agent/agents/security-auditor.md +170 -0
  22. package/templates/.agent/agents/seo-specialist.md +111 -0
  23. package/templates/.agent/agents/test-engineer.md +158 -0
  24. package/templates/.agent/rules/GEMINI.md +253 -0
  25. package/templates/.agent/workflows/brainstorm.md +113 -0
  26. package/templates/.agent/workflows/create.md +59 -0
  27. package/templates/.agent/workflows/debug.md +103 -0
  28. package/templates/.agent/workflows/deploy.md +176 -0
  29. package/templates/.agent/workflows/enhance.md +63 -0
  30. package/templates/.agent/workflows/orchestrate.md +237 -0
  31. package/templates/.agent/workflows/plan.md +89 -0
  32. package/templates/.agent/workflows/preview.md +81 -0
  33. package/templates/.agent/workflows/status.md +86 -0
  34. package/templates/.agent/workflows/test.md +144 -0
  35. package/templates/.agent/workflows/ui-ux-pro-max.md +296 -0
  36. package/templates/base/.env.example +54 -0
  37. package/templates/base/AGENTS.md +463 -0
  38. package/templates/base/requirements.txt +6 -0
  39. package/templates/base/skill-creator/LICENSE.txt +202 -0
  40. package/templates/base/skill-creator/SKILL_skillcreator.md +389 -0
  41. package/templates/base/skill-creator/references/output-patterns.md +82 -0
  42. package/templates/base/skill-creator/references/workflows.md +28 -0
  43. package/templates/base/skill-creator/scripts/init_skill.py +304 -0
  44. package/templates/base/skill-creator/scripts/package_skill.py +110 -0
  45. package/templates/base/skill-creator/scripts/quick_validate.py +95 -0
  46. package/templates/base/skill-creator/scripts/update_catalog.py +371 -0
  47. package/templates/skills/core/README.md +21 -0
  48. package/templates/skills/core/documentation/SKILL.md +351 -0
  49. package/templates/skills/core/documentation/references/best_practices.md +201 -0
  50. package/templates/skills/core/documentation/scripts/analyze_code.py +307 -0
  51. package/templates/skills/core/documentation/scripts/detect_changes.py +460 -0
  52. package/templates/skills/core/documentation/scripts/generate_changelog.py +312 -0
  53. package/templates/skills/core/documentation/scripts/sync_docs.py +272 -0
  54. package/templates/skills/core/documentation/scripts/update_skill_docs.py +366 -0
  55. package/templates/skills/core/pdf-reader/SKILL.md +104 -0
  56. package/templates/skills/core/pdf-reader/references/pdf_libraries.md +83 -0
  57. package/templates/skills/core/pdf-reader/scripts/extract_text.py +295 -0
  58. package/templates/skills/core/qdrant-memory/SKILL.md +435 -0
  59. package/templates/skills/core/qdrant-memory/references/advanced_patterns.md +375 -0
  60. package/templates/skills/core/qdrant-memory/references/collection_schemas.md +229 -0
  61. package/templates/skills/core/qdrant-memory/references/complete_guide.md +724 -0
  62. package/templates/skills/core/qdrant-memory/references/embedding_models.md +325 -0
  63. package/templates/skills/core/qdrant-memory/scripts/benchmark_token_savings.py +640 -0
  64. package/templates/skills/core/qdrant-memory/scripts/embedding_utils.py +323 -0
  65. package/templates/skills/core/qdrant-memory/scripts/hybrid_search.py +214 -0
  66. package/templates/skills/core/qdrant-memory/scripts/init_collection.py +193 -0
  67. package/templates/skills/core/qdrant-memory/scripts/memory_retrieval.py +345 -0
  68. package/templates/skills/core/qdrant-memory/scripts/semantic_cache.py +282 -0
  69. package/templates/skills/core/qdrant-memory/scripts/test_skill.py +655 -0
  70. package/templates/skills/core/webcrawler/SKILL.md +292 -0
  71. package/templates/skills/core/webcrawler/references/advanced_crawling.md +181 -0
  72. package/templates/skills/core/webcrawler/scripts/crawl_docs.py +532 -0
  73. package/templates/skills/core/webcrawler/scripts/extract_page.py +189 -0
  74. package/templates/skills/core/webcrawler/scripts/filter_docs.py +200 -0
  75. package/templates/skills/knowledge/api-patterns/SKILL.md +81 -0
  76. package/templates/skills/knowledge/api-patterns/api-style.md +42 -0
  77. package/templates/skills/knowledge/api-patterns/auth.md +24 -0
  78. package/templates/skills/knowledge/api-patterns/documentation.md +26 -0
  79. package/templates/skills/knowledge/api-patterns/graphql.md +41 -0
  80. package/templates/skills/knowledge/api-patterns/rate-limiting.md +31 -0
  81. package/templates/skills/knowledge/api-patterns/response.md +37 -0
  82. package/templates/skills/knowledge/api-patterns/rest.md +40 -0
  83. package/templates/skills/knowledge/api-patterns/scripts/api_validator.py +211 -0
  84. package/templates/skills/knowledge/api-patterns/security-testing.md +122 -0
  85. package/templates/skills/knowledge/api-patterns/trpc.md +41 -0
  86. package/templates/skills/knowledge/api-patterns/versioning.md +22 -0
  87. package/templates/skills/knowledge/app-builder/SKILL.md +75 -0
  88. package/templates/skills/knowledge/app-builder/agent-coordination.md +71 -0
  89. package/templates/skills/knowledge/app-builder/feature-building.md +53 -0
  90. package/templates/skills/knowledge/app-builder/project-detection.md +34 -0
  91. package/templates/skills/knowledge/app-builder/scaffolding.md +118 -0
  92. package/templates/skills/knowledge/app-builder/tech-stack.md +40 -0
  93. package/templates/skills/knowledge/app-builder/templates/SKILL.md +39 -0
  94. package/templates/skills/knowledge/app-builder/templates/astro-static/TEMPLATE.md +76 -0
  95. package/templates/skills/knowledge/app-builder/templates/chrome-extension/TEMPLATE.md +92 -0
  96. package/templates/skills/knowledge/app-builder/templates/cli-tool/TEMPLATE.md +88 -0
  97. package/templates/skills/knowledge/app-builder/templates/electron-desktop/TEMPLATE.md +88 -0
  98. package/templates/skills/knowledge/app-builder/templates/express-api/TEMPLATE.md +83 -0
  99. package/templates/skills/knowledge/app-builder/templates/flutter-app/TEMPLATE.md +90 -0
  100. package/templates/skills/knowledge/app-builder/templates/monorepo-turborepo/TEMPLATE.md +90 -0
  101. package/templates/skills/knowledge/app-builder/templates/nextjs-fullstack/TEMPLATE.md +82 -0
  102. package/templates/skills/knowledge/app-builder/templates/nextjs-saas/TEMPLATE.md +100 -0
  103. package/templates/skills/knowledge/app-builder/templates/nextjs-static/TEMPLATE.md +106 -0
  104. package/templates/skills/knowledge/app-builder/templates/nuxt-app/TEMPLATE.md +101 -0
  105. package/templates/skills/knowledge/app-builder/templates/python-fastapi/TEMPLATE.md +83 -0
  106. package/templates/skills/knowledge/app-builder/templates/react-native-app/TEMPLATE.md +93 -0
  107. package/templates/skills/knowledge/architecture/SKILL.md +55 -0
  108. package/templates/skills/knowledge/architecture/context-discovery.md +43 -0
  109. package/templates/skills/knowledge/architecture/examples.md +94 -0
  110. package/templates/skills/knowledge/architecture/pattern-selection.md +68 -0
  111. package/templates/skills/knowledge/architecture/patterns-reference.md +50 -0
  112. package/templates/skills/knowledge/architecture/trade-off-analysis.md +77 -0
  113. package/templates/skills/knowledge/bash-linux/SKILL.md +199 -0
  114. package/templates/skills/knowledge/behavioral-modes/SKILL.md +242 -0
  115. package/templates/skills/knowledge/brainstorming/SKILL.md +163 -0
  116. package/templates/skills/knowledge/brainstorming/dynamic-questioning.md +350 -0
  117. package/templates/skills/knowledge/clean-code/SKILL.md +201 -0
  118. package/templates/skills/knowledge/code-review-checklist/SKILL.md +109 -0
  119. package/templates/skills/knowledge/database-design/SKILL.md +52 -0
  120. package/templates/skills/knowledge/database-design/database-selection.md +43 -0
  121. package/templates/skills/knowledge/database-design/indexing.md +39 -0
  122. package/templates/skills/knowledge/database-design/migrations.md +48 -0
  123. package/templates/skills/knowledge/database-design/optimization.md +36 -0
  124. package/templates/skills/knowledge/database-design/orm-selection.md +30 -0
  125. package/templates/skills/knowledge/database-design/schema-design.md +56 -0
  126. package/templates/skills/knowledge/database-design/scripts/schema_validator.py +172 -0
  127. package/templates/skills/knowledge/deployment-procedures/SKILL.md +241 -0
  128. package/templates/skills/knowledge/doc.md +177 -0
  129. package/templates/skills/knowledge/documentation-templates/SKILL.md +194 -0
  130. package/templates/skills/knowledge/frontend-design/SKILL.md +396 -0
  131. package/templates/skills/knowledge/frontend-design/animation-guide.md +331 -0
  132. package/templates/skills/knowledge/frontend-design/color-system.md +311 -0
  133. package/templates/skills/knowledge/frontend-design/decision-trees.md +418 -0
  134. package/templates/skills/knowledge/frontend-design/motion-graphics.md +306 -0
  135. package/templates/skills/knowledge/frontend-design/scripts/accessibility_checker.py +183 -0
  136. package/templates/skills/knowledge/frontend-design/scripts/ux_audit.py +722 -0
  137. package/templates/skills/knowledge/frontend-design/typography-system.md +345 -0
  138. package/templates/skills/knowledge/frontend-design/ux-psychology.md +541 -0
  139. package/templates/skills/knowledge/frontend-design/visual-effects.md +383 -0
  140. package/templates/skills/knowledge/game-development/2d-games/SKILL.md +119 -0
  141. package/templates/skills/knowledge/game-development/3d-games/SKILL.md +135 -0
  142. package/templates/skills/knowledge/game-development/SKILL.md +167 -0
  143. package/templates/skills/knowledge/game-development/game-art/SKILL.md +185 -0
  144. package/templates/skills/knowledge/game-development/game-audio/SKILL.md +190 -0
  145. package/templates/skills/knowledge/game-development/game-design/SKILL.md +129 -0
  146. package/templates/skills/knowledge/game-development/mobile-games/SKILL.md +108 -0
  147. package/templates/skills/knowledge/game-development/multiplayer/SKILL.md +132 -0
  148. package/templates/skills/knowledge/game-development/pc-games/SKILL.md +144 -0
  149. package/templates/skills/knowledge/game-development/vr-ar/SKILL.md +123 -0
  150. package/templates/skills/knowledge/game-development/web-games/SKILL.md +150 -0
  151. package/templates/skills/knowledge/geo-fundamentals/SKILL.md +156 -0
  152. package/templates/skills/knowledge/geo-fundamentals/scripts/geo_checker.py +289 -0
  153. package/templates/skills/knowledge/i18n-localization/SKILL.md +154 -0
  154. package/templates/skills/knowledge/i18n-localization/scripts/i18n_checker.py +241 -0
  155. package/templates/skills/knowledge/intelligent-routing/SKILL.md +334 -0
  156. package/templates/skills/knowledge/lint-and-validate/SKILL.md +45 -0
  157. package/templates/skills/knowledge/lint-and-validate/scripts/lint_runner.py +172 -0
  158. package/templates/skills/knowledge/lint-and-validate/scripts/type_coverage.py +173 -0
  159. package/templates/skills/knowledge/mcp-builder/SKILL.md +176 -0
  160. package/templates/skills/knowledge/mobile-design/SKILL.md +394 -0
  161. package/templates/skills/knowledge/mobile-design/decision-trees.md +516 -0
  162. package/templates/skills/knowledge/mobile-design/mobile-backend.md +491 -0
  163. package/templates/skills/knowledge/mobile-design/mobile-color-system.md +420 -0
  164. package/templates/skills/knowledge/mobile-design/mobile-debugging.md +122 -0
  165. package/templates/skills/knowledge/mobile-design/mobile-design-thinking.md +357 -0
  166. package/templates/skills/knowledge/mobile-design/mobile-navigation.md +458 -0
  167. package/templates/skills/knowledge/mobile-design/mobile-performance.md +767 -0
  168. package/templates/skills/knowledge/mobile-design/mobile-testing.md +356 -0
  169. package/templates/skills/knowledge/mobile-design/mobile-typography.md +433 -0
  170. package/templates/skills/knowledge/mobile-design/platform-android.md +666 -0
  171. package/templates/skills/knowledge/mobile-design/platform-ios.md +561 -0
  172. package/templates/skills/knowledge/mobile-design/scripts/mobile_audit.py +670 -0
  173. package/templates/skills/knowledge/mobile-design/touch-psychology.md +537 -0
  174. package/templates/skills/knowledge/nextjs-best-practices/SKILL.md +203 -0
  175. package/templates/skills/knowledge/nodejs-best-practices/SKILL.md +333 -0
  176. package/templates/skills/knowledge/parallel-agents/SKILL.md +175 -0
  177. package/templates/skills/knowledge/performance-profiling/SKILL.md +143 -0
  178. package/templates/skills/knowledge/performance-profiling/scripts/lighthouse_audit.py +76 -0
  179. package/templates/skills/knowledge/plan-writing/SKILL.md +152 -0
  180. package/templates/skills/knowledge/powershell-windows/SKILL.md +167 -0
  181. package/templates/skills/knowledge/python-patterns/SKILL.md +441 -0
  182. package/templates/skills/knowledge/react-patterns/SKILL.md +198 -0
  183. package/templates/skills/knowledge/red-team-tactics/SKILL.md +199 -0
  184. package/templates/skills/knowledge/seo-fundamentals/SKILL.md +129 -0
  185. package/templates/skills/knowledge/seo-fundamentals/scripts/seo_checker.py +219 -0
  186. package/templates/skills/knowledge/server-management/SKILL.md +161 -0
  187. package/templates/skills/knowledge/systematic-debugging/SKILL.md +109 -0
  188. package/templates/skills/knowledge/tailwind-patterns/SKILL.md +269 -0
  189. package/templates/skills/knowledge/tdd-workflow/SKILL.md +149 -0
  190. package/templates/skills/knowledge/testing-patterns/SKILL.md +178 -0
  191. package/templates/skills/knowledge/testing-patterns/scripts/test_runner.py +219 -0
  192. package/templates/skills/knowledge/vulnerability-scanner/SKILL.md +276 -0
  193. package/templates/skills/knowledge/vulnerability-scanner/checklists.md +121 -0
  194. package/templates/skills/knowledge/vulnerability-scanner/scripts/security_scan.py +458 -0
  195. package/templates/skills/knowledge/webapp-testing/SKILL.md +187 -0
  196. package/templates/skills/knowledge/webapp-testing/scripts/playwright_runner.py +173 -0
@@ -0,0 +1,292 @@
1
+ ---
2
+ name: webcrawler
3
+ description: "Documentation harvesting agent for crawling and extracting content from documentation websites. Use for crawling documentation sites and extracting all pages about a subject, building offline knowledge bases from online docs, harvesting API references, tutorials, or guides from documentation portals, creating structured markdown exports from multi-page documentation, and downloading and organizing technical docs for embedding or RAG pipelines. Supports recursive crawling with depth control, content filtering, and structured output."
4
+ ---
5
+
6
+ # Webcrawler Skill
7
+
8
+ Intelligent documentation harvesting agent that recursively crawls documentation websites and extracts structured content about specific subjects.
9
+
10
+ > **Last Updated:** 2026-01-23
11
+
12
+ ---
13
+
14
+ ## Quick Start
15
+
16
+ ```bash
17
+ # Crawl Python documentation about async/await
18
+ python skills/webcrawler/scripts/crawl_docs.py \
19
+ --url "https://docs.python.org/3/library/asyncio.html" \
20
+ --subject "asyncio" \
21
+ --depth 2 \
22
+ --output .tmp/docs/python-asyncio/
23
+
24
+ # Crawl React documentation
25
+ python skills/webcrawler/scripts/crawl_docs.py \
26
+ --url "https://react.dev/" \
27
+ --subject "React" \
28
+ --depth 3 \
29
+ --output .tmp/docs/react/
30
+
31
+ # Extract only API reference pages
32
+ python skills/webcrawler/scripts/crawl_docs.py \
33
+ --url "https://expressjs.com/en/4x/api.html" \
34
+ --subject "Express API" \
35
+ --filter "api" \
36
+ --output .tmp/docs/express-api/
37
+ ```
38
+
39
+ ---
40
+
41
+ ## Core Workflow
42
+
43
+ 1. **Initialize Crawl** — Provide base URL and subject focus
44
+ 2. **Discover Pages** — Recursively find all linked documentation pages
45
+ 3. **Filter Content** — Keep only pages matching the subject criteria
46
+ 4. **Extract Content** — Convert HTML to clean markdown
47
+ 5. **Organize Output** — Structure files in a navigable hierarchy
48
+ 6. **Generate Index** — Create a master index with all harvested pages
49
+
50
+ ---
51
+
52
+ ## Scripts
53
+
54
+ ### `crawl_docs.py` — Main Documentation Crawler
55
+
56
+ The primary crawling script that handles recursive page discovery and content extraction.
57
+
58
+ ```bash
59
+ python skills/webcrawler/scripts/crawl_docs.py \
60
+ --url <base-url> # Starting URL (required)
61
+ --subject <topic> # Subject focus for filtering (required)
62
+ --output <directory> # Output directory (default: .tmp/crawled/)
63
+ --depth <n> # Max crawl depth (default: 2)
64
+ --filter <pattern> # URL path filter pattern (optional)
65
+ --delay <seconds> # Delay between requests (default: 0.5)
66
+ --max-pages <n> # Maximum pages to crawl (default: 100)
67
+ --same-domain # Stay within same domain (default: true)
68
+ --include-code # Preserve code blocks (default: true)
69
+ --format <md|json|both> # Output format (default: both)
70
+ ```
71
+
72
+ **Outputs:**
73
+
74
+ - `index.md` — Master index with links to all pages
75
+ - `pages/*.md` — Individual markdown files per page
76
+ - `metadata.json` — Crawl metadata and page inventory
77
+ - `content.json` — Structured JSON with all extracted content
78
+
79
+ ### `extract_page.py` — Single Page Extractor
80
+
81
+ Extract content from a single documentation page.
82
+
83
+ ```bash
84
+ python skills/webcrawler/scripts/extract_page.py \
85
+ --url <page-url> # Page to extract (required)
86
+ --output <file> # Output file (default: stdout)
87
+ --format <md|json> # Output format (default: md)
88
+ --include-links # Include internal links (default: true)
89
+ ```
90
+
91
+ ### `filter_docs.py` — Post-Crawl Filtering
92
+
93
+ Filter already-crawled documentation by subject or pattern.
94
+
95
+ ```bash
96
+ python skills/webcrawler/scripts/filter_docs.py \
97
+ --input <crawl-dir> # Crawled docs directory (required)
98
+ --subject <topic> # Subject to filter for (required)
99
+ --output <directory> # Filtered output directory (required)
100
+ --threshold <0.0-1.0> # Relevance threshold (default: 0.3)
101
+ ```
102
+
103
+ ---
104
+
105
+ ## Configuration
106
+
107
+ ### Rate Limiting & Politeness
108
+
109
+ The crawler respects `robots.txt` and implements polite crawling:
110
+
111
+ - **Default delay**: 0.5s between requests
112
+ - **User-Agent**: Identifies as documentation harvester
113
+ - **robots.txt**: Honored by default (disable with `--ignore-robots`)
114
+
115
+ ### Domain Handling
116
+
117
+ | Mode | Behavior |
118
+ | -------------------- | -------------------------------------------- |
119
+ | `--same-domain` | Only crawl pages on the starting domain |
120
+ | `--same-path` | Only crawl pages under the starting URL path |
121
+ | `--allow-subdomains` | Include subdomains (e.g., api.example.com) |
122
+
123
+ ### Content Extraction
124
+
125
+ The crawler uses intelligent content extraction:
126
+
127
+ 1. **Main content detection** — Finds `<main>`, `<article>`, or content containers
128
+ 2. **Navigation removal** — Strips headers, footers, sidebars
129
+ 3. **Code preservation** — Maintains code blocks with language hints
130
+ 4. **Link normalization** — Converts relative links to absolute
131
+ 5. **Image handling** — Optionally downloads and references images
132
+
133
+ ---
134
+
135
+ ## Output Structure
136
+
137
+ ```
138
+ .tmp/docs/<subject>/
139
+ ├── index.md # Master index with TOC
140
+ ├── metadata.json # Crawl metadata
141
+ ├── content.json # Structured JSON export
142
+ └── pages/
143
+ ├── getting-started.md
144
+ ├── installation.md
145
+ ├── api-reference.md
146
+ ├── configuration/
147
+ │ ├── basic.md
148
+ │ └── advanced.md
149
+ └── troubleshooting.md
150
+ ```
151
+
152
+ ### Index Format
153
+
154
+ ```markdown
155
+ # <Subject> Documentation
156
+
157
+ > Crawled from: <base-url>
158
+ > Pages: <count>
159
+ > Date: <timestamp>
160
+
161
+ ## Table of Contents
162
+
163
+ - [Getting Started](pages/getting-started.md)
164
+ - [Installation](pages/installation.md)
165
+ - [API Reference](pages/api-reference.md)
166
+ - Configuration
167
+ - [Basic](pages/configuration/basic.md)
168
+ - [Advanced](pages/configuration/advanced.md)
169
+ - [Troubleshooting](pages/troubleshooting.md)
170
+ ```
171
+
172
+ ---
173
+
174
+ ## Common Workflows
175
+
176
+ ### 1. Harvest API Documentation
177
+
178
+ ```bash
179
+ # Crawl API docs with deep recursion
180
+ python skills/webcrawler/scripts/crawl_docs.py \
181
+ --url "https://api.example.com/docs" \
182
+ --subject "Example API" \
183
+ --depth 4 \
184
+ --filter "/api/" \
185
+ --output .tmp/docs/example-api/
186
+ ```
187
+
188
+ ### 2. Build RAG Knowledge Base
189
+
190
+ ```bash
191
+ # Crawl and export as JSON for embedding
192
+ python skills/webcrawler/scripts/crawl_docs.py \
193
+ --url "https://docs.example.com" \
194
+ --subject "Example Docs" \
195
+ --depth 3 \
196
+ --format json \
197
+ --output .tmp/rag/example/
198
+
199
+ # The content.json can be fed directly to embedding pipelines
200
+ ```
201
+
202
+ ### 3. Offline Documentation Mirror
203
+
204
+ ```bash
205
+ # Full documentation harvest
206
+ python skills/webcrawler/scripts/crawl_docs.py \
207
+ --url "https://docs.kubernetes.io/docs/concepts/" \
208
+ --subject "Kubernetes Concepts" \
209
+ --depth 5 \
210
+ --max-pages 500 \
211
+ --include-images \
212
+ --output .tmp/docs/k8s-concepts/
213
+ ```
214
+
215
+ ### 4. Focused Topic Extraction
216
+
217
+ ```bash
218
+ # Crawl, then filter to specific topic
219
+ python skills/webcrawler/scripts/crawl_docs.py \
220
+ --url "https://developer.hashicorp.com/terraform/docs" \
221
+ --subject "Terraform" \
222
+ --depth 3 \
223
+ --output .tmp/docs/terraform-full/
224
+
225
+ # Filter to AWS provider only
226
+ python skills/webcrawler/scripts/filter_docs.py \
227
+ --input .tmp/docs/terraform-full/ \
228
+ --subject "AWS Provider" \
229
+ --output .tmp/docs/terraform-aws/
230
+ ```
231
+
232
+ ---
233
+
234
+ ## Best Practices
235
+
236
+ ### Crawling
237
+
238
+ 1. **Start shallow** — Begin with `--depth 1` to test, then increase
239
+ 2. **Use filters** — Narrow scope with `--filter` patterns
240
+ 3. **Set page limits** — Use `--max-pages` to prevent runaway crawls
241
+ 4. **Respect rate limits** — Increase `--delay` for slower servers
242
+
243
+ ### Content Quality
244
+
245
+ 1. **Subject focus** — Be specific with `--subject` for better filtering
246
+ 2. **Review index** — Check `index.md` to verify crawl coverage
247
+ 3. **Post-filter** — Use `filter_docs.py` to refine results
248
+
249
+ ### Storage
250
+
251
+ 1. **Use `.tmp/`** — Store crawled docs in the temp directory
252
+ 2. **Organize by subject** — Create subdirectories per topic
253
+ 3. **Version with dates** — Add timestamps for recurring crawls
254
+
255
+ ---
256
+
257
+ ## Troubleshooting
258
+
259
+ | Issue | Cause | Solution |
260
+ | ----------------------- | --------------------------- | --------------------------------------- |
261
+ | **403 Forbidden** | Blocked by server | Increase delay, check robots.txt |
262
+ | **Empty pages** | JavaScript-rendered content | Use `--render-js` (requires Playwright) |
263
+ | **Too many pages** | Unbounded crawl | Lower depth, use filters |
264
+ | **Duplicate content** | Same page via multiple URLs | Enabled by default (URL normalization) |
265
+ | **Missing code blocks** | Extraction issue | Check `--include-code` is enabled |
266
+
267
+ ---
268
+
269
+ ## Dependencies
270
+
271
+ Required Python packages:
272
+
273
+ ```bash
274
+ pip install requests beautifulsoup4 html2text lxml
275
+ # Optional for JavaScript rendering:
276
+ pip install playwright && playwright install
277
+ ```
278
+
279
+ ---
280
+
281
+ ## Related Skills
282
+
283
+ - **[qdrant-memory](../qdrant-memory/SKILL.md)** — Store crawled docs in vector database for RAG
284
+ - **[pdf-reader](../pdf-reader/SKILL.md)** — Extract text from PDF documentation
285
+
286
+ ---
287
+
288
+ ## External Resources
289
+
290
+ - [Scrapy Documentation](https://docs.scrapy.org/) — For complex crawling needs
291
+ - [html2text](https://github.com/Alir3z4/html2text) — HTML to Markdown conversion
292
+ - [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) — HTML parsing
@@ -0,0 +1,181 @@
1
+ # Advanced Crawling Reference
2
+
3
+ ## JavaScript-Rendered Pages
4
+
5
+ Some documentation sites render content with JavaScript. For these, use Playwright:
6
+
7
+ ```bash
8
+ # Install Playwright
9
+ pip install playwright
10
+ playwright install chromium
11
+
12
+ # Use in crawl_docs.py with --render-js flag (future feature)
13
+ ```
14
+
15
+ ### Manual Extraction with Playwright
16
+
17
+ ```python
18
+ from playwright.sync_api import sync_playwright
19
+
20
+ def extract_js_rendered(url: str) -> str:
21
+ with sync_playwright() as p:
22
+ browser = p.chromium.launch()
23
+ page = browser.new_page()
24
+ page.goto(url, wait_until='networkidle')
25
+ content = page.content()
26
+ browser.close()
27
+ return content
28
+ ```
29
+
30
+ ---
31
+
32
+ ## Rate Limiting Strategies
33
+
34
+ ### Exponential Backoff
35
+
36
+ ```python
37
+ import time
38
+ import random
39
+
40
+ def fetch_with_backoff(url, max_retries=3):
41
+ for attempt in range(max_retries):
42
+ try:
43
+ response = requests.get(url)
44
+ if response.status_code == 429: # Too Many Requests
45
+ wait = (2 ** attempt) + random.uniform(0, 1)
46
+ time.sleep(wait)
47
+ continue
48
+ return response
49
+ except requests.exceptions.RequestException:
50
+ if attempt == max_retries - 1:
51
+ raise
52
+ time.sleep(2 ** attempt)
53
+ ```
54
+
55
+ ### Respecting Crawl-Delay
56
+
57
+ ```python
58
+ from urllib.robotparser import RobotFileParser
59
+
60
+ rp = RobotFileParser()
61
+ rp.set_url("https://example.com/robots.txt")
62
+ rp.read()
63
+
64
+ crawl_delay = rp.crawl_delay("*")
65
+ if crawl_delay:
66
+ time.sleep(crawl_delay)
67
+ ```
68
+
69
+ ---
70
+
71
+ ## Content Extraction Patterns
72
+
73
+ ### Documentation Site Patterns
74
+
75
+ | Site Type | Content Selector | Notes |
76
+ | --------------- | ---------------------- | ------------------------- |
77
+ | **ReadTheDocs** | `.document`, `.body` | Standard Sphinx output |
78
+ | **GitBook** | `.page-inner` | Modern docs platform |
79
+ | **Docusaurus** | `.markdown`, `article` | React-based docs |
80
+ | **MkDocs** | `.md-content` | Python-based docs |
81
+ | **Notion** | `.notion-page-content` | Requires special handling |
82
+ | **Confluence** | `#main-content` | Enterprise wiki |
83
+
84
+ ### Handling Dynamic Navigation
85
+
86
+ Some sites use JavaScript for navigation. Strategy:
87
+
88
+ 1. Extract sitemap from `sitemap.xml` if available
89
+ 2. Parse navigation elements for all page links
90
+ 3. Follow `next`/`prev` pagination links
91
+
92
+ ```python
93
+ def get_sitemap_urls(base_url: str) -> list:
94
+ sitemap_url = f"{base_url}/sitemap.xml"
95
+ response = requests.get(sitemap_url)
96
+ soup = BeautifulSoup(response.content, 'lxml-xml')
97
+ return [loc.text for loc in soup.find_all('loc')]
98
+ ```
99
+
100
+ ---
101
+
102
+ ## Large Documentation Sets
103
+
104
+ For documentation with 500+ pages:
105
+
106
+ 1. **Use depth limits** — Start with `--depth 1` to get main sections
107
+ 2. **Section by section** — Crawl each major section separately
108
+ 3. **Resume capability** — Check `metadata.json` for already-crawled pages
109
+ 4. **Parallel crawling** — Use async requests (not implemented in base script)
110
+
111
+ ### Memory-Efficient Streaming
112
+
113
+ ```python
114
+ # For very large crawls, write pages immediately instead of buffering
115
+ def crawl_streaming(url, output_dir):
116
+ for page in discover_pages(url):
117
+ content = extract_page(page)
118
+ save_immediately(content, output_dir)
119
+ # Page content is not kept in memory
120
+ ```
121
+
122
+ ---
123
+
124
+ ## Integration with RAG Pipelines
125
+
126
+ ### Chunking Strategy
127
+
128
+ After crawling, chunk documents for embedding:
129
+
130
+ ```python
131
+ def chunk_document(content: str, chunk_size: int = 500) -> list:
132
+ """Split document into overlapping chunks."""
133
+ words = content.split()
134
+ chunks = []
135
+ overlap = chunk_size // 4
136
+
137
+ for i in range(0, len(words), chunk_size - overlap):
138
+ chunk = ' '.join(words[i:i + chunk_size])
139
+ if chunk:
140
+ chunks.append(chunk)
141
+
142
+ return chunks
143
+ ```
144
+
145
+ ### Metadata Preservation
146
+
147
+ Keep source URLs with chunks for citation:
148
+
149
+ ```python
150
+ {
151
+ "text": "chunk content...",
152
+ "metadata": {
153
+ "source_url": "https://docs.example.com/page",
154
+ "title": "Page Title",
155
+ "section": "Getting Started"
156
+ }
157
+ }
158
+ ```
159
+
160
+ ---
161
+
162
+ ## Troubleshooting
163
+
164
+ ### Common Issues
165
+
166
+ | Problem | Solution |
167
+ | ----------------------- | -------------------------------------------------- |
168
+ | **403 Forbidden** | Add realistic User-Agent, increase delay |
169
+ | **Cloudflare blocking** | Use Playwright with stealth plugin |
170
+ | **CAPTCHA** | Cannot bypass; manual intervention required |
171
+ | **Session-based auth** | Export cookies, use `--cookies` option |
172
+ | **Infinite scroll** | Use Playwright to scroll and wait for content |
173
+ | **Rate limiting (429)** | Implement exponential backoff, respect Retry-After |
174
+
175
+ ### Debugging
176
+
177
+ Enable verbose mode to trace crawl behavior:
178
+
179
+ ```bash
180
+ python crawl_docs.py --url "..." --subject "..." -v 2>&1 | tee crawl.log
181
+ ```