aidevops 2.52.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. package/.agent/AGENTS.md +614 -0
  2. package/.agent/accounts.md +65 -0
  3. package/.agent/aidevops/add-new-mcp-to-aidevops.md +456 -0
  4. package/.agent/aidevops/api-integrations.md +335 -0
  5. package/.agent/aidevops/architecture.md +510 -0
  6. package/.agent/aidevops/configs.md +274 -0
  7. package/.agent/aidevops/docs.md +244 -0
  8. package/.agent/aidevops/extension.md +311 -0
  9. package/.agent/aidevops/mcp-integrations.md +340 -0
  10. package/.agent/aidevops/mcp-troubleshooting.md +162 -0
  11. package/.agent/aidevops/memory-patterns.md +172 -0
  12. package/.agent/aidevops/providers.md +217 -0
  13. package/.agent/aidevops/recommendations.md +321 -0
  14. package/.agent/aidevops/requirements.md +301 -0
  15. package/.agent/aidevops/resources.md +214 -0
  16. package/.agent/aidevops/security-requirements.md +174 -0
  17. package/.agent/aidevops/security.md +350 -0
  18. package/.agent/aidevops/service-links.md +400 -0
  19. package/.agent/aidevops/services.md +357 -0
  20. package/.agent/aidevops/setup.md +153 -0
  21. package/.agent/aidevops/troubleshooting.md +389 -0
  22. package/.agent/aidevops.md +124 -0
  23. package/.agent/build-plus.md +244 -0
  24. package/.agent/content/guidelines.md +109 -0
  25. package/.agent/content.md +87 -0
  26. package/.agent/health.md +59 -0
  27. package/.agent/legal.md +59 -0
  28. package/.agent/loop-state/full-loop.local.md +16 -0
  29. package/.agent/loop-state/ralph-loop.local.md +10 -0
  30. package/.agent/marketing.md +440 -0
  31. package/.agent/memory/README.md +260 -0
  32. package/.agent/onboarding.md +796 -0
  33. package/.agent/plan-plus.md +245 -0
  34. package/.agent/research.md +100 -0
  35. package/.agent/sales.md +333 -0
  36. package/.agent/scripts/101domains-helper.sh +701 -0
  37. package/.agent/scripts/add-missing-returns.sh +140 -0
  38. package/.agent/scripts/agent-browser-helper.sh +311 -0
  39. package/.agent/scripts/agno-setup.sh +712 -0
  40. package/.agent/scripts/ahrefs-mcp-wrapper.js +168 -0
  41. package/.agent/scripts/aidevops-update-check.sh +71 -0
  42. package/.agent/scripts/ampcode-cli.sh +522 -0
  43. package/.agent/scripts/auto-version-bump.sh +156 -0
  44. package/.agent/scripts/autogen-helper.sh +512 -0
  45. package/.agent/scripts/beads-sync-helper.sh +596 -0
  46. package/.agent/scripts/closte-helper.sh +5 -0
  47. package/.agent/scripts/cloudron-helper.sh +321 -0
  48. package/.agent/scripts/codacy-cli-chunked.sh +581 -0
  49. package/.agent/scripts/codacy-cli.sh +442 -0
  50. package/.agent/scripts/code-audit-helper.sh +5 -0
  51. package/.agent/scripts/coderabbit-cli.sh +417 -0
  52. package/.agent/scripts/coderabbit-pro-analysis.sh +238 -0
  53. package/.agent/scripts/commands/code-simplifier.md +86 -0
  54. package/.agent/scripts/commands/full-loop.md +246 -0
  55. package/.agent/scripts/commands/postflight-loop.md +103 -0
  56. package/.agent/scripts/commands/recall.md +182 -0
  57. package/.agent/scripts/commands/remember.md +132 -0
  58. package/.agent/scripts/commands/save-todo.md +175 -0
  59. package/.agent/scripts/commands/session-review.md +154 -0
  60. package/.agent/scripts/comprehensive-quality-fix.sh +106 -0
  61. package/.agent/scripts/context-builder-helper.sh +522 -0
  62. package/.agent/scripts/coolify-cli-helper.sh +674 -0
  63. package/.agent/scripts/coolify-helper.sh +380 -0
  64. package/.agent/scripts/crawl4ai-examples.sh +401 -0
  65. package/.agent/scripts/crawl4ai-helper.sh +1078 -0
  66. package/.agent/scripts/crewai-helper.sh +681 -0
  67. package/.agent/scripts/dev-browser-helper.sh +513 -0
  68. package/.agent/scripts/dns-helper.sh +396 -0
  69. package/.agent/scripts/domain-research-helper.sh +917 -0
  70. package/.agent/scripts/dspy-helper.sh +285 -0
  71. package/.agent/scripts/dspyground-helper.sh +291 -0
  72. package/.agent/scripts/eeat-score-helper.sh +1242 -0
  73. package/.agent/scripts/efficient-return-fix.sh +92 -0
  74. package/.agent/scripts/extract-opencode-prompts.sh +128 -0
  75. package/.agent/scripts/find-missing-returns.sh +113 -0
  76. package/.agent/scripts/fix-auth-headers.sh +104 -0
  77. package/.agent/scripts/fix-common-strings.sh +254 -0
  78. package/.agent/scripts/fix-content-type.sh +100 -0
  79. package/.agent/scripts/fix-error-messages.sh +130 -0
  80. package/.agent/scripts/fix-misplaced-returns.sh +74 -0
  81. package/.agent/scripts/fix-remaining-literals.sh +152 -0
  82. package/.agent/scripts/fix-return-statements.sh +41 -0
  83. package/.agent/scripts/fix-s131-default-cases.sh +249 -0
  84. package/.agent/scripts/fix-sc2155-simple.sh +102 -0
  85. package/.agent/scripts/fix-shellcheck-critical.sh +187 -0
  86. package/.agent/scripts/fix-string-literals.sh +273 -0
  87. package/.agent/scripts/full-loop-helper.sh +773 -0
  88. package/.agent/scripts/generate-opencode-agents.sh +497 -0
  89. package/.agent/scripts/generate-opencode-commands.sh +1629 -0
  90. package/.agent/scripts/generate-skills.sh +366 -0
  91. package/.agent/scripts/git-platforms-helper.sh +640 -0
  92. package/.agent/scripts/gitea-cli-helper.sh +743 -0
  93. package/.agent/scripts/github-cli-helper.sh +702 -0
  94. package/.agent/scripts/gitlab-cli-helper.sh +682 -0
  95. package/.agent/scripts/gsc-add-user-helper.sh +325 -0
  96. package/.agent/scripts/gsc-sitemap-helper.sh +678 -0
  97. package/.agent/scripts/hetzner-helper.sh +485 -0
  98. package/.agent/scripts/hostinger-helper.sh +229 -0
  99. package/.agent/scripts/keyword-research-helper.sh +1815 -0
  100. package/.agent/scripts/langflow-helper.sh +544 -0
  101. package/.agent/scripts/linkedin-automation.py +241 -0
  102. package/.agent/scripts/linter-manager.sh +599 -0
  103. package/.agent/scripts/linters-local.sh +434 -0
  104. package/.agent/scripts/list-keys-helper.sh +488 -0
  105. package/.agent/scripts/local-browser-automation.py +339 -0
  106. package/.agent/scripts/localhost-helper.sh +744 -0
  107. package/.agent/scripts/loop-common.sh +806 -0
  108. package/.agent/scripts/mainwp-helper.sh +728 -0
  109. package/.agent/scripts/markdown-formatter.sh +338 -0
  110. package/.agent/scripts/markdown-lint-fix.sh +311 -0
  111. package/.agent/scripts/mass-fix-returns.sh +58 -0
  112. package/.agent/scripts/mcp-diagnose.sh +167 -0
  113. package/.agent/scripts/mcp-inspector-helper.sh +449 -0
  114. package/.agent/scripts/memory-helper.sh +650 -0
  115. package/.agent/scripts/monitor-code-review.sh +255 -0
  116. package/.agent/scripts/onboarding-helper.sh +706 -0
  117. package/.agent/scripts/opencode-github-setup-helper.sh +797 -0
  118. package/.agent/scripts/opencode-test-helper.sh +213 -0
  119. package/.agent/scripts/pagespeed-helper.sh +464 -0
  120. package/.agent/scripts/pandoc-helper.sh +362 -0
  121. package/.agent/scripts/postflight-check.sh +555 -0
  122. package/.agent/scripts/pre-commit-hook.sh +259 -0
  123. package/.agent/scripts/pre-edit-check.sh +169 -0
  124. package/.agent/scripts/qlty-cli.sh +356 -0
  125. package/.agent/scripts/quality-cli-manager.sh +525 -0
  126. package/.agent/scripts/quality-feedback-helper.sh +462 -0
  127. package/.agent/scripts/quality-fix.sh +263 -0
  128. package/.agent/scripts/quality-loop-helper.sh +1108 -0
  129. package/.agent/scripts/ralph-loop-helper.sh +836 -0
  130. package/.agent/scripts/ralph-upstream-check.sh +341 -0
  131. package/.agent/scripts/secretlint-helper.sh +847 -0
  132. package/.agent/scripts/servers-helper.sh +241 -0
  133. package/.agent/scripts/ses-helper.sh +619 -0
  134. package/.agent/scripts/session-review-helper.sh +404 -0
  135. package/.agent/scripts/setup-linters-wizard.sh +379 -0
  136. package/.agent/scripts/setup-local-api-keys.sh +330 -0
  137. package/.agent/scripts/setup-mcp-integrations.sh +472 -0
  138. package/.agent/scripts/shared-constants.sh +246 -0
  139. package/.agent/scripts/site-crawler-helper.sh +1487 -0
  140. package/.agent/scripts/snyk-helper.sh +940 -0
  141. package/.agent/scripts/sonarcloud-autofix.sh +193 -0
  142. package/.agent/scripts/sonarcloud-cli.sh +191 -0
  143. package/.agent/scripts/sonarscanner-cli.sh +455 -0
  144. package/.agent/scripts/spaceship-helper.sh +747 -0
  145. package/.agent/scripts/stagehand-helper.sh +321 -0
  146. package/.agent/scripts/stagehand-python-helper.sh +321 -0
  147. package/.agent/scripts/stagehand-python-setup.sh +441 -0
  148. package/.agent/scripts/stagehand-setup.sh +439 -0
  149. package/.agent/scripts/system-cleanup.sh +340 -0
  150. package/.agent/scripts/terminal-title-helper.sh +388 -0
  151. package/.agent/scripts/terminal-title-setup.sh +549 -0
  152. package/.agent/scripts/test-stagehand-both-integration.sh +317 -0
  153. package/.agent/scripts/test-stagehand-integration.sh +309 -0
  154. package/.agent/scripts/test-stagehand-python-integration.sh +341 -0
  155. package/.agent/scripts/todo-ready.sh +263 -0
  156. package/.agent/scripts/tool-version-check.sh +362 -0
  157. package/.agent/scripts/toon-helper.sh +469 -0
  158. package/.agent/scripts/twilio-helper.sh +917 -0
  159. package/.agent/scripts/updown-helper.sh +279 -0
  160. package/.agent/scripts/validate-mcp-integrations.sh +250 -0
  161. package/.agent/scripts/validate-version-consistency.sh +131 -0
  162. package/.agent/scripts/vaultwarden-helper.sh +597 -0
  163. package/.agent/scripts/vercel-cli-helper.sh +816 -0
  164. package/.agent/scripts/verify-mirrors.sh +169 -0
  165. package/.agent/scripts/version-manager.sh +831 -0
  166. package/.agent/scripts/webhosting-helper.sh +471 -0
  167. package/.agent/scripts/webhosting-verify.sh +238 -0
  168. package/.agent/scripts/wordpress-mcp-helper.sh +508 -0
  169. package/.agent/scripts/worktree-helper.sh +595 -0
  170. package/.agent/scripts/worktree-sessions.sh +577 -0
  171. package/.agent/seo/dataforseo.md +215 -0
  172. package/.agent/seo/domain-research.md +532 -0
  173. package/.agent/seo/eeat-score.md +659 -0
  174. package/.agent/seo/google-search-console.md +366 -0
  175. package/.agent/seo/gsc-sitemaps.md +282 -0
  176. package/.agent/seo/keyword-research.md +521 -0
  177. package/.agent/seo/serper.md +278 -0
  178. package/.agent/seo/site-crawler.md +387 -0
  179. package/.agent/seo.md +236 -0
  180. package/.agent/services/accounting/quickfile.md +159 -0
  181. package/.agent/services/communications/telfon.md +470 -0
  182. package/.agent/services/communications/twilio.md +569 -0
  183. package/.agent/services/crm/fluentcrm.md +449 -0
  184. package/.agent/services/email/ses.md +399 -0
  185. package/.agent/services/hosting/101domains.md +378 -0
  186. package/.agent/services/hosting/closte.md +177 -0
  187. package/.agent/services/hosting/cloudflare.md +251 -0
  188. package/.agent/services/hosting/cloudron.md +478 -0
  189. package/.agent/services/hosting/dns-providers.md +335 -0
  190. package/.agent/services/hosting/domain-purchasing.md +344 -0
  191. package/.agent/services/hosting/hetzner.md +327 -0
  192. package/.agent/services/hosting/hostinger.md +287 -0
  193. package/.agent/services/hosting/localhost.md +419 -0
  194. package/.agent/services/hosting/spaceship.md +353 -0
  195. package/.agent/services/hosting/webhosting.md +330 -0
  196. package/.agent/social-media.md +69 -0
  197. package/.agent/templates/plans-template.md +114 -0
  198. package/.agent/templates/prd-template.md +129 -0
  199. package/.agent/templates/tasks-template.md +108 -0
  200. package/.agent/templates/todo-template.md +89 -0
  201. package/.agent/tools/ai-assistants/agno.md +471 -0
  202. package/.agent/tools/ai-assistants/capsolver.md +326 -0
  203. package/.agent/tools/ai-assistants/configuration.md +221 -0
  204. package/.agent/tools/ai-assistants/overview.md +209 -0
  205. package/.agent/tools/ai-assistants/status.md +171 -0
  206. package/.agent/tools/ai-assistants/windsurf.md +193 -0
  207. package/.agent/tools/ai-orchestration/autogen.md +406 -0
  208. package/.agent/tools/ai-orchestration/crewai.md +445 -0
  209. package/.agent/tools/ai-orchestration/langflow.md +405 -0
  210. package/.agent/tools/ai-orchestration/openprose.md +487 -0
  211. package/.agent/tools/ai-orchestration/overview.md +362 -0
  212. package/.agent/tools/ai-orchestration/packaging.md +647 -0
  213. package/.agent/tools/browser/agent-browser.md +464 -0
  214. package/.agent/tools/browser/browser-automation.md +400 -0
  215. package/.agent/tools/browser/chrome-devtools.md +282 -0
  216. package/.agent/tools/browser/crawl4ai-integration.md +422 -0
  217. package/.agent/tools/browser/crawl4ai-resources.md +277 -0
  218. package/.agent/tools/browser/crawl4ai-usage.md +416 -0
  219. package/.agent/tools/browser/crawl4ai.md +585 -0
  220. package/.agent/tools/browser/dev-browser.md +341 -0
  221. package/.agent/tools/browser/pagespeed.md +260 -0
  222. package/.agent/tools/browser/playwright.md +266 -0
  223. package/.agent/tools/browser/playwriter.md +310 -0
  224. package/.agent/tools/browser/stagehand-examples.md +456 -0
  225. package/.agent/tools/browser/stagehand-python.md +483 -0
  226. package/.agent/tools/browser/stagehand.md +421 -0
  227. package/.agent/tools/build-agent/agent-review.md +224 -0
  228. package/.agent/tools/build-agent/build-agent.md +784 -0
  229. package/.agent/tools/build-mcp/aidevops-plugin.md +476 -0
  230. package/.agent/tools/build-mcp/api-wrapper.md +445 -0
  231. package/.agent/tools/build-mcp/build-mcp.md +240 -0
  232. package/.agent/tools/build-mcp/deployment.md +401 -0
  233. package/.agent/tools/build-mcp/server-patterns.md +632 -0
  234. package/.agent/tools/build-mcp/transports.md +366 -0
  235. package/.agent/tools/code-review/auditing.md +383 -0
  236. package/.agent/tools/code-review/automation.md +219 -0
  237. package/.agent/tools/code-review/best-practices.md +203 -0
  238. package/.agent/tools/code-review/codacy.md +151 -0
  239. package/.agent/tools/code-review/code-simplifier.md +174 -0
  240. package/.agent/tools/code-review/code-standards.md +309 -0
  241. package/.agent/tools/code-review/coderabbit.md +101 -0
  242. package/.agent/tools/code-review/management.md +155 -0
  243. package/.agent/tools/code-review/qlty.md +248 -0
  244. package/.agent/tools/code-review/secretlint.md +565 -0
  245. package/.agent/tools/code-review/setup.md +250 -0
  246. package/.agent/tools/code-review/snyk.md +563 -0
  247. package/.agent/tools/code-review/tools.md +230 -0
  248. package/.agent/tools/content/summarize.md +353 -0
  249. package/.agent/tools/context/augment-context-engine.md +468 -0
  250. package/.agent/tools/context/context-builder-agent.md +76 -0
  251. package/.agent/tools/context/context-builder.md +375 -0
  252. package/.agent/tools/context/context7.md +371 -0
  253. package/.agent/tools/context/dspy.md +302 -0
  254. package/.agent/tools/context/dspyground.md +374 -0
  255. package/.agent/tools/context/llm-tldr.md +219 -0
  256. package/.agent/tools/context/osgrep.md +488 -0
  257. package/.agent/tools/context/prompt-optimization.md +338 -0
  258. package/.agent/tools/context/toon.md +292 -0
  259. package/.agent/tools/conversion/pandoc.md +304 -0
  260. package/.agent/tools/credentials/api-key-management.md +154 -0
  261. package/.agent/tools/credentials/api-key-setup.md +224 -0
  262. package/.agent/tools/credentials/environment-variables.md +180 -0
  263. package/.agent/tools/credentials/vaultwarden.md +382 -0
  264. package/.agent/tools/data-extraction/outscraper.md +974 -0
  265. package/.agent/tools/deployment/coolify-cli.md +388 -0
  266. package/.agent/tools/deployment/coolify-setup.md +353 -0
  267. package/.agent/tools/deployment/coolify.md +345 -0
  268. package/.agent/tools/deployment/vercel.md +390 -0
  269. package/.agent/tools/git/authentication.md +132 -0
  270. package/.agent/tools/git/gitea-cli.md +193 -0
  271. package/.agent/tools/git/github-actions.md +207 -0
  272. package/.agent/tools/git/github-cli.md +223 -0
  273. package/.agent/tools/git/gitlab-cli.md +190 -0
  274. package/.agent/tools/git/opencode-github-security.md +350 -0
  275. package/.agent/tools/git/opencode-github.md +328 -0
  276. package/.agent/tools/git/opencode-gitlab.md +252 -0
  277. package/.agent/tools/git/security.md +196 -0
  278. package/.agent/tools/git.md +207 -0
  279. package/.agent/tools/opencode/oh-my-opencode.md +375 -0
  280. package/.agent/tools/opencode/opencode-anthropic-auth.md +446 -0
  281. package/.agent/tools/opencode/opencode.md +651 -0
  282. package/.agent/tools/social-media/bird.md +437 -0
  283. package/.agent/tools/task-management/beads.md +336 -0
  284. package/.agent/tools/terminal/terminal-title.md +251 -0
  285. package/.agent/tools/ui/shadcn.md +196 -0
  286. package/.agent/tools/ui/ui-skills.md +115 -0
  287. package/.agent/tools/wordpress/localwp.md +311 -0
  288. package/.agent/tools/wordpress/mainwp.md +391 -0
  289. package/.agent/tools/wordpress/scf.md +527 -0
  290. package/.agent/tools/wordpress/wp-admin.md +729 -0
  291. package/.agent/tools/wordpress/wp-dev.md +940 -0
  292. package/.agent/tools/wordpress/wp-preferred.md +398 -0
  293. package/.agent/tools/wordpress.md +95 -0
  294. package/.agent/workflows/branch/bugfix.md +63 -0
  295. package/.agent/workflows/branch/chore.md +95 -0
  296. package/.agent/workflows/branch/experiment.md +115 -0
  297. package/.agent/workflows/branch/feature.md +59 -0
  298. package/.agent/workflows/branch/hotfix.md +98 -0
  299. package/.agent/workflows/branch/refactor.md +92 -0
  300. package/.agent/workflows/branch/release.md +96 -0
  301. package/.agent/workflows/branch.md +347 -0
  302. package/.agent/workflows/bug-fixing.md +267 -0
  303. package/.agent/workflows/changelog.md +129 -0
  304. package/.agent/workflows/code-audit-remote.md +279 -0
  305. package/.agent/workflows/conversation-starter.md +69 -0
  306. package/.agent/workflows/error-feedback.md +578 -0
  307. package/.agent/workflows/feature-development.md +355 -0
  308. package/.agent/workflows/git-workflow.md +702 -0
  309. package/.agent/workflows/multi-repo-workspace.md +268 -0
  310. package/.agent/workflows/plans.md +709 -0
  311. package/.agent/workflows/postflight.md +604 -0
  312. package/.agent/workflows/pr.md +571 -0
  313. package/.agent/workflows/preflight.md +278 -0
  314. package/.agent/workflows/ralph-loop.md +773 -0
  315. package/.agent/workflows/release.md +498 -0
  316. package/.agent/workflows/session-manager.md +254 -0
  317. package/.agent/workflows/session-review.md +311 -0
  318. package/.agent/workflows/sql-migrations.md +631 -0
  319. package/.agent/workflows/version-bump.md +283 -0
  320. package/.agent/workflows/wiki-update.md +333 -0
  321. package/.agent/workflows/worktree.md +477 -0
  322. package/LICENSE +21 -0
  323. package/README.md +1446 -0
  324. package/VERSION +1 -0
  325. package/aidevops.sh +1746 -0
  326. package/bin/aidevops +21 -0
  327. package/package.json +75 -0
  328. package/scripts/npm-postinstall.js +60 -0
  329. package/setup.sh +2366 -0
@@ -0,0 +1,1487 @@
1
+ #!/bin/bash
2
+ # shellcheck disable=SC2034,SC2155,SC2317,SC2329,SC2016,SC2181,SC1091,SC2154,SC2015,SC2086,SC2129,SC2030,SC2031,SC2119,SC2120,SC2001,SC2162,SC2088,SC2089,SC2090,SC2029,SC2006,SC2153
3
+
4
+ # Site Crawler Helper Script
5
+ # SEO site auditing with Screaming Frog-like capabilities
6
+ # Uses Crawl4AI when available, falls back to lightweight Python crawler
7
+ #
8
+ # Usage: ./site-crawler-helper.sh [command] [url] [options]
9
+ # Commands:
10
+ # crawl - Full site crawl with SEO data extraction
11
+ # audit-links - Check for broken links (4XX/5XX)
12
+ # audit-meta - Audit page titles and meta descriptions
13
+ # audit-redirects - Analyze redirects and chains
14
+ # generate-sitemap - Generate XML sitemap from crawl
15
+ # compare - Compare two crawls
16
+ # status - Check crawler dependencies
17
+ # help - Show this help message
18
+ #
19
+ # Author: AI DevOps Framework
20
+ # Version: 2.0.0
21
+ # License: MIT
22
+
23
+ set -euo pipefail
24
+
25
+ # Colors for output
26
+ readonly GREEN='\033[0;32m'
27
+ readonly BLUE='\033[0;34m'
28
+ readonly YELLOW='\033[1;33m'
29
+ readonly RED='\033[0;31m'
30
+ readonly PURPLE='\033[0;35m'
31
+ readonly NC='\033[0m'
32
+
33
+ # Constants
34
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" || exit
35
+ readonly SCRIPT_DIR
36
+ readonly CONFIG_DIR="${HOME}/.config/aidevops"
37
+ readonly CONFIG_FILE="${CONFIG_DIR}/site-crawler.json"
38
+ readonly DEFAULT_OUTPUT_DIR="${HOME}/Downloads"
39
+ readonly CRAWL4AI_PORT="11235"
40
+ readonly CRAWL4AI_URL="http://localhost:${CRAWL4AI_PORT}"
41
+
42
+ # Default configuration
43
+ DEFAULT_DEPTH=3
44
+ DEFAULT_MAX_URLS=100
45
+ DEFAULT_DELAY=100
46
+ DEFAULT_FORMAT="xlsx"
47
+ RESPECT_ROBOTS=true
48
+ USE_CRAWL4AI=false
49
+
50
+ # Detect Python with required packages
51
+ PYTHON_CMD=""
52
+
53
+ # Print functions
54
+ print_success() {
55
+ echo -e "${GREEN}[OK] $1${NC}"
56
+ return 0
57
+ }
58
+
59
+ print_info() {
60
+ echo -e "${BLUE}[INFO] $1${NC}"
61
+ return 0
62
+ }
63
+
64
+ print_warning() {
65
+ echo -e "${YELLOW}[WARN] $1${NC}"
66
+ return 0
67
+ }
68
+
69
+ print_error() {
70
+ echo -e "${RED}[ERROR] $1${NC}"
71
+ return 0
72
+ }
73
+
74
+ print_header() {
75
+ echo -e "${PURPLE}=== $1 ===${NC}"
76
+ return 0
77
+ }
78
+
79
+ # Check if Crawl4AI is available
80
+ check_crawl4ai() {
81
+ if curl -s --connect-timeout 2 "${CRAWL4AI_URL}/health" &> /dev/null; then
82
+ USE_CRAWL4AI=true
83
+ return 0
84
+ fi
85
+ return 1
86
+ }
87
+
88
+ # Find working Python with dependencies
89
+ find_python() {
90
+ local pythons=("python3.11" "python3.12" "python3.10" "python3")
91
+ local user_site="${HOME}/Library/Python/3.11/lib/python/site-packages"
92
+
93
+ for py in "${pythons[@]}"; do
94
+ if command -v "$py" &> /dev/null; then
95
+ # Test if it has the required modules
96
+ if PYTHONPATH="${user_site}:${PYTHONPATH:-}" "$py" -c "import aiohttp, bs4" 2>/dev/null; then
97
+ PYTHON_CMD="$py"
98
+ export PYTHONPATH="${user_site}:${PYTHONPATH:-}"
99
+ return 0
100
+ fi
101
+ fi
102
+ done
103
+ return 1
104
+ }
105
+
106
+ # Install Python dependencies
107
+ install_python_deps() {
108
+ local pythons=("python3.11" "python3.12" "python3.10" "python3")
109
+
110
+ for py in "${pythons[@]}"; do
111
+ if command -v "$py" &> /dev/null; then
112
+ print_info "Installing dependencies with $py..."
113
+ "$py" -m pip install --user aiohttp beautifulsoup4 openpyxl 2>/dev/null && {
114
+ PYTHON_CMD="$py"
115
+ export PYTHONPATH="${HOME}/Library/Python/3.11/lib/python/site-packages:${PYTHONPATH:-}"
116
+ return 0
117
+ }
118
+ fi
119
+ done
120
+ return 1
121
+ }
122
+
123
+ # Extract domain from URL
124
+ get_domain() {
125
+ local url="$1"
126
+ echo "$url" | sed -E 's|^https?://||' | sed -E 's|/.*||' | sed -E 's|:.*||'
127
+ }
128
+
129
+ # Create output directory structure
130
+ create_output_dir() {
131
+ local domain="$1"
132
+ local output_base="${2:-$DEFAULT_OUTPUT_DIR}"
133
+ local timestamp
134
+ timestamp=$(date +%Y-%m-%d_%H%M%S)
135
+
136
+ local output_dir="${output_base}/${domain}/${timestamp}"
137
+ mkdir -p "$output_dir"
138
+
139
+ # Update _latest symlink
140
+ local latest_link="${output_base}/${domain}/_latest"
141
+ rm -f "$latest_link"
142
+ ln -sf "$timestamp" "$latest_link"
143
+
144
+ echo "$output_dir"
145
+ return 0
146
+ }
147
+
148
+ # Save markdown with rich metadata frontmatter and download images
149
+ save_markdown_with_metadata() {
150
+ local result="$1"
151
+ local full_page_dir="$2"
152
+ local body_only_dir="$3"
153
+ local images_dir="$4"
154
+ local base_domain="$5"
155
+
156
+ # Extract basic info
157
+ local page_url status_code redirected_url success
158
+ page_url=$(printf '%s' "$result" | jq -r '.url // empty')
159
+ status_code=$(printf '%s' "$result" | jq -r '.status_code // 0')
160
+ redirected_url=$(printf '%s' "$result" | jq -r '.redirected_url // empty')
161
+ success=$(printf '%s' "$result" | jq -r '.success // false')
162
+
163
+ # Fix status code: if success=true and we got content, report 200 (final status)
164
+ # Only show redirect status if there was an actual redirect to a different path
165
+ local original_status="$status_code"
166
+ if [[ "$success" == "true" && $status_code -ge 300 && $status_code -lt 400 ]]; then
167
+ # Check if redirect was just trailing slash normalization
168
+ local url_normalized redirect_normalized
169
+ url_normalized=$(echo "$page_url" | sed 's|/$||')
170
+ redirect_normalized=$(echo "$redirected_url" | sed 's|/$||')
171
+ if [[ "$url_normalized" == "$redirect_normalized" ]]; then
172
+ status_code=200 # Trailing slash redirect, content was fetched successfully
173
+ fi
174
+ fi
175
+
176
+ # Extract metadata
177
+ local title meta_desc meta_keywords canonical og_title og_desc og_image
178
+ title=$(printf '%s' "$result" | jq -r '.metadata.title // empty')
179
+ meta_desc=$(printf '%s' "$result" | jq -r '.metadata.description // empty')
180
+ meta_keywords=$(printf '%s' "$result" | jq -r '.metadata.keywords // empty')
181
+ canonical=$(printf '%s' "$result" | jq -r '.metadata."og:url" // empty')
182
+ og_title=$(printf '%s' "$result" | jq -r '.metadata."og:title" // empty')
183
+ og_desc=$(printf '%s' "$result" | jq -r '.metadata."og:description" // empty')
184
+ og_image=$(printf '%s' "$result" | jq -r '.metadata."og:image" // empty')
185
+
186
+ # Extract hreflang (from response headers or HTML - Crawl4AI may have this in metadata)
187
+ local hreflang_json
188
+ hreflang_json=$(printf '%s' "$result" | jq -c '[.metadata | to_entries[] | select(.key | startswith("hreflang")) | {lang: .key, url: .value}]' 2>/dev/null || echo "[]")
189
+
190
+ # Extract JSON-LD schema from HTML
191
+ local schema_json=""
192
+ local html_content
193
+ html_content=$(printf '%s' "$result" | jq -r '.html // empty' 2>/dev/null)
194
+ if [[ -n "$html_content" ]]; then
195
+ # Extract all JSON-LD script blocks
196
+ schema_json=$(echo "$html_content" | grep -o '<script type="application/ld+json"[^>]*>[^<]*</script>' | \
197
+ sed 's/<script type="application\/ld+json"[^>]*>//g' | \
198
+ sed 's/<\/script>//g' | \
199
+ while read -r schema_block; do
200
+ # Pretty print each schema block
201
+ echo "$schema_block" | jq '.' 2>/dev/null
202
+ done)
203
+ fi
204
+
205
+ # Get markdown content
206
+ local markdown_content
207
+ markdown_content=$(printf '%s' "$result" | jq -r '.markdown.raw_markdown // .markdown // empty' 2>/dev/null)
208
+
209
+ [[ -z "$markdown_content" || "$markdown_content" == "null" || "$markdown_content" == "{" ]] && return 0
210
+
211
+ # Generate slug for filename
212
+ local slug
213
+ slug=$(echo "$page_url" | sed -E 's|^https?://[^/]+||' | sed 's|^/||' | sed 's|/$||' | tr '/' '-' | tr '?' '-' | tr '&' '-')
214
+ [[ -z "$slug" ]] && slug="index"
215
+ slug="${slug:0:100}"
216
+
217
+ # Extract and download body images
218
+ local images_json page_images_dir
219
+ images_json=$(printf '%s' "$result" | jq -c '.media.images // []' 2>/dev/null)
220
+ page_images_dir="${images_dir}/${slug}"
221
+
222
+ local downloaded_images
223
+ downloaded_images=()
224
+ local image_count
225
+ image_count=$(echo "$images_json" | jq 'length' 2>/dev/null || echo "0")
226
+
227
+ if [[ $image_count -gt 0 ]]; then
228
+ mkdir -p "$page_images_dir"
229
+
230
+ # Download unique images (skip srcset variants by filtering unique base names)
231
+ local seen_images
232
+ seen_images=()
233
+ for ((j=0; j<image_count && j<20; j++)); do # Limit to 20 images per page
234
+ local img_src img_alt img_filename
235
+ img_src=$(echo "$images_json" | jq -r ".[$j].src // empty")
236
+ img_alt=$(echo "$images_json" | jq -r ".[$j].alt // empty")
237
+
238
+ [[ -z "$img_src" ]] && continue
239
+
240
+ # Skip data URIs and tiny images (likely icons/tracking pixels)
241
+ [[ "$img_src" =~ ^data: ]] && continue
242
+
243
+ # Extract filename from URL
244
+ img_filename=$(basename "$img_src" | sed 's|?.*||' | sed 's|#.*||')
245
+
246
+ # Skip if we've already seen this base image (avoid srcset duplicates)
247
+ local base_img
248
+ base_img=$(echo "$img_filename" | sed -E 's/-[0-9]+x[0-9]+\./\./')
249
+
250
+ local already_seen=false
251
+ if [[ ${#seen_images[@]} -gt 0 ]]; then
252
+ for seen in "${seen_images[@]}"; do
253
+ if [[ "$seen" == "$base_img" ]]; then
254
+ already_seen=true
255
+ break
256
+ fi
257
+ done
258
+ fi
259
+ if [[ "$already_seen" == "true" ]]; then
260
+ continue
261
+ fi
262
+ seen_images+=("$base_img")
263
+
264
+ # Download image (quietly, with timeout)
265
+ if curl -sS -L --max-time 10 -o "${page_images_dir}/${img_filename}" "$img_src" 2>/dev/null; then
266
+ # Only keep if file is > 1KB (skip tracking pixels)
267
+ local file_size
268
+ file_size=$(stat -f%z "${page_images_dir}/${img_filename}" 2>/dev/null || echo "0")
269
+ if [[ $file_size -gt 1024 ]]; then
270
+ downloaded_images+=("${img_filename}|${img_src}|${img_alt}")
271
+ else
272
+ rm -f "${page_images_dir}/${img_filename}"
273
+ fi
274
+ fi
275
+ done
276
+
277
+ # Remove empty directory if no images downloaded
278
+ rmdir "${page_images_dir}" 2>/dev/null || true
279
+ fi
280
+
281
+ # Build YAML frontmatter
282
+ local frontmatter="---
283
+ url: \"${page_url}\"
284
+ status_code: ${status_code}"
285
+
286
+ # Add redirect info only if it was a real redirect (not just trailing slash)
287
+ if [[ $original_status -ge 300 && $original_status -lt 400 && "$status_code" != "$original_status" ]]; then
288
+ frontmatter+="
289
+ redirect_status: ${original_status}
290
+ redirected_to: \"${redirected_url}\""
291
+ elif [[ -n "$redirected_url" && "$redirected_url" != "$page_url" && "$redirected_url" != "null" ]]; then
292
+ frontmatter+="
293
+ redirected_to: \"${redirected_url}\""
294
+ fi
295
+
296
+ # Add SEO metadata (use || true to prevent set -e exit on false conditions)
297
+ if [[ -n "$title" && "$title" != "null" ]]; then
298
+ frontmatter+="
299
+ title: \"$(echo "$title" | sed 's/"/\\"/g')\""
300
+ fi
301
+
302
+ if [[ -n "$meta_desc" && "$meta_desc" != "null" ]]; then
303
+ frontmatter+="
304
+ description: \"$(echo "$meta_desc" | sed 's/"/\\"/g')\""
305
+ fi
306
+
307
+ if [[ -n "$meta_keywords" && "$meta_keywords" != "null" ]]; then
308
+ frontmatter+="
309
+ keywords: \"$(echo "$meta_keywords" | sed 's/"/\\"/g')\""
310
+ fi
311
+
312
+ if [[ -n "$canonical" && "$canonical" != "null" ]]; then
313
+ frontmatter+="
314
+ canonical: \"${canonical}\""
315
+ fi
316
+
317
+ # Add Open Graph data
318
+ if [[ -n "$og_title" && "$og_title" != "null" && "$og_title" != "$title" ]]; then
319
+ frontmatter+="
320
+ og_title: \"$(echo "$og_title" | sed 's/"/\\"/g')\""
321
+ fi
322
+
323
+ if [[ -n "$og_image" && "$og_image" != "null" ]]; then
324
+ frontmatter+="
325
+ og_image: \"${og_image}\""
326
+ fi
327
+
328
+ # Add hreflang if present
329
+ if [[ "$hreflang_json" != "[]" && "$hreflang_json" != "null" ]]; then
330
+ local hreflang_yaml
331
+ hreflang_yaml=$(echo "$hreflang_json" | jq -r '.[] | " - lang: \"\(.lang)\"\n url: \"\(.url)\""' 2>/dev/null)
332
+ if [[ -n "$hreflang_yaml" ]]; then
333
+ frontmatter+="
334
+ hreflang:
335
+ ${hreflang_yaml}"
336
+ fi
337
+ fi
338
+
339
+ # Add downloaded images list
340
+ if [[ -n "${downloaded_images[*]:-}" ]]; then
341
+ frontmatter+="
342
+ images:"
343
+ for img_info in "${downloaded_images[@]}"; do
344
+ local img_file img_url img_alt_text
345
+ img_file=$(echo "$img_info" | cut -d'|' -f1)
346
+ img_url=$(echo "$img_info" | cut -d'|' -f2)
347
+ img_alt_text=$(echo "$img_info" | cut -d'|' -f3 | sed 's/"/\\"/g')
348
+ frontmatter+="
349
+ - file: \"${img_file}\"
350
+ original_url: \"${img_url}\""
351
+ if [[ -n "$img_alt_text" ]]; then
352
+ frontmatter+="
353
+ alt: \"${img_alt_text}\""
354
+ fi
355
+ done
356
+ fi
357
+
358
+ # Add crawl timestamp
359
+ frontmatter+="
360
+ crawled_at: \"$(date -Iseconds)\"
361
+ ---"
362
+
363
+ # Update markdown image references to point to local files
364
+ local updated_markdown="$markdown_content"
365
+ for img_info in "${downloaded_images[@]+"${downloaded_images[@]}"}"; do
366
+ [[ -z "$img_info" ]] && continue
367
+ local img_file img_url
368
+ img_file=$(echo "$img_info" | cut -d'|' -f1)
369
+ img_url=$(echo "$img_info" | cut -d'|' -f2)
370
+ # Replace remote URL with local path
371
+ updated_markdown=$(echo "$updated_markdown" | sed "s|${img_url}|../images/${slug}/${img_file}|g")
372
+ done
373
+
374
+ # Extract body-only content (remove nav, header, footer, cookie notices)
375
+ local body_markdown
376
+ body_markdown=$(extract_body_content "$updated_markdown")
377
+
378
+ # Write the FULL PAGE markdown file
379
+ {
380
+ echo "$frontmatter"
381
+ echo ""
382
+ echo "$updated_markdown"
383
+
384
+ # Append schema markup if found
385
+ if [[ -n "$schema_json" ]]; then
386
+ echo ""
387
+ echo "---"
388
+ echo ""
389
+ echo "## Structured Data (JSON-LD)"
390
+ echo ""
391
+ echo '```json'
392
+ echo "$schema_json"
393
+ echo '```'
394
+ fi
395
+ } > "${full_page_dir}/${slug}.md"
396
+
397
+ # Write the BODY ONLY markdown file (no schema - just content)
398
+ {
399
+ echo "$frontmatter"
400
+ echo ""
401
+ echo "$body_markdown"
402
+ } > "${body_only_dir}/${slug}.md"
403
+
404
+ return 0
405
+ }
406
+
407
+ # Extract body content from markdown (remove nav, header, footer, cookie notices)
408
+ # Site-agnostic approach - optimized for performance
409
+ extract_body_content() {
410
+ local markdown="$1"
411
+
412
+ # Use awk for efficient single-pass extraction
413
+ # This is much faster than bash loops with regex
414
+ echo "$markdown" | awk '
415
+ BEGIN {
416
+ in_body = 0
417
+ footer_started = 0
418
+ }
419
+
420
+ # Start at first H1 or H2 heading
421
+ /^#+ / && !in_body {
422
+ in_body = 1
423
+ }
424
+
425
+ # Skip until we find a heading
426
+ !in_body { next }
427
+
428
+ # Detect footer markers
429
+ /^##* *[Ff]ooter/ { footer_started = 1 }
430
+ /©|Copyright|\(c\) *20[0-9][0-9]/ { footer_started = 1 }
431
+ /All rights reserved|Alle Rechte vorbehalten|Tous droits/ { footer_started = 1 }
432
+ /^##* *(References|Références|Referenzen)$/ { footer_started = 1 }
433
+
434
+ # Cookie/GDPR patterns
435
+ /[Cc]ookie.*(consent|settings|preferences|policy)/ { footer_started = 1 }
436
+ /GDPR|CCPA|LGPD/ { footer_started = 1 }
437
+ /[Pp]rivacy [Oo]verview/ { footer_started = 1 }
438
+ /[Ss]trictly [Nn]ecessary [Cc]ookie/ { footer_started = 1 }
439
+
440
+ # Powered by patterns
441
+ /[Pp]owered by|[Bb]uilt with|[Mm]ade with/ { footer_started = 1 }
442
+
443
+ # Skip footer content
444
+ footer_started { next }
445
+
446
+ # Print body content
447
+ { print }
448
+ '
449
+ }
450
+
451
+ # Crawl using Crawl4AI API with multi-page discovery
452
+ crawl_with_crawl4ai() {
453
+ local url="$1"
454
+ local output_dir="$2"
455
+ local max_urls="$3"
456
+ local depth="$4"
457
+
458
+ print_info "Using Crawl4AI backend..."
459
+
460
+ # Create content directories
461
+ local full_page_dir="${output_dir}/content-full-page-md"
462
+ local body_only_dir="${output_dir}/content-body-md"
463
+ local images_dir="${output_dir}/images"
464
+ mkdir -p "$full_page_dir" "$body_only_dir" "$images_dir"
465
+
466
+ # Extract base domain for internal link filtering
467
+ local base_domain
468
+ base_domain=$(echo "$url" | sed -E 's|^https?://||' | sed -E 's|/.*||')
469
+
470
+ # Initialize tracking arrays via temp files
471
+ local visited_file="${output_dir}/.visited_urls"
472
+ local queue_file="${output_dir}/.queue_urls"
473
+ local results_file="${output_dir}/.results.jsonl"
474
+
475
+ echo "$url" > "$queue_file"
476
+ touch "$visited_file"
477
+ touch "$results_file"
478
+
479
+ local crawled_count=0
480
+ local current_depth=0
481
+
482
+ print_info "Starting multi-page crawl (max: $max_urls, depth: $depth)"
483
+
484
+ while [[ $crawled_count -lt $max_urls ]] && [[ -s "$queue_file" ]]; do
485
+ # Get next batch of URLs (up to 5 at a time for efficiency)
486
+ local batch_size=5
487
+ local remaining=$((max_urls - crawled_count))
488
+ [[ $remaining -lt $batch_size ]] && batch_size=$remaining
489
+
490
+ local batch_urls=()
491
+ local batch_count=0
492
+
493
+ while IFS= read -r queue_url && [[ $batch_count -lt $batch_size ]]; do
494
+ # Skip if already visited
495
+ if grep -qxF "$queue_url" "$visited_file" 2>/dev/null; then
496
+ continue
497
+ fi
498
+ batch_urls+=("$queue_url")
499
+ echo "$queue_url" >> "$visited_file"
500
+ ((batch_count++))
501
+ done < "$queue_file"
502
+
503
+ # Remove processed URLs from queue
504
+ if [[ ${#batch_urls[@]} -gt 0 ]]; then
505
+ local new_queue
506
+ new_queue=$(mktemp)
507
+ while IFS= read -r queue_url; do
508
+ if ! grep -qxF "$queue_url" "$visited_file" 2>/dev/null; then
509
+ echo "$queue_url"
510
+ fi
511
+ done < "$queue_file" > "$new_queue"
512
+ mv "$new_queue" "$queue_file"
513
+ fi
514
+
515
+ [[ ${#batch_urls[@]} -eq 0 ]] && break
516
+
517
+ # Build JSON array of URLs
518
+ local urls_json="["
519
+ local first=true
520
+ for batch_url in "${batch_urls[@]}"; do
521
+ [[ "$first" != "true" ]] && urls_json+=","
522
+ urls_json+="\"$batch_url\""
523
+ first=false
524
+ done
525
+ urls_json+="]"
526
+
527
+ print_info "[${crawled_count}/${max_urls}] Crawling batch of ${#batch_urls[@]} URLs..."
528
+
529
+ # Submit crawl job to Crawl4AI
530
+ local response
531
+ response=$(curl -s -X POST "${CRAWL4AI_URL}/crawl" \
532
+ --max-time 120 \
533
+ -H "Content-Type: application/json" \
534
+ -d "{
535
+ \"urls\": $urls_json,
536
+ \"crawler_config\": {
537
+ \"type\": \"CrawlerRunConfig\",
538
+ \"params\": {
539
+ \"cache_mode\": \"bypass\",
540
+ \"word_count_threshold\": 10,
541
+ \"page_timeout\": 30000
542
+ }
543
+ }
544
+ }" 2>/dev/null)
545
+
546
+ if [[ -z "$response" ]]; then
547
+ print_warning "No response from Crawl4AI for batch, skipping..."
548
+ continue
549
+ fi
550
+
551
+ # Process results
552
+ if command -v jq &> /dev/null; then
553
+ # Extract each result
554
+ local result_count
555
+ result_count=$(echo "$response" | jq -r '.results | length' 2>/dev/null || echo "0")
556
+
557
+ for ((i=0; i<result_count; i++)); do
558
+ local result
559
+ result=$(echo "$response" | jq -c ".results[$i]" 2>/dev/null)
560
+ [[ -z "$result" || "$result" == "null" ]] && continue
561
+
562
+ # Append to results file
563
+ echo "$result" >> "$results_file"
564
+ ((crawled_count++))
565
+
566
+ local page_url status_code
567
+ page_url=$(printf '%s' "$result" | jq -r '.url // empty')
568
+ status_code=$(printf '%s' "$result" | jq -r '.status_code // 0')
569
+
570
+ print_info " [${crawled_count}] ${status_code} ${page_url:0:60}"
571
+
572
+ # Save markdown content with rich metadata frontmatter (non-fatal if it fails)
573
+ save_markdown_with_metadata "$result" "$full_page_dir" "$body_only_dir" "$images_dir" "$base_domain" || true
574
+
575
+ # Extract internal links for queue (if under depth limit)
576
+ if [[ $current_depth -lt $depth ]]; then
577
+ local links
578
+ links=$(printf '%s' "$result" | jq -r '.links.internal[]?.href // empty' 2>/dev/null | head -50)
579
+
580
+ while IFS= read -r link; do
581
+ [[ -z "$link" ]] && continue
582
+ # Normalize URL
583
+ if [[ "$link" =~ ^/ ]]; then
584
+ link="https://${base_domain}${link}"
585
+ elif [[ ! "$link" =~ ^https?:// ]]; then
586
+ continue
587
+ fi
588
+ # Only add internal links
589
+ if [[ "$link" =~ $base_domain ]]; then
590
+ # Remove fragments and normalize
591
+ link=$(echo "$link" | sed 's|#.*||' | sed 's|/$||')
592
+ # Add to queue if not visited
593
+ if ! grep -qxF "$link" "$visited_file" 2>/dev/null; then
594
+ echo "$link" >> "$queue_file"
595
+ fi
596
+ fi
597
+ done <<< "$links"
598
+ fi
599
+ done
600
+ fi
601
+
602
+ ((current_depth++))
603
+ done
604
+
605
+ print_info "Crawl complete. Processing results..."
606
+
607
+ # Generate CSV and XLSX from results
608
+ crawl4ai_generate_reports "$output_dir" "$results_file" "$base_domain"
609
+
610
+ # Cleanup temp files
611
+ rm -f "$visited_file" "$queue_file"
612
+
613
+ # Count markdown files and images
614
+ local full_page_count body_count img_count
615
+ full_page_count=$(find "$full_page_dir" -name "*.md" 2>/dev/null | wc -l | tr -d ' ')
616
+ body_count=$(find "$body_only_dir" -name "*.md" 2>/dev/null | wc -l | tr -d ' ')
617
+ img_count=$(find "$images_dir" -type f \( -name "*.jpg" -o -name "*.jpeg" -o -name "*.png" -o -name "*.gif" -o -name "*.webp" -o -name "*.svg" \) 2>/dev/null | wc -l | tr -d ' ')
618
+
619
+ print_success "Crawl4AI results saved to ${output_dir}"
620
+ print_info " Pages crawled: $crawled_count"
621
+ print_info " Full page markdown: $full_page_count (in content-full-page-md/)"
622
+ print_info " Body-only markdown: $body_count (in content-body-md/)"
623
+ print_info " Images downloaded: $img_count (in images/)"
624
+ return 0
625
+ }
626
+
627
+ # Generate reports from Crawl4AI results
628
+ crawl4ai_generate_reports() {
629
+ local output_dir="$1"
630
+ local results_file="$2"
631
+ local base_domain="$3"
632
+
633
+ [[ ! -s "$results_file" ]] && return 0
634
+
635
+ # Generate CSV
636
+ local csv_file="${output_dir}/crawl-data.csv"
637
+ echo "url,status_code,status,title,title_length,meta_description,description_length,h1,h1_count,canonical,meta_robots,word_count,response_time_ms,crawl_depth,internal_links,external_links,images,images_missing_alt" > "$csv_file"
638
+
639
+ local broken_links=()
640
+ local redirects=()
641
+ local meta_issues=()
642
+ local status_codes=()
643
+
644
+ while IFS= read -r result; do
645
+ [[ -z "$result" ]] && continue
646
+
647
+ local url status_code title meta_desc h1 canonical word_count
648
+ url=$(printf '%s' "$result" | jq -r '.url // ""')
649
+ status_code=$(printf '%s' "$result" | jq -r '.status_code // 0')
650
+ title=$(printf '%s' "$result" | jq -r '.metadata.title // .title // ""' | tr ',' ';' | head -c 200)
651
+ meta_desc=$(printf '%s' "$result" | jq -r '.metadata.description // ""' | tr ',' ';' | head -c 300)
652
+ h1=$(printf '%s' "$result" | jq -r '.metadata.h1 // ""' | tr ',' ';' | head -c 200)
653
+ canonical=$(printf '%s' "$result" | jq -r '.metadata.canonical // ""')
654
+ word_count=$(printf '%s' "$result" | jq -r '.word_count // 0')
655
+
656
+ local title_len=${#title}
657
+ local desc_len=${#meta_desc}
658
+ local status="OK"
659
+ [[ $status_code -ge 300 && $status_code -lt 400 ]] && status="Redirect"
660
+ [[ $status_code -ge 400 ]] && status="Error"
661
+
662
+ # Count links
663
+ local internal_links external_links
664
+ internal_links=$(printf '%s' "$result" | jq -r '.links.internal | length // 0' 2>/dev/null || echo "0")
665
+ external_links=$(printf '%s' "$result" | jq -r '.links.external | length // 0' 2>/dev/null || echo "0")
666
+
667
+ # Write CSV row
668
+ echo "\"$url\",$status_code,\"$status\",\"$title\",$title_len,\"$meta_desc\",$desc_len,\"$h1\",1,\"$canonical\",\"\",$word_count,0,0,$internal_links,$external_links,0,0" >> "$csv_file"
669
+
670
+ # Track status codes
671
+ status_codes+=("$status_code")
672
+
673
+ # Track broken links
674
+ if [[ $status_code -ge 400 ]]; then
675
+ broken_links+=("{\"url\":\"$url\",\"status_code\":$status_code,\"source\":\"direct\"}")
676
+ fi
677
+
678
+ # Track meta issues
679
+ local issues=""
680
+ [[ -z "$title" ]] && issues+="Missing title; "
681
+ [[ $title_len -gt 60 ]] && issues+="Title too long; "
682
+ [[ -z "$meta_desc" ]] && issues+="Missing description; "
683
+ [[ $desc_len -gt 160 ]] && issues+="Description too long; "
684
+ [[ -z "$h1" ]] && issues+="Missing H1; "
685
+
686
+ if [[ -n "$issues" ]]; then
687
+ meta_issues+=("{\"url\":\"$url\",\"title\":\"${title:0:50}\",\"h1\":\"${h1:0:50}\",\"issues\":\"${issues%%; }\"}")
688
+ fi
689
+ done < "$results_file"
690
+
691
+ print_info "Generated: $csv_file"
692
+
693
+ # Generate broken-links.csv
694
+ if [[ ${#broken_links[@]} -gt 0 ]]; then
695
+ local broken_file="${output_dir}/broken-links.csv"
696
+ echo "url,status_code,source" > "$broken_file"
697
+ for bl in "${broken_links[@]}"; do
698
+ local bl_url bl_code bl_src
699
+ bl_url=$(echo "$bl" | jq -r '.url')
700
+ bl_code=$(echo "$bl" | jq -r '.status_code')
701
+ bl_src=$(echo "$bl" | jq -r '.source')
702
+ echo "\"$bl_url\",$bl_code,\"$bl_src\"" >> "$broken_file"
703
+ done
704
+ print_info "Generated: $broken_file"
705
+ fi
706
+
707
+ # Generate meta-issues.csv
708
+ if [[ ${#meta_issues[@]} -gt 0 ]]; then
709
+ local issues_file="${output_dir}/meta-issues.csv"
710
+ echo "url,title,h1,issues" > "$issues_file"
711
+ for mi in "${meta_issues[@]}"; do
712
+ local mi_url mi_title mi_h1 mi_issues
713
+ mi_url=$(echo "$mi" | jq -r '.url')
714
+ mi_title=$(echo "$mi" | jq -r '.title')
715
+ mi_h1=$(echo "$mi" | jq -r '.h1')
716
+ mi_issues=$(echo "$mi" | jq -r '.issues')
717
+ echo "\"$mi_url\",\"$mi_title\",\"$mi_h1\",\"$mi_issues\"" >> "$issues_file"
718
+ done
719
+ print_info "Generated: $issues_file"
720
+ fi
721
+
722
+ # Generate summary.json
723
+ local total_pages=${#status_codes[@]}
724
+ local summary_file="${output_dir}/summary.json"
725
+
726
+ # Count status codes
727
+ local code_200=0 code_301=0 code_302=0 code_404=0 code_500=0 code_other=0
728
+ for code in "${status_codes[@]}"; do
729
+ case "$code" in
730
+ 200) ((code_200++)) ;;
731
+ 301) ((code_301++)) ;;
732
+ 302) ((code_302++)) ;;
733
+ 404) ((code_404++)) ;;
734
+ 500) ((code_500++)) ;;
735
+ *) ((code_other++)) ;;
736
+ esac
737
+ done
738
+
739
+ cat > "$summary_file" << EOF
740
+ {
741
+ "crawl_date": "$(date -Iseconds)",
742
+ "base_url": "https://${base_domain}",
743
+ "backend": "crawl4ai",
744
+ "pages_crawled": $total_pages,
745
+ "broken_links": ${#broken_links[@]},
746
+ "redirects": 0,
747
+ "meta_issues": ${#meta_issues[@]},
748
+ "status_codes": {
749
+ "200": $code_200,
750
+ "301": $code_301,
751
+ "302": $code_302,
752
+ "404": $code_404,
753
+ "500": $code_500,
754
+ "other": $code_other
755
+ }
756
+ }
757
+ EOF
758
+ print_info "Generated: $summary_file"
759
+
760
+ # Generate XLSX if Python available
761
+ if find_python && "$PYTHON_CMD" -c "import openpyxl" 2>/dev/null; then
762
+ local xlsx_script
763
+ xlsx_script=$(mktemp /tmp/xlsx_gen_XXXXXX.py)
764
+ cat > "$xlsx_script" << 'PYXLSX'
765
+ import sys
766
+ import csv
767
+ import openpyxl
768
+ from openpyxl.styles import Font, PatternFill
769
+ from pathlib import Path
770
+
771
+ csv_file = Path(sys.argv[1])
772
+ xlsx_file = csv_file.with_suffix('.xlsx')
773
+
774
+ wb = openpyxl.Workbook()
775
+ ws = wb.active
776
+ ws.title = "Crawl Data"
777
+
778
+ with open(csv_file, 'r', encoding='utf-8') as f:
779
+ reader = csv.reader(f)
780
+ for row_num, row in enumerate(reader, 1):
781
+ for col_num, value in enumerate(row, 1):
782
+ cell = ws.cell(row=row_num, column=col_num, value=value)
783
+ if row_num == 1:
784
+ cell.font = Font(bold=True)
785
+ cell.fill = PatternFill(start_color="DAEEF3", end_color="DAEEF3", fill_type="solid")
786
+
787
+ wb.save(xlsx_file)
788
+ print(f"Generated: {xlsx_file}")
789
+ PYXLSX
790
+ "$PYTHON_CMD" "$xlsx_script" "$csv_file" 2>/dev/null || true
791
+ rm -f "$xlsx_script"
792
+ fi
793
+
794
+ return 0
795
+ }
796
+
797
+ # Lightweight Python crawler (fallback)
798
+ generate_fallback_crawler() {
799
+ cat << 'PYTHON_CRAWLER'
800
+ #!/usr/bin/env python3
801
+ """
802
+ Lightweight SEO Site Crawler
803
+ Fallback when Crawl4AI is not available
804
+ """
805
+
806
+ import asyncio
807
+ import aiohttp
808
+ import csv
809
+ import json
810
+ import hashlib
811
+ import sys
812
+ from datetime import datetime
813
+ from pathlib import Path
814
+ from urllib.parse import urljoin, urlparse
815
+ from collections import defaultdict
816
+ from dataclasses import dataclass, asdict
817
+ from bs4 import BeautifulSoup
818
+
819
+ try:
820
+ import openpyxl
821
+ from openpyxl.styles import Font, PatternFill, Alignment
822
+ HAS_XLSX = True
823
+ except ImportError:
824
+ HAS_XLSX = False
825
+
826
+
827
+ @dataclass
828
+ class PageData:
829
+ url: str
830
+ status_code: int = 0
831
+ status: str = ""
832
+ title: str = ""
833
+ title_length: int = 0
834
+ meta_description: str = ""
835
+ description_length: int = 0
836
+ h1: str = ""
837
+ h1_count: int = 0
838
+ canonical: str = ""
839
+ meta_robots: str = ""
840
+ word_count: int = 0
841
+ response_time_ms: float = 0.0
842
+ crawl_depth: int = 0
843
+ internal_links: int = 0
844
+ external_links: int = 0
845
+ images: int = 0
846
+ images_missing_alt: int = 0
847
+
848
+
849
+ class SiteCrawler:
850
+ def __init__(self, base_url: str, max_urls: int = 100, max_depth: int = 3, delay_ms: int = 100):
851
+ self.base_url = base_url.rstrip('/')
852
+ self.base_domain = urlparse(base_url).netloc
853
+ self.max_urls = max_urls
854
+ self.max_depth = max_depth
855
+ self.delay = delay_ms / 1000.0
856
+
857
+ self.visited = set()
858
+ self.queue = [(self.base_url, 0)]
859
+ self.pages = []
860
+ self.broken_links = []
861
+ self.redirects = []
862
+
863
+ def is_internal(self, url: str) -> bool:
864
+ parsed = urlparse(url)
865
+ return parsed.netloc == self.base_domain or parsed.netloc == ""
866
+
867
+ def normalize_url(self, url: str, base: str) -> str:
868
+ url = urljoin(base, url)
869
+ parsed = urlparse(url)
870
+ normalized = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
871
+ if parsed.query:
872
+ normalized += f"?{parsed.query}"
873
+ return normalized.rstrip('/')
874
+
875
+ async def fetch_page(self, session: aiohttp.ClientSession, url: str, depth: int) -> PageData:
876
+ page = PageData(url=url, crawl_depth=depth)
877
+
878
+ try:
879
+ start = datetime.now()
880
+ async with session.get(url, allow_redirects=True, timeout=aiohttp.ClientTimeout(total=15)) as response:
881
+ page.status_code = response.status
882
+ page.response_time_ms = (datetime.now() - start).total_seconds() * 1000
883
+
884
+ # Track redirects
885
+ if response.history:
886
+ for r in response.history:
887
+ self.redirects.append({
888
+ 'original_url': str(r.url),
889
+ 'status_code': r.status,
890
+ 'redirect_url': str(response.url)
891
+ })
892
+
893
+ page.status = "OK" if response.status < 300 else ("Redirect" if response.status < 400 else "Error")
894
+
895
+ if response.status >= 400:
896
+ self.broken_links.append({'url': url, 'status_code': response.status, 'source': 'direct'})
897
+ return page
898
+
899
+ content_type = response.headers.get('Content-Type', '')
900
+ if 'text/html' not in content_type:
901
+ return page
902
+
903
+ html = await response.text()
904
+ soup = BeautifulSoup(html, 'html.parser')
905
+
906
+ # Title
907
+ if soup.title:
908
+ page.title = soup.title.get_text(strip=True)[:200]
909
+ page.title_length = len(page.title)
910
+
911
+ # Meta description
912
+ meta_desc = soup.find('meta', attrs={'name': 'description'})
913
+ if meta_desc:
914
+ page.meta_description = meta_desc.get('content', '')[:300]
915
+ page.description_length = len(page.meta_description)
916
+
917
+ # Meta robots
918
+ meta_robots = soup.find('meta', attrs={'name': 'robots'})
919
+ if meta_robots:
920
+ page.meta_robots = meta_robots.get('content', '')
921
+
922
+ # Canonical
923
+ canonical = soup.find('link', attrs={'rel': 'canonical'})
924
+ if canonical:
925
+ page.canonical = canonical.get('href', '')
926
+
927
+ # H1
928
+ h1_tags = soup.find_all('h1')
929
+ page.h1_count = len(h1_tags)
930
+ if h1_tags:
931
+ page.h1 = h1_tags[0].get_text(strip=True)[:200]
932
+
933
+ # Word count
934
+ text = soup.get_text(separator=' ', strip=True)
935
+ page.word_count = len(text.split())
936
+
937
+ # Images
938
+ images = soup.find_all('img')
939
+ page.images = len(images)
940
+ page.images_missing_alt = sum(1 for img in images if not img.get('alt'))
941
+
942
+ # Links
943
+ internal_count = 0
944
+ external_count = 0
945
+
946
+ for link in soup.find_all('a', href=True):
947
+ href = link.get('href', '')
948
+ if not href or href.startswith(('#', 'javascript:', 'mailto:', 'tel:')):
949
+ continue
950
+
951
+ target_url = self.normalize_url(href, url)
952
+
953
+ if self.is_internal(target_url):
954
+ internal_count += 1
955
+ if target_url not in self.visited and depth < self.max_depth:
956
+ self.queue.append((target_url, depth + 1))
957
+ else:
958
+ external_count += 1
959
+
960
+ page.internal_links = internal_count
961
+ page.external_links = external_count
962
+
963
+ except asyncio.TimeoutError:
964
+ page.status = "Timeout"
965
+ except Exception as e:
966
+ page.status = f"Error: {str(e)[:50]}"
967
+
968
+ return page
969
+
970
+ async def crawl(self):
971
+ connector = aiohttp.TCPConnector(limit=5)
972
+ headers = {'User-Agent': 'AIDevOps-SiteCrawler/2.0'}
973
+
974
+ async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
975
+ while self.queue and len(self.visited) < self.max_urls:
976
+ url, depth = self.queue.pop(0)
977
+
978
+ if url in self.visited:
979
+ continue
980
+
981
+ self.visited.add(url)
982
+ page = await self.fetch_page(session, url, depth)
983
+ self.pages.append(page)
984
+
985
+ print(f"[{len(self.pages)}/{self.max_urls}] {page.status_code or 'ERR'} {url[:70]}")
986
+
987
+ await asyncio.sleep(self.delay)
988
+
989
+ return self.pages
990
+
991
+ def export(self, output_dir: Path, domain: str, fmt: str = "xlsx"):
992
+ output_dir = Path(output_dir)
993
+
994
+ # CSV export
995
+ csv_file = output_dir / "crawl-data.csv"
996
+ fieldnames = list(PageData.__dataclass_fields__.keys())
997
+
998
+ with open(csv_file, 'w', newline='', encoding='utf-8') as f:
999
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
1000
+ writer.writeheader()
1001
+ for page in self.pages:
1002
+ writer.writerow(asdict(page))
1003
+ print(f"Exported: {csv_file}")
1004
+
1005
+ # XLSX export
1006
+ if fmt in ("xlsx", "all") and HAS_XLSX:
1007
+ xlsx_file = output_dir / "crawl-data.xlsx"
1008
+ wb = openpyxl.Workbook()
1009
+ ws = wb.active
1010
+ ws.title = "Crawl Data"
1011
+
1012
+ # Headers
1013
+ for col, field in enumerate(fieldnames, 1):
1014
+ cell = ws.cell(row=1, column=col, value=field.replace('_', ' ').title())
1015
+ cell.font = Font(bold=True)
1016
+
1017
+ # Data
1018
+ for row, page in enumerate(self.pages, 2):
1019
+ for col, field in enumerate(fieldnames, 1):
1020
+ ws.cell(row=row, column=col, value=getattr(page, field))
1021
+
1022
+ wb.save(xlsx_file)
1023
+ print(f"Exported: {xlsx_file}")
1024
+
1025
+ # Broken links
1026
+ if self.broken_links:
1027
+ broken_file = output_dir / "broken-links.csv"
1028
+ with open(broken_file, 'w', newline='', encoding='utf-8') as f:
1029
+ writer = csv.DictWriter(f, fieldnames=['url', 'status_code', 'source'])
1030
+ writer.writeheader()
1031
+ writer.writerows(self.broken_links)
1032
+ print(f"Exported: {broken_file}")
1033
+
1034
+ # Redirects
1035
+ if self.redirects:
1036
+ redirects_file = output_dir / "redirects.csv"
1037
+ with open(redirects_file, 'w', newline='', encoding='utf-8') as f:
1038
+ writer = csv.DictWriter(f, fieldnames=['original_url', 'status_code', 'redirect_url'])
1039
+ writer.writeheader()
1040
+ writer.writerows(self.redirects)
1041
+ print(f"Exported: {redirects_file}")
1042
+
1043
+ # Meta issues
1044
+ meta_issues = []
1045
+ for page in self.pages:
1046
+ issues = []
1047
+ if not page.title:
1048
+ issues.append("Missing title")
1049
+ elif page.title_length > 60:
1050
+ issues.append("Title too long")
1051
+ if not page.meta_description:
1052
+ issues.append("Missing description")
1053
+ elif page.description_length > 160:
1054
+ issues.append("Description too long")
1055
+ if page.h1_count == 0:
1056
+ issues.append("Missing H1")
1057
+ elif page.h1_count > 1:
1058
+ issues.append("Multiple H1s")
1059
+
1060
+ if issues:
1061
+ meta_issues.append({
1062
+ 'url': page.url,
1063
+ 'title': page.title[:50],
1064
+ 'h1': page.h1[:50],
1065
+ 'issues': '; '.join(issues)
1066
+ })
1067
+
1068
+ if meta_issues:
1069
+ issues_file = output_dir / "meta-issues.csv"
1070
+ with open(issues_file, 'w', newline='', encoding='utf-8') as f:
1071
+ writer = csv.DictWriter(f, fieldnames=['url', 'title', 'h1', 'issues'])
1072
+ writer.writeheader()
1073
+ writer.writerows(meta_issues)
1074
+ print(f"Exported: {issues_file}")
1075
+
1076
+ # Summary
1077
+ summary = {
1078
+ 'crawl_date': datetime.now().isoformat(),
1079
+ 'base_url': self.base_url,
1080
+ 'pages_crawled': len(self.pages),
1081
+ 'broken_links': len(self.broken_links),
1082
+ 'redirects': len(self.redirects),
1083
+ 'meta_issues': len(meta_issues),
1084
+ 'status_codes': {}
1085
+ }
1086
+
1087
+ for page in self.pages:
1088
+ code = str(page.status_code)
1089
+ summary['status_codes'][code] = summary['status_codes'].get(code, 0) + 1
1090
+
1091
+ with open(output_dir / "summary.json", 'w') as f:
1092
+ json.dump(summary, f, indent=2)
1093
+ print(f"Exported: {output_dir / 'summary.json'}")
1094
+
1095
+ return summary
1096
+
1097
+
1098
+ async def main():
1099
+ if len(sys.argv) < 4:
1100
+ print("Usage: crawler.py <url> <output_dir> <max_urls> [depth] [format]")
1101
+ sys.exit(1)
1102
+
1103
+ url = sys.argv[1]
1104
+ output_dir = sys.argv[2]
1105
+ max_urls = int(sys.argv[3])
1106
+ depth = int(sys.argv[4]) if len(sys.argv) > 4 else 3
1107
+ fmt = sys.argv[5] if len(sys.argv) > 5 else "xlsx"
1108
+
1109
+ domain = urlparse(url).netloc
1110
+
1111
+ print(f"Starting crawl: {url}")
1112
+ print(f"Max URLs: {max_urls}, Max depth: {depth}")
1113
+ print()
1114
+
1115
+ crawler = SiteCrawler(url, max_urls=max_urls, max_depth=depth)
1116
+ await crawler.crawl()
1117
+
1118
+ summary = crawler.export(Path(output_dir), domain, fmt)
1119
+
1120
+ print()
1121
+ print("=== Crawl Summary ===")
1122
+ print(f"Pages crawled: {summary['pages_crawled']}")
1123
+ print(f"Broken links: {summary['broken_links']}")
1124
+ print(f"Redirects: {summary['redirects']}")
1125
+ print(f"Meta issues: {summary['meta_issues']}")
1126
+
1127
+
1128
+ if __name__ == "__main__":
1129
+ asyncio.run(main())
1130
+ PYTHON_CRAWLER
1131
+ }
1132
+
1133
+ # Run crawl
1134
+ do_crawl() {
1135
+ local url="$1"
1136
+ shift
1137
+
1138
+ # Parse options
1139
+ local depth="$DEFAULT_DEPTH"
1140
+ local max_urls="$DEFAULT_MAX_URLS"
1141
+ local format="$DEFAULT_FORMAT"
1142
+ local output_base="$DEFAULT_OUTPUT_DIR"
1143
+ local force_fallback=false
1144
+
1145
+ while [[ $# -gt 0 ]]; do
1146
+ case "$1" in
1147
+ --depth)
1148
+ depth="$2"
1149
+ shift 2
1150
+ ;;
1151
+ --max-urls)
1152
+ max_urls="$2"
1153
+ shift 2
1154
+ ;;
1155
+ --format)
1156
+ format="$2"
1157
+ shift 2
1158
+ ;;
1159
+ --output)
1160
+ output_base="$2"
1161
+ shift 2
1162
+ ;;
1163
+ --fallback)
1164
+ force_fallback=true
1165
+ shift
1166
+ ;;
1167
+ *)
1168
+ shift
1169
+ ;;
1170
+ esac
1171
+ done
1172
+
1173
+ local domain
1174
+ domain=$(get_domain "$url")
1175
+
1176
+ local output_dir
1177
+ output_dir=$(create_output_dir "$domain" "$output_base")
1178
+
1179
+ print_header "Site Crawler - SEO Audit"
1180
+ print_info "URL: $url"
1181
+ print_info "Output: $output_dir"
1182
+ print_info "Depth: $depth, Max URLs: $max_urls"
1183
+
1184
+ # Try Crawl4AI first (unless forced fallback)
1185
+ if [[ "$force_fallback" != "true" ]] && check_crawl4ai; then
1186
+ print_success "Crawl4AI detected at ${CRAWL4AI_URL}"
1187
+ crawl_with_crawl4ai "$url" "$output_dir" "$max_urls" "$depth"
1188
+ print_success "Crawl complete!"
1189
+ print_info "Results: $output_dir"
1190
+ print_info "Latest: ${output_base}/${domain}/_latest"
1191
+ return 0
1192
+ fi
1193
+
1194
+ # Fallback to Python crawler
1195
+ print_info "Using lightweight Python crawler..."
1196
+
1197
+ # Find or install Python
1198
+ if ! find_python; then
1199
+ print_warning "Installing Python dependencies..."
1200
+ if ! install_python_deps; then
1201
+ print_error "Could not find or install Python with required packages"
1202
+ print_info "Install manually: pip3 install aiohttp beautifulsoup4 openpyxl"
1203
+ return 1
1204
+ fi
1205
+ fi
1206
+
1207
+ print_info "Using: $PYTHON_CMD"
1208
+
1209
+ # Generate and run crawler
1210
+ local crawler_script
1211
+ crawler_script=$(mktemp /tmp/site_crawler_XXXXXX.py)
1212
+ generate_fallback_crawler > "$crawler_script"
1213
+
1214
+ "$PYTHON_CMD" "$crawler_script" "$url" "$output_dir" "$max_urls" "$depth" "$format"
1215
+ local exit_code=$?
1216
+
1217
+ rm -f "$crawler_script"
1218
+
1219
+ if [[ $exit_code -eq 0 ]]; then
1220
+ print_success "Crawl complete!"
1221
+ print_info "Results: $output_dir"
1222
+ print_info "Latest: ${output_base}/${domain}/_latest"
1223
+ else
1224
+ print_error "Crawl failed with exit code $exit_code"
1225
+ fi
1226
+
1227
+ return $exit_code
1228
+ }
1229
+
1230
+ # Audit broken links
1231
+ audit_links() {
1232
+ local url="$1"
1233
+ shift
1234
+ print_info "Running broken link audit..."
1235
+ do_crawl "$url" --max-urls 200 "$@"
1236
+ return 0
1237
+ }
1238
+
1239
+ # Audit meta data
1240
+ audit_meta() {
1241
+ local url="$1"
1242
+ shift
1243
+ print_info "Running meta data audit..."
1244
+ do_crawl "$url" --max-urls 200 "$@"
1245
+ return 0
1246
+ }
1247
+
1248
+ # Audit redirects
1249
+ audit_redirects() {
1250
+ local url="$1"
1251
+ shift
1252
+ print_info "Running redirect audit..."
1253
+ do_crawl "$url" --max-urls 200 "$@"
1254
+ return 0
1255
+ }
1256
+
1257
+ # Generate XML sitemap
1258
+ generate_sitemap() {
1259
+ local url="$1"
1260
+ local domain
1261
+ domain=$(get_domain "$url")
1262
+ local output_dir="${DEFAULT_OUTPUT_DIR}/${domain}/_latest"
1263
+
1264
+ if [[ ! -d "$output_dir" ]]; then
1265
+ print_error "No crawl data found. Run 'crawl' first."
1266
+ return 1
1267
+ fi
1268
+
1269
+ local crawl_data="${output_dir}/crawl-data.csv"
1270
+ if [[ ! -f "$crawl_data" ]]; then
1271
+ print_error "Crawl data not found: $crawl_data"
1272
+ return 1
1273
+ fi
1274
+
1275
+ print_header "Generating XML Sitemap"
1276
+
1277
+ local sitemap="${output_dir}/sitemap.xml"
1278
+
1279
+ {
1280
+ echo '<?xml version="1.0" encoding="UTF-8"?>'
1281
+ echo '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
1282
+
1283
+ tail -n +2 "$crawl_data" | while IFS=, read -r page_url status_code rest; do
1284
+ if [[ "$status_code" == "200" ]]; then
1285
+ page_url="${page_url//\"/}"
1286
+ echo " <url>"
1287
+ echo " <loc>$page_url</loc>"
1288
+ echo " <changefreq>weekly</changefreq>"
1289
+ echo " <priority>0.5</priority>"
1290
+ echo " </url>"
1291
+ fi
1292
+ done
1293
+
1294
+ echo '</urlset>'
1295
+ } > "$sitemap"
1296
+
1297
+ print_success "Sitemap generated: $sitemap"
1298
+ return 0
1299
+ }
1300
+
1301
+ # Compare crawls
1302
+ compare_crawls() {
1303
+ local arg1="${1:-}"
1304
+ local arg2="${2:-}"
1305
+
1306
+ print_header "Comparing Crawls"
1307
+
1308
+ if [[ -z "$arg2" ]] && [[ -n "$arg1" ]]; then
1309
+ local domain
1310
+ domain=$(get_domain "$arg1")
1311
+ local domain_dir="${DEFAULT_OUTPUT_DIR}/${domain}"
1312
+
1313
+ if [[ ! -d "$domain_dir" ]]; then
1314
+ print_error "No crawl data found for domain"
1315
+ return 1
1316
+ fi
1317
+
1318
+ local crawls
1319
+ crawls=$(find "$domain_dir" -maxdepth 1 -type d -name "20*" | sort -r | head -2)
1320
+ local count
1321
+ count=$(echo "$crawls" | wc -l | tr -d ' ')
1322
+
1323
+ if [[ $count -lt 2 ]]; then
1324
+ print_error "Need at least 2 crawls to compare"
1325
+ return 1
1326
+ fi
1327
+
1328
+ arg1=$(echo "$crawls" | head -1)
1329
+ arg2=$(echo "$crawls" | tail -1)
1330
+ fi
1331
+
1332
+ print_info "Crawl 1: $arg1"
1333
+ print_info "Crawl 2: $arg2"
1334
+
1335
+ if [[ -f "${arg1}/crawl-data.csv" ]] && [[ -f "${arg2}/crawl-data.csv" ]]; then
1336
+ local urls1 urls2
1337
+ urls1=$(cut -d, -f1 "${arg1}/crawl-data.csv" | tail -n +2 | sort -u | wc -l | tr -d ' ')
1338
+ urls2=$(cut -d, -f1 "${arg2}/crawl-data.csv" | tail -n +2 | sort -u | wc -l | tr -d ' ')
1339
+
1340
+ print_info "Crawl 1 URLs: $urls1"
1341
+ print_info "Crawl 2 URLs: $urls2"
1342
+ fi
1343
+
1344
+ return 0
1345
+ }
1346
+
1347
+ # Check status
1348
+ check_status() {
1349
+ print_header "Site Crawler Status"
1350
+
1351
+ # Check Crawl4AI
1352
+ print_info "Checking Crawl4AI..."
1353
+ if check_crawl4ai; then
1354
+ print_success "Crawl4AI: Running at ${CRAWL4AI_URL}"
1355
+ else
1356
+ print_warning "Crawl4AI: Not running (will use fallback crawler)"
1357
+ fi
1358
+
1359
+ # Check Python
1360
+ print_info "Checking Python..."
1361
+ if find_python; then
1362
+ print_success "Python: $PYTHON_CMD with required packages"
1363
+ else
1364
+ print_warning "Python: Dependencies not installed"
1365
+ print_info " Install with: pip3 install aiohttp beautifulsoup4 openpyxl"
1366
+ fi
1367
+
1368
+ # Check dependencies
1369
+ if command -v jq &> /dev/null; then
1370
+ print_success "jq: installed"
1371
+ else
1372
+ print_warning "jq: not installed (optional, for JSON processing)"
1373
+ fi
1374
+
1375
+ if command -v curl &> /dev/null; then
1376
+ print_success "curl: installed"
1377
+ else
1378
+ print_error "curl: not installed (required)"
1379
+ fi
1380
+
1381
+ return 0
1382
+ }
1383
+
1384
+ # Show help
1385
+ show_help() {
1386
+ cat << 'EOF'
1387
+ Site Crawler Helper - SEO Spider Tool
1388
+
1389
+ Usage: site-crawler-helper.sh [command] [url] [options]
1390
+
1391
+ Commands:
1392
+ crawl <url> Full site crawl with SEO data extraction
1393
+ audit-links <url> Check for broken links (4XX/5XX errors)
1394
+ audit-meta <url> Audit page titles and meta descriptions
1395
+ audit-redirects <url> Analyze redirects and chains
1396
+ generate-sitemap <url> Generate XML sitemap from crawl
1397
+ compare [url|dir1] [dir2] Compare two crawls
1398
+ status Check crawler dependencies
1399
+ help Show this help message
1400
+
1401
+ Options:
1402
+ --depth <n> Max crawl depth (default: 3)
1403
+ --max-urls <n> Max URLs to crawl (default: 100)
1404
+ --format <fmt> Output format: csv, xlsx, all (default: xlsx)
1405
+ --output <dir> Output directory (default: ~/Downloads)
1406
+ --fallback Force use of fallback crawler (skip Crawl4AI)
1407
+
1408
+ Examples:
1409
+ # Full site crawl
1410
+ site-crawler-helper.sh crawl https://example.com
1411
+
1412
+ # Limited crawl
1413
+ site-crawler-helper.sh crawl https://example.com --depth 2 --max-urls 50
1414
+
1415
+ # Quick broken link check
1416
+ site-crawler-helper.sh audit-links https://example.com
1417
+
1418
+ # Generate sitemap from existing crawl
1419
+ site-crawler-helper.sh generate-sitemap https://example.com
1420
+
1421
+ # Check status
1422
+ site-crawler-helper.sh status
1423
+
1424
+ Output Structure:
1425
+ ~/Downloads/{domain}/{timestamp}/
1426
+ - crawl-data.xlsx Full crawl data
1427
+ - crawl-data.csv Full crawl data (CSV)
1428
+ - broken-links.csv 4XX/5XX errors
1429
+ - redirects.csv Redirect chains
1430
+ - meta-issues.csv Title/description issues
1431
+ - summary.json Crawl statistics
1432
+
1433
+ ~/Downloads/{domain}/_latest -> symlink to latest crawl
1434
+
1435
+ Backends:
1436
+ - Crawl4AI (preferred): Uses Docker-based Crawl4AI when available
1437
+ - Fallback: Lightweight async Python crawler
1438
+
1439
+ Related:
1440
+ - E-E-A-T scoring: eeat-score-helper.sh
1441
+ - Crawl4AI setup: crawl4ai-helper.sh
1442
+ - PageSpeed: pagespeed-helper.sh
1443
+ EOF
1444
+ return 0
1445
+ }
1446
+
1447
+ # Main function
1448
+ main() {
1449
+ local command="${1:-help}"
1450
+ shift || true
1451
+
1452
+ case "$command" in
1453
+ crawl)
1454
+ do_crawl "$@"
1455
+ ;;
1456
+ audit-links)
1457
+ audit_links "$@"
1458
+ ;;
1459
+ audit-meta)
1460
+ audit_meta "$@"
1461
+ ;;
1462
+ audit-redirects)
1463
+ audit_redirects "$@"
1464
+ ;;
1465
+ generate-sitemap)
1466
+ generate_sitemap "$@"
1467
+ ;;
1468
+ compare)
1469
+ compare_crawls "$@"
1470
+ ;;
1471
+ status)
1472
+ check_status
1473
+ ;;
1474
+ help|-h|--help|"")
1475
+ show_help
1476
+ ;;
1477
+ *)
1478
+ print_error "Unknown command: $command"
1479
+ show_help
1480
+ return 1
1481
+ ;;
1482
+ esac
1483
+
1484
+ return 0
1485
+ }
1486
+
1487
+ main "$@"