aidevops 2.52.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent/AGENTS.md +614 -0
- package/.agent/accounts.md +65 -0
- package/.agent/aidevops/add-new-mcp-to-aidevops.md +456 -0
- package/.agent/aidevops/api-integrations.md +335 -0
- package/.agent/aidevops/architecture.md +510 -0
- package/.agent/aidevops/configs.md +274 -0
- package/.agent/aidevops/docs.md +244 -0
- package/.agent/aidevops/extension.md +311 -0
- package/.agent/aidevops/mcp-integrations.md +340 -0
- package/.agent/aidevops/mcp-troubleshooting.md +162 -0
- package/.agent/aidevops/memory-patterns.md +172 -0
- package/.agent/aidevops/providers.md +217 -0
- package/.agent/aidevops/recommendations.md +321 -0
- package/.agent/aidevops/requirements.md +301 -0
- package/.agent/aidevops/resources.md +214 -0
- package/.agent/aidevops/security-requirements.md +174 -0
- package/.agent/aidevops/security.md +350 -0
- package/.agent/aidevops/service-links.md +400 -0
- package/.agent/aidevops/services.md +357 -0
- package/.agent/aidevops/setup.md +153 -0
- package/.agent/aidevops/troubleshooting.md +389 -0
- package/.agent/aidevops.md +124 -0
- package/.agent/build-plus.md +244 -0
- package/.agent/content/guidelines.md +109 -0
- package/.agent/content.md +87 -0
- package/.agent/health.md +59 -0
- package/.agent/legal.md +59 -0
- package/.agent/loop-state/full-loop.local.md +16 -0
- package/.agent/loop-state/ralph-loop.local.md +10 -0
- package/.agent/marketing.md +440 -0
- package/.agent/memory/README.md +260 -0
- package/.agent/onboarding.md +796 -0
- package/.agent/plan-plus.md +245 -0
- package/.agent/research.md +100 -0
- package/.agent/sales.md +333 -0
- package/.agent/scripts/101domains-helper.sh +701 -0
- package/.agent/scripts/add-missing-returns.sh +140 -0
- package/.agent/scripts/agent-browser-helper.sh +311 -0
- package/.agent/scripts/agno-setup.sh +712 -0
- package/.agent/scripts/ahrefs-mcp-wrapper.js +168 -0
- package/.agent/scripts/aidevops-update-check.sh +71 -0
- package/.agent/scripts/ampcode-cli.sh +522 -0
- package/.agent/scripts/auto-version-bump.sh +156 -0
- package/.agent/scripts/autogen-helper.sh +512 -0
- package/.agent/scripts/beads-sync-helper.sh +596 -0
- package/.agent/scripts/closte-helper.sh +5 -0
- package/.agent/scripts/cloudron-helper.sh +321 -0
- package/.agent/scripts/codacy-cli-chunked.sh +581 -0
- package/.agent/scripts/codacy-cli.sh +442 -0
- package/.agent/scripts/code-audit-helper.sh +5 -0
- package/.agent/scripts/coderabbit-cli.sh +417 -0
- package/.agent/scripts/coderabbit-pro-analysis.sh +238 -0
- package/.agent/scripts/commands/code-simplifier.md +86 -0
- package/.agent/scripts/commands/full-loop.md +246 -0
- package/.agent/scripts/commands/postflight-loop.md +103 -0
- package/.agent/scripts/commands/recall.md +182 -0
- package/.agent/scripts/commands/remember.md +132 -0
- package/.agent/scripts/commands/save-todo.md +175 -0
- package/.agent/scripts/commands/session-review.md +154 -0
- package/.agent/scripts/comprehensive-quality-fix.sh +106 -0
- package/.agent/scripts/context-builder-helper.sh +522 -0
- package/.agent/scripts/coolify-cli-helper.sh +674 -0
- package/.agent/scripts/coolify-helper.sh +380 -0
- package/.agent/scripts/crawl4ai-examples.sh +401 -0
- package/.agent/scripts/crawl4ai-helper.sh +1078 -0
- package/.agent/scripts/crewai-helper.sh +681 -0
- package/.agent/scripts/dev-browser-helper.sh +513 -0
- package/.agent/scripts/dns-helper.sh +396 -0
- package/.agent/scripts/domain-research-helper.sh +917 -0
- package/.agent/scripts/dspy-helper.sh +285 -0
- package/.agent/scripts/dspyground-helper.sh +291 -0
- package/.agent/scripts/eeat-score-helper.sh +1242 -0
- package/.agent/scripts/efficient-return-fix.sh +92 -0
- package/.agent/scripts/extract-opencode-prompts.sh +128 -0
- package/.agent/scripts/find-missing-returns.sh +113 -0
- package/.agent/scripts/fix-auth-headers.sh +104 -0
- package/.agent/scripts/fix-common-strings.sh +254 -0
- package/.agent/scripts/fix-content-type.sh +100 -0
- package/.agent/scripts/fix-error-messages.sh +130 -0
- package/.agent/scripts/fix-misplaced-returns.sh +74 -0
- package/.agent/scripts/fix-remaining-literals.sh +152 -0
- package/.agent/scripts/fix-return-statements.sh +41 -0
- package/.agent/scripts/fix-s131-default-cases.sh +249 -0
- package/.agent/scripts/fix-sc2155-simple.sh +102 -0
- package/.agent/scripts/fix-shellcheck-critical.sh +187 -0
- package/.agent/scripts/fix-string-literals.sh +273 -0
- package/.agent/scripts/full-loop-helper.sh +773 -0
- package/.agent/scripts/generate-opencode-agents.sh +497 -0
- package/.agent/scripts/generate-opencode-commands.sh +1629 -0
- package/.agent/scripts/generate-skills.sh +366 -0
- package/.agent/scripts/git-platforms-helper.sh +640 -0
- package/.agent/scripts/gitea-cli-helper.sh +743 -0
- package/.agent/scripts/github-cli-helper.sh +702 -0
- package/.agent/scripts/gitlab-cli-helper.sh +682 -0
- package/.agent/scripts/gsc-add-user-helper.sh +325 -0
- package/.agent/scripts/gsc-sitemap-helper.sh +678 -0
- package/.agent/scripts/hetzner-helper.sh +485 -0
- package/.agent/scripts/hostinger-helper.sh +229 -0
- package/.agent/scripts/keyword-research-helper.sh +1815 -0
- package/.agent/scripts/langflow-helper.sh +544 -0
- package/.agent/scripts/linkedin-automation.py +241 -0
- package/.agent/scripts/linter-manager.sh +599 -0
- package/.agent/scripts/linters-local.sh +434 -0
- package/.agent/scripts/list-keys-helper.sh +488 -0
- package/.agent/scripts/local-browser-automation.py +339 -0
- package/.agent/scripts/localhost-helper.sh +744 -0
- package/.agent/scripts/loop-common.sh +806 -0
- package/.agent/scripts/mainwp-helper.sh +728 -0
- package/.agent/scripts/markdown-formatter.sh +338 -0
- package/.agent/scripts/markdown-lint-fix.sh +311 -0
- package/.agent/scripts/mass-fix-returns.sh +58 -0
- package/.agent/scripts/mcp-diagnose.sh +167 -0
- package/.agent/scripts/mcp-inspector-helper.sh +449 -0
- package/.agent/scripts/memory-helper.sh +650 -0
- package/.agent/scripts/monitor-code-review.sh +255 -0
- package/.agent/scripts/onboarding-helper.sh +706 -0
- package/.agent/scripts/opencode-github-setup-helper.sh +797 -0
- package/.agent/scripts/opencode-test-helper.sh +213 -0
- package/.agent/scripts/pagespeed-helper.sh +464 -0
- package/.agent/scripts/pandoc-helper.sh +362 -0
- package/.agent/scripts/postflight-check.sh +555 -0
- package/.agent/scripts/pre-commit-hook.sh +259 -0
- package/.agent/scripts/pre-edit-check.sh +169 -0
- package/.agent/scripts/qlty-cli.sh +356 -0
- package/.agent/scripts/quality-cli-manager.sh +525 -0
- package/.agent/scripts/quality-feedback-helper.sh +462 -0
- package/.agent/scripts/quality-fix.sh +263 -0
- package/.agent/scripts/quality-loop-helper.sh +1108 -0
- package/.agent/scripts/ralph-loop-helper.sh +836 -0
- package/.agent/scripts/ralph-upstream-check.sh +341 -0
- package/.agent/scripts/secretlint-helper.sh +847 -0
- package/.agent/scripts/servers-helper.sh +241 -0
- package/.agent/scripts/ses-helper.sh +619 -0
- package/.agent/scripts/session-review-helper.sh +404 -0
- package/.agent/scripts/setup-linters-wizard.sh +379 -0
- package/.agent/scripts/setup-local-api-keys.sh +330 -0
- package/.agent/scripts/setup-mcp-integrations.sh +472 -0
- package/.agent/scripts/shared-constants.sh +246 -0
- package/.agent/scripts/site-crawler-helper.sh +1487 -0
- package/.agent/scripts/snyk-helper.sh +940 -0
- package/.agent/scripts/sonarcloud-autofix.sh +193 -0
- package/.agent/scripts/sonarcloud-cli.sh +191 -0
- package/.agent/scripts/sonarscanner-cli.sh +455 -0
- package/.agent/scripts/spaceship-helper.sh +747 -0
- package/.agent/scripts/stagehand-helper.sh +321 -0
- package/.agent/scripts/stagehand-python-helper.sh +321 -0
- package/.agent/scripts/stagehand-python-setup.sh +441 -0
- package/.agent/scripts/stagehand-setup.sh +439 -0
- package/.agent/scripts/system-cleanup.sh +340 -0
- package/.agent/scripts/terminal-title-helper.sh +388 -0
- package/.agent/scripts/terminal-title-setup.sh +549 -0
- package/.agent/scripts/test-stagehand-both-integration.sh +317 -0
- package/.agent/scripts/test-stagehand-integration.sh +309 -0
- package/.agent/scripts/test-stagehand-python-integration.sh +341 -0
- package/.agent/scripts/todo-ready.sh +263 -0
- package/.agent/scripts/tool-version-check.sh +362 -0
- package/.agent/scripts/toon-helper.sh +469 -0
- package/.agent/scripts/twilio-helper.sh +917 -0
- package/.agent/scripts/updown-helper.sh +279 -0
- package/.agent/scripts/validate-mcp-integrations.sh +250 -0
- package/.agent/scripts/validate-version-consistency.sh +131 -0
- package/.agent/scripts/vaultwarden-helper.sh +597 -0
- package/.agent/scripts/vercel-cli-helper.sh +816 -0
- package/.agent/scripts/verify-mirrors.sh +169 -0
- package/.agent/scripts/version-manager.sh +831 -0
- package/.agent/scripts/webhosting-helper.sh +471 -0
- package/.agent/scripts/webhosting-verify.sh +238 -0
- package/.agent/scripts/wordpress-mcp-helper.sh +508 -0
- package/.agent/scripts/worktree-helper.sh +595 -0
- package/.agent/scripts/worktree-sessions.sh +577 -0
- package/.agent/seo/dataforseo.md +215 -0
- package/.agent/seo/domain-research.md +532 -0
- package/.agent/seo/eeat-score.md +659 -0
- package/.agent/seo/google-search-console.md +366 -0
- package/.agent/seo/gsc-sitemaps.md +282 -0
- package/.agent/seo/keyword-research.md +521 -0
- package/.agent/seo/serper.md +278 -0
- package/.agent/seo/site-crawler.md +387 -0
- package/.agent/seo.md +236 -0
- package/.agent/services/accounting/quickfile.md +159 -0
- package/.agent/services/communications/telfon.md +470 -0
- package/.agent/services/communications/twilio.md +569 -0
- package/.agent/services/crm/fluentcrm.md +449 -0
- package/.agent/services/email/ses.md +399 -0
- package/.agent/services/hosting/101domains.md +378 -0
- package/.agent/services/hosting/closte.md +177 -0
- package/.agent/services/hosting/cloudflare.md +251 -0
- package/.agent/services/hosting/cloudron.md +478 -0
- package/.agent/services/hosting/dns-providers.md +335 -0
- package/.agent/services/hosting/domain-purchasing.md +344 -0
- package/.agent/services/hosting/hetzner.md +327 -0
- package/.agent/services/hosting/hostinger.md +287 -0
- package/.agent/services/hosting/localhost.md +419 -0
- package/.agent/services/hosting/spaceship.md +353 -0
- package/.agent/services/hosting/webhosting.md +330 -0
- package/.agent/social-media.md +69 -0
- package/.agent/templates/plans-template.md +114 -0
- package/.agent/templates/prd-template.md +129 -0
- package/.agent/templates/tasks-template.md +108 -0
- package/.agent/templates/todo-template.md +89 -0
- package/.agent/tools/ai-assistants/agno.md +471 -0
- package/.agent/tools/ai-assistants/capsolver.md +326 -0
- package/.agent/tools/ai-assistants/configuration.md +221 -0
- package/.agent/tools/ai-assistants/overview.md +209 -0
- package/.agent/tools/ai-assistants/status.md +171 -0
- package/.agent/tools/ai-assistants/windsurf.md +193 -0
- package/.agent/tools/ai-orchestration/autogen.md +406 -0
- package/.agent/tools/ai-orchestration/crewai.md +445 -0
- package/.agent/tools/ai-orchestration/langflow.md +405 -0
- package/.agent/tools/ai-orchestration/openprose.md +487 -0
- package/.agent/tools/ai-orchestration/overview.md +362 -0
- package/.agent/tools/ai-orchestration/packaging.md +647 -0
- package/.agent/tools/browser/agent-browser.md +464 -0
- package/.agent/tools/browser/browser-automation.md +400 -0
- package/.agent/tools/browser/chrome-devtools.md +282 -0
- package/.agent/tools/browser/crawl4ai-integration.md +422 -0
- package/.agent/tools/browser/crawl4ai-resources.md +277 -0
- package/.agent/tools/browser/crawl4ai-usage.md +416 -0
- package/.agent/tools/browser/crawl4ai.md +585 -0
- package/.agent/tools/browser/dev-browser.md +341 -0
- package/.agent/tools/browser/pagespeed.md +260 -0
- package/.agent/tools/browser/playwright.md +266 -0
- package/.agent/tools/browser/playwriter.md +310 -0
- package/.agent/tools/browser/stagehand-examples.md +456 -0
- package/.agent/tools/browser/stagehand-python.md +483 -0
- package/.agent/tools/browser/stagehand.md +421 -0
- package/.agent/tools/build-agent/agent-review.md +224 -0
- package/.agent/tools/build-agent/build-agent.md +784 -0
- package/.agent/tools/build-mcp/aidevops-plugin.md +476 -0
- package/.agent/tools/build-mcp/api-wrapper.md +445 -0
- package/.agent/tools/build-mcp/build-mcp.md +240 -0
- package/.agent/tools/build-mcp/deployment.md +401 -0
- package/.agent/tools/build-mcp/server-patterns.md +632 -0
- package/.agent/tools/build-mcp/transports.md +366 -0
- package/.agent/tools/code-review/auditing.md +383 -0
- package/.agent/tools/code-review/automation.md +219 -0
- package/.agent/tools/code-review/best-practices.md +203 -0
- package/.agent/tools/code-review/codacy.md +151 -0
- package/.agent/tools/code-review/code-simplifier.md +174 -0
- package/.agent/tools/code-review/code-standards.md +309 -0
- package/.agent/tools/code-review/coderabbit.md +101 -0
- package/.agent/tools/code-review/management.md +155 -0
- package/.agent/tools/code-review/qlty.md +248 -0
- package/.agent/tools/code-review/secretlint.md +565 -0
- package/.agent/tools/code-review/setup.md +250 -0
- package/.agent/tools/code-review/snyk.md +563 -0
- package/.agent/tools/code-review/tools.md +230 -0
- package/.agent/tools/content/summarize.md +353 -0
- package/.agent/tools/context/augment-context-engine.md +468 -0
- package/.agent/tools/context/context-builder-agent.md +76 -0
- package/.agent/tools/context/context-builder.md +375 -0
- package/.agent/tools/context/context7.md +371 -0
- package/.agent/tools/context/dspy.md +302 -0
- package/.agent/tools/context/dspyground.md +374 -0
- package/.agent/tools/context/llm-tldr.md +219 -0
- package/.agent/tools/context/osgrep.md +488 -0
- package/.agent/tools/context/prompt-optimization.md +338 -0
- package/.agent/tools/context/toon.md +292 -0
- package/.agent/tools/conversion/pandoc.md +304 -0
- package/.agent/tools/credentials/api-key-management.md +154 -0
- package/.agent/tools/credentials/api-key-setup.md +224 -0
- package/.agent/tools/credentials/environment-variables.md +180 -0
- package/.agent/tools/credentials/vaultwarden.md +382 -0
- package/.agent/tools/data-extraction/outscraper.md +974 -0
- package/.agent/tools/deployment/coolify-cli.md +388 -0
- package/.agent/tools/deployment/coolify-setup.md +353 -0
- package/.agent/tools/deployment/coolify.md +345 -0
- package/.agent/tools/deployment/vercel.md +390 -0
- package/.agent/tools/git/authentication.md +132 -0
- package/.agent/tools/git/gitea-cli.md +193 -0
- package/.agent/tools/git/github-actions.md +207 -0
- package/.agent/tools/git/github-cli.md +223 -0
- package/.agent/tools/git/gitlab-cli.md +190 -0
- package/.agent/tools/git/opencode-github-security.md +350 -0
- package/.agent/tools/git/opencode-github.md +328 -0
- package/.agent/tools/git/opencode-gitlab.md +252 -0
- package/.agent/tools/git/security.md +196 -0
- package/.agent/tools/git.md +207 -0
- package/.agent/tools/opencode/oh-my-opencode.md +375 -0
- package/.agent/tools/opencode/opencode-anthropic-auth.md +446 -0
- package/.agent/tools/opencode/opencode.md +651 -0
- package/.agent/tools/social-media/bird.md +437 -0
- package/.agent/tools/task-management/beads.md +336 -0
- package/.agent/tools/terminal/terminal-title.md +251 -0
- package/.agent/tools/ui/shadcn.md +196 -0
- package/.agent/tools/ui/ui-skills.md +115 -0
- package/.agent/tools/wordpress/localwp.md +311 -0
- package/.agent/tools/wordpress/mainwp.md +391 -0
- package/.agent/tools/wordpress/scf.md +527 -0
- package/.agent/tools/wordpress/wp-admin.md +729 -0
- package/.agent/tools/wordpress/wp-dev.md +940 -0
- package/.agent/tools/wordpress/wp-preferred.md +398 -0
- package/.agent/tools/wordpress.md +95 -0
- package/.agent/workflows/branch/bugfix.md +63 -0
- package/.agent/workflows/branch/chore.md +95 -0
- package/.agent/workflows/branch/experiment.md +115 -0
- package/.agent/workflows/branch/feature.md +59 -0
- package/.agent/workflows/branch/hotfix.md +98 -0
- package/.agent/workflows/branch/refactor.md +92 -0
- package/.agent/workflows/branch/release.md +96 -0
- package/.agent/workflows/branch.md +347 -0
- package/.agent/workflows/bug-fixing.md +267 -0
- package/.agent/workflows/changelog.md +129 -0
- package/.agent/workflows/code-audit-remote.md +279 -0
- package/.agent/workflows/conversation-starter.md +69 -0
- package/.agent/workflows/error-feedback.md +578 -0
- package/.agent/workflows/feature-development.md +355 -0
- package/.agent/workflows/git-workflow.md +702 -0
- package/.agent/workflows/multi-repo-workspace.md +268 -0
- package/.agent/workflows/plans.md +709 -0
- package/.agent/workflows/postflight.md +604 -0
- package/.agent/workflows/pr.md +571 -0
- package/.agent/workflows/preflight.md +278 -0
- package/.agent/workflows/ralph-loop.md +773 -0
- package/.agent/workflows/release.md +498 -0
- package/.agent/workflows/session-manager.md +254 -0
- package/.agent/workflows/session-review.md +311 -0
- package/.agent/workflows/sql-migrations.md +631 -0
- package/.agent/workflows/version-bump.md +283 -0
- package/.agent/workflows/wiki-update.md +333 -0
- package/.agent/workflows/worktree.md +477 -0
- package/LICENSE +21 -0
- package/README.md +1446 -0
- package/VERSION +1 -0
- package/aidevops.sh +1746 -0
- package/bin/aidevops +21 -0
- package/package.json +75 -0
- package/scripts/npm-postinstall.js +60 -0
- package/setup.sh +2366 -0
|
@@ -0,0 +1,1487 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# shellcheck disable=SC2034,SC2155,SC2317,SC2329,SC2016,SC2181,SC1091,SC2154,SC2015,SC2086,SC2129,SC2030,SC2031,SC2119,SC2120,SC2001,SC2162,SC2088,SC2089,SC2090,SC2029,SC2006,SC2153
|
|
3
|
+
|
|
4
|
+
# Site Crawler Helper Script
|
|
5
|
+
# SEO site auditing with Screaming Frog-like capabilities
|
|
6
|
+
# Uses Crawl4AI when available, falls back to lightweight Python crawler
|
|
7
|
+
#
|
|
8
|
+
# Usage: ./site-crawler-helper.sh [command] [url] [options]
|
|
9
|
+
# Commands:
|
|
10
|
+
# crawl - Full site crawl with SEO data extraction
|
|
11
|
+
# audit-links - Check for broken links (4XX/5XX)
|
|
12
|
+
# audit-meta - Audit page titles and meta descriptions
|
|
13
|
+
# audit-redirects - Analyze redirects and chains
|
|
14
|
+
# generate-sitemap - Generate XML sitemap from crawl
|
|
15
|
+
# compare - Compare two crawls
|
|
16
|
+
# status - Check crawler dependencies
|
|
17
|
+
# help - Show this help message
|
|
18
|
+
#
|
|
19
|
+
# Author: AI DevOps Framework
|
|
20
|
+
# Version: 2.0.0
|
|
21
|
+
# License: MIT
|
|
22
|
+
|
|
23
|
+
set -euo pipefail
|
|
24
|
+
|
|
25
|
+
# Colors for output
|
|
26
|
+
readonly GREEN='\033[0;32m'
|
|
27
|
+
readonly BLUE='\033[0;34m'
|
|
28
|
+
readonly YELLOW='\033[1;33m'
|
|
29
|
+
readonly RED='\033[0;31m'
|
|
30
|
+
readonly PURPLE='\033[0;35m'
|
|
31
|
+
readonly NC='\033[0m'
|
|
32
|
+
|
|
33
|
+
# Constants
|
|
34
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" || exit
|
|
35
|
+
readonly SCRIPT_DIR
|
|
36
|
+
readonly CONFIG_DIR="${HOME}/.config/aidevops"
|
|
37
|
+
readonly CONFIG_FILE="${CONFIG_DIR}/site-crawler.json"
|
|
38
|
+
readonly DEFAULT_OUTPUT_DIR="${HOME}/Downloads"
|
|
39
|
+
readonly CRAWL4AI_PORT="11235"
|
|
40
|
+
readonly CRAWL4AI_URL="http://localhost:${CRAWL4AI_PORT}"
|
|
41
|
+
|
|
42
|
+
# Default configuration
|
|
43
|
+
DEFAULT_DEPTH=3
|
|
44
|
+
DEFAULT_MAX_URLS=100
|
|
45
|
+
DEFAULT_DELAY=100
|
|
46
|
+
DEFAULT_FORMAT="xlsx"
|
|
47
|
+
RESPECT_ROBOTS=true
|
|
48
|
+
USE_CRAWL4AI=false
|
|
49
|
+
|
|
50
|
+
# Detect Python with required packages
|
|
51
|
+
PYTHON_CMD=""
|
|
52
|
+
|
|
53
|
+
# Print functions
|
|
54
|
+
print_success() {
|
|
55
|
+
echo -e "${GREEN}[OK] $1${NC}"
|
|
56
|
+
return 0
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
print_info() {
|
|
60
|
+
echo -e "${BLUE}[INFO] $1${NC}"
|
|
61
|
+
return 0
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
print_warning() {
|
|
65
|
+
echo -e "${YELLOW}[WARN] $1${NC}"
|
|
66
|
+
return 0
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
print_error() {
|
|
70
|
+
echo -e "${RED}[ERROR] $1${NC}"
|
|
71
|
+
return 0
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
print_header() {
|
|
75
|
+
echo -e "${PURPLE}=== $1 ===${NC}"
|
|
76
|
+
return 0
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
# Check if Crawl4AI is available
|
|
80
|
+
check_crawl4ai() {
|
|
81
|
+
if curl -s --connect-timeout 2 "${CRAWL4AI_URL}/health" &> /dev/null; then
|
|
82
|
+
USE_CRAWL4AI=true
|
|
83
|
+
return 0
|
|
84
|
+
fi
|
|
85
|
+
return 1
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
# Find working Python with dependencies
|
|
89
|
+
find_python() {
|
|
90
|
+
local pythons=("python3.11" "python3.12" "python3.10" "python3")
|
|
91
|
+
local user_site="${HOME}/Library/Python/3.11/lib/python/site-packages"
|
|
92
|
+
|
|
93
|
+
for py in "${pythons[@]}"; do
|
|
94
|
+
if command -v "$py" &> /dev/null; then
|
|
95
|
+
# Test if it has the required modules
|
|
96
|
+
if PYTHONPATH="${user_site}:${PYTHONPATH:-}" "$py" -c "import aiohttp, bs4" 2>/dev/null; then
|
|
97
|
+
PYTHON_CMD="$py"
|
|
98
|
+
export PYTHONPATH="${user_site}:${PYTHONPATH:-}"
|
|
99
|
+
return 0
|
|
100
|
+
fi
|
|
101
|
+
fi
|
|
102
|
+
done
|
|
103
|
+
return 1
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
# Install Python dependencies
|
|
107
|
+
install_python_deps() {
|
|
108
|
+
local pythons=("python3.11" "python3.12" "python3.10" "python3")
|
|
109
|
+
|
|
110
|
+
for py in "${pythons[@]}"; do
|
|
111
|
+
if command -v "$py" &> /dev/null; then
|
|
112
|
+
print_info "Installing dependencies with $py..."
|
|
113
|
+
"$py" -m pip install --user aiohttp beautifulsoup4 openpyxl 2>/dev/null && {
|
|
114
|
+
PYTHON_CMD="$py"
|
|
115
|
+
export PYTHONPATH="${HOME}/Library/Python/3.11/lib/python/site-packages:${PYTHONPATH:-}"
|
|
116
|
+
return 0
|
|
117
|
+
}
|
|
118
|
+
fi
|
|
119
|
+
done
|
|
120
|
+
return 1
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
# Extract domain from URL
|
|
124
|
+
get_domain() {
|
|
125
|
+
local url="$1"
|
|
126
|
+
echo "$url" | sed -E 's|^https?://||' | sed -E 's|/.*||' | sed -E 's|:.*||'
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
# Create output directory structure
|
|
130
|
+
create_output_dir() {
|
|
131
|
+
local domain="$1"
|
|
132
|
+
local output_base="${2:-$DEFAULT_OUTPUT_DIR}"
|
|
133
|
+
local timestamp
|
|
134
|
+
timestamp=$(date +%Y-%m-%d_%H%M%S)
|
|
135
|
+
|
|
136
|
+
local output_dir="${output_base}/${domain}/${timestamp}"
|
|
137
|
+
mkdir -p "$output_dir"
|
|
138
|
+
|
|
139
|
+
# Update _latest symlink
|
|
140
|
+
local latest_link="${output_base}/${domain}/_latest"
|
|
141
|
+
rm -f "$latest_link"
|
|
142
|
+
ln -sf "$timestamp" "$latest_link"
|
|
143
|
+
|
|
144
|
+
echo "$output_dir"
|
|
145
|
+
return 0
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
# Save markdown with rich metadata frontmatter and download images
|
|
149
|
+
save_markdown_with_metadata() {
|
|
150
|
+
local result="$1"
|
|
151
|
+
local full_page_dir="$2"
|
|
152
|
+
local body_only_dir="$3"
|
|
153
|
+
local images_dir="$4"
|
|
154
|
+
local base_domain="$5"
|
|
155
|
+
|
|
156
|
+
# Extract basic info
|
|
157
|
+
local page_url status_code redirected_url success
|
|
158
|
+
page_url=$(printf '%s' "$result" | jq -r '.url // empty')
|
|
159
|
+
status_code=$(printf '%s' "$result" | jq -r '.status_code // 0')
|
|
160
|
+
redirected_url=$(printf '%s' "$result" | jq -r '.redirected_url // empty')
|
|
161
|
+
success=$(printf '%s' "$result" | jq -r '.success // false')
|
|
162
|
+
|
|
163
|
+
# Fix status code: if success=true and we got content, report 200 (final status)
|
|
164
|
+
# Only show redirect status if there was an actual redirect to a different path
|
|
165
|
+
local original_status="$status_code"
|
|
166
|
+
if [[ "$success" == "true" && $status_code -ge 300 && $status_code -lt 400 ]]; then
|
|
167
|
+
# Check if redirect was just trailing slash normalization
|
|
168
|
+
local url_normalized redirect_normalized
|
|
169
|
+
url_normalized=$(echo "$page_url" | sed 's|/$||')
|
|
170
|
+
redirect_normalized=$(echo "$redirected_url" | sed 's|/$||')
|
|
171
|
+
if [[ "$url_normalized" == "$redirect_normalized" ]]; then
|
|
172
|
+
status_code=200 # Trailing slash redirect, content was fetched successfully
|
|
173
|
+
fi
|
|
174
|
+
fi
|
|
175
|
+
|
|
176
|
+
# Extract metadata
|
|
177
|
+
local title meta_desc meta_keywords canonical og_title og_desc og_image
|
|
178
|
+
title=$(printf '%s' "$result" | jq -r '.metadata.title // empty')
|
|
179
|
+
meta_desc=$(printf '%s' "$result" | jq -r '.metadata.description // empty')
|
|
180
|
+
meta_keywords=$(printf '%s' "$result" | jq -r '.metadata.keywords // empty')
|
|
181
|
+
canonical=$(printf '%s' "$result" | jq -r '.metadata."og:url" // empty')
|
|
182
|
+
og_title=$(printf '%s' "$result" | jq -r '.metadata."og:title" // empty')
|
|
183
|
+
og_desc=$(printf '%s' "$result" | jq -r '.metadata."og:description" // empty')
|
|
184
|
+
og_image=$(printf '%s' "$result" | jq -r '.metadata."og:image" // empty')
|
|
185
|
+
|
|
186
|
+
# Extract hreflang (from response headers or HTML - Crawl4AI may have this in metadata)
|
|
187
|
+
local hreflang_json
|
|
188
|
+
hreflang_json=$(printf '%s' "$result" | jq -c '[.metadata | to_entries[] | select(.key | startswith("hreflang")) | {lang: .key, url: .value}]' 2>/dev/null || echo "[]")
|
|
189
|
+
|
|
190
|
+
# Extract JSON-LD schema from HTML
|
|
191
|
+
local schema_json=""
|
|
192
|
+
local html_content
|
|
193
|
+
html_content=$(printf '%s' "$result" | jq -r '.html // empty' 2>/dev/null)
|
|
194
|
+
if [[ -n "$html_content" ]]; then
|
|
195
|
+
# Extract all JSON-LD script blocks
|
|
196
|
+
schema_json=$(echo "$html_content" | grep -o '<script type="application/ld+json"[^>]*>[^<]*</script>' | \
|
|
197
|
+
sed 's/<script type="application\/ld+json"[^>]*>//g' | \
|
|
198
|
+
sed 's/<\/script>//g' | \
|
|
199
|
+
while read -r schema_block; do
|
|
200
|
+
# Pretty print each schema block
|
|
201
|
+
echo "$schema_block" | jq '.' 2>/dev/null
|
|
202
|
+
done)
|
|
203
|
+
fi
|
|
204
|
+
|
|
205
|
+
# Get markdown content
|
|
206
|
+
local markdown_content
|
|
207
|
+
markdown_content=$(printf '%s' "$result" | jq -r '.markdown.raw_markdown // .markdown // empty' 2>/dev/null)
|
|
208
|
+
|
|
209
|
+
[[ -z "$markdown_content" || "$markdown_content" == "null" || "$markdown_content" == "{" ]] && return 0
|
|
210
|
+
|
|
211
|
+
# Generate slug for filename
|
|
212
|
+
local slug
|
|
213
|
+
slug=$(echo "$page_url" | sed -E 's|^https?://[^/]+||' | sed 's|^/||' | sed 's|/$||' | tr '/' '-' | tr '?' '-' | tr '&' '-')
|
|
214
|
+
[[ -z "$slug" ]] && slug="index"
|
|
215
|
+
slug="${slug:0:100}"
|
|
216
|
+
|
|
217
|
+
# Extract and download body images
|
|
218
|
+
local images_json page_images_dir
|
|
219
|
+
images_json=$(printf '%s' "$result" | jq -c '.media.images // []' 2>/dev/null)
|
|
220
|
+
page_images_dir="${images_dir}/${slug}"
|
|
221
|
+
|
|
222
|
+
local downloaded_images
|
|
223
|
+
downloaded_images=()
|
|
224
|
+
local image_count
|
|
225
|
+
image_count=$(echo "$images_json" | jq 'length' 2>/dev/null || echo "0")
|
|
226
|
+
|
|
227
|
+
if [[ $image_count -gt 0 ]]; then
|
|
228
|
+
mkdir -p "$page_images_dir"
|
|
229
|
+
|
|
230
|
+
# Download unique images (skip srcset variants by filtering unique base names)
|
|
231
|
+
local seen_images
|
|
232
|
+
seen_images=()
|
|
233
|
+
for ((j=0; j<image_count && j<20; j++)); do # Limit to 20 images per page
|
|
234
|
+
local img_src img_alt img_filename
|
|
235
|
+
img_src=$(echo "$images_json" | jq -r ".[$j].src // empty")
|
|
236
|
+
img_alt=$(echo "$images_json" | jq -r ".[$j].alt // empty")
|
|
237
|
+
|
|
238
|
+
[[ -z "$img_src" ]] && continue
|
|
239
|
+
|
|
240
|
+
# Skip data URIs and tiny images (likely icons/tracking pixels)
|
|
241
|
+
[[ "$img_src" =~ ^data: ]] && continue
|
|
242
|
+
|
|
243
|
+
# Extract filename from URL
|
|
244
|
+
img_filename=$(basename "$img_src" | sed 's|?.*||' | sed 's|#.*||')
|
|
245
|
+
|
|
246
|
+
# Skip if we've already seen this base image (avoid srcset duplicates)
|
|
247
|
+
local base_img
|
|
248
|
+
base_img=$(echo "$img_filename" | sed -E 's/-[0-9]+x[0-9]+\./\./')
|
|
249
|
+
|
|
250
|
+
local already_seen=false
|
|
251
|
+
if [[ ${#seen_images[@]} -gt 0 ]]; then
|
|
252
|
+
for seen in "${seen_images[@]}"; do
|
|
253
|
+
if [[ "$seen" == "$base_img" ]]; then
|
|
254
|
+
already_seen=true
|
|
255
|
+
break
|
|
256
|
+
fi
|
|
257
|
+
done
|
|
258
|
+
fi
|
|
259
|
+
if [[ "$already_seen" == "true" ]]; then
|
|
260
|
+
continue
|
|
261
|
+
fi
|
|
262
|
+
seen_images+=("$base_img")
|
|
263
|
+
|
|
264
|
+
# Download image (quietly, with timeout)
|
|
265
|
+
if curl -sS -L --max-time 10 -o "${page_images_dir}/${img_filename}" "$img_src" 2>/dev/null; then
|
|
266
|
+
# Only keep if file is > 1KB (skip tracking pixels)
|
|
267
|
+
local file_size
|
|
268
|
+
file_size=$(stat -f%z "${page_images_dir}/${img_filename}" 2>/dev/null || echo "0")
|
|
269
|
+
if [[ $file_size -gt 1024 ]]; then
|
|
270
|
+
downloaded_images+=("${img_filename}|${img_src}|${img_alt}")
|
|
271
|
+
else
|
|
272
|
+
rm -f "${page_images_dir}/${img_filename}"
|
|
273
|
+
fi
|
|
274
|
+
fi
|
|
275
|
+
done
|
|
276
|
+
|
|
277
|
+
# Remove empty directory if no images downloaded
|
|
278
|
+
rmdir "${page_images_dir}" 2>/dev/null || true
|
|
279
|
+
fi
|
|
280
|
+
|
|
281
|
+
# Build YAML frontmatter
|
|
282
|
+
local frontmatter="---
|
|
283
|
+
url: \"${page_url}\"
|
|
284
|
+
status_code: ${status_code}"
|
|
285
|
+
|
|
286
|
+
# Add redirect info only if it was a real redirect (not just trailing slash)
|
|
287
|
+
if [[ $original_status -ge 300 && $original_status -lt 400 && "$status_code" != "$original_status" ]]; then
|
|
288
|
+
frontmatter+="
|
|
289
|
+
redirect_status: ${original_status}
|
|
290
|
+
redirected_to: \"${redirected_url}\""
|
|
291
|
+
elif [[ -n "$redirected_url" && "$redirected_url" != "$page_url" && "$redirected_url" != "null" ]]; then
|
|
292
|
+
frontmatter+="
|
|
293
|
+
redirected_to: \"${redirected_url}\""
|
|
294
|
+
fi
|
|
295
|
+
|
|
296
|
+
# Add SEO metadata (use || true to prevent set -e exit on false conditions)
|
|
297
|
+
if [[ -n "$title" && "$title" != "null" ]]; then
|
|
298
|
+
frontmatter+="
|
|
299
|
+
title: \"$(echo "$title" | sed 's/"/\\"/g')\""
|
|
300
|
+
fi
|
|
301
|
+
|
|
302
|
+
if [[ -n "$meta_desc" && "$meta_desc" != "null" ]]; then
|
|
303
|
+
frontmatter+="
|
|
304
|
+
description: \"$(echo "$meta_desc" | sed 's/"/\\"/g')\""
|
|
305
|
+
fi
|
|
306
|
+
|
|
307
|
+
if [[ -n "$meta_keywords" && "$meta_keywords" != "null" ]]; then
|
|
308
|
+
frontmatter+="
|
|
309
|
+
keywords: \"$(echo "$meta_keywords" | sed 's/"/\\"/g')\""
|
|
310
|
+
fi
|
|
311
|
+
|
|
312
|
+
if [[ -n "$canonical" && "$canonical" != "null" ]]; then
|
|
313
|
+
frontmatter+="
|
|
314
|
+
canonical: \"${canonical}\""
|
|
315
|
+
fi
|
|
316
|
+
|
|
317
|
+
# Add Open Graph data
|
|
318
|
+
if [[ -n "$og_title" && "$og_title" != "null" && "$og_title" != "$title" ]]; then
|
|
319
|
+
frontmatter+="
|
|
320
|
+
og_title: \"$(echo "$og_title" | sed 's/"/\\"/g')\""
|
|
321
|
+
fi
|
|
322
|
+
|
|
323
|
+
if [[ -n "$og_image" && "$og_image" != "null" ]]; then
|
|
324
|
+
frontmatter+="
|
|
325
|
+
og_image: \"${og_image}\""
|
|
326
|
+
fi
|
|
327
|
+
|
|
328
|
+
# Add hreflang if present
|
|
329
|
+
if [[ "$hreflang_json" != "[]" && "$hreflang_json" != "null" ]]; then
|
|
330
|
+
local hreflang_yaml
|
|
331
|
+
hreflang_yaml=$(echo "$hreflang_json" | jq -r '.[] | " - lang: \"\(.lang)\"\n url: \"\(.url)\""' 2>/dev/null)
|
|
332
|
+
if [[ -n "$hreflang_yaml" ]]; then
|
|
333
|
+
frontmatter+="
|
|
334
|
+
hreflang:
|
|
335
|
+
${hreflang_yaml}"
|
|
336
|
+
fi
|
|
337
|
+
fi
|
|
338
|
+
|
|
339
|
+
# Add downloaded images list
|
|
340
|
+
if [[ -n "${downloaded_images[*]:-}" ]]; then
|
|
341
|
+
frontmatter+="
|
|
342
|
+
images:"
|
|
343
|
+
for img_info in "${downloaded_images[@]}"; do
|
|
344
|
+
local img_file img_url img_alt_text
|
|
345
|
+
img_file=$(echo "$img_info" | cut -d'|' -f1)
|
|
346
|
+
img_url=$(echo "$img_info" | cut -d'|' -f2)
|
|
347
|
+
img_alt_text=$(echo "$img_info" | cut -d'|' -f3 | sed 's/"/\\"/g')
|
|
348
|
+
frontmatter+="
|
|
349
|
+
- file: \"${img_file}\"
|
|
350
|
+
original_url: \"${img_url}\""
|
|
351
|
+
if [[ -n "$img_alt_text" ]]; then
|
|
352
|
+
frontmatter+="
|
|
353
|
+
alt: \"${img_alt_text}\""
|
|
354
|
+
fi
|
|
355
|
+
done
|
|
356
|
+
fi
|
|
357
|
+
|
|
358
|
+
# Add crawl timestamp
|
|
359
|
+
frontmatter+="
|
|
360
|
+
crawled_at: \"$(date -Iseconds)\"
|
|
361
|
+
---"
|
|
362
|
+
|
|
363
|
+
# Update markdown image references to point to local files
|
|
364
|
+
local updated_markdown="$markdown_content"
|
|
365
|
+
for img_info in "${downloaded_images[@]+"${downloaded_images[@]}"}"; do
|
|
366
|
+
[[ -z "$img_info" ]] && continue
|
|
367
|
+
local img_file img_url
|
|
368
|
+
img_file=$(echo "$img_info" | cut -d'|' -f1)
|
|
369
|
+
img_url=$(echo "$img_info" | cut -d'|' -f2)
|
|
370
|
+
# Replace remote URL with local path
|
|
371
|
+
updated_markdown=$(echo "$updated_markdown" | sed "s|${img_url}|../images/${slug}/${img_file}|g")
|
|
372
|
+
done
|
|
373
|
+
|
|
374
|
+
# Extract body-only content (remove nav, header, footer, cookie notices)
|
|
375
|
+
local body_markdown
|
|
376
|
+
body_markdown=$(extract_body_content "$updated_markdown")
|
|
377
|
+
|
|
378
|
+
# Write the FULL PAGE markdown file
|
|
379
|
+
{
|
|
380
|
+
echo "$frontmatter"
|
|
381
|
+
echo ""
|
|
382
|
+
echo "$updated_markdown"
|
|
383
|
+
|
|
384
|
+
# Append schema markup if found
|
|
385
|
+
if [[ -n "$schema_json" ]]; then
|
|
386
|
+
echo ""
|
|
387
|
+
echo "---"
|
|
388
|
+
echo ""
|
|
389
|
+
echo "## Structured Data (JSON-LD)"
|
|
390
|
+
echo ""
|
|
391
|
+
echo '```json'
|
|
392
|
+
echo "$schema_json"
|
|
393
|
+
echo '```'
|
|
394
|
+
fi
|
|
395
|
+
} > "${full_page_dir}/${slug}.md"
|
|
396
|
+
|
|
397
|
+
# Write the BODY ONLY markdown file (no schema - just content)
|
|
398
|
+
{
|
|
399
|
+
echo "$frontmatter"
|
|
400
|
+
echo ""
|
|
401
|
+
echo "$body_markdown"
|
|
402
|
+
} > "${body_only_dir}/${slug}.md"
|
|
403
|
+
|
|
404
|
+
return 0
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
# Extract body content from markdown (remove nav, header, footer, cookie notices)
|
|
408
|
+
# Site-agnostic approach - optimized for performance
|
|
409
|
+
extract_body_content() {
|
|
410
|
+
local markdown="$1"
|
|
411
|
+
|
|
412
|
+
# Use awk for efficient single-pass extraction
|
|
413
|
+
# This is much faster than bash loops with regex
|
|
414
|
+
echo "$markdown" | awk '
|
|
415
|
+
BEGIN {
|
|
416
|
+
in_body = 0
|
|
417
|
+
footer_started = 0
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
# Start at first H1 or H2 heading
|
|
421
|
+
/^#+ / && !in_body {
|
|
422
|
+
in_body = 1
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
# Skip until we find a heading
|
|
426
|
+
!in_body { next }
|
|
427
|
+
|
|
428
|
+
# Detect footer markers
|
|
429
|
+
/^##* *[Ff]ooter/ { footer_started = 1 }
|
|
430
|
+
/©|Copyright|\(c\) *20[0-9][0-9]/ { footer_started = 1 }
|
|
431
|
+
/All rights reserved|Alle Rechte vorbehalten|Tous droits/ { footer_started = 1 }
|
|
432
|
+
/^##* *(References|Références|Referenzen)$/ { footer_started = 1 }
|
|
433
|
+
|
|
434
|
+
# Cookie/GDPR patterns
|
|
435
|
+
/[Cc]ookie.*(consent|settings|preferences|policy)/ { footer_started = 1 }
|
|
436
|
+
/GDPR|CCPA|LGPD/ { footer_started = 1 }
|
|
437
|
+
/[Pp]rivacy [Oo]verview/ { footer_started = 1 }
|
|
438
|
+
/[Ss]trictly [Nn]ecessary [Cc]ookie/ { footer_started = 1 }
|
|
439
|
+
|
|
440
|
+
# Powered by patterns
|
|
441
|
+
/[Pp]owered by|[Bb]uilt with|[Mm]ade with/ { footer_started = 1 }
|
|
442
|
+
|
|
443
|
+
# Skip footer content
|
|
444
|
+
footer_started { next }
|
|
445
|
+
|
|
446
|
+
# Print body content
|
|
447
|
+
{ print }
|
|
448
|
+
'
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
# Crawl using Crawl4AI API with multi-page discovery
|
|
452
|
+
crawl_with_crawl4ai() {
|
|
453
|
+
local url="$1"
|
|
454
|
+
local output_dir="$2"
|
|
455
|
+
local max_urls="$3"
|
|
456
|
+
local depth="$4"
|
|
457
|
+
|
|
458
|
+
print_info "Using Crawl4AI backend..."
|
|
459
|
+
|
|
460
|
+
# Create content directories
|
|
461
|
+
local full_page_dir="${output_dir}/content-full-page-md"
|
|
462
|
+
local body_only_dir="${output_dir}/content-body-md"
|
|
463
|
+
local images_dir="${output_dir}/images"
|
|
464
|
+
mkdir -p "$full_page_dir" "$body_only_dir" "$images_dir"
|
|
465
|
+
|
|
466
|
+
# Extract base domain for internal link filtering
|
|
467
|
+
local base_domain
|
|
468
|
+
base_domain=$(echo "$url" | sed -E 's|^https?://||' | sed -E 's|/.*||')
|
|
469
|
+
|
|
470
|
+
# Initialize tracking arrays via temp files
|
|
471
|
+
local visited_file="${output_dir}/.visited_urls"
|
|
472
|
+
local queue_file="${output_dir}/.queue_urls"
|
|
473
|
+
local results_file="${output_dir}/.results.jsonl"
|
|
474
|
+
|
|
475
|
+
echo "$url" > "$queue_file"
|
|
476
|
+
touch "$visited_file"
|
|
477
|
+
touch "$results_file"
|
|
478
|
+
|
|
479
|
+
local crawled_count=0
|
|
480
|
+
local current_depth=0
|
|
481
|
+
|
|
482
|
+
print_info "Starting multi-page crawl (max: $max_urls, depth: $depth)"
|
|
483
|
+
|
|
484
|
+
while [[ $crawled_count -lt $max_urls ]] && [[ -s "$queue_file" ]]; do
|
|
485
|
+
# Get next batch of URLs (up to 5 at a time for efficiency)
|
|
486
|
+
local batch_size=5
|
|
487
|
+
local remaining=$((max_urls - crawled_count))
|
|
488
|
+
[[ $remaining -lt $batch_size ]] && batch_size=$remaining
|
|
489
|
+
|
|
490
|
+
local batch_urls=()
|
|
491
|
+
local batch_count=0
|
|
492
|
+
|
|
493
|
+
while IFS= read -r queue_url && [[ $batch_count -lt $batch_size ]]; do
|
|
494
|
+
# Skip if already visited
|
|
495
|
+
if grep -qxF "$queue_url" "$visited_file" 2>/dev/null; then
|
|
496
|
+
continue
|
|
497
|
+
fi
|
|
498
|
+
batch_urls+=("$queue_url")
|
|
499
|
+
echo "$queue_url" >> "$visited_file"
|
|
500
|
+
((batch_count++))
|
|
501
|
+
done < "$queue_file"
|
|
502
|
+
|
|
503
|
+
# Remove processed URLs from queue
|
|
504
|
+
if [[ ${#batch_urls[@]} -gt 0 ]]; then
|
|
505
|
+
local new_queue
|
|
506
|
+
new_queue=$(mktemp)
|
|
507
|
+
while IFS= read -r queue_url; do
|
|
508
|
+
if ! grep -qxF "$queue_url" "$visited_file" 2>/dev/null; then
|
|
509
|
+
echo "$queue_url"
|
|
510
|
+
fi
|
|
511
|
+
done < "$queue_file" > "$new_queue"
|
|
512
|
+
mv "$new_queue" "$queue_file"
|
|
513
|
+
fi
|
|
514
|
+
|
|
515
|
+
[[ ${#batch_urls[@]} -eq 0 ]] && break
|
|
516
|
+
|
|
517
|
+
# Build JSON array of URLs
|
|
518
|
+
local urls_json="["
|
|
519
|
+
local first=true
|
|
520
|
+
for batch_url in "${batch_urls[@]}"; do
|
|
521
|
+
[[ "$first" != "true" ]] && urls_json+=","
|
|
522
|
+
urls_json+="\"$batch_url\""
|
|
523
|
+
first=false
|
|
524
|
+
done
|
|
525
|
+
urls_json+="]"
|
|
526
|
+
|
|
527
|
+
print_info "[${crawled_count}/${max_urls}] Crawling batch of ${#batch_urls[@]} URLs..."
|
|
528
|
+
|
|
529
|
+
# Submit crawl job to Crawl4AI
|
|
530
|
+
local response
|
|
531
|
+
response=$(curl -s -X POST "${CRAWL4AI_URL}/crawl" \
|
|
532
|
+
--max-time 120 \
|
|
533
|
+
-H "Content-Type: application/json" \
|
|
534
|
+
-d "{
|
|
535
|
+
\"urls\": $urls_json,
|
|
536
|
+
\"crawler_config\": {
|
|
537
|
+
\"type\": \"CrawlerRunConfig\",
|
|
538
|
+
\"params\": {
|
|
539
|
+
\"cache_mode\": \"bypass\",
|
|
540
|
+
\"word_count_threshold\": 10,
|
|
541
|
+
\"page_timeout\": 30000
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
}" 2>/dev/null)
|
|
545
|
+
|
|
546
|
+
if [[ -z "$response" ]]; then
|
|
547
|
+
print_warning "No response from Crawl4AI for batch, skipping..."
|
|
548
|
+
continue
|
|
549
|
+
fi
|
|
550
|
+
|
|
551
|
+
# Process results
|
|
552
|
+
if command -v jq &> /dev/null; then
|
|
553
|
+
# Extract each result
|
|
554
|
+
local result_count
|
|
555
|
+
result_count=$(echo "$response" | jq -r '.results | length' 2>/dev/null || echo "0")
|
|
556
|
+
|
|
557
|
+
for ((i=0; i<result_count; i++)); do
|
|
558
|
+
local result
|
|
559
|
+
result=$(echo "$response" | jq -c ".results[$i]" 2>/dev/null)
|
|
560
|
+
[[ -z "$result" || "$result" == "null" ]] && continue
|
|
561
|
+
|
|
562
|
+
# Append to results file
|
|
563
|
+
echo "$result" >> "$results_file"
|
|
564
|
+
((crawled_count++))
|
|
565
|
+
|
|
566
|
+
local page_url status_code
|
|
567
|
+
page_url=$(printf '%s' "$result" | jq -r '.url // empty')
|
|
568
|
+
status_code=$(printf '%s' "$result" | jq -r '.status_code // 0')
|
|
569
|
+
|
|
570
|
+
print_info " [${crawled_count}] ${status_code} ${page_url:0:60}"
|
|
571
|
+
|
|
572
|
+
# Save markdown content with rich metadata frontmatter (non-fatal if it fails)
|
|
573
|
+
save_markdown_with_metadata "$result" "$full_page_dir" "$body_only_dir" "$images_dir" "$base_domain" || true
|
|
574
|
+
|
|
575
|
+
# Extract internal links for queue (if under depth limit)
|
|
576
|
+
if [[ $current_depth -lt $depth ]]; then
|
|
577
|
+
local links
|
|
578
|
+
links=$(printf '%s' "$result" | jq -r '.links.internal[]?.href // empty' 2>/dev/null | head -50)
|
|
579
|
+
|
|
580
|
+
while IFS= read -r link; do
|
|
581
|
+
[[ -z "$link" ]] && continue
|
|
582
|
+
# Normalize URL
|
|
583
|
+
if [[ "$link" =~ ^/ ]]; then
|
|
584
|
+
link="https://${base_domain}${link}"
|
|
585
|
+
elif [[ ! "$link" =~ ^https?:// ]]; then
|
|
586
|
+
continue
|
|
587
|
+
fi
|
|
588
|
+
# Only add internal links
|
|
589
|
+
if [[ "$link" =~ $base_domain ]]; then
|
|
590
|
+
# Remove fragments and normalize
|
|
591
|
+
link=$(echo "$link" | sed 's|#.*||' | sed 's|/$||')
|
|
592
|
+
# Add to queue if not visited
|
|
593
|
+
if ! grep -qxF "$link" "$visited_file" 2>/dev/null; then
|
|
594
|
+
echo "$link" >> "$queue_file"
|
|
595
|
+
fi
|
|
596
|
+
fi
|
|
597
|
+
done <<< "$links"
|
|
598
|
+
fi
|
|
599
|
+
done
|
|
600
|
+
fi
|
|
601
|
+
|
|
602
|
+
((current_depth++))
|
|
603
|
+
done
|
|
604
|
+
|
|
605
|
+
print_info "Crawl complete. Processing results..."
|
|
606
|
+
|
|
607
|
+
# Generate CSV and XLSX from results
|
|
608
|
+
crawl4ai_generate_reports "$output_dir" "$results_file" "$base_domain"
|
|
609
|
+
|
|
610
|
+
# Cleanup temp files
|
|
611
|
+
rm -f "$visited_file" "$queue_file"
|
|
612
|
+
|
|
613
|
+
# Count markdown files and images
|
|
614
|
+
local full_page_count body_count img_count
|
|
615
|
+
full_page_count=$(find "$full_page_dir" -name "*.md" 2>/dev/null | wc -l | tr -d ' ')
|
|
616
|
+
body_count=$(find "$body_only_dir" -name "*.md" 2>/dev/null | wc -l | tr -d ' ')
|
|
617
|
+
img_count=$(find "$images_dir" -type f \( -name "*.jpg" -o -name "*.jpeg" -o -name "*.png" -o -name "*.gif" -o -name "*.webp" -o -name "*.svg" \) 2>/dev/null | wc -l | tr -d ' ')
|
|
618
|
+
|
|
619
|
+
print_success "Crawl4AI results saved to ${output_dir}"
|
|
620
|
+
print_info " Pages crawled: $crawled_count"
|
|
621
|
+
print_info " Full page markdown: $full_page_count (in content-full-page-md/)"
|
|
622
|
+
print_info " Body-only markdown: $body_count (in content-body-md/)"
|
|
623
|
+
print_info " Images downloaded: $img_count (in images/)"
|
|
624
|
+
return 0
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
# Generate reports from Crawl4AI results
|
|
628
|
+
crawl4ai_generate_reports() {
|
|
629
|
+
local output_dir="$1"
|
|
630
|
+
local results_file="$2"
|
|
631
|
+
local base_domain="$3"
|
|
632
|
+
|
|
633
|
+
[[ ! -s "$results_file" ]] && return 0
|
|
634
|
+
|
|
635
|
+
# Generate CSV
|
|
636
|
+
local csv_file="${output_dir}/crawl-data.csv"
|
|
637
|
+
echo "url,status_code,status,title,title_length,meta_description,description_length,h1,h1_count,canonical,meta_robots,word_count,response_time_ms,crawl_depth,internal_links,external_links,images,images_missing_alt" > "$csv_file"
|
|
638
|
+
|
|
639
|
+
local broken_links=()
|
|
640
|
+
local redirects=()
|
|
641
|
+
local meta_issues=()
|
|
642
|
+
local status_codes=()
|
|
643
|
+
|
|
644
|
+
while IFS= read -r result; do
|
|
645
|
+
[[ -z "$result" ]] && continue
|
|
646
|
+
|
|
647
|
+
local url status_code title meta_desc h1 canonical word_count
|
|
648
|
+
url=$(printf '%s' "$result" | jq -r '.url // ""')
|
|
649
|
+
status_code=$(printf '%s' "$result" | jq -r '.status_code // 0')
|
|
650
|
+
title=$(printf '%s' "$result" | jq -r '.metadata.title // .title // ""' | tr ',' ';' | head -c 200)
|
|
651
|
+
meta_desc=$(printf '%s' "$result" | jq -r '.metadata.description // ""' | tr ',' ';' | head -c 300)
|
|
652
|
+
h1=$(printf '%s' "$result" | jq -r '.metadata.h1 // ""' | tr ',' ';' | head -c 200)
|
|
653
|
+
canonical=$(printf '%s' "$result" | jq -r '.metadata.canonical // ""')
|
|
654
|
+
word_count=$(printf '%s' "$result" | jq -r '.word_count // 0')
|
|
655
|
+
|
|
656
|
+
local title_len=${#title}
|
|
657
|
+
local desc_len=${#meta_desc}
|
|
658
|
+
local status="OK"
|
|
659
|
+
[[ $status_code -ge 300 && $status_code -lt 400 ]] && status="Redirect"
|
|
660
|
+
[[ $status_code -ge 400 ]] && status="Error"
|
|
661
|
+
|
|
662
|
+
# Count links
|
|
663
|
+
local internal_links external_links
|
|
664
|
+
internal_links=$(printf '%s' "$result" | jq -r '.links.internal | length // 0' 2>/dev/null || echo "0")
|
|
665
|
+
external_links=$(printf '%s' "$result" | jq -r '.links.external | length // 0' 2>/dev/null || echo "0")
|
|
666
|
+
|
|
667
|
+
# Write CSV row
|
|
668
|
+
echo "\"$url\",$status_code,\"$status\",\"$title\",$title_len,\"$meta_desc\",$desc_len,\"$h1\",1,\"$canonical\",\"\",$word_count,0,0,$internal_links,$external_links,0,0" >> "$csv_file"
|
|
669
|
+
|
|
670
|
+
# Track status codes
|
|
671
|
+
status_codes+=("$status_code")
|
|
672
|
+
|
|
673
|
+
# Track broken links
|
|
674
|
+
if [[ $status_code -ge 400 ]]; then
|
|
675
|
+
broken_links+=("{\"url\":\"$url\",\"status_code\":$status_code,\"source\":\"direct\"}")
|
|
676
|
+
fi
|
|
677
|
+
|
|
678
|
+
# Track meta issues
|
|
679
|
+
local issues=""
|
|
680
|
+
[[ -z "$title" ]] && issues+="Missing title; "
|
|
681
|
+
[[ $title_len -gt 60 ]] && issues+="Title too long; "
|
|
682
|
+
[[ -z "$meta_desc" ]] && issues+="Missing description; "
|
|
683
|
+
[[ $desc_len -gt 160 ]] && issues+="Description too long; "
|
|
684
|
+
[[ -z "$h1" ]] && issues+="Missing H1; "
|
|
685
|
+
|
|
686
|
+
if [[ -n "$issues" ]]; then
|
|
687
|
+
meta_issues+=("{\"url\":\"$url\",\"title\":\"${title:0:50}\",\"h1\":\"${h1:0:50}\",\"issues\":\"${issues%%; }\"}")
|
|
688
|
+
fi
|
|
689
|
+
done < "$results_file"
|
|
690
|
+
|
|
691
|
+
print_info "Generated: $csv_file"
|
|
692
|
+
|
|
693
|
+
# Generate broken-links.csv
|
|
694
|
+
if [[ ${#broken_links[@]} -gt 0 ]]; then
|
|
695
|
+
local broken_file="${output_dir}/broken-links.csv"
|
|
696
|
+
echo "url,status_code,source" > "$broken_file"
|
|
697
|
+
for bl in "${broken_links[@]}"; do
|
|
698
|
+
local bl_url bl_code bl_src
|
|
699
|
+
bl_url=$(echo "$bl" | jq -r '.url')
|
|
700
|
+
bl_code=$(echo "$bl" | jq -r '.status_code')
|
|
701
|
+
bl_src=$(echo "$bl" | jq -r '.source')
|
|
702
|
+
echo "\"$bl_url\",$bl_code,\"$bl_src\"" >> "$broken_file"
|
|
703
|
+
done
|
|
704
|
+
print_info "Generated: $broken_file"
|
|
705
|
+
fi
|
|
706
|
+
|
|
707
|
+
# Generate meta-issues.csv
|
|
708
|
+
if [[ ${#meta_issues[@]} -gt 0 ]]; then
|
|
709
|
+
local issues_file="${output_dir}/meta-issues.csv"
|
|
710
|
+
echo "url,title,h1,issues" > "$issues_file"
|
|
711
|
+
for mi in "${meta_issues[@]}"; do
|
|
712
|
+
local mi_url mi_title mi_h1 mi_issues
|
|
713
|
+
mi_url=$(echo "$mi" | jq -r '.url')
|
|
714
|
+
mi_title=$(echo "$mi" | jq -r '.title')
|
|
715
|
+
mi_h1=$(echo "$mi" | jq -r '.h1')
|
|
716
|
+
mi_issues=$(echo "$mi" | jq -r '.issues')
|
|
717
|
+
echo "\"$mi_url\",\"$mi_title\",\"$mi_h1\",\"$mi_issues\"" >> "$issues_file"
|
|
718
|
+
done
|
|
719
|
+
print_info "Generated: $issues_file"
|
|
720
|
+
fi
|
|
721
|
+
|
|
722
|
+
# Generate summary.json
|
|
723
|
+
local total_pages=${#status_codes[@]}
|
|
724
|
+
local summary_file="${output_dir}/summary.json"
|
|
725
|
+
|
|
726
|
+
# Count status codes
|
|
727
|
+
local code_200=0 code_301=0 code_302=0 code_404=0 code_500=0 code_other=0
|
|
728
|
+
for code in "${status_codes[@]}"; do
|
|
729
|
+
case "$code" in
|
|
730
|
+
200) ((code_200++)) ;;
|
|
731
|
+
301) ((code_301++)) ;;
|
|
732
|
+
302) ((code_302++)) ;;
|
|
733
|
+
404) ((code_404++)) ;;
|
|
734
|
+
500) ((code_500++)) ;;
|
|
735
|
+
*) ((code_other++)) ;;
|
|
736
|
+
esac
|
|
737
|
+
done
|
|
738
|
+
|
|
739
|
+
cat > "$summary_file" << EOF
|
|
740
|
+
{
|
|
741
|
+
"crawl_date": "$(date -Iseconds)",
|
|
742
|
+
"base_url": "https://${base_domain}",
|
|
743
|
+
"backend": "crawl4ai",
|
|
744
|
+
"pages_crawled": $total_pages,
|
|
745
|
+
"broken_links": ${#broken_links[@]},
|
|
746
|
+
"redirects": 0,
|
|
747
|
+
"meta_issues": ${#meta_issues[@]},
|
|
748
|
+
"status_codes": {
|
|
749
|
+
"200": $code_200,
|
|
750
|
+
"301": $code_301,
|
|
751
|
+
"302": $code_302,
|
|
752
|
+
"404": $code_404,
|
|
753
|
+
"500": $code_500,
|
|
754
|
+
"other": $code_other
|
|
755
|
+
}
|
|
756
|
+
}
|
|
757
|
+
EOF
|
|
758
|
+
print_info "Generated: $summary_file"
|
|
759
|
+
|
|
760
|
+
# Generate XLSX if Python available
|
|
761
|
+
if find_python && "$PYTHON_CMD" -c "import openpyxl" 2>/dev/null; then
|
|
762
|
+
local xlsx_script
|
|
763
|
+
xlsx_script=$(mktemp /tmp/xlsx_gen_XXXXXX.py)
|
|
764
|
+
cat > "$xlsx_script" << 'PYXLSX'
|
|
765
|
+
import sys
|
|
766
|
+
import csv
|
|
767
|
+
import openpyxl
|
|
768
|
+
from openpyxl.styles import Font, PatternFill
|
|
769
|
+
from pathlib import Path
|
|
770
|
+
|
|
771
|
+
csv_file = Path(sys.argv[1])
|
|
772
|
+
xlsx_file = csv_file.with_suffix('.xlsx')
|
|
773
|
+
|
|
774
|
+
wb = openpyxl.Workbook()
|
|
775
|
+
ws = wb.active
|
|
776
|
+
ws.title = "Crawl Data"
|
|
777
|
+
|
|
778
|
+
with open(csv_file, 'r', encoding='utf-8') as f:
|
|
779
|
+
reader = csv.reader(f)
|
|
780
|
+
for row_num, row in enumerate(reader, 1):
|
|
781
|
+
for col_num, value in enumerate(row, 1):
|
|
782
|
+
cell = ws.cell(row=row_num, column=col_num, value=value)
|
|
783
|
+
if row_num == 1:
|
|
784
|
+
cell.font = Font(bold=True)
|
|
785
|
+
cell.fill = PatternFill(start_color="DAEEF3", end_color="DAEEF3", fill_type="solid")
|
|
786
|
+
|
|
787
|
+
wb.save(xlsx_file)
|
|
788
|
+
print(f"Generated: {xlsx_file}")
|
|
789
|
+
PYXLSX
|
|
790
|
+
"$PYTHON_CMD" "$xlsx_script" "$csv_file" 2>/dev/null || true
|
|
791
|
+
rm -f "$xlsx_script"
|
|
792
|
+
fi
|
|
793
|
+
|
|
794
|
+
return 0
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
# Lightweight Python crawler (fallback)
|
|
798
|
+
generate_fallback_crawler() {
|
|
799
|
+
cat << 'PYTHON_CRAWLER'
|
|
800
|
+
#!/usr/bin/env python3
|
|
801
|
+
"""
|
|
802
|
+
Lightweight SEO Site Crawler
|
|
803
|
+
Fallback when Crawl4AI is not available
|
|
804
|
+
"""
|
|
805
|
+
|
|
806
|
+
import asyncio
|
|
807
|
+
import aiohttp
|
|
808
|
+
import csv
|
|
809
|
+
import json
|
|
810
|
+
import hashlib
|
|
811
|
+
import sys
|
|
812
|
+
from datetime import datetime
|
|
813
|
+
from pathlib import Path
|
|
814
|
+
from urllib.parse import urljoin, urlparse
|
|
815
|
+
from collections import defaultdict
|
|
816
|
+
from dataclasses import dataclass, asdict
|
|
817
|
+
from bs4 import BeautifulSoup
|
|
818
|
+
|
|
819
|
+
try:
|
|
820
|
+
import openpyxl
|
|
821
|
+
from openpyxl.styles import Font, PatternFill, Alignment
|
|
822
|
+
HAS_XLSX = True
|
|
823
|
+
except ImportError:
|
|
824
|
+
HAS_XLSX = False
|
|
825
|
+
|
|
826
|
+
|
|
827
|
+
@dataclass
|
|
828
|
+
class PageData:
|
|
829
|
+
url: str
|
|
830
|
+
status_code: int = 0
|
|
831
|
+
status: str = ""
|
|
832
|
+
title: str = ""
|
|
833
|
+
title_length: int = 0
|
|
834
|
+
meta_description: str = ""
|
|
835
|
+
description_length: int = 0
|
|
836
|
+
h1: str = ""
|
|
837
|
+
h1_count: int = 0
|
|
838
|
+
canonical: str = ""
|
|
839
|
+
meta_robots: str = ""
|
|
840
|
+
word_count: int = 0
|
|
841
|
+
response_time_ms: float = 0.0
|
|
842
|
+
crawl_depth: int = 0
|
|
843
|
+
internal_links: int = 0
|
|
844
|
+
external_links: int = 0
|
|
845
|
+
images: int = 0
|
|
846
|
+
images_missing_alt: int = 0
|
|
847
|
+
|
|
848
|
+
|
|
849
|
+
class SiteCrawler:
|
|
850
|
+
def __init__(self, base_url: str, max_urls: int = 100, max_depth: int = 3, delay_ms: int = 100):
|
|
851
|
+
self.base_url = base_url.rstrip('/')
|
|
852
|
+
self.base_domain = urlparse(base_url).netloc
|
|
853
|
+
self.max_urls = max_urls
|
|
854
|
+
self.max_depth = max_depth
|
|
855
|
+
self.delay = delay_ms / 1000.0
|
|
856
|
+
|
|
857
|
+
self.visited = set()
|
|
858
|
+
self.queue = [(self.base_url, 0)]
|
|
859
|
+
self.pages = []
|
|
860
|
+
self.broken_links = []
|
|
861
|
+
self.redirects = []
|
|
862
|
+
|
|
863
|
+
def is_internal(self, url: str) -> bool:
|
|
864
|
+
parsed = urlparse(url)
|
|
865
|
+
return parsed.netloc == self.base_domain or parsed.netloc == ""
|
|
866
|
+
|
|
867
|
+
def normalize_url(self, url: str, base: str) -> str:
|
|
868
|
+
url = urljoin(base, url)
|
|
869
|
+
parsed = urlparse(url)
|
|
870
|
+
normalized = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
|
|
871
|
+
if parsed.query:
|
|
872
|
+
normalized += f"?{parsed.query}"
|
|
873
|
+
return normalized.rstrip('/')
|
|
874
|
+
|
|
875
|
+
async def fetch_page(self, session: aiohttp.ClientSession, url: str, depth: int) -> PageData:
|
|
876
|
+
page = PageData(url=url, crawl_depth=depth)
|
|
877
|
+
|
|
878
|
+
try:
|
|
879
|
+
start = datetime.now()
|
|
880
|
+
async with session.get(url, allow_redirects=True, timeout=aiohttp.ClientTimeout(total=15)) as response:
|
|
881
|
+
page.status_code = response.status
|
|
882
|
+
page.response_time_ms = (datetime.now() - start).total_seconds() * 1000
|
|
883
|
+
|
|
884
|
+
# Track redirects
|
|
885
|
+
if response.history:
|
|
886
|
+
for r in response.history:
|
|
887
|
+
self.redirects.append({
|
|
888
|
+
'original_url': str(r.url),
|
|
889
|
+
'status_code': r.status,
|
|
890
|
+
'redirect_url': str(response.url)
|
|
891
|
+
})
|
|
892
|
+
|
|
893
|
+
page.status = "OK" if response.status < 300 else ("Redirect" if response.status < 400 else "Error")
|
|
894
|
+
|
|
895
|
+
if response.status >= 400:
|
|
896
|
+
self.broken_links.append({'url': url, 'status_code': response.status, 'source': 'direct'})
|
|
897
|
+
return page
|
|
898
|
+
|
|
899
|
+
content_type = response.headers.get('Content-Type', '')
|
|
900
|
+
if 'text/html' not in content_type:
|
|
901
|
+
return page
|
|
902
|
+
|
|
903
|
+
html = await response.text()
|
|
904
|
+
soup = BeautifulSoup(html, 'html.parser')
|
|
905
|
+
|
|
906
|
+
# Title
|
|
907
|
+
if soup.title:
|
|
908
|
+
page.title = soup.title.get_text(strip=True)[:200]
|
|
909
|
+
page.title_length = len(page.title)
|
|
910
|
+
|
|
911
|
+
# Meta description
|
|
912
|
+
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
|
913
|
+
if meta_desc:
|
|
914
|
+
page.meta_description = meta_desc.get('content', '')[:300]
|
|
915
|
+
page.description_length = len(page.meta_description)
|
|
916
|
+
|
|
917
|
+
# Meta robots
|
|
918
|
+
meta_robots = soup.find('meta', attrs={'name': 'robots'})
|
|
919
|
+
if meta_robots:
|
|
920
|
+
page.meta_robots = meta_robots.get('content', '')
|
|
921
|
+
|
|
922
|
+
# Canonical
|
|
923
|
+
canonical = soup.find('link', attrs={'rel': 'canonical'})
|
|
924
|
+
if canonical:
|
|
925
|
+
page.canonical = canonical.get('href', '')
|
|
926
|
+
|
|
927
|
+
# H1
|
|
928
|
+
h1_tags = soup.find_all('h1')
|
|
929
|
+
page.h1_count = len(h1_tags)
|
|
930
|
+
if h1_tags:
|
|
931
|
+
page.h1 = h1_tags[0].get_text(strip=True)[:200]
|
|
932
|
+
|
|
933
|
+
# Word count
|
|
934
|
+
text = soup.get_text(separator=' ', strip=True)
|
|
935
|
+
page.word_count = len(text.split())
|
|
936
|
+
|
|
937
|
+
# Images
|
|
938
|
+
images = soup.find_all('img')
|
|
939
|
+
page.images = len(images)
|
|
940
|
+
page.images_missing_alt = sum(1 for img in images if not img.get('alt'))
|
|
941
|
+
|
|
942
|
+
# Links
|
|
943
|
+
internal_count = 0
|
|
944
|
+
external_count = 0
|
|
945
|
+
|
|
946
|
+
for link in soup.find_all('a', href=True):
|
|
947
|
+
href = link.get('href', '')
|
|
948
|
+
if not href or href.startswith(('#', 'javascript:', 'mailto:', 'tel:')):
|
|
949
|
+
continue
|
|
950
|
+
|
|
951
|
+
target_url = self.normalize_url(href, url)
|
|
952
|
+
|
|
953
|
+
if self.is_internal(target_url):
|
|
954
|
+
internal_count += 1
|
|
955
|
+
if target_url not in self.visited and depth < self.max_depth:
|
|
956
|
+
self.queue.append((target_url, depth + 1))
|
|
957
|
+
else:
|
|
958
|
+
external_count += 1
|
|
959
|
+
|
|
960
|
+
page.internal_links = internal_count
|
|
961
|
+
page.external_links = external_count
|
|
962
|
+
|
|
963
|
+
except asyncio.TimeoutError:
|
|
964
|
+
page.status = "Timeout"
|
|
965
|
+
except Exception as e:
|
|
966
|
+
page.status = f"Error: {str(e)[:50]}"
|
|
967
|
+
|
|
968
|
+
return page
|
|
969
|
+
|
|
970
|
+
async def crawl(self):
|
|
971
|
+
connector = aiohttp.TCPConnector(limit=5)
|
|
972
|
+
headers = {'User-Agent': 'AIDevOps-SiteCrawler/2.0'}
|
|
973
|
+
|
|
974
|
+
async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
|
|
975
|
+
while self.queue and len(self.visited) < self.max_urls:
|
|
976
|
+
url, depth = self.queue.pop(0)
|
|
977
|
+
|
|
978
|
+
if url in self.visited:
|
|
979
|
+
continue
|
|
980
|
+
|
|
981
|
+
self.visited.add(url)
|
|
982
|
+
page = await self.fetch_page(session, url, depth)
|
|
983
|
+
self.pages.append(page)
|
|
984
|
+
|
|
985
|
+
print(f"[{len(self.pages)}/{self.max_urls}] {page.status_code or 'ERR'} {url[:70]}")
|
|
986
|
+
|
|
987
|
+
await asyncio.sleep(self.delay)
|
|
988
|
+
|
|
989
|
+
return self.pages
|
|
990
|
+
|
|
991
|
+
def export(self, output_dir: Path, domain: str, fmt: str = "xlsx"):
|
|
992
|
+
output_dir = Path(output_dir)
|
|
993
|
+
|
|
994
|
+
# CSV export
|
|
995
|
+
csv_file = output_dir / "crawl-data.csv"
|
|
996
|
+
fieldnames = list(PageData.__dataclass_fields__.keys())
|
|
997
|
+
|
|
998
|
+
with open(csv_file, 'w', newline='', encoding='utf-8') as f:
|
|
999
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
1000
|
+
writer.writeheader()
|
|
1001
|
+
for page in self.pages:
|
|
1002
|
+
writer.writerow(asdict(page))
|
|
1003
|
+
print(f"Exported: {csv_file}")
|
|
1004
|
+
|
|
1005
|
+
# XLSX export
|
|
1006
|
+
if fmt in ("xlsx", "all") and HAS_XLSX:
|
|
1007
|
+
xlsx_file = output_dir / "crawl-data.xlsx"
|
|
1008
|
+
wb = openpyxl.Workbook()
|
|
1009
|
+
ws = wb.active
|
|
1010
|
+
ws.title = "Crawl Data"
|
|
1011
|
+
|
|
1012
|
+
# Headers
|
|
1013
|
+
for col, field in enumerate(fieldnames, 1):
|
|
1014
|
+
cell = ws.cell(row=1, column=col, value=field.replace('_', ' ').title())
|
|
1015
|
+
cell.font = Font(bold=True)
|
|
1016
|
+
|
|
1017
|
+
# Data
|
|
1018
|
+
for row, page in enumerate(self.pages, 2):
|
|
1019
|
+
for col, field in enumerate(fieldnames, 1):
|
|
1020
|
+
ws.cell(row=row, column=col, value=getattr(page, field))
|
|
1021
|
+
|
|
1022
|
+
wb.save(xlsx_file)
|
|
1023
|
+
print(f"Exported: {xlsx_file}")
|
|
1024
|
+
|
|
1025
|
+
# Broken links
|
|
1026
|
+
if self.broken_links:
|
|
1027
|
+
broken_file = output_dir / "broken-links.csv"
|
|
1028
|
+
with open(broken_file, 'w', newline='', encoding='utf-8') as f:
|
|
1029
|
+
writer = csv.DictWriter(f, fieldnames=['url', 'status_code', 'source'])
|
|
1030
|
+
writer.writeheader()
|
|
1031
|
+
writer.writerows(self.broken_links)
|
|
1032
|
+
print(f"Exported: {broken_file}")
|
|
1033
|
+
|
|
1034
|
+
# Redirects
|
|
1035
|
+
if self.redirects:
|
|
1036
|
+
redirects_file = output_dir / "redirects.csv"
|
|
1037
|
+
with open(redirects_file, 'w', newline='', encoding='utf-8') as f:
|
|
1038
|
+
writer = csv.DictWriter(f, fieldnames=['original_url', 'status_code', 'redirect_url'])
|
|
1039
|
+
writer.writeheader()
|
|
1040
|
+
writer.writerows(self.redirects)
|
|
1041
|
+
print(f"Exported: {redirects_file}")
|
|
1042
|
+
|
|
1043
|
+
# Meta issues
|
|
1044
|
+
meta_issues = []
|
|
1045
|
+
for page in self.pages:
|
|
1046
|
+
issues = []
|
|
1047
|
+
if not page.title:
|
|
1048
|
+
issues.append("Missing title")
|
|
1049
|
+
elif page.title_length > 60:
|
|
1050
|
+
issues.append("Title too long")
|
|
1051
|
+
if not page.meta_description:
|
|
1052
|
+
issues.append("Missing description")
|
|
1053
|
+
elif page.description_length > 160:
|
|
1054
|
+
issues.append("Description too long")
|
|
1055
|
+
if page.h1_count == 0:
|
|
1056
|
+
issues.append("Missing H1")
|
|
1057
|
+
elif page.h1_count > 1:
|
|
1058
|
+
issues.append("Multiple H1s")
|
|
1059
|
+
|
|
1060
|
+
if issues:
|
|
1061
|
+
meta_issues.append({
|
|
1062
|
+
'url': page.url,
|
|
1063
|
+
'title': page.title[:50],
|
|
1064
|
+
'h1': page.h1[:50],
|
|
1065
|
+
'issues': '; '.join(issues)
|
|
1066
|
+
})
|
|
1067
|
+
|
|
1068
|
+
if meta_issues:
|
|
1069
|
+
issues_file = output_dir / "meta-issues.csv"
|
|
1070
|
+
with open(issues_file, 'w', newline='', encoding='utf-8') as f:
|
|
1071
|
+
writer = csv.DictWriter(f, fieldnames=['url', 'title', 'h1', 'issues'])
|
|
1072
|
+
writer.writeheader()
|
|
1073
|
+
writer.writerows(meta_issues)
|
|
1074
|
+
print(f"Exported: {issues_file}")
|
|
1075
|
+
|
|
1076
|
+
# Summary
|
|
1077
|
+
summary = {
|
|
1078
|
+
'crawl_date': datetime.now().isoformat(),
|
|
1079
|
+
'base_url': self.base_url,
|
|
1080
|
+
'pages_crawled': len(self.pages),
|
|
1081
|
+
'broken_links': len(self.broken_links),
|
|
1082
|
+
'redirects': len(self.redirects),
|
|
1083
|
+
'meta_issues': len(meta_issues),
|
|
1084
|
+
'status_codes': {}
|
|
1085
|
+
}
|
|
1086
|
+
|
|
1087
|
+
for page in self.pages:
|
|
1088
|
+
code = str(page.status_code)
|
|
1089
|
+
summary['status_codes'][code] = summary['status_codes'].get(code, 0) + 1
|
|
1090
|
+
|
|
1091
|
+
with open(output_dir / "summary.json", 'w') as f:
|
|
1092
|
+
json.dump(summary, f, indent=2)
|
|
1093
|
+
print(f"Exported: {output_dir / 'summary.json'}")
|
|
1094
|
+
|
|
1095
|
+
return summary
|
|
1096
|
+
|
|
1097
|
+
|
|
1098
|
+
async def main():
|
|
1099
|
+
if len(sys.argv) < 4:
|
|
1100
|
+
print("Usage: crawler.py <url> <output_dir> <max_urls> [depth] [format]")
|
|
1101
|
+
sys.exit(1)
|
|
1102
|
+
|
|
1103
|
+
url = sys.argv[1]
|
|
1104
|
+
output_dir = sys.argv[2]
|
|
1105
|
+
max_urls = int(sys.argv[3])
|
|
1106
|
+
depth = int(sys.argv[4]) if len(sys.argv) > 4 else 3
|
|
1107
|
+
fmt = sys.argv[5] if len(sys.argv) > 5 else "xlsx"
|
|
1108
|
+
|
|
1109
|
+
domain = urlparse(url).netloc
|
|
1110
|
+
|
|
1111
|
+
print(f"Starting crawl: {url}")
|
|
1112
|
+
print(f"Max URLs: {max_urls}, Max depth: {depth}")
|
|
1113
|
+
print()
|
|
1114
|
+
|
|
1115
|
+
crawler = SiteCrawler(url, max_urls=max_urls, max_depth=depth)
|
|
1116
|
+
await crawler.crawl()
|
|
1117
|
+
|
|
1118
|
+
summary = crawler.export(Path(output_dir), domain, fmt)
|
|
1119
|
+
|
|
1120
|
+
print()
|
|
1121
|
+
print("=== Crawl Summary ===")
|
|
1122
|
+
print(f"Pages crawled: {summary['pages_crawled']}")
|
|
1123
|
+
print(f"Broken links: {summary['broken_links']}")
|
|
1124
|
+
print(f"Redirects: {summary['redirects']}")
|
|
1125
|
+
print(f"Meta issues: {summary['meta_issues']}")
|
|
1126
|
+
|
|
1127
|
+
|
|
1128
|
+
if __name__ == "__main__":
|
|
1129
|
+
asyncio.run(main())
|
|
1130
|
+
PYTHON_CRAWLER
|
|
1131
|
+
}
|
|
1132
|
+
|
|
1133
|
+
# Run crawl
|
|
1134
|
+
do_crawl() {
|
|
1135
|
+
local url="$1"
|
|
1136
|
+
shift
|
|
1137
|
+
|
|
1138
|
+
# Parse options
|
|
1139
|
+
local depth="$DEFAULT_DEPTH"
|
|
1140
|
+
local max_urls="$DEFAULT_MAX_URLS"
|
|
1141
|
+
local format="$DEFAULT_FORMAT"
|
|
1142
|
+
local output_base="$DEFAULT_OUTPUT_DIR"
|
|
1143
|
+
local force_fallback=false
|
|
1144
|
+
|
|
1145
|
+
while [[ $# -gt 0 ]]; do
|
|
1146
|
+
case "$1" in
|
|
1147
|
+
--depth)
|
|
1148
|
+
depth="$2"
|
|
1149
|
+
shift 2
|
|
1150
|
+
;;
|
|
1151
|
+
--max-urls)
|
|
1152
|
+
max_urls="$2"
|
|
1153
|
+
shift 2
|
|
1154
|
+
;;
|
|
1155
|
+
--format)
|
|
1156
|
+
format="$2"
|
|
1157
|
+
shift 2
|
|
1158
|
+
;;
|
|
1159
|
+
--output)
|
|
1160
|
+
output_base="$2"
|
|
1161
|
+
shift 2
|
|
1162
|
+
;;
|
|
1163
|
+
--fallback)
|
|
1164
|
+
force_fallback=true
|
|
1165
|
+
shift
|
|
1166
|
+
;;
|
|
1167
|
+
*)
|
|
1168
|
+
shift
|
|
1169
|
+
;;
|
|
1170
|
+
esac
|
|
1171
|
+
done
|
|
1172
|
+
|
|
1173
|
+
local domain
|
|
1174
|
+
domain=$(get_domain "$url")
|
|
1175
|
+
|
|
1176
|
+
local output_dir
|
|
1177
|
+
output_dir=$(create_output_dir "$domain" "$output_base")
|
|
1178
|
+
|
|
1179
|
+
print_header "Site Crawler - SEO Audit"
|
|
1180
|
+
print_info "URL: $url"
|
|
1181
|
+
print_info "Output: $output_dir"
|
|
1182
|
+
print_info "Depth: $depth, Max URLs: $max_urls"
|
|
1183
|
+
|
|
1184
|
+
# Try Crawl4AI first (unless forced fallback)
|
|
1185
|
+
if [[ "$force_fallback" != "true" ]] && check_crawl4ai; then
|
|
1186
|
+
print_success "Crawl4AI detected at ${CRAWL4AI_URL}"
|
|
1187
|
+
crawl_with_crawl4ai "$url" "$output_dir" "$max_urls" "$depth"
|
|
1188
|
+
print_success "Crawl complete!"
|
|
1189
|
+
print_info "Results: $output_dir"
|
|
1190
|
+
print_info "Latest: ${output_base}/${domain}/_latest"
|
|
1191
|
+
return 0
|
|
1192
|
+
fi
|
|
1193
|
+
|
|
1194
|
+
# Fallback to Python crawler
|
|
1195
|
+
print_info "Using lightweight Python crawler..."
|
|
1196
|
+
|
|
1197
|
+
# Find or install Python
|
|
1198
|
+
if ! find_python; then
|
|
1199
|
+
print_warning "Installing Python dependencies..."
|
|
1200
|
+
if ! install_python_deps; then
|
|
1201
|
+
print_error "Could not find or install Python with required packages"
|
|
1202
|
+
print_info "Install manually: pip3 install aiohttp beautifulsoup4 openpyxl"
|
|
1203
|
+
return 1
|
|
1204
|
+
fi
|
|
1205
|
+
fi
|
|
1206
|
+
|
|
1207
|
+
print_info "Using: $PYTHON_CMD"
|
|
1208
|
+
|
|
1209
|
+
# Generate and run crawler
|
|
1210
|
+
local crawler_script
|
|
1211
|
+
crawler_script=$(mktemp /tmp/site_crawler_XXXXXX.py)
|
|
1212
|
+
generate_fallback_crawler > "$crawler_script"
|
|
1213
|
+
|
|
1214
|
+
"$PYTHON_CMD" "$crawler_script" "$url" "$output_dir" "$max_urls" "$depth" "$format"
|
|
1215
|
+
local exit_code=$?
|
|
1216
|
+
|
|
1217
|
+
rm -f "$crawler_script"
|
|
1218
|
+
|
|
1219
|
+
if [[ $exit_code -eq 0 ]]; then
|
|
1220
|
+
print_success "Crawl complete!"
|
|
1221
|
+
print_info "Results: $output_dir"
|
|
1222
|
+
print_info "Latest: ${output_base}/${domain}/_latest"
|
|
1223
|
+
else
|
|
1224
|
+
print_error "Crawl failed with exit code $exit_code"
|
|
1225
|
+
fi
|
|
1226
|
+
|
|
1227
|
+
return $exit_code
|
|
1228
|
+
}
|
|
1229
|
+
|
|
1230
|
+
# Audit broken links
|
|
1231
|
+
audit_links() {
|
|
1232
|
+
local url="$1"
|
|
1233
|
+
shift
|
|
1234
|
+
print_info "Running broken link audit..."
|
|
1235
|
+
do_crawl "$url" --max-urls 200 "$@"
|
|
1236
|
+
return 0
|
|
1237
|
+
}
|
|
1238
|
+
|
|
1239
|
+
# Audit meta data
|
|
1240
|
+
audit_meta() {
|
|
1241
|
+
local url="$1"
|
|
1242
|
+
shift
|
|
1243
|
+
print_info "Running meta data audit..."
|
|
1244
|
+
do_crawl "$url" --max-urls 200 "$@"
|
|
1245
|
+
return 0
|
|
1246
|
+
}
|
|
1247
|
+
|
|
1248
|
+
# Audit redirects
|
|
1249
|
+
audit_redirects() {
|
|
1250
|
+
local url="$1"
|
|
1251
|
+
shift
|
|
1252
|
+
print_info "Running redirect audit..."
|
|
1253
|
+
do_crawl "$url" --max-urls 200 "$@"
|
|
1254
|
+
return 0
|
|
1255
|
+
}
|
|
1256
|
+
|
|
1257
|
+
# Generate XML sitemap
|
|
1258
|
+
generate_sitemap() {
|
|
1259
|
+
local url="$1"
|
|
1260
|
+
local domain
|
|
1261
|
+
domain=$(get_domain "$url")
|
|
1262
|
+
local output_dir="${DEFAULT_OUTPUT_DIR}/${domain}/_latest"
|
|
1263
|
+
|
|
1264
|
+
if [[ ! -d "$output_dir" ]]; then
|
|
1265
|
+
print_error "No crawl data found. Run 'crawl' first."
|
|
1266
|
+
return 1
|
|
1267
|
+
fi
|
|
1268
|
+
|
|
1269
|
+
local crawl_data="${output_dir}/crawl-data.csv"
|
|
1270
|
+
if [[ ! -f "$crawl_data" ]]; then
|
|
1271
|
+
print_error "Crawl data not found: $crawl_data"
|
|
1272
|
+
return 1
|
|
1273
|
+
fi
|
|
1274
|
+
|
|
1275
|
+
print_header "Generating XML Sitemap"
|
|
1276
|
+
|
|
1277
|
+
local sitemap="${output_dir}/sitemap.xml"
|
|
1278
|
+
|
|
1279
|
+
{
|
|
1280
|
+
echo '<?xml version="1.0" encoding="UTF-8"?>'
|
|
1281
|
+
echo '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
|
|
1282
|
+
|
|
1283
|
+
tail -n +2 "$crawl_data" | while IFS=, read -r page_url status_code rest; do
|
|
1284
|
+
if [[ "$status_code" == "200" ]]; then
|
|
1285
|
+
page_url="${page_url//\"/}"
|
|
1286
|
+
echo " <url>"
|
|
1287
|
+
echo " <loc>$page_url</loc>"
|
|
1288
|
+
echo " <changefreq>weekly</changefreq>"
|
|
1289
|
+
echo " <priority>0.5</priority>"
|
|
1290
|
+
echo " </url>"
|
|
1291
|
+
fi
|
|
1292
|
+
done
|
|
1293
|
+
|
|
1294
|
+
echo '</urlset>'
|
|
1295
|
+
} > "$sitemap"
|
|
1296
|
+
|
|
1297
|
+
print_success "Sitemap generated: $sitemap"
|
|
1298
|
+
return 0
|
|
1299
|
+
}
|
|
1300
|
+
|
|
1301
|
+
# Compare crawls
|
|
1302
|
+
compare_crawls() {
|
|
1303
|
+
local arg1="${1:-}"
|
|
1304
|
+
local arg2="${2:-}"
|
|
1305
|
+
|
|
1306
|
+
print_header "Comparing Crawls"
|
|
1307
|
+
|
|
1308
|
+
if [[ -z "$arg2" ]] && [[ -n "$arg1" ]]; then
|
|
1309
|
+
local domain
|
|
1310
|
+
domain=$(get_domain "$arg1")
|
|
1311
|
+
local domain_dir="${DEFAULT_OUTPUT_DIR}/${domain}"
|
|
1312
|
+
|
|
1313
|
+
if [[ ! -d "$domain_dir" ]]; then
|
|
1314
|
+
print_error "No crawl data found for domain"
|
|
1315
|
+
return 1
|
|
1316
|
+
fi
|
|
1317
|
+
|
|
1318
|
+
local crawls
|
|
1319
|
+
crawls=$(find "$domain_dir" -maxdepth 1 -type d -name "20*" | sort -r | head -2)
|
|
1320
|
+
local count
|
|
1321
|
+
count=$(echo "$crawls" | wc -l | tr -d ' ')
|
|
1322
|
+
|
|
1323
|
+
if [[ $count -lt 2 ]]; then
|
|
1324
|
+
print_error "Need at least 2 crawls to compare"
|
|
1325
|
+
return 1
|
|
1326
|
+
fi
|
|
1327
|
+
|
|
1328
|
+
arg1=$(echo "$crawls" | head -1)
|
|
1329
|
+
arg2=$(echo "$crawls" | tail -1)
|
|
1330
|
+
fi
|
|
1331
|
+
|
|
1332
|
+
print_info "Crawl 1: $arg1"
|
|
1333
|
+
print_info "Crawl 2: $arg2"
|
|
1334
|
+
|
|
1335
|
+
if [[ -f "${arg1}/crawl-data.csv" ]] && [[ -f "${arg2}/crawl-data.csv" ]]; then
|
|
1336
|
+
local urls1 urls2
|
|
1337
|
+
urls1=$(cut -d, -f1 "${arg1}/crawl-data.csv" | tail -n +2 | sort -u | wc -l | tr -d ' ')
|
|
1338
|
+
urls2=$(cut -d, -f1 "${arg2}/crawl-data.csv" | tail -n +2 | sort -u | wc -l | tr -d ' ')
|
|
1339
|
+
|
|
1340
|
+
print_info "Crawl 1 URLs: $urls1"
|
|
1341
|
+
print_info "Crawl 2 URLs: $urls2"
|
|
1342
|
+
fi
|
|
1343
|
+
|
|
1344
|
+
return 0
|
|
1345
|
+
}
|
|
1346
|
+
|
|
1347
|
+
# Check status
|
|
1348
|
+
check_status() {
|
|
1349
|
+
print_header "Site Crawler Status"
|
|
1350
|
+
|
|
1351
|
+
# Check Crawl4AI
|
|
1352
|
+
print_info "Checking Crawl4AI..."
|
|
1353
|
+
if check_crawl4ai; then
|
|
1354
|
+
print_success "Crawl4AI: Running at ${CRAWL4AI_URL}"
|
|
1355
|
+
else
|
|
1356
|
+
print_warning "Crawl4AI: Not running (will use fallback crawler)"
|
|
1357
|
+
fi
|
|
1358
|
+
|
|
1359
|
+
# Check Python
|
|
1360
|
+
print_info "Checking Python..."
|
|
1361
|
+
if find_python; then
|
|
1362
|
+
print_success "Python: $PYTHON_CMD with required packages"
|
|
1363
|
+
else
|
|
1364
|
+
print_warning "Python: Dependencies not installed"
|
|
1365
|
+
print_info " Install with: pip3 install aiohttp beautifulsoup4 openpyxl"
|
|
1366
|
+
fi
|
|
1367
|
+
|
|
1368
|
+
# Check dependencies
|
|
1369
|
+
if command -v jq &> /dev/null; then
|
|
1370
|
+
print_success "jq: installed"
|
|
1371
|
+
else
|
|
1372
|
+
print_warning "jq: not installed (optional, for JSON processing)"
|
|
1373
|
+
fi
|
|
1374
|
+
|
|
1375
|
+
if command -v curl &> /dev/null; then
|
|
1376
|
+
print_success "curl: installed"
|
|
1377
|
+
else
|
|
1378
|
+
print_error "curl: not installed (required)"
|
|
1379
|
+
fi
|
|
1380
|
+
|
|
1381
|
+
return 0
|
|
1382
|
+
}
|
|
1383
|
+
|
|
1384
|
+
# Show help
|
|
1385
|
+
show_help() {
|
|
1386
|
+
cat << 'EOF'
|
|
1387
|
+
Site Crawler Helper - SEO Spider Tool
|
|
1388
|
+
|
|
1389
|
+
Usage: site-crawler-helper.sh [command] [url] [options]
|
|
1390
|
+
|
|
1391
|
+
Commands:
|
|
1392
|
+
crawl <url> Full site crawl with SEO data extraction
|
|
1393
|
+
audit-links <url> Check for broken links (4XX/5XX errors)
|
|
1394
|
+
audit-meta <url> Audit page titles and meta descriptions
|
|
1395
|
+
audit-redirects <url> Analyze redirects and chains
|
|
1396
|
+
generate-sitemap <url> Generate XML sitemap from crawl
|
|
1397
|
+
compare [url|dir1] [dir2] Compare two crawls
|
|
1398
|
+
status Check crawler dependencies
|
|
1399
|
+
help Show this help message
|
|
1400
|
+
|
|
1401
|
+
Options:
|
|
1402
|
+
--depth <n> Max crawl depth (default: 3)
|
|
1403
|
+
--max-urls <n> Max URLs to crawl (default: 100)
|
|
1404
|
+
--format <fmt> Output format: csv, xlsx, all (default: xlsx)
|
|
1405
|
+
--output <dir> Output directory (default: ~/Downloads)
|
|
1406
|
+
--fallback Force use of fallback crawler (skip Crawl4AI)
|
|
1407
|
+
|
|
1408
|
+
Examples:
|
|
1409
|
+
# Full site crawl
|
|
1410
|
+
site-crawler-helper.sh crawl https://example.com
|
|
1411
|
+
|
|
1412
|
+
# Limited crawl
|
|
1413
|
+
site-crawler-helper.sh crawl https://example.com --depth 2 --max-urls 50
|
|
1414
|
+
|
|
1415
|
+
# Quick broken link check
|
|
1416
|
+
site-crawler-helper.sh audit-links https://example.com
|
|
1417
|
+
|
|
1418
|
+
# Generate sitemap from existing crawl
|
|
1419
|
+
site-crawler-helper.sh generate-sitemap https://example.com
|
|
1420
|
+
|
|
1421
|
+
# Check status
|
|
1422
|
+
site-crawler-helper.sh status
|
|
1423
|
+
|
|
1424
|
+
Output Structure:
|
|
1425
|
+
~/Downloads/{domain}/{timestamp}/
|
|
1426
|
+
- crawl-data.xlsx Full crawl data
|
|
1427
|
+
- crawl-data.csv Full crawl data (CSV)
|
|
1428
|
+
- broken-links.csv 4XX/5XX errors
|
|
1429
|
+
- redirects.csv Redirect chains
|
|
1430
|
+
- meta-issues.csv Title/description issues
|
|
1431
|
+
- summary.json Crawl statistics
|
|
1432
|
+
|
|
1433
|
+
~/Downloads/{domain}/_latest -> symlink to latest crawl
|
|
1434
|
+
|
|
1435
|
+
Backends:
|
|
1436
|
+
- Crawl4AI (preferred): Uses Docker-based Crawl4AI when available
|
|
1437
|
+
- Fallback: Lightweight async Python crawler
|
|
1438
|
+
|
|
1439
|
+
Related:
|
|
1440
|
+
- E-E-A-T scoring: eeat-score-helper.sh
|
|
1441
|
+
- Crawl4AI setup: crawl4ai-helper.sh
|
|
1442
|
+
- PageSpeed: pagespeed-helper.sh
|
|
1443
|
+
EOF
|
|
1444
|
+
return 0
|
|
1445
|
+
}
|
|
1446
|
+
|
|
1447
|
+
# Main function
|
|
1448
|
+
main() {
|
|
1449
|
+
local command="${1:-help}"
|
|
1450
|
+
shift || true
|
|
1451
|
+
|
|
1452
|
+
case "$command" in
|
|
1453
|
+
crawl)
|
|
1454
|
+
do_crawl "$@"
|
|
1455
|
+
;;
|
|
1456
|
+
audit-links)
|
|
1457
|
+
audit_links "$@"
|
|
1458
|
+
;;
|
|
1459
|
+
audit-meta)
|
|
1460
|
+
audit_meta "$@"
|
|
1461
|
+
;;
|
|
1462
|
+
audit-redirects)
|
|
1463
|
+
audit_redirects "$@"
|
|
1464
|
+
;;
|
|
1465
|
+
generate-sitemap)
|
|
1466
|
+
generate_sitemap "$@"
|
|
1467
|
+
;;
|
|
1468
|
+
compare)
|
|
1469
|
+
compare_crawls "$@"
|
|
1470
|
+
;;
|
|
1471
|
+
status)
|
|
1472
|
+
check_status
|
|
1473
|
+
;;
|
|
1474
|
+
help|-h|--help|"")
|
|
1475
|
+
show_help
|
|
1476
|
+
;;
|
|
1477
|
+
*)
|
|
1478
|
+
print_error "Unknown command: $command"
|
|
1479
|
+
show_help
|
|
1480
|
+
return 1
|
|
1481
|
+
;;
|
|
1482
|
+
esac
|
|
1483
|
+
|
|
1484
|
+
return 0
|
|
1485
|
+
}
|
|
1486
|
+
|
|
1487
|
+
main "$@"
|