agentcrawl-ai 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. agentcrawl_ai-0.1.0/.dockerignore +15 -0
  2. agentcrawl_ai-0.1.0/.env.example +28 -0
  3. agentcrawl_ai-0.1.0/.github/workflows/ci.yml +42 -0
  4. agentcrawl_ai-0.1.0/.github/workflows/docker.yml +68 -0
  5. agentcrawl_ai-0.1.0/.gitignore +13 -0
  6. agentcrawl_ai-0.1.0/.hermes/plans/2026-06-11_000405-agentcrawl-docs-cleanup.md +239 -0
  7. agentcrawl_ai-0.1.0/AGENTS.md +3 -0
  8. agentcrawl_ai-0.1.0/CHANGELOG.md +19 -0
  9. agentcrawl_ai-0.1.0/CLAUDE.md +3 -0
  10. agentcrawl_ai-0.1.0/CONTRIBUTING.md +10 -0
  11. agentcrawl_ai-0.1.0/Dockerfile +35 -0
  12. agentcrawl_ai-0.1.0/INSTALL_FOR_AGENTS.md +170 -0
  13. agentcrawl_ai-0.1.0/LICENSE +202 -0
  14. agentcrawl_ai-0.1.0/PKG-INFO +356 -0
  15. agentcrawl_ai-0.1.0/README.md +299 -0
  16. agentcrawl_ai-0.1.0/ROADMAP.md +52 -0
  17. agentcrawl_ai-0.1.0/SECURITY.md +20 -0
  18. agentcrawl_ai-0.1.0/agentcrawl/__init__.py +30 -0
  19. agentcrawl_ai-0.1.0/agentcrawl/__main__.py +3 -0
  20. agentcrawl_ai-0.1.0/agentcrawl/cli.py +401 -0
  21. agentcrawl_ai-0.1.0/agentcrawl/client.py +77 -0
  22. agentcrawl_ai-0.1.0/agentcrawl/config.py +86 -0
  23. agentcrawl_ai-0.1.0/agentcrawl/config_adapter.py +52 -0
  24. agentcrawl_ai-0.1.0/agentcrawl/crawler.py +538 -0
  25. agentcrawl_ai-0.1.0/agentcrawl/documents.py +113 -0
  26. agentcrawl_ai-0.1.0/agentcrawl/errors.py +22 -0
  27. agentcrawl_ai-0.1.0/agentcrawl/exceptions.py +10 -0
  28. agentcrawl_ai-0.1.0/agentcrawl/extraction.py +87 -0
  29. agentcrawl_ai-0.1.0/agentcrawl/fetchers.py +262 -0
  30. agentcrawl_ai-0.1.0/agentcrawl/graph.py +110 -0
  31. agentcrawl_ai-0.1.0/agentcrawl/graphs/__init__.py +5 -0
  32. agentcrawl_ai-0.1.0/agentcrawl/graphs/extraction.py +28 -0
  33. agentcrawl_ai-0.1.0/agentcrawl/graphs/multi.py +29 -0
  34. agentcrawl_ai-0.1.0/agentcrawl/graphs/search_graph.py +25 -0
  35. agentcrawl_ai-0.1.0/agentcrawl/html_tools.py +236 -0
  36. agentcrawl_ai-0.1.0/agentcrawl/llm.py +44 -0
  37. agentcrawl_ai-0.1.0/agentcrawl/mcp_server.py +299 -0
  38. agentcrawl_ai-0.1.0/agentcrawl/models.py +68 -0
  39. agentcrawl_ai-0.1.0/agentcrawl/parsing.py +407 -0
  40. agentcrawl_ai-0.1.0/agentcrawl/remote_client.py +222 -0
  41. agentcrawl_ai-0.1.0/agentcrawl/search.py +73 -0
  42. agentcrawl_ai-0.1.0/agentcrawl/security.py +39 -0
  43. agentcrawl_ai-0.1.0/agentcrawl/serializers.py +16 -0
  44. agentcrawl_ai-0.1.0/agentcrawl/server.py +746 -0
  45. agentcrawl_ai-0.1.0/agentcrawl/state.py +20 -0
  46. agentcrawl_ai-0.1.0/agentcrawl/storage.py +991 -0
  47. agentcrawl_ai-0.1.0/agentcrawl/utils.py +98 -0
  48. agentcrawl_ai-0.1.0/assets/favicon-512.png +0 -0
  49. agentcrawl_ai-0.1.0/assets/github-social-preview.png +0 -0
  50. agentcrawl_ai-0.1.0/assets/logo-mark.png +0 -0
  51. agentcrawl_ai-0.1.0/assets/readme-hero.png +0 -0
  52. agentcrawl_ai-0.1.0/benchmarks/quality_report.py +211 -0
  53. agentcrawl_ai-0.1.0/docker-compose.yml +19 -0
  54. agentcrawl_ai-0.1.0/docs/COMPARISON.md +62 -0
  55. agentcrawl_ai-0.1.0/docs/EXAMPLES.md +131 -0
  56. agentcrawl_ai-0.1.0/docs/OPERATIONS.md +172 -0
  57. agentcrawl_ai-0.1.0/docs/QUALITY_BENCHMARKS.md +104 -0
  58. agentcrawl_ai-0.1.0/docs/RELEASE.md +111 -0
  59. agentcrawl_ai-0.1.0/examples/api_scrape.md +30 -0
  60. agentcrawl_ai-0.1.0/examples/basic.py +9 -0
  61. agentcrawl_ai-0.1.0/examples/browser_rendered.md +29 -0
  62. agentcrawl_ai-0.1.0/examples/cli_scrape.md +38 -0
  63. agentcrawl_ai-0.1.0/examples/docker.md +34 -0
  64. agentcrawl_ai-0.1.0/examples/graph_extraction.py +18 -0
  65. agentcrawl_ai-0.1.0/examples/mcp.md +47 -0
  66. agentcrawl_ai-0.1.0/examples/python_scrape.py +9 -0
  67. agentcrawl_ai-0.1.0/integrations/hermes/web-agentcrawl/README.md +29 -0
  68. agentcrawl_ai-0.1.0/integrations/hermes/web-agentcrawl/__init__.py +10 -0
  69. agentcrawl_ai-0.1.0/integrations/hermes/web-agentcrawl/plugin.yaml +7 -0
  70. agentcrawl_ai-0.1.0/integrations/hermes/web-agentcrawl/provider.py +116 -0
  71. agentcrawl_ai-0.1.0/integrations/hermes/web-agentcrawl/tests/test_provider.py +45 -0
  72. agentcrawl_ai-0.1.0/pyproject.toml +89 -0
  73. agentcrawl_ai-0.1.0/tests/conftest.py +33 -0
  74. agentcrawl_ai-0.1.0/tests/fixtures/quality/api_reference.html +24 -0
  75. agentcrawl_ai-0.1.0/tests/fixtures/quality/article.html +23 -0
  76. agentcrawl_ai-0.1.0/tests/fixtures/quality/blog.html +21 -0
  77. agentcrawl_ai-0.1.0/tests/fixtures/quality/canonical.html +24 -0
  78. agentcrawl_ai-0.1.0/tests/fixtures/quality/complex_product.html +40 -0
  79. agentcrawl_ai-0.1.0/tests/fixtures/quality/documentation.html +24 -0
  80. agentcrawl_ai-0.1.0/tests/fixtures/quality/ecommerce.html +25 -0
  81. agentcrawl_ai-0.1.0/tests/fixtures/quality/forum.html +21 -0
  82. agentcrawl_ai-0.1.0/tests/fixtures/quality/media_article.html +18 -0
  83. agentcrawl_ai-0.1.0/tests/fixtures/quality/messy_docs.html +39 -0
  84. agentcrawl_ai-0.1.0/tests/fixtures/quality/nested_sidebar_docs.html +40 -0
  85. agentcrawl_ai-0.1.0/tests/fixtures/quality/noisy_article.html +26 -0
  86. agentcrawl_ai-0.1.0/tests/fixtures/quality/spa_rendered.html +19 -0
  87. agentcrawl_ai-0.1.0/tests/fixtures/quality/spa_shell.html +9 -0
  88. agentcrawl_ai-0.1.0/tests/fixtures/quality/table.html +24 -0
  89. agentcrawl_ai-0.1.0/tests/test_agentcrawl.py +569 -0
  90. agentcrawl_ai-0.1.0/tests/test_cli.py +153 -0
  91. agentcrawl_ai-0.1.0/tests/test_core.py +226 -0
  92. agentcrawl_ai-0.1.0/tests/test_deployment.py +61 -0
  93. agentcrawl_ai-0.1.0/tests/test_documents.py +81 -0
  94. agentcrawl_ai-0.1.0/tests/test_fetchers.py +228 -0
  95. agentcrawl_ai-0.1.0/tests/test_mcp_server.py +94 -0
  96. agentcrawl_ai-0.1.0/tests/test_quality_fixtures.py +296 -0
  97. agentcrawl_ai-0.1.0/tests/test_quality_report.py +44 -0
  98. agentcrawl_ai-0.1.0/tests/test_redirect_security.py +60 -0
  99. agentcrawl_ai-0.1.0/tests/test_remote_client.py +47 -0
  100. agentcrawl_ai-0.1.0/tests/test_security.py +46 -0
  101. agentcrawl_ai-0.1.0/tests/test_server.py +624 -0
  102. agentcrawl_ai-0.1.0/tests/test_storage.py +85 -0
@@ -0,0 +1,15 @@
1
+ .venv
2
+ __pycache__
3
+ *.pyc
4
+ .pytest_cache
5
+ .ruff_cache
6
+ agentcrawl.db
7
+ .git
8
+ .env
9
+ tests
10
+ examples
11
+ .github
12
+ PROJECT_STATE.md
13
+ PROGRESS_LOG.md
14
+ VPS_DEPLOYMENT.md
15
+ README_AGENT_CONNECTION.md
@@ -0,0 +1,28 @@
1
+ # AgentCrawl server
2
+ AGENTCRAWL_AUTH_ENABLED=true
3
+ AGENTCRAWL_ALLOW_LOCAL_FILES=false
4
+ AGENTCRAWL_ALLOW_PRIVATE_NETWORK=false
5
+ AGENTCRAWL_API_KEYS=change-me-dev-key
6
+ AGENTCRAWL_OWNER_API_KEYS=change-me-dev-key
7
+ AGENTCRAWL_DB=/data/agentcrawl.db
8
+ AGENTCRAWL_FETCHER=http
9
+ AGENTCRAWL_HEADLESS=true
10
+ AGENTCRAWL_TIMEOUT_MS=30000
11
+ AGENTCRAWL_HTTP_RETRIES=2
12
+ AGENTCRAWL_HTTP_RETRY_DELAY=1.0
13
+ AGENTCRAWL_BROWSER_FALLBACK=false
14
+ AGENTCRAWL_DOMAIN_MIN_DELAY=0.35
15
+ AGENTCRAWL_DOMAIN_MAX_CONCURRENCY=2
16
+ AGENTCRAWL_CACHE_ENABLED=true
17
+ AGENTCRAWL_CACHE_TTL_SECONDS=86400
18
+ AGENTCRAWL_CRAWL_DEPTH=1
19
+ AGENTCRAWL_CRAWL_MAX_PAGES=25
20
+ AGENTCRAWL_RESPECT_ROBOTS_TXT=true
21
+ AGENTCRAWL_WORKERS=4
22
+ AGENTCRAWL_CRAWL_JOB_PAGE_QUANTUM=5
23
+ AGENTCRAWL_RATE_LIMIT_PER_MINUTE=60
24
+ AGENTCRAWL_USER_AGENT=Mozilla/5.0 (compatible; AgentCrawl/0.1; +https://agentcrawl.local)
25
+
26
+ # Remote client / MCP server
27
+ AGENTCRAWL_BASE_URL=http://127.0.0.1:8000
28
+ AGENTCRAWL_API_KEY=change-me-dev-key
@@ -0,0 +1,42 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ permissions:
9
+ contents: read
10
+
11
+ jobs:
12
+ test:
13
+ runs-on: ubuntu-latest
14
+ strategy:
15
+ fail-fast: false
16
+ matrix:
17
+ python-version: ["3.10", "3.12"]
18
+
19
+ steps:
20
+ - uses: actions/checkout@v6
21
+ - uses: actions/setup-python@v6
22
+ with:
23
+ python-version: ${{ matrix.python-version }}
24
+ cache: pip
25
+ - name: Install
26
+ run: |
27
+ python -m pip install --upgrade pip
28
+ python -m pip install -e ".[server,mcp,dev]"
29
+ - name: Ruff
30
+ run: python -m ruff check agentcrawl tests examples benchmarks
31
+ - name: Format
32
+ run: python -m ruff format --check agentcrawl tests examples benchmarks
33
+ - name: Tests
34
+ run: python -m pytest -q
35
+ - name: Compile
36
+ run: python -m compileall -q agentcrawl benchmarks
37
+
38
+ docker:
39
+ runs-on: ubuntu-latest
40
+ steps:
41
+ - uses: actions/checkout@v6
42
+ - run: docker build -t agentcrawl-community:ci .
@@ -0,0 +1,68 @@
1
+ name: Build and Push Docker Image
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ tags: ['v*']
7
+ pull_request:
8
+ branches: [main]
9
+ workflow_dispatch:
10
+
11
+ env:
12
+ REGISTRY: ghcr.io
13
+ IMAGE_NAME: ${{ github.repository }}
14
+
15
+ jobs:
16
+ build-and-push:
17
+ runs-on: ubuntu-latest
18
+ permissions:
19
+ contents: read
20
+ packages: write
21
+
22
+ steps:
23
+ - name: Checkout repository
24
+ uses: actions/checkout@v6
25
+
26
+ - name: Set up Docker Buildx
27
+ uses: docker/setup-buildx-action@v3
28
+
29
+ - name: Log in to Container Registry
30
+ if: github.event_name != 'pull_request'
31
+ uses: docker/login-action@v3
32
+ with:
33
+ registry: ${{ env.REGISTRY }}
34
+ username: ${{ github.actor }}
35
+ password: ${{ secrets.GITHUB_TOKEN }}
36
+
37
+ - name: Extract metadata
38
+ id: meta
39
+ uses: docker/metadata-action@v5
40
+ with:
41
+ images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
42
+ tags: |
43
+ type=ref,event=branch
44
+ type=ref,event=pr
45
+ type=semver,pattern={{version}}
46
+ type=semver,pattern={{major}}.{{minor}}
47
+ type=sha,prefix=,suffix=,format=short
48
+ type=raw,value=latest,enable={{is_default_branch}}
49
+
50
+ - name: Build local image for smoke test
51
+ uses: docker/build-push-action@v6
52
+ with:
53
+ context: .
54
+ load: true
55
+ tags: agentcrawl:smoke
56
+
57
+ - name: Smoke test Docker image
58
+ run: |
59
+ docker run --rm agentcrawl:smoke agentcrawl --version
60
+ docker run --rm agentcrawl:smoke agentcrawl doctor
61
+
62
+ - name: Build and push Docker image
63
+ uses: docker/build-push-action@v6
64
+ with:
65
+ context: .
66
+ push: ${{ github.event_name != 'pull_request' }}
67
+ tags: ${{ steps.meta.outputs.tags }}
68
+ labels: ${{ steps.meta.outputs.labels }}
@@ -0,0 +1,13 @@
1
+ .venv/
2
+ __pycache__/
3
+ *.py[cod]
4
+ .pytest_cache/
5
+ .ruff_cache/
6
+ *.db
7
+ .env
8
+ .DS_Store
9
+
10
+ # Build artifacts
11
+ build/
12
+ dist/
13
+ *.egg-info/
@@ -0,0 +1,239 @@
1
+ # Plan — limpieza y refinado de documentación AgentCrawl Community
2
+
3
+ Fecha: 2026-06-11
4
+ Repo público: `/home/orgej/Proyectos/agentcrawl`
5
+ Docs privados: `/home/orgej/Proyectos/agentcrawl-private-docs`
6
+ Estado base verificado:
7
+
8
+ ```text
9
+ Branch: main
10
+ Estado: limpio y sincronizado con origin/main
11
+ HEAD: ea897d3 Improve README conversion quickstart
12
+ CI: success
13
+ Docker/GHCR workflow: success
14
+ GHCR latest/main: ea897d3
15
+ VPS: no tocar; despliegue sigue aplazado
16
+ ```
17
+
18
+ ## Objetivo
19
+
20
+ Antes del Bloque 6 de ejemplos/cookbook, limpiar y alinear todos los Markdown para que el repo público parezca un producto Community serio y no un acumulado de fases internas.
21
+
22
+ Resultado buscado:
23
+
24
+ - Public docs coherentes entre sí.
25
+ - Nada privado/comercial sensible dentro del repo público.
26
+ - README fuerte pero no demasiado largo ni contradictorio.
27
+ - Release/roadmap/examples actualizados al estado real: PyPI preparado, GHCR ya publicado, Docker ligero, Phase 2 quality verde.
28
+ - Private handoff actualizado con el estado real posterior a los commits `9060705` y `ea897d3`.
29
+
30
+ ## Hallazgos iniciales
31
+
32
+ ### 1. Repo/GHCR están actualizados
33
+
34
+ - `origin/main` está en `ea897d3`.
35
+ - GitHub CI y Docker workflow verdes.
36
+ - GHCR contiene tags `latest`, `main`, `ea897d3`.
37
+
38
+ ### 2. Drift privado importante
39
+
40
+ Los docs privados siguen diciendo que el estado canónico era:
41
+
42
+ ```text
43
+ 1b0d460 Add browser rendered SPA quality coverage
44
+ ```
45
+
46
+ Pero el estado real ahora es:
47
+
48
+ ```text
49
+ ea897d3 Improve README conversion quickstart
50
+ 9060705 Prepare lightweight GHCR Docker image
51
+ ```
52
+
53
+ Archivos privados afectados:
54
+
55
+ - `/home/orgej/Proyectos/agentcrawl-private-docs/INDEX.md`
56
+ - `/home/orgej/Proyectos/agentcrawl-private-docs/HANDOFF.md`
57
+ - `/home/orgej/Proyectos/agentcrawl-private-docs/SESSION_CONTINUITY_2026-06-10.md`
58
+
59
+ ### 3. Public docs tienen wording viejo o inconsistente
60
+
61
+ Puntos detectados:
62
+
63
+ - `ROADMAP.md` aún lista “public GHCR image” como prioridad pendiente, aunque ya está publicado.
64
+ - `CHANGELOG.md` no menciona claramente Docker ligero, GHCR publicado, README conversion quickstart ni fixture count actual.
65
+ - `docs/RELEASE.md` describe GHCR como “expected public image path after publication”; ahora ya está publicado.
66
+ - `docs/EXAMPLES.md` mezcla ejemplos actuales con una sección “Additional Examples On The Roadmap” que justo coincide con Bloque 6; hay que convertir eso en archivos reales o lista limpia de pendientes.
67
+ - `docs/OPERATIONS.md` usa `docker compose up --build -d`; para usuarios públicos debería preferir la imagen GHCR y dejar build local como opción de desarrollo.
68
+ - `docs/EXAMPLES.md` tiene placeholders tipo `$AGENT...KEY` / `Bearer` que conviene normalizar para evitar filtrados raros y confusión.
69
+ - `COMMUNITY_VS_ENHANCED.md` en el repo público aparece vacío. Hay que decidir: eliminarlo si no está trackeado/relevante, o llenarlo con una versión pública mínima sin estrategia privada.
70
+
71
+ ### 4. README ya mejoró, pero necesita un pase de consistencia
72
+
73
+ El README tiene buena conversión inicial, pero antes de seguir hay que revisar:
74
+
75
+ - longitud y orden;
76
+ - placeholders de API key;
77
+ - que Docker use GHCR como camino principal;
78
+ - que no prometa browser fallback dentro de la imagen default;
79
+ - que “Community includes” no regale Enhanced ni prometa cloud.
80
+
81
+ ## Plan por bloques
82
+
83
+ ### Bloque A — Normalización pública mínima
84
+
85
+ Archivos:
86
+
87
+ - `README.md`
88
+ - `ROADMAP.md`
89
+ - `CHANGELOG.md`
90
+ - `docs/RELEASE.md`
91
+ - `docs/OPERATIONS.md`
92
+ - `docs/QUALITY_BENCHMARKS.md`
93
+ - `docs/COMPARISON.md`
94
+
95
+ Acciones:
96
+
97
+ 1. Actualizar ROADMAP:
98
+ - Mover GHCR publicado a Completed.
99
+ - Dejar pendiente solo PyPI publication, examples/cookbook, live smoke targets y docs finales.
100
+ - No hablar de “Phase 2” como si fuera interno; usar “Extraction quality hardening”.
101
+
102
+ 2. Actualizar CHANGELOG:
103
+ - Añadir GHCR lightweight Docker image.
104
+ - Añadir Docker workflow smoke tests.
105
+ - Añadir README conversion quickstart.
106
+ - Añadir quality report 14 fixtures / threshold 85 / avg green sin claim competitivo.
107
+
108
+ 3. Actualizar RELEASE:
109
+ - Cambiar “expected image path” por “published image path”.
110
+ - Incluir verificación GHCR:
111
+ ```bash
112
+ docker pull ghcr.io/jorg18/agentcrawl:latest
113
+ docker run --rm ghcr.io/jorg18/agentcrawl:latest agentcrawl --version
114
+ docker run --rm ghcr.io/jorg18/agentcrawl:latest agentcrawl doctor
115
+ ```
116
+ - Mantener PyPI como pendiente si no hay token/publicación final.
117
+
118
+ 4. Actualizar OPERATIONS:
119
+ - Primero GHCR image.
120
+ - Luego Compose local/dev.
121
+ - Dejar `docker compose up --build` como desarrollo, no como instalación principal.
122
+
123
+ 5. Revisar COMPARISON:
124
+ - Asegurar que no parece que ya tenemos todo lo de competidores.
125
+ - Mantener tono: “fits when…” sin claims no probados.
126
+
127
+ 6. Revisar QUALITY_BENCHMARKS:
128
+ - Confirmar que refleja 14 fixtures.
129
+ - Explicar que benchmarks competitivos quedan diferidos.
130
+
131
+ ### Bloque B — Examples/cookbook preparación antes de crear ejemplos reales
132
+
133
+ Archivos actuales/potenciales:
134
+
135
+ - `docs/EXAMPLES.md`
136
+ - `examples/basic.py`
137
+ - `examples/graph_extraction.py`
138
+ - nuevos propuestos:
139
+ - `examples/cli_scrape.md`
140
+ - `examples/python_scrape.py`
141
+ - `examples/api_scrape.md`
142
+ - `examples/docker.md`
143
+ - `examples/mcp.md`
144
+ - `examples/browser_rendered.md`
145
+
146
+ Acciones:
147
+
148
+ 1. Convertir `docs/EXAMPLES.md` en índice/cookbook principal.
149
+ 2. Mover los ejemplos largos a archivos concretos.
150
+ 3. Mantener 5-6 ejemplos buenos, no 20.
151
+ 4. Cada ejemplo debe ser:
152
+ - copy-pasteable;
153
+ - probado si es ejecutable;
154
+ - claramente documental si requiere API/MCP externa.
155
+ 5. Evitar `python -m pip install -e` en docs para usuarios finales salvo sección “from repo checkout”.
156
+
157
+ Validación:
158
+
159
+ - Ejecutar `python examples/python_scrape.py` si usa una fixture/local file o `https://example.com`.
160
+ - Smoke CLI contra `https://example.com`.
161
+ - Si API example requiere server, arrancar servidor temporal o documentarlo claramente.
162
+
163
+ ### Bloque C — Private handoff synchronization
164
+
165
+ Archivos privados:
166
+
167
+ - `/home/orgej/Proyectos/agentcrawl-private-docs/INDEX.md`
168
+ - `/home/orgej/Proyectos/agentcrawl-private-docs/HANDOFF.md`
169
+ - `/home/orgej/Proyectos/agentcrawl-private-docs/SESSION_CONTINUITY_2026-06-10.md`
170
+
171
+ Acciones:
172
+
173
+ 1. Actualizar estado canónico:
174
+ - HEAD `ea897d3`.
175
+ - GHCR publicado.
176
+ - README conversion quickstart hecho.
177
+ - Docker local ya no es requisito; GitHub Actions construye y publica.
178
+ 2. Mantener política VPS:
179
+ - No tocar VPS aún.
180
+ - Despliegue dedicado con backup/smoke test más adelante.
181
+ 3. Actualizar “next recommended work”:
182
+ - limpieza docs/refino;
183
+ - Bloque 6 examples/cookbook;
184
+ - docs finales;
185
+ - release tag/PyPI si hay token.
186
+
187
+ ### Bloque D — Verificación final antes de commit
188
+
189
+ Ejecutar:
190
+
191
+ ```bash
192
+ .venv/bin/python -m compileall -q agentcrawl benchmarks
193
+ .venv/bin/python -m pytest -q
194
+ .venv/bin/ruff check agentcrawl tests benchmarks examples
195
+ .venv/bin/ruff format --check agentcrawl tests benchmarks examples
196
+ .venv/bin/python benchmarks/quality_report.py
197
+ ```
198
+
199
+ Además:
200
+
201
+ ```bash
202
+ git diff --stat
203
+ git diff --check
204
+ gh run list --repo JorG18/agentcrawl --limit 5
205
+ ```
206
+
207
+ Si se toca Docker/GHCR docs solamente, no hace falta reconstruir imagen salvo que cambie Dockerfile/workflow.
208
+
209
+ ### Bloque E — Commit/push limpio
210
+
211
+ Commit sugerido tras Bloque A/C:
212
+
213
+ ```bash
214
+ git add README.md ROADMAP.md CHANGELOG.md docs/*.md /home/orgej/Proyectos/agentcrawl-private-docs/*.md
215
+ git commit -m "Refine AgentCrawl Community release docs"
216
+ git push origin main
217
+ ```
218
+
219
+ Nota: los docs privados no están en el repo público, así que probablemente se actualizan sin commit público. Reportar claramente:
220
+
221
+ - “Repo público commit X”.
222
+ - “Docs privados actualizados localmente”.
223
+
224
+ ## Riesgos / cuidado
225
+
226
+ - No copiar estrategia privada de monetización al repo público.
227
+ - No prometer hosted/Enhanced como ya disponible.
228
+ - No publicar benchmarks competitivos aún.
229
+ - No tocar VPS.
230
+ - No reinstalar herramientas ni modificar Hermes.
231
+ - No sobreescribir MDs enteros sin re-leerlos completos, salvo README/plan donde se controle el diff.
232
+
233
+ ## Orden recomendado de ejecución
234
+
235
+ 1. Bloque A: limpiar docs públicos de estado/distribución.
236
+ 2. Bloque C: sincronizar docs privados con estado real.
237
+ 3. Validación rápida.
238
+ 4. Commit/push si todo está limpio.
239
+ 5. Después sí: Bloque 6 examples/cookbook.
@@ -0,0 +1,3 @@
1
+ # Agent Instructions
2
+
3
+ When asked to install, configure, connect, test, or use AgentCrawl, read and execute [INSTALL_FOR_AGENTS.md](INSTALL_FOR_AGENTS.md). That file is the canonical procedure. Do not merely describe the commands when the user requested installation.
@@ -0,0 +1,19 @@
1
+ # Changelog
2
+
3
+ ## 0.1.0 - Unreleased
4
+
5
+ - Initial AgentCrawl Community release candidate.
6
+ - CLI, Python library, HTTP API, Docker/GHCR image, and MCP integration.
7
+ - Local and HTTP scraping with optional browser/Camofox fallback.
8
+ - Main-content Markdown extraction with semantic container selection, text-rich fallback blocks, and boilerplate reduction.
9
+ - Markdown table preservation and fenced code blocks with language tags from common HTML classes.
10
+ - Local document ingestion for Markdown, text, JSON, XML/RSS/Atom, and PDF-to-Markdown through the optional `docs` extra.
11
+ - Mapping, crawling, persistent jobs, progress, cancellation, event history, crawl failures, and selective failure retries.
12
+ - Cache controls, usage reporting, operational stats, backup, and restore.
13
+ - Authentication, SSRF protections, unsafe redirect blocking, private-network controls, and safe server defaults.
14
+ - Safety baseline fixes for text normalization, sitemap discovery, PDF limits, scrape error behavior, URL validation, and crawl failure filtering.
15
+ - Package version export, wheel/sdist build verification, `twine check`, and clean install smoke tests for base/server/MCP/docs extras.
16
+ - Lightweight default Docker image based on `python:3.12-slim`, published through GHCR as `ghcr.io/jorg18/agentcrawl:latest`.
17
+ - GitHub Actions Docker workflow builds, smoke-tests, and publishes GHCR images for `main`, tags, and commit SHAs.
18
+ - README quickstart refreshed around CLI, Python, MCP, Docker/API, and Community scope.
19
+ - Quality report baseline: 14 checked-in fixtures, minimum score threshold 85, current local average 100.0, richer provenance metadata, JSON-LD/Product schema extraction, Markdown structure metrics, and noisy-layout coverage.
@@ -0,0 +1,3 @@
1
+ # Claude Code Instructions
2
+
3
+ For AgentCrawl installation, MCP registration, verification, and normal tool-selection rules, read and execute [INSTALL_FOR_AGENTS.md](INSTALL_FOR_AGENTS.md). Preserve unrelated client configuration and never expose credentials.
@@ -0,0 +1,10 @@
1
+ # Contributing
2
+
3
+ 1. Create a focused branch.
4
+ 2. Keep changes scoped and avoid unrelated refactors.
5
+ 3. Add tests for behavior changes.
6
+ 4. Run `pytest -q` and `ruff check agentcrawl tests examples`.
7
+ 5. Do not commit credentials, deployment addresses, databases, scraped content, or private commercial modules.
8
+ 6. Explain behavior, security impact, and verification in the pull request.
9
+
10
+ By contributing, you agree that your contribution is licensed under Apache License 2.0.
@@ -0,0 +1,35 @@
1
+ FROM python:3.12-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1 \
4
+ PYTHONUNBUFFERED=1 \
5
+ AGENTCRAWL_DB=/data/agentcrawl.db \
6
+ AGENTCRAWL_FETCHER=http \
7
+ AGENTCRAWL_AUTH_ENABLED=true \
8
+ AGENTCRAWL_ALLOW_LOCAL_FILES=false \
9
+ AGENTCRAWL_ALLOW_PRIVATE_NETWORK=false
10
+
11
+ WORKDIR /app
12
+
13
+ LABEL org.opencontainers.image.title="AgentCrawl" \
14
+ org.opencontainers.image.description="Self-hosted web extraction and Markdown crawling for AI agents." \
15
+ org.opencontainers.image.source="https://github.com/JorG18/agentcrawl" \
16
+ org.opencontainers.image.licenses="Apache-2.0"
17
+
18
+ COPY pyproject.toml README.md LICENSE ./
19
+ COPY agentcrawl ./agentcrawl
20
+
21
+ RUN python -m pip install --no-cache-dir --upgrade pip \
22
+ && python -m pip install --no-cache-dir -e ".[server,mcp]" \
23
+ && useradd --create-home --uid 10001 agentcrawl \
24
+ && mkdir -p /data \
25
+ && chown -R agentcrawl:agentcrawl /app /data
26
+
27
+ USER agentcrawl
28
+
29
+ VOLUME ["/data"]
30
+ EXPOSE 8000
31
+
32
+ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
33
+ CMD python -c "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8000/health', timeout=3).read()"
34
+
35
+ CMD ["uvicorn", "agentcrawl.server:app", "--host", "0.0.0.0", "--port", "8000"]
@@ -0,0 +1,170 @@
1
+ # Install AgentCrawl For An Agent Client
2
+
3
+ This file is the canonical installation procedure for coding agents. When a user asks you to inspect this repository, install AgentCrawl, connect it to your tools, or make it the default scraper, execute the procedure. Do not only explain commands.
4
+
5
+ ## Goal
6
+
7
+ Install AgentCrawl, verify direct scraping, register its standards-based stdio MCP server in your own client configuration, reload when required, and verify `scrape_url`.
8
+
9
+ ## Safety Rules
10
+
11
+ - Never print API keys or copy secrets into chat.
12
+ - Do not overwrite unrelated MCP servers or client settings.
13
+ - Back up a configuration file before modifying it.
14
+ - Prefer the client own supported MCP command or structured configuration format.
15
+ - Do not install Playwright browsers unless the user requests browser fallback or the environment needs it. HTTP extraction works without a browser runtime.
16
+
17
+ ## 1. Install
18
+
19
+ For a published release:
20
+
21
+ ```bash
22
+ python -m pip install "agentcrawl-ai[browser]"
23
+ ```
24
+
25
+ For a repository checkout before a release is published:
26
+
27
+ ```bash
28
+ python -m pip install "agentcrawl-ai[browser]"
29
+ ```
30
+
31
+ The base package uses HTTP and does not install a browser. Install other capabilities only when needed:
32
+
33
+ ```bash
34
+ python -m pip install "agentcrawl-ai[browser]"
35
+ python -m pip install "agentcrawl-ai[docs]" # local PDF ingestion
36
+ python -m pip install "agentcrawl-ai[browser]"
37
+ playwright install chromium
38
+ ```
39
+
40
+ Inspect the installation:
41
+
42
+ ```bash
43
+ agentcrawl doctor
44
+ ```
45
+
46
+ `doctor` reports installed extras, Python/command discovery, local scrape health,
47
+ and optional remote API health when `AGENTCRAWL_BASE_URL` is set. It only reports
48
+ whether an API key is configured and never prints secret values.
49
+
50
+ ## 2. Verify Direct Scraping
51
+
52
+ ```bash
53
+ agentcrawl scrape https://example.com
54
+ ```
55
+
56
+ Success requires non-empty content containing `Example Domain`. Fix installation or network errors before configuring MCP.
57
+
58
+ ## 3. Register The MCP Server
59
+
60
+ Current stdio launcher:
61
+
62
+ ```text
63
+ command: agentcrawl
64
+ args: ["mcp"]
65
+ ```
66
+
67
+ Equivalent generic MCP configuration:
68
+
69
+ ```json
70
+ {
71
+ "mcpServers": {
72
+ "agentcrawl": {
73
+ "command": "agentcrawl",
74
+ "args": ["mcp"]
75
+ }
76
+ }
77
+ }
78
+ ```
79
+
80
+ Use your own client supported registration mechanism. Inspect its existing configuration or CLI help instead of guessing a path. Preserve all unrelated settings.
81
+
82
+ Without `AGENTCRAWL_BASE_URL`, the MCP server runs the local HTTP scraper directly and needs no separate API process. If AgentCrawl is a remote HTTP service, set environment variables on the MCP server process:
83
+
84
+ ```json
85
+ {
86
+ "mcpServers": {
87
+ "agentcrawl": {
88
+ "command": "agentcrawl",
89
+ "args": ["mcp"],
90
+ "env": {
91
+ "AGENTCRAWL_BASE_URL": "https://agentcrawl.example.com",
92
+ "AGENTCRAWL_API_KEY": "<secret>"
93
+ }
94
+ }
95
+ }
96
+ }
97
+ ```
98
+
99
+ Store real secrets using the client credential mechanism or protected environment files. Never commit them.
100
+
101
+ ## 4. Reload And Verify
102
+
103
+ Reload or restart your client if it does not hot-reload MCP configuration. Verify that the `agentcrawl` server exposes at least:
104
+
105
+ ```text
106
+ scrape_url
107
+ map_site
108
+ crawl_site
109
+ get_job
110
+ cancel_job
111
+ job_events
112
+ inspect_failures
113
+ retry_failures
114
+ usage
115
+ cache_stats
116
+ clear_cache
117
+ ```
118
+
119
+ Call `scrape_url` with:
120
+
121
+ ```json
122
+ {
123
+ "url": "https://example.com",
124
+ "formats": ["markdown", "metadata"]
125
+ }
126
+ ```
127
+
128
+ Success requires clean Markdown containing `Example Domain`.
129
+
130
+ ## 5. Normal Tool Selection
131
+
132
+ After registration:
133
+
134
+ - Use `scrape_url` for one known URL.
135
+ - Use `map_site` to discover site URLs without scraping all pages.
136
+ - Use `crawl_site` for bounded multi-page extraction.
137
+ - For asynchronous crawl jobs, provide a stable idempotency key, keep the returned `job_id`, and poll `get_job`; do not start duplicates.
138
+ - A queued job with a future `available_at` is waiting for persisted backoff, not stuck.
139
+ - Read large completed crawls page by page with `offset` and `limit` until `has_more` is false.
140
+ - Use browser automation only for interactive actions, not ordinary page reading.
141
+ - Use another extractor only after AgentCrawl returns a definitive error.
142
+
143
+ ## Operator Backup
144
+
145
+ Before deploying over an existing server database, run:
146
+
147
+ ```bash
148
+ agentcrawl backup --db /path/to/agentcrawl.db --output-dir /path/to/backups
149
+ ```
150
+
151
+ Use `--env-file /path/to/agentcrawl.env` only when an environment file should be
152
+ copied into the backup directory. Do not print or paste its contents.
153
+
154
+ Restore only with the service stopped and only from a verified backup:
155
+
156
+ ```bash
157
+ agentcrawl restore --backup-db /path/to/backup.db --db /path/to/agentcrawl.db --force
158
+ ```
159
+
160
+ ## Completion Report
161
+
162
+ Report only:
163
+
164
+ - installation method and version;
165
+ - whether direct scraping succeeded;
166
+ - whether MCP registration succeeded;
167
+ - number of AgentCrawl tools discovered;
168
+ - whether the functional `scrape_url` test succeeded.
169
+
170
+ Do not include credentials.