agentcrawl-ai 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentcrawl_ai-0.1.0/.dockerignore +15 -0
- agentcrawl_ai-0.1.0/.env.example +28 -0
- agentcrawl_ai-0.1.0/.github/workflows/ci.yml +42 -0
- agentcrawl_ai-0.1.0/.github/workflows/docker.yml +68 -0
- agentcrawl_ai-0.1.0/.gitignore +13 -0
- agentcrawl_ai-0.1.0/.hermes/plans/2026-06-11_000405-agentcrawl-docs-cleanup.md +239 -0
- agentcrawl_ai-0.1.0/AGENTS.md +3 -0
- agentcrawl_ai-0.1.0/CHANGELOG.md +19 -0
- agentcrawl_ai-0.1.0/CLAUDE.md +3 -0
- agentcrawl_ai-0.1.0/CONTRIBUTING.md +10 -0
- agentcrawl_ai-0.1.0/Dockerfile +35 -0
- agentcrawl_ai-0.1.0/INSTALL_FOR_AGENTS.md +170 -0
- agentcrawl_ai-0.1.0/LICENSE +202 -0
- agentcrawl_ai-0.1.0/PKG-INFO +356 -0
- agentcrawl_ai-0.1.0/README.md +299 -0
- agentcrawl_ai-0.1.0/ROADMAP.md +52 -0
- agentcrawl_ai-0.1.0/SECURITY.md +20 -0
- agentcrawl_ai-0.1.0/agentcrawl/__init__.py +30 -0
- agentcrawl_ai-0.1.0/agentcrawl/__main__.py +3 -0
- agentcrawl_ai-0.1.0/agentcrawl/cli.py +401 -0
- agentcrawl_ai-0.1.0/agentcrawl/client.py +77 -0
- agentcrawl_ai-0.1.0/agentcrawl/config.py +86 -0
- agentcrawl_ai-0.1.0/agentcrawl/config_adapter.py +52 -0
- agentcrawl_ai-0.1.0/agentcrawl/crawler.py +538 -0
- agentcrawl_ai-0.1.0/agentcrawl/documents.py +113 -0
- agentcrawl_ai-0.1.0/agentcrawl/errors.py +22 -0
- agentcrawl_ai-0.1.0/agentcrawl/exceptions.py +10 -0
- agentcrawl_ai-0.1.0/agentcrawl/extraction.py +87 -0
- agentcrawl_ai-0.1.0/agentcrawl/fetchers.py +262 -0
- agentcrawl_ai-0.1.0/agentcrawl/graph.py +110 -0
- agentcrawl_ai-0.1.0/agentcrawl/graphs/__init__.py +5 -0
- agentcrawl_ai-0.1.0/agentcrawl/graphs/extraction.py +28 -0
- agentcrawl_ai-0.1.0/agentcrawl/graphs/multi.py +29 -0
- agentcrawl_ai-0.1.0/agentcrawl/graphs/search_graph.py +25 -0
- agentcrawl_ai-0.1.0/agentcrawl/html_tools.py +236 -0
- agentcrawl_ai-0.1.0/agentcrawl/llm.py +44 -0
- agentcrawl_ai-0.1.0/agentcrawl/mcp_server.py +299 -0
- agentcrawl_ai-0.1.0/agentcrawl/models.py +68 -0
- agentcrawl_ai-0.1.0/agentcrawl/parsing.py +407 -0
- agentcrawl_ai-0.1.0/agentcrawl/remote_client.py +222 -0
- agentcrawl_ai-0.1.0/agentcrawl/search.py +73 -0
- agentcrawl_ai-0.1.0/agentcrawl/security.py +39 -0
- agentcrawl_ai-0.1.0/agentcrawl/serializers.py +16 -0
- agentcrawl_ai-0.1.0/agentcrawl/server.py +746 -0
- agentcrawl_ai-0.1.0/agentcrawl/state.py +20 -0
- agentcrawl_ai-0.1.0/agentcrawl/storage.py +991 -0
- agentcrawl_ai-0.1.0/agentcrawl/utils.py +98 -0
- agentcrawl_ai-0.1.0/assets/favicon-512.png +0 -0
- agentcrawl_ai-0.1.0/assets/github-social-preview.png +0 -0
- agentcrawl_ai-0.1.0/assets/logo-mark.png +0 -0
- agentcrawl_ai-0.1.0/assets/readme-hero.png +0 -0
- agentcrawl_ai-0.1.0/benchmarks/quality_report.py +211 -0
- agentcrawl_ai-0.1.0/docker-compose.yml +19 -0
- agentcrawl_ai-0.1.0/docs/COMPARISON.md +62 -0
- agentcrawl_ai-0.1.0/docs/EXAMPLES.md +131 -0
- agentcrawl_ai-0.1.0/docs/OPERATIONS.md +172 -0
- agentcrawl_ai-0.1.0/docs/QUALITY_BENCHMARKS.md +104 -0
- agentcrawl_ai-0.1.0/docs/RELEASE.md +111 -0
- agentcrawl_ai-0.1.0/examples/api_scrape.md +30 -0
- agentcrawl_ai-0.1.0/examples/basic.py +9 -0
- agentcrawl_ai-0.1.0/examples/browser_rendered.md +29 -0
- agentcrawl_ai-0.1.0/examples/cli_scrape.md +38 -0
- agentcrawl_ai-0.1.0/examples/docker.md +34 -0
- agentcrawl_ai-0.1.0/examples/graph_extraction.py +18 -0
- agentcrawl_ai-0.1.0/examples/mcp.md +47 -0
- agentcrawl_ai-0.1.0/examples/python_scrape.py +9 -0
- agentcrawl_ai-0.1.0/integrations/hermes/web-agentcrawl/README.md +29 -0
- agentcrawl_ai-0.1.0/integrations/hermes/web-agentcrawl/__init__.py +10 -0
- agentcrawl_ai-0.1.0/integrations/hermes/web-agentcrawl/plugin.yaml +7 -0
- agentcrawl_ai-0.1.0/integrations/hermes/web-agentcrawl/provider.py +116 -0
- agentcrawl_ai-0.1.0/integrations/hermes/web-agentcrawl/tests/test_provider.py +45 -0
- agentcrawl_ai-0.1.0/pyproject.toml +89 -0
- agentcrawl_ai-0.1.0/tests/conftest.py +33 -0
- agentcrawl_ai-0.1.0/tests/fixtures/quality/api_reference.html +24 -0
- agentcrawl_ai-0.1.0/tests/fixtures/quality/article.html +23 -0
- agentcrawl_ai-0.1.0/tests/fixtures/quality/blog.html +21 -0
- agentcrawl_ai-0.1.0/tests/fixtures/quality/canonical.html +24 -0
- agentcrawl_ai-0.1.0/tests/fixtures/quality/complex_product.html +40 -0
- agentcrawl_ai-0.1.0/tests/fixtures/quality/documentation.html +24 -0
- agentcrawl_ai-0.1.0/tests/fixtures/quality/ecommerce.html +25 -0
- agentcrawl_ai-0.1.0/tests/fixtures/quality/forum.html +21 -0
- agentcrawl_ai-0.1.0/tests/fixtures/quality/media_article.html +18 -0
- agentcrawl_ai-0.1.0/tests/fixtures/quality/messy_docs.html +39 -0
- agentcrawl_ai-0.1.0/tests/fixtures/quality/nested_sidebar_docs.html +40 -0
- agentcrawl_ai-0.1.0/tests/fixtures/quality/noisy_article.html +26 -0
- agentcrawl_ai-0.1.0/tests/fixtures/quality/spa_rendered.html +19 -0
- agentcrawl_ai-0.1.0/tests/fixtures/quality/spa_shell.html +9 -0
- agentcrawl_ai-0.1.0/tests/fixtures/quality/table.html +24 -0
- agentcrawl_ai-0.1.0/tests/test_agentcrawl.py +569 -0
- agentcrawl_ai-0.1.0/tests/test_cli.py +153 -0
- agentcrawl_ai-0.1.0/tests/test_core.py +226 -0
- agentcrawl_ai-0.1.0/tests/test_deployment.py +61 -0
- agentcrawl_ai-0.1.0/tests/test_documents.py +81 -0
- agentcrawl_ai-0.1.0/tests/test_fetchers.py +228 -0
- agentcrawl_ai-0.1.0/tests/test_mcp_server.py +94 -0
- agentcrawl_ai-0.1.0/tests/test_quality_fixtures.py +296 -0
- agentcrawl_ai-0.1.0/tests/test_quality_report.py +44 -0
- agentcrawl_ai-0.1.0/tests/test_redirect_security.py +60 -0
- agentcrawl_ai-0.1.0/tests/test_remote_client.py +47 -0
- agentcrawl_ai-0.1.0/tests/test_security.py +46 -0
- agentcrawl_ai-0.1.0/tests/test_server.py +624 -0
- agentcrawl_ai-0.1.0/tests/test_storage.py +85 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# AgentCrawl server
|
|
2
|
+
AGENTCRAWL_AUTH_ENABLED=true
|
|
3
|
+
AGENTCRAWL_ALLOW_LOCAL_FILES=false
|
|
4
|
+
AGENTCRAWL_ALLOW_PRIVATE_NETWORK=false
|
|
5
|
+
AGENTCRAWL_API_KEYS=change-me-dev-key
|
|
6
|
+
AGENTCRAWL_OWNER_API_KEYS=change-me-dev-key
|
|
7
|
+
AGENTCRAWL_DB=/data/agentcrawl.db
|
|
8
|
+
AGENTCRAWL_FETCHER=http
|
|
9
|
+
AGENTCRAWL_HEADLESS=true
|
|
10
|
+
AGENTCRAWL_TIMEOUT_MS=30000
|
|
11
|
+
AGENTCRAWL_HTTP_RETRIES=2
|
|
12
|
+
AGENTCRAWL_HTTP_RETRY_DELAY=1.0
|
|
13
|
+
AGENTCRAWL_BROWSER_FALLBACK=false
|
|
14
|
+
AGENTCRAWL_DOMAIN_MIN_DELAY=0.35
|
|
15
|
+
AGENTCRAWL_DOMAIN_MAX_CONCURRENCY=2
|
|
16
|
+
AGENTCRAWL_CACHE_ENABLED=true
|
|
17
|
+
AGENTCRAWL_CACHE_TTL_SECONDS=86400
|
|
18
|
+
AGENTCRAWL_CRAWL_DEPTH=1
|
|
19
|
+
AGENTCRAWL_CRAWL_MAX_PAGES=25
|
|
20
|
+
AGENTCRAWL_RESPECT_ROBOTS_TXT=true
|
|
21
|
+
AGENTCRAWL_WORKERS=4
|
|
22
|
+
AGENTCRAWL_CRAWL_JOB_PAGE_QUANTUM=5
|
|
23
|
+
AGENTCRAWL_RATE_LIMIT_PER_MINUTE=60
|
|
24
|
+
AGENTCRAWL_USER_AGENT=Mozilla/5.0 (compatible; AgentCrawl/0.1; +https://agentcrawl.local)
|
|
25
|
+
|
|
26
|
+
# Remote client / MCP server
|
|
27
|
+
AGENTCRAWL_BASE_URL=http://127.0.0.1:8000
|
|
28
|
+
AGENTCRAWL_API_KEY=change-me-dev-key
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
contents: read
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
test:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
strategy:
|
|
15
|
+
fail-fast: false
|
|
16
|
+
matrix:
|
|
17
|
+
python-version: ["3.10", "3.12"]
|
|
18
|
+
|
|
19
|
+
steps:
|
|
20
|
+
- uses: actions/checkout@v6
|
|
21
|
+
- uses: actions/setup-python@v6
|
|
22
|
+
with:
|
|
23
|
+
python-version: ${{ matrix.python-version }}
|
|
24
|
+
cache: pip
|
|
25
|
+
- name: Install
|
|
26
|
+
run: |
|
|
27
|
+
python -m pip install --upgrade pip
|
|
28
|
+
python -m pip install -e ".[server,mcp,dev]"
|
|
29
|
+
- name: Ruff
|
|
30
|
+
run: python -m ruff check agentcrawl tests examples benchmarks
|
|
31
|
+
- name: Format
|
|
32
|
+
run: python -m ruff format --check agentcrawl tests examples benchmarks
|
|
33
|
+
- name: Tests
|
|
34
|
+
run: python -m pytest -q
|
|
35
|
+
- name: Compile
|
|
36
|
+
run: python -m compileall -q agentcrawl benchmarks
|
|
37
|
+
|
|
38
|
+
docker:
|
|
39
|
+
runs-on: ubuntu-latest
|
|
40
|
+
steps:
|
|
41
|
+
- uses: actions/checkout@v6
|
|
42
|
+
- run: docker build -t agentcrawl-community:ci .
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
name: Build and Push Docker Image
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
tags: ['v*']
|
|
7
|
+
pull_request:
|
|
8
|
+
branches: [main]
|
|
9
|
+
workflow_dispatch:
|
|
10
|
+
|
|
11
|
+
env:
|
|
12
|
+
REGISTRY: ghcr.io
|
|
13
|
+
IMAGE_NAME: ${{ github.repository }}
|
|
14
|
+
|
|
15
|
+
jobs:
|
|
16
|
+
build-and-push:
|
|
17
|
+
runs-on: ubuntu-latest
|
|
18
|
+
permissions:
|
|
19
|
+
contents: read
|
|
20
|
+
packages: write
|
|
21
|
+
|
|
22
|
+
steps:
|
|
23
|
+
- name: Checkout repository
|
|
24
|
+
uses: actions/checkout@v6
|
|
25
|
+
|
|
26
|
+
- name: Set up Docker Buildx
|
|
27
|
+
uses: docker/setup-buildx-action@v3
|
|
28
|
+
|
|
29
|
+
- name: Log in to Container Registry
|
|
30
|
+
if: github.event_name != 'pull_request'
|
|
31
|
+
uses: docker/login-action@v3
|
|
32
|
+
with:
|
|
33
|
+
registry: ${{ env.REGISTRY }}
|
|
34
|
+
username: ${{ github.actor }}
|
|
35
|
+
password: ${{ secrets.GITHUB_TOKEN }}
|
|
36
|
+
|
|
37
|
+
- name: Extract metadata
|
|
38
|
+
id: meta
|
|
39
|
+
uses: docker/metadata-action@v5
|
|
40
|
+
with:
|
|
41
|
+
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
|
|
42
|
+
tags: |
|
|
43
|
+
type=ref,event=branch
|
|
44
|
+
type=ref,event=pr
|
|
45
|
+
type=semver,pattern={{version}}
|
|
46
|
+
type=semver,pattern={{major}}.{{minor}}
|
|
47
|
+
type=sha,prefix=,suffix=,format=short
|
|
48
|
+
type=raw,value=latest,enable={{is_default_branch}}
|
|
49
|
+
|
|
50
|
+
- name: Build local image for smoke test
|
|
51
|
+
uses: docker/build-push-action@v6
|
|
52
|
+
with:
|
|
53
|
+
context: .
|
|
54
|
+
load: true
|
|
55
|
+
tags: agentcrawl:smoke
|
|
56
|
+
|
|
57
|
+
- name: Smoke test Docker image
|
|
58
|
+
run: |
|
|
59
|
+
docker run --rm agentcrawl:smoke agentcrawl --version
|
|
60
|
+
docker run --rm agentcrawl:smoke agentcrawl doctor
|
|
61
|
+
|
|
62
|
+
- name: Build and push Docker image
|
|
63
|
+
uses: docker/build-push-action@v6
|
|
64
|
+
with:
|
|
65
|
+
context: .
|
|
66
|
+
push: ${{ github.event_name != 'pull_request' }}
|
|
67
|
+
tags: ${{ steps.meta.outputs.tags }}
|
|
68
|
+
labels: ${{ steps.meta.outputs.labels }}
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
# Plan — limpieza y refinado de documentación AgentCrawl Community
|
|
2
|
+
|
|
3
|
+
Fecha: 2026-06-11
|
|
4
|
+
Repo público: `/home/orgej/Proyectos/agentcrawl`
|
|
5
|
+
Docs privados: `/home/orgej/Proyectos/agentcrawl-private-docs`
|
|
6
|
+
Estado base verificado:
|
|
7
|
+
|
|
8
|
+
```text
|
|
9
|
+
Branch: main
|
|
10
|
+
Estado: limpio y sincronizado con origin/main
|
|
11
|
+
HEAD: ea897d3 Improve README conversion quickstart
|
|
12
|
+
CI: success
|
|
13
|
+
Docker/GHCR workflow: success
|
|
14
|
+
GHCR latest/main: ea897d3
|
|
15
|
+
VPS: no tocar; despliegue sigue aplazado
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Objetivo
|
|
19
|
+
|
|
20
|
+
Antes del Bloque 6 de ejemplos/cookbook, limpiar y alinear todos los Markdown para que el repo público parezca un producto Community serio y no un acumulado de fases internas.
|
|
21
|
+
|
|
22
|
+
Resultado buscado:
|
|
23
|
+
|
|
24
|
+
- Public docs coherentes entre sí.
|
|
25
|
+
- Nada privado/comercial sensible dentro del repo público.
|
|
26
|
+
- README fuerte pero no demasiado largo ni contradictorio.
|
|
27
|
+
- Release/roadmap/examples actualizados al estado real: PyPI preparado, GHCR ya publicado, Docker ligero, Phase 2 quality verde.
|
|
28
|
+
- Private handoff actualizado con el estado real posterior a los commits `9060705` y `ea897d3`.
|
|
29
|
+
|
|
30
|
+
## Hallazgos iniciales
|
|
31
|
+
|
|
32
|
+
### 1. Repo/GHCR están actualizados
|
|
33
|
+
|
|
34
|
+
- `origin/main` está en `ea897d3`.
|
|
35
|
+
- GitHub CI y Docker workflow verdes.
|
|
36
|
+
- GHCR contiene tags `latest`, `main`, `ea897d3`.
|
|
37
|
+
|
|
38
|
+
### 2. Drift privado importante
|
|
39
|
+
|
|
40
|
+
Los docs privados siguen diciendo que el estado canónico era:
|
|
41
|
+
|
|
42
|
+
```text
|
|
43
|
+
1b0d460 Add browser rendered SPA quality coverage
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Pero el estado real ahora es:
|
|
47
|
+
|
|
48
|
+
```text
|
|
49
|
+
ea897d3 Improve README conversion quickstart
|
|
50
|
+
9060705 Prepare lightweight GHCR Docker image
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Archivos privados afectados:
|
|
54
|
+
|
|
55
|
+
- `/home/orgej/Proyectos/agentcrawl-private-docs/INDEX.md`
|
|
56
|
+
- `/home/orgej/Proyectos/agentcrawl-private-docs/HANDOFF.md`
|
|
57
|
+
- `/home/orgej/Proyectos/agentcrawl-private-docs/SESSION_CONTINUITY_2026-06-10.md`
|
|
58
|
+
|
|
59
|
+
### 3. Public docs tienen wording viejo o inconsistente
|
|
60
|
+
|
|
61
|
+
Puntos detectados:
|
|
62
|
+
|
|
63
|
+
- `ROADMAP.md` aún lista “public GHCR image” como prioridad pendiente, aunque ya está publicado.
|
|
64
|
+
- `CHANGELOG.md` no menciona claramente Docker ligero, GHCR publicado, README conversion quickstart ni fixture count actual.
|
|
65
|
+
- `docs/RELEASE.md` describe GHCR como “expected public image path after publication”; ahora ya está publicado.
|
|
66
|
+
- `docs/EXAMPLES.md` mezcla ejemplos actuales con una sección “Additional Examples On The Roadmap” que justo coincide con Bloque 6; hay que convertir eso en archivos reales o lista limpia de pendientes.
|
|
67
|
+
- `docs/OPERATIONS.md` usa `docker compose up --build -d`; para usuarios públicos debería preferir la imagen GHCR y dejar build local como opción de desarrollo.
|
|
68
|
+
- `docs/EXAMPLES.md` tiene placeholders tipo `$AGENT...KEY` / `Bearer` que conviene normalizar para evitar filtrados raros y confusión.
|
|
69
|
+
- `COMMUNITY_VS_ENHANCED.md` en el repo público aparece vacío. Hay que decidir: eliminarlo si no está trackeado/relevante, o llenarlo con una versión pública mínima sin estrategia privada.
|
|
70
|
+
|
|
71
|
+
### 4. README ya mejoró, pero necesita un pase de consistencia
|
|
72
|
+
|
|
73
|
+
El README tiene buena conversión inicial, pero antes de seguir hay que revisar:
|
|
74
|
+
|
|
75
|
+
- longitud y orden;
|
|
76
|
+
- placeholders de API key;
|
|
77
|
+
- que Docker use GHCR como camino principal;
|
|
78
|
+
- que no prometa browser fallback dentro de la imagen default;
|
|
79
|
+
- que “Community includes” no regale Enhanced ni prometa cloud.
|
|
80
|
+
|
|
81
|
+
## Plan por bloques
|
|
82
|
+
|
|
83
|
+
### Bloque A — Normalización pública mínima
|
|
84
|
+
|
|
85
|
+
Archivos:
|
|
86
|
+
|
|
87
|
+
- `README.md`
|
|
88
|
+
- `ROADMAP.md`
|
|
89
|
+
- `CHANGELOG.md`
|
|
90
|
+
- `docs/RELEASE.md`
|
|
91
|
+
- `docs/OPERATIONS.md`
|
|
92
|
+
- `docs/QUALITY_BENCHMARKS.md`
|
|
93
|
+
- `docs/COMPARISON.md`
|
|
94
|
+
|
|
95
|
+
Acciones:
|
|
96
|
+
|
|
97
|
+
1. Actualizar ROADMAP:
|
|
98
|
+
- Mover GHCR publicado a Completed.
|
|
99
|
+
- Dejar pendiente solo PyPI publication, examples/cookbook, live smoke targets y docs finales.
|
|
100
|
+
- No hablar de “Phase 2” como si fuera interno; usar “Extraction quality hardening”.
|
|
101
|
+
|
|
102
|
+
2. Actualizar CHANGELOG:
|
|
103
|
+
- Añadir GHCR lightweight Docker image.
|
|
104
|
+
- Añadir Docker workflow smoke tests.
|
|
105
|
+
- Añadir README conversion quickstart.
|
|
106
|
+
- Añadir quality report 14 fixtures / threshold 85 / avg green sin claim competitivo.
|
|
107
|
+
|
|
108
|
+
3. Actualizar RELEASE:
|
|
109
|
+
- Cambiar “expected image path” por “published image path”.
|
|
110
|
+
- Incluir verificación GHCR:
|
|
111
|
+
```bash
|
|
112
|
+
docker pull ghcr.io/jorg18/agentcrawl:latest
|
|
113
|
+
docker run --rm ghcr.io/jorg18/agentcrawl:latest agentcrawl --version
|
|
114
|
+
docker run --rm ghcr.io/jorg18/agentcrawl:latest agentcrawl doctor
|
|
115
|
+
```
|
|
116
|
+
- Mantener PyPI como pendiente si no hay token/publicación final.
|
|
117
|
+
|
|
118
|
+
4. Actualizar OPERATIONS:
|
|
119
|
+
- Primero GHCR image.
|
|
120
|
+
- Luego Compose local/dev.
|
|
121
|
+
- Dejar `docker compose up --build` como desarrollo, no como instalación principal.
|
|
122
|
+
|
|
123
|
+
5. Revisar COMPARISON:
|
|
124
|
+
- Asegurar que no parece que ya tenemos todo lo de competidores.
|
|
125
|
+
- Mantener tono: “fits when…” sin claims no probados.
|
|
126
|
+
|
|
127
|
+
6. Revisar QUALITY_BENCHMARKS:
|
|
128
|
+
- Confirmar que refleja 14 fixtures.
|
|
129
|
+
- Explicar que benchmarks competitivos quedan diferidos.
|
|
130
|
+
|
|
131
|
+
### Bloque B — Examples/cookbook preparación antes de crear ejemplos reales
|
|
132
|
+
|
|
133
|
+
Archivos actuales/potenciales:
|
|
134
|
+
|
|
135
|
+
- `docs/EXAMPLES.md`
|
|
136
|
+
- `examples/basic.py`
|
|
137
|
+
- `examples/graph_extraction.py`
|
|
138
|
+
- nuevos propuestos:
|
|
139
|
+
- `examples/cli_scrape.md`
|
|
140
|
+
- `examples/python_scrape.py`
|
|
141
|
+
- `examples/api_scrape.md`
|
|
142
|
+
- `examples/docker.md`
|
|
143
|
+
- `examples/mcp.md`
|
|
144
|
+
- `examples/browser_rendered.md`
|
|
145
|
+
|
|
146
|
+
Acciones:
|
|
147
|
+
|
|
148
|
+
1. Convertir `docs/EXAMPLES.md` en índice/cookbook principal.
|
|
149
|
+
2. Mover los ejemplos largos a archivos concretos.
|
|
150
|
+
3. Mantener 5-6 ejemplos buenos, no 20.
|
|
151
|
+
4. Cada ejemplo debe ser:
|
|
152
|
+
- copy-pasteable;
|
|
153
|
+
- probado si es ejecutable;
|
|
154
|
+
- claramente documental si requiere API/MCP externa.
|
|
155
|
+
5. Evitar `python -m pip install -e` en docs para usuarios finales salvo sección “from repo checkout”.
|
|
156
|
+
|
|
157
|
+
Validación:
|
|
158
|
+
|
|
159
|
+
- Ejecutar `python examples/python_scrape.py` si usa una fixture/local file o `https://example.com`.
|
|
160
|
+
- Smoke CLI contra `https://example.com`.
|
|
161
|
+
- Si API example requiere server, arrancar servidor temporal o documentarlo claramente.
|
|
162
|
+
|
|
163
|
+
### Bloque C — Private handoff synchronization
|
|
164
|
+
|
|
165
|
+
Archivos privados:
|
|
166
|
+
|
|
167
|
+
- `/home/orgej/Proyectos/agentcrawl-private-docs/INDEX.md`
|
|
168
|
+
- `/home/orgej/Proyectos/agentcrawl-private-docs/HANDOFF.md`
|
|
169
|
+
- `/home/orgej/Proyectos/agentcrawl-private-docs/SESSION_CONTINUITY_2026-06-10.md`
|
|
170
|
+
|
|
171
|
+
Acciones:
|
|
172
|
+
|
|
173
|
+
1. Actualizar estado canónico:
|
|
174
|
+
- HEAD `ea897d3`.
|
|
175
|
+
- GHCR publicado.
|
|
176
|
+
- README conversion quickstart hecho.
|
|
177
|
+
- Docker local ya no es requisito; GitHub Actions construye y publica.
|
|
178
|
+
2. Mantener política VPS:
|
|
179
|
+
- No tocar VPS aún.
|
|
180
|
+
- Despliegue dedicado con backup/smoke test más adelante.
|
|
181
|
+
3. Actualizar “next recommended work”:
|
|
182
|
+
- limpieza docs/refino;
|
|
183
|
+
- Bloque 6 examples/cookbook;
|
|
184
|
+
- docs finales;
|
|
185
|
+
- release tag/PyPI si hay token.
|
|
186
|
+
|
|
187
|
+
### Bloque D — Verificación final antes de commit
|
|
188
|
+
|
|
189
|
+
Ejecutar:
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
.venv/bin/python -m compileall -q agentcrawl benchmarks
|
|
193
|
+
.venv/bin/python -m pytest -q
|
|
194
|
+
.venv/bin/ruff check agentcrawl tests benchmarks examples
|
|
195
|
+
.venv/bin/ruff format --check agentcrawl tests benchmarks examples
|
|
196
|
+
.venv/bin/python benchmarks/quality_report.py
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
Además:
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
git diff --stat
|
|
203
|
+
git diff --check
|
|
204
|
+
gh run list --repo JorG18/agentcrawl --limit 5
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
Si se toca Docker/GHCR docs solamente, no hace falta reconstruir imagen salvo que cambie Dockerfile/workflow.
|
|
208
|
+
|
|
209
|
+
### Bloque E — Commit/push limpio
|
|
210
|
+
|
|
211
|
+
Commit sugerido tras Bloque A/C:
|
|
212
|
+
|
|
213
|
+
```bash
|
|
214
|
+
git add README.md ROADMAP.md CHANGELOG.md docs/*.md /home/orgej/Proyectos/agentcrawl-private-docs/*.md
|
|
215
|
+
git commit -m "Refine AgentCrawl Community release docs"
|
|
216
|
+
git push origin main
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
Nota: los docs privados no están en el repo público, así que probablemente se actualizan sin commit público. Reportar claramente:
|
|
220
|
+
|
|
221
|
+
- “Repo público commit X”.
|
|
222
|
+
- “Docs privados actualizados localmente”.
|
|
223
|
+
|
|
224
|
+
## Riesgos / cuidado
|
|
225
|
+
|
|
226
|
+
- No copiar estrategia privada de monetización al repo público.
|
|
227
|
+
- No prometer hosted/Enhanced como ya disponible.
|
|
228
|
+
- No publicar benchmarks competitivos aún.
|
|
229
|
+
- No tocar VPS.
|
|
230
|
+
- No reinstalar herramientas ni modificar Hermes.
|
|
231
|
+
- No sobreescribir MDs enteros sin re-leerlos completos, salvo README/plan donde se controle el diff.
|
|
232
|
+
|
|
233
|
+
## Orden recomendado de ejecución
|
|
234
|
+
|
|
235
|
+
1. Bloque A: limpiar docs públicos de estado/distribución.
|
|
236
|
+
2. Bloque C: sincronizar docs privados con estado real.
|
|
237
|
+
3. Validación rápida.
|
|
238
|
+
4. Commit/push si todo está limpio.
|
|
239
|
+
5. Después sí: Bloque 6 examples/cookbook.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.1.0 - Unreleased
|
|
4
|
+
|
|
5
|
+
- Initial AgentCrawl Community release candidate.
|
|
6
|
+
- CLI, Python library, HTTP API, Docker/GHCR image, and MCP integration.
|
|
7
|
+
- Local and HTTP scraping with optional browser/Camofox fallback.
|
|
8
|
+
- Main-content Markdown extraction with semantic container selection, text-rich fallback blocks, and boilerplate reduction.
|
|
9
|
+
- Markdown table preservation and fenced code blocks with language tags from common HTML classes.
|
|
10
|
+
- Local document ingestion for Markdown, text, JSON, XML/RSS/Atom, and PDF-to-Markdown through the optional `docs` extra.
|
|
11
|
+
- Mapping, crawling, persistent jobs, progress, cancellation, event history, crawl failures, and selective failure retries.
|
|
12
|
+
- Cache controls, usage reporting, operational stats, backup, and restore.
|
|
13
|
+
- Authentication, SSRF protections, unsafe redirect blocking, private-network controls, and safe server defaults.
|
|
14
|
+
- Safety baseline fixes for text normalization, sitemap discovery, PDF limits, scrape error behavior, URL validation, and crawl failure filtering.
|
|
15
|
+
- Package version export, wheel/sdist build verification, `twine check`, and clean install smoke tests for base/server/MCP/docs extras.
|
|
16
|
+
- Lightweight default Docker image based on `python:3.12-slim`, published through GHCR as `ghcr.io/jorg18/agentcrawl:latest`.
|
|
17
|
+
- GitHub Actions Docker workflow builds, smoke-tests, and publishes GHCR images for `main`, tags, and commit SHAs.
|
|
18
|
+
- README quickstart refreshed around CLI, Python, MCP, Docker/API, and Community scope.
|
|
19
|
+
- Quality report baseline: 14 checked-in fixtures, minimum score threshold 85, current local average 100.0, richer provenance metadata, JSON-LD/Product schema extraction, Markdown structure metrics, and noisy-layout coverage.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# Contributing
|
|
2
|
+
|
|
3
|
+
1. Create a focused branch.
|
|
4
|
+
2. Keep changes scoped and avoid unrelated refactors.
|
|
5
|
+
3. Add tests for behavior changes.
|
|
6
|
+
4. Run `pytest -q` and `ruff check agentcrawl tests examples`.
|
|
7
|
+
5. Do not commit credentials, deployment addresses, databases, scraped content, or private commercial modules.
|
|
8
|
+
6. Explain behavior, security impact, and verification in the pull request.
|
|
9
|
+
|
|
10
|
+
By contributing, you agree that your contribution is licensed under Apache License 2.0.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
FROM python:3.12-slim
|
|
2
|
+
|
|
3
|
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
|
4
|
+
PYTHONUNBUFFERED=1 \
|
|
5
|
+
AGENTCRAWL_DB=/data/agentcrawl.db \
|
|
6
|
+
AGENTCRAWL_FETCHER=http \
|
|
7
|
+
AGENTCRAWL_AUTH_ENABLED=true \
|
|
8
|
+
AGENTCRAWL_ALLOW_LOCAL_FILES=false \
|
|
9
|
+
AGENTCRAWL_ALLOW_PRIVATE_NETWORK=false
|
|
10
|
+
|
|
11
|
+
WORKDIR /app
|
|
12
|
+
|
|
13
|
+
LABEL org.opencontainers.image.title="AgentCrawl" \
|
|
14
|
+
org.opencontainers.image.description="Self-hosted web extraction and Markdown crawling for AI agents." \
|
|
15
|
+
org.opencontainers.image.source="https://github.com/JorG18/agentcrawl" \
|
|
16
|
+
org.opencontainers.image.licenses="Apache-2.0"
|
|
17
|
+
|
|
18
|
+
COPY pyproject.toml README.md LICENSE ./
|
|
19
|
+
COPY agentcrawl ./agentcrawl
|
|
20
|
+
|
|
21
|
+
RUN python -m pip install --no-cache-dir --upgrade pip \
|
|
22
|
+
&& python -m pip install --no-cache-dir -e ".[server,mcp]" \
|
|
23
|
+
&& useradd --create-home --uid 10001 agentcrawl \
|
|
24
|
+
&& mkdir -p /data \
|
|
25
|
+
&& chown -R agentcrawl:agentcrawl /app /data
|
|
26
|
+
|
|
27
|
+
USER agentcrawl
|
|
28
|
+
|
|
29
|
+
VOLUME ["/data"]
|
|
30
|
+
EXPOSE 8000
|
|
31
|
+
|
|
32
|
+
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
|
|
33
|
+
CMD python -c "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8000/health', timeout=3).read()"
|
|
34
|
+
|
|
35
|
+
CMD ["uvicorn", "agentcrawl.server:app", "--host", "0.0.0.0", "--port", "8000"]
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# Install AgentCrawl For An Agent Client
|
|
2
|
+
|
|
3
|
+
This file is the canonical installation procedure for coding agents. When a user asks you to inspect this repository, install AgentCrawl, connect it to your tools, or make it the default scraper, execute the procedure. Do not only explain commands.
|
|
4
|
+
|
|
5
|
+
## Goal
|
|
6
|
+
|
|
7
|
+
Install AgentCrawl, verify direct scraping, register its standards-based stdio MCP server in your own client configuration, reload when required, and verify `scrape_url`.
|
|
8
|
+
|
|
9
|
+
## Safety Rules
|
|
10
|
+
|
|
11
|
+
- Never print API keys or copy secrets into chat.
|
|
12
|
+
- Do not overwrite unrelated MCP servers or client settings.
|
|
13
|
+
- Back up a configuration file before modifying it.
|
|
14
|
+
- Prefer the client own supported MCP command or structured configuration format.
|
|
15
|
+
- Do not install Playwright browsers unless the user requests browser fallback or the environment needs it. HTTP extraction works without a browser runtime.
|
|
16
|
+
|
|
17
|
+
## 1. Install
|
|
18
|
+
|
|
19
|
+
For a published release:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
python -m pip install "agentcrawl-ai[browser]"
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
For a repository checkout before a release is published:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
python -m pip install "agentcrawl-ai[browser]"
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
The base package uses HTTP and does not install a browser. Install other capabilities only when needed:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
python -m pip install "agentcrawl-ai[browser]"
|
|
35
|
+
python -m pip install "agentcrawl-ai[docs]" # local PDF ingestion
|
|
36
|
+
python -m pip install "agentcrawl-ai[browser]"
|
|
37
|
+
playwright install chromium
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Inspect the installation:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
agentcrawl doctor
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
`doctor` reports installed extras, Python/command discovery, local scrape health,
|
|
47
|
+
and optional remote API health when `AGENTCRAWL_BASE_URL` is set. It only reports
|
|
48
|
+
whether an API key is configured and never prints secret values.
|
|
49
|
+
|
|
50
|
+
## 2. Verify Direct Scraping
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
agentcrawl scrape https://example.com
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Success requires non-empty content containing `Example Domain`. Fix installation or network errors before configuring MCP.
|
|
57
|
+
|
|
58
|
+
## 3. Register The MCP Server
|
|
59
|
+
|
|
60
|
+
Current stdio launcher:
|
|
61
|
+
|
|
62
|
+
```text
|
|
63
|
+
command: agentcrawl
|
|
64
|
+
args: ["mcp"]
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Equivalent generic MCP configuration:
|
|
68
|
+
|
|
69
|
+
```json
|
|
70
|
+
{
|
|
71
|
+
"mcpServers": {
|
|
72
|
+
"agentcrawl": {
|
|
73
|
+
"command": "agentcrawl",
|
|
74
|
+
"args": ["mcp"]
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Use your own client supported registration mechanism. Inspect its existing configuration or CLI help instead of guessing a path. Preserve all unrelated settings.
|
|
81
|
+
|
|
82
|
+
Without `AGENTCRAWL_BASE_URL`, the MCP server runs the local HTTP scraper directly and needs no separate API process. If AgentCrawl is a remote HTTP service, set environment variables on the MCP server process:
|
|
83
|
+
|
|
84
|
+
```json
|
|
85
|
+
{
|
|
86
|
+
"mcpServers": {
|
|
87
|
+
"agentcrawl": {
|
|
88
|
+
"command": "agentcrawl",
|
|
89
|
+
"args": ["mcp"],
|
|
90
|
+
"env": {
|
|
91
|
+
"AGENTCRAWL_BASE_URL": "https://agentcrawl.example.com",
|
|
92
|
+
"AGENTCRAWL_API_KEY": "<secret>"
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Store real secrets using the client credential mechanism or protected environment files. Never commit them.
|
|
100
|
+
|
|
101
|
+
## 4. Reload And Verify
|
|
102
|
+
|
|
103
|
+
Reload or restart your client if it does not hot-reload MCP configuration. Verify that the `agentcrawl` server exposes at least:
|
|
104
|
+
|
|
105
|
+
```text
|
|
106
|
+
scrape_url
|
|
107
|
+
map_site
|
|
108
|
+
crawl_site
|
|
109
|
+
get_job
|
|
110
|
+
cancel_job
|
|
111
|
+
job_events
|
|
112
|
+
inspect_failures
|
|
113
|
+
retry_failures
|
|
114
|
+
usage
|
|
115
|
+
cache_stats
|
|
116
|
+
clear_cache
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Call `scrape_url` with:
|
|
120
|
+
|
|
121
|
+
```json
|
|
122
|
+
{
|
|
123
|
+
"url": "https://example.com",
|
|
124
|
+
"formats": ["markdown", "metadata"]
|
|
125
|
+
}
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Success requires clean Markdown containing `Example Domain`.
|
|
129
|
+
|
|
130
|
+
## 5. Normal Tool Selection
|
|
131
|
+
|
|
132
|
+
After registration:
|
|
133
|
+
|
|
134
|
+
- Use `scrape_url` for one known URL.
|
|
135
|
+
- Use `map_site` to discover site URLs without scraping all pages.
|
|
136
|
+
- Use `crawl_site` for bounded multi-page extraction.
|
|
137
|
+
- For asynchronous crawl jobs, provide a stable idempotency key, keep the returned `job_id`, and poll `get_job`; do not start duplicates.
|
|
138
|
+
- A queued job with a future `available_at` is waiting for persisted backoff, not stuck.
|
|
139
|
+
- Read large completed crawls page by page with `offset` and `limit` until `has_more` is false.
|
|
140
|
+
- Use browser automation only for interactive actions, not ordinary page reading.
|
|
141
|
+
- Use another extractor only after AgentCrawl returns a definitive error.
|
|
142
|
+
|
|
143
|
+
## Operator Backup
|
|
144
|
+
|
|
145
|
+
Before deploying over an existing server database, run:
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
agentcrawl backup --db /path/to/agentcrawl.db --output-dir /path/to/backups
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Use `--env-file /path/to/agentcrawl.env` only when an environment file should be
|
|
152
|
+
copied into the backup directory. Do not print or paste its contents.
|
|
153
|
+
|
|
154
|
+
Restore only with the service stopped and only from a verified backup:
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
agentcrawl restore --backup-db /path/to/backup.db --db /path/to/agentcrawl.db --force
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## Completion Report
|
|
161
|
+
|
|
162
|
+
Report only:
|
|
163
|
+
|
|
164
|
+
- installation method and version;
|
|
165
|
+
- whether direct scraping succeeded;
|
|
166
|
+
- whether MCP registration succeeded;
|
|
167
|
+
- number of AgentCrawl tools discovered;
|
|
168
|
+
- whether the functional `scrape_url` test succeeded.
|
|
169
|
+
|
|
170
|
+
Do not include credentials.
|