evalcraft 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {evalcraft-0.2.0 → evalcraft-0.3.0}/CHANGELOG.md +10 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/PKG-INFO +4 -2
- {evalcraft-0.2.0 → evalcraft-0.3.0}/README.md +3 -1
- {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/index.md +3 -1
- {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/changelog.md +7 -0
- evalcraft-0.3.0/docs/user-guide/check-stale.md +95 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/cli.md +19 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/__init__.py +5 -1
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/main.py +105 -3
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/templates/evalcraft.toml +1 -1
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cloud/client.py +31 -14
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/core/models.py +18 -9
- evalcraft-0.3.0/evalcraft/staleness/__init__.py +25 -0
- evalcraft-0.3.0/evalcraft/staleness/checker.py +232 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/mkdocs.yml +1 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/pyproject.toml +1 -1
- {evalcraft-0.2.0 → evalcraft-0.3.0}/scripts/seed_demo.py +3 -3
- {evalcraft-0.2.0 → evalcraft-0.3.0}/scripts/smoke_test.py +1 -1
- evalcraft-0.3.0/site/CNAME +1 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/site/index.html +10 -10
- evalcraft-0.3.0/tests/test_check_stale.py +161 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_cloud.py +18 -7
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_e2e_pipeline.py +1 -1
- evalcraft-0.2.0/site/CNAME +0 -1
- {evalcraft-0.2.0 → evalcraft-0.3.0}/.env.example +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/.github/ISSUE_TEMPLATE/design-partner-feedback.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/.github/workflows/ci.yml +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/.github/workflows/publish.yml +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/.gitignore +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/CONTRIBUTING.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/LICENSE +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/Makefile +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/action.yml +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/Dockerfile +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/alembic/env.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/alembic/script.py.mako +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/alembic/versions/.gitkeep +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/alembic/versions/001_initial.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/alembic.ini +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/__init__.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/api/__init__.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/api/auth.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/api/cassettes.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/api/golden_sets.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/api/projects.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/api/regressions.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/api/webhooks.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/cache.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/config.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/database.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/logging_config.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/main.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/models/__init__.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/models/cassette.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/models/golden_set.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/models/project.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/models/regression.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/models/user.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/schemas/__init__.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/schemas/api.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/services/alert_service.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/services/analytics_service.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/services/cassette_service.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/services/regression_service.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/entrypoint.sh +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/requirements.txt +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/tests/conftest.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/tests/test_auth.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/tests/test_cassettes.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/tests/test_golden_sets.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/tests/test_projects.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/tests/test_regressions.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/tests/test_webhooks.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/.gitignore +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/Dockerfile +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/README.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/eslint.config.js +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/index.html +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/nginx.conf +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/package-lock.json +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/package.json +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/public/vite.svg +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/App.css +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/App.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/__tests__/ErrorBoundary.test.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/__tests__/Skeleton.test.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/__tests__/api.test.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/assets/react.svg +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/components/CreateGoldenSetModal.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/components/CreateProjectModal.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/components/ErrorBoundary.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/components/Layout.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/components/MetricCard.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/components/Sidebar.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/components/Skeleton.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/components/StatusBadge.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/components/Toast.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/components/UploadCassetteModal.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/context/AuthContext.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/hooks/useApi.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/index.css +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/main.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/pages/Analytics.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/pages/CassetteDetail.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/pages/Cassettes.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/pages/Dashboard.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/pages/GoldenSetDetail.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/pages/GoldenSets.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/pages/Login.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/pages/Regressions.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/pages/Settings.tsx +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/services/api.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/test-setup.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/tsconfig.app.json +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/tsconfig.json +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/tsconfig.node.json +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/vite.config.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/vitest.config.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/docker-compose.production.yml +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/docker-compose.yml +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/logo.png +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/adapters/anthropic.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/adapters/crewai.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/adapters/langgraph.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/adapters/openai.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/capture.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/ci-cd.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/concepts.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/five-minute-case-study.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/index.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/live-eval.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/mock.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/pytest-plugin.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/quickstart.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/replay.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/scorers.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/adapters/__init__.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/adapters/anthropic_adapter.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/adapters/autogen_adapter.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/adapters/crewai_adapter.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/adapters/gemini_adapter.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/adapters/langgraph_adapter.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/adapters/llamaindex_adapter.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/adapters/openai_adapter.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/adapters/pydantic_ai_adapter.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/alerts/__init__.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/alerts/email.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/alerts/slack.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/alerts/webhook.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/capture/__init__.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/capture/recorder.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/__init__.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/doctor_cmd.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/generate_cmd.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/init_cmd.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/templates/__init__.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/templates/conftest.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/templates/test_agent_anthropic.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/templates/test_agent_crewai.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/templates/test_agent_generic.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/templates/test_agent_langgraph.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/templates/test_agent_openai.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cloud/__init__.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/core/__init__.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/__init__.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/_utils.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/hallucination.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/judge_cache.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/jury.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/live.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/llm_judge.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/pairwise.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/rag_scorers.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/scorers/__init__.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/statistical.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/golden/__init__.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/golden/manager.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/mock/__init__.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/mock/llm.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/mock/tool.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/pytest_plugin/__init__.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/pytest_plugin/plugin.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/regression/__init__.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/regression/detector.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/regression/trend.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/replay/__init__.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/replay/engine.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/replay/network_guard.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/sanitize/__init__.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/sanitize/redactor.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/anthropic-agent/README.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/anthropic-agent/agent.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/anthropic-agent/record_cassettes.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/anthropic-agent/requirements.txt +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/anthropic-agent/tests/cassettes/auth_middleware_review.json +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/anthropic-agent/tests/cassettes/db_pool_refactor_review.json +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/anthropic-agent/tests/test_code_review_agent.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/basic_capture.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/ci-pipeline/.github/workflows/eval.yml +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/ci-pipeline/README.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/ci-pipeline/evalcraft-ci.yml +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/ci-pipeline/evalcraft_gate.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/ci-pipeline/requirements.txt +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/ci-pipeline/tests/test_ci_gate.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/example-ci-gate.yml +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/langgraph-workflow/README.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/langgraph-workflow/record_cassettes.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/langgraph-workflow/requirements.txt +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/langgraph-workflow/tests/cassettes/equipment_stipend.json +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/langgraph-workflow/tests/cassettes/remote_work_policy.json +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/langgraph-workflow/tests/test_rag_workflow.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/langgraph-workflow/workflow.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/openai-agent/README.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/openai-agent/agent.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/openai-agent/build_golden.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/openai-agent/record_cassettes.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/openai-agent/requirements.txt +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/openai-agent/tests/cassettes/damaged_item.json +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/openai-agent/tests/cassettes/order_tracking.json +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/openai-agent/tests/cassettes/return_request.json +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/openai-agent/tests/test_golden.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/openai-agent/tests/test_support_agent.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/replay_and_diff.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/test_with_mocks.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/launch/dm-templates.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/launch/outreach-targets.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/launch/outreach-tracker.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/launch/reddit-langchain.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/launch/reddit-python.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/launch/show-hn.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/launch/tracking-dashboard.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/launch/twitter-thread.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/launch-posts.md +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/package-lock.json +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/package.json +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/adapters/gemini.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/adapters/openai.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/adapters/vercel-ai.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/capture/recorder.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/core/models.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/core/types.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/eval/llm-judge.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/eval/rag-scorers.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/eval/scorers.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/index.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/mock/llm.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/mock/tool.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/replay/engine.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/types/externals.d.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/tests/capture.test.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/tests/mock.test.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/tests/models.test.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/tests/replay.test.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/tests/scorers.test.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/tests/vercel-ai.test.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/tsconfig.json +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/vitest.config.ts +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/scripts/requirements.txt +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/scripts/validate_with_openai.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/site/logo.png +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/site/robots.txt +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/site/sitemap.xml +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/site/thank-you.html +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/conftest.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/regression/test_trend_detector.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_alerts.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_anthropic_adapter.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_doctor.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_gemini_adapter.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_generate_tests.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_golden.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_hallucination.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_init.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_judge_cache.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_jury.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_live_eval.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_llm_judge.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_mock_llm.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_mock_tool.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_models.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_network_guard.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_pairwise.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_provenance.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_pydantic_ai_adapter.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_rag_scorers.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_recorder.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_regression.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_replay.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_sanitize.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_scorer_offline.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_scorers.py +0 -0
- {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_statistical.py +0 -0
|
@@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.3.0] — 2026-06-01
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
- **`evalcraft check-stale`** — activates the provenance every cassette already records (model set, prompt hash, timestamp) to flag deterministic tests that have silently gone stale: a recorded model absent from the current `--models` set is **CRITICAL** (non-zero exit — a CI gate), a drifted `--prompts` hash is a **WARNING**, and age is **INFO**. Adds a `StalenessChecker` Python API (`evalcraft.staleness`) and refactors a shared `compute_prompt_hash` so recorded and recomputed prompt hashes match byte-for-byte. No new dependencies; runs fully offline.
|
|
12
|
+
|
|
13
|
+
## [0.2.1] — 2026-05-30
|
|
14
|
+
|
|
15
|
+
### Fixed
|
|
16
|
+
- **Removed references to the unregistered `evalcraft.dev` domain.** The cloud client and the `evalcraft cloud` CLI no longer default to a non-existent `api.evalcraft.dev` endpoint. There is **no public hosted service** — configure a self-hosted dashboard URL explicitly via `base_url=`, the `EVALCRAFT_BASE_URL` env var, or `~/.evalcraft/config.json`. A cloud call with no URL configured now raises a clear, self-host-pointing error instead of failing against a dead host. Also scrubbed the dead domain from the `evalcraft init` config template and the landing-page contact links.
|
|
17
|
+
|
|
8
18
|
## [0.2.0] — 2026-05-30
|
|
9
19
|
|
|
10
20
|
Ships everything developed since the initial `0.1.0` PyPI upload — a much larger
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: evalcraft
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: VCR for AI agents — record agent runs as cassettes and replay them deterministically in CI for $0.
|
|
5
5
|
Project-URL: Homepage, https://github.com/beyhangl/evalcraft
|
|
6
6
|
Project-URL: Repository, https://github.com/beyhangl/evalcraft
|
|
@@ -69,7 +69,8 @@ Description-Content-Type: text/markdown
|
|
|
69
69
|
<p align="center">
|
|
70
70
|
<img src="site/logo.png" alt="Evalcraft" width="400" />
|
|
71
71
|
</p>
|
|
72
|
-
<p align="center"><strong>
|
|
72
|
+
<p align="center"><strong>Deterministic tests for AI agents — generated from one real run.</strong></p>
|
|
73
|
+
<p align="center">Capture an agent run and evalcraft writes a <strong>pytest</strong> that locks its tool calls, output shape, and cost — then replays it in CI for <strong>$0</strong>. Like VCR for HTTP, but it writes the agent tests for you.</p>
|
|
73
74
|
|
|
74
75
|
[](https://github.com/beyhangl/evalcraft/actions/workflows/ci.yml)
|
|
75
76
|
[](https://pypi.org/project/evalcraft/)
|
|
@@ -536,6 +537,7 @@ evalcraft [command] [options]
|
|
|
536
537
|
| `evalcraft sanitize <cassette>` | Redact PII and secrets |
|
|
537
538
|
| `evalcraft doctor` | Diagnose setup issues (deps, API keys, cassettes) |
|
|
538
539
|
| `evalcraft live-eval <current> --baseline <b>` | Gate a live-eval run vs a baseline (catch drift) |
|
|
540
|
+
| `evalcraft check-stale <cassettes> --models <set>` | Fail CI when a cassette's recorded model was retired or swapped |
|
|
539
541
|
|
|
540
542
|
---
|
|
541
543
|
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
<p align="center">
|
|
2
2
|
<img src="site/logo.png" alt="Evalcraft" width="400" />
|
|
3
3
|
</p>
|
|
4
|
-
<p align="center"><strong>
|
|
4
|
+
<p align="center"><strong>Deterministic tests for AI agents — generated from one real run.</strong></p>
|
|
5
|
+
<p align="center">Capture an agent run and evalcraft writes a <strong>pytest</strong> that locks its tool calls, output shape, and cost — then replays it in CI for <strong>$0</strong>. Like VCR for HTTP, but it writes the agent tests for you.</p>
|
|
5
6
|
|
|
6
7
|
[](https://github.com/beyhangl/evalcraft/actions/workflows/ci.yml)
|
|
7
8
|
[](https://pypi.org/project/evalcraft/)
|
|
@@ -468,6 +469,7 @@ evalcraft [command] [options]
|
|
|
468
469
|
| `evalcraft sanitize <cassette>` | Redact PII and secrets |
|
|
469
470
|
| `evalcraft doctor` | Diagnose setup issues (deps, API keys, cassettes) |
|
|
470
471
|
| `evalcraft live-eval <current> --baseline <b>` | Gate a live-eval run vs a baseline (catch drift) |
|
|
472
|
+
| `evalcraft check-stale <cassettes> --models <set>` | Fail CI when a cassette's recorded model was retired or swapped |
|
|
471
473
|
|
|
472
474
|
---
|
|
473
475
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# Evalcraft
|
|
2
2
|
|
|
3
|
-
**
|
|
3
|
+
**Deterministic tests for AI agents — generated from one real run.**
|
|
4
|
+
|
|
5
|
+
Capture an agent run and evalcraft writes a **pytest** that locks its tool calls, output shape, and cost — then replays it in CI for **$0**. Like VCR for HTTP, but it writes the agent tests for you.
|
|
4
6
|
|
|
5
7
|
[](https://github.com/beyhangl/evalcraft/actions/workflows/ci.yml)
|
|
6
8
|
[](https://pypi.org/project/evalcraft/)
|
|
@@ -7,6 +7,13 @@ The format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and
|
|
|
7
7
|
|
|
8
8
|
---
|
|
9
9
|
|
|
10
|
+
## [0.3.0] — 2026-06-01
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- `evalcraft check-stale` — detect cassettes recorded against a retired/swapped model (CRITICAL, non-zero exit for CI) or a drifted prompt (WARNING), by activating the provenance each cassette records. See [Check Stale](check-stale.md).
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
10
17
|
## [0.1.0] — 2026-03-05
|
|
11
18
|
|
|
12
19
|
Initial public release of Evalcraft — the pytest for AI agents.
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# Check Stale — catch cassettes recorded against a retired model
|
|
2
|
+
|
|
3
|
+
A replayed cassette is a *deterministic* test: it passes as long as the recording
|
|
4
|
+
is unchanged. But that's exactly the trap — a green replay says nothing about
|
|
5
|
+
whether the recording still mirrors reality. In 2026, models get **hard
|
|
6
|
+
retirement dates** (and providers silently update weights). When the model a
|
|
7
|
+
cassette was recorded against is gone, your test keeps "passing" against a world
|
|
8
|
+
that no longer exists.
|
|
9
|
+
|
|
10
|
+
`evalcraft check-stale` fixes the blind spot by **activating the provenance**
|
|
11
|
+
every cassette already records (model set, prompt hash, timestamp) and turning it
|
|
12
|
+
into a CI gate.
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
evalcraft check-stale tests/cassettes/*.json --models "gpt-5.1,claude-sonnet-4-5"
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
staleness check 3 cassette(s)
|
|
20
|
+
|
|
21
|
+
refund_flow
|
|
22
|
+
CRITICAL [model_retired] Recorded model 'gpt-4o' is not in the current model set —
|
|
23
|
+
it may have been retired or swapped. This deterministic test no longer
|
|
24
|
+
mirrors production.
|
|
25
|
+
fresh weather_agent
|
|
26
|
+
fresh search_agent
|
|
27
|
+
|
|
28
|
+
CRITICAL staleness found — re-record the affected cassettes
|
|
29
|
+
# exit code 1
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## What it checks
|
|
33
|
+
|
|
34
|
+
| Finding | Severity | Meaning | Exits CI? |
|
|
35
|
+
|---|---|---|---|
|
|
36
|
+
| `model_retired` | **CRITICAL** | A recorded model is absent from the current `--models` set (retired or swapped) — the cassette may now exercise an API that errors live. | **Yes (exit 1)** |
|
|
37
|
+
| `prompt_drift` | WARNING | The current prompt hash (`--prompts`) differs from the recorded one — still replays, but no longer mirrors the live prompt. | No |
|
|
38
|
+
| `age` | INFO | The recording is older than `--max-age-days`. | No |
|
|
39
|
+
| `no_provenance` | INFO | A legacy / hand-built cassette with no provenance — re-record to enable checks. | No |
|
|
40
|
+
|
|
41
|
+
Only a **retired model** blocks the build — it's the one signal that means "your
|
|
42
|
+
deterministic test is lying." Prompt drift and age are visible but non-blocking.
|
|
43
|
+
|
|
44
|
+
## Flags
|
|
45
|
+
|
|
46
|
+
| Flag | Description |
|
|
47
|
+
|---|---|
|
|
48
|
+
| `--models "a,b,c"` | The model set you ship today. Any recorded model not in this exact set → CRITICAL. Omit to skip the model check. |
|
|
49
|
+
| `--prompts <file>` | A file of your current prompts; its hash is compared to the recorded `prompt_hash`. Omit to skip. |
|
|
50
|
+
| `--max-age-days N` | Recorded-at age over `N` days → INFO. Defaults to `30` if no other check is given. |
|
|
51
|
+
| `--json` | Emit `{"cassettes": [report, ...]}` (severity strings `CRITICAL`/`WARNING`/`INFO`). Still exits 1 on any CRITICAL. |
|
|
52
|
+
|
|
53
|
+
Matching is **exact and case-sensitive** — a swap from `gpt-5.1` to `gpt-5.1-mini`
|
|
54
|
+
*should* fire. No fuzzy family matching.
|
|
55
|
+
|
|
56
|
+
### `--prompts` file shape
|
|
57
|
+
|
|
58
|
+
The hash basis is identical to what was recorded at capture time, so a file that
|
|
59
|
+
reproduces the prompts matches byte-for-byte. Accepted shapes:
|
|
60
|
+
|
|
61
|
+
```jsonc
|
|
62
|
+
// 1. JSON object with the run's input + per-LLM-call inputs
|
|
63
|
+
{ "input_text": "refund order 123", "llm_inputs": ["system + user prompt...", "..."] }
|
|
64
|
+
|
|
65
|
+
// 2. JSON list → treated as llm_inputs (input_text = "")
|
|
66
|
+
["system + user prompt..."]
|
|
67
|
+
|
|
68
|
+
// 3. anything else → treated as input_text
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Wire it into CI
|
|
72
|
+
|
|
73
|
+
Add it as a fast, deterministic gate next to your other checks — no API key, no
|
|
74
|
+
network:
|
|
75
|
+
|
|
76
|
+
```yaml
|
|
77
|
+
- name: Fail if any cassette was recorded against a retired model
|
|
78
|
+
run: evalcraft check-stale tests/cassettes/*.json --models "${{ vars.CURRENT_MODELS }}"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
When a model is retired, the gate goes red — **re-record the affected cassettes**
|
|
82
|
+
(which refreshes their provenance), review the new behavior, and commit.
|
|
83
|
+
|
|
84
|
+
## Python API
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
from evalcraft import StalenessChecker
|
|
88
|
+
from evalcraft.core.models import Cassette
|
|
89
|
+
|
|
90
|
+
report = StalenessChecker(max_age_days=30).check(
|
|
91
|
+
Cassette.load("tests/cassettes/refund_flow.json"),
|
|
92
|
+
current_models=["gpt-5.1", "claude-sonnet-4-5"],
|
|
93
|
+
)
|
|
94
|
+
assert not report.has_critical, report.to_dict()
|
|
95
|
+
```
|
|
@@ -465,6 +465,25 @@ def test_with_generated_fixtures():
|
|
|
465
465
|
|
|
466
466
|
---
|
|
467
467
|
|
|
468
|
+
## `evalcraft check-stale`
|
|
469
|
+
|
|
470
|
+
Flag cassettes recorded against a model that's been retired or swapped (or a drifted prompt), using the provenance each cassette records. Exits non-zero on a retired model so CI can block stale deterministic tests.
|
|
471
|
+
|
|
472
|
+
```bash
|
|
473
|
+
evalcraft check-stale CASSETTES... [OPTIONS]
|
|
474
|
+
```
|
|
475
|
+
|
|
476
|
+
| Option | Description |
|
|
477
|
+
|--------|-------------|
|
|
478
|
+
| `--models "a,b,c"` | Current model set; a recorded model not in it is CRITICAL |
|
|
479
|
+
| `--prompts PATH` | Current prompts file; hash drift vs the recording is a WARNING |
|
|
480
|
+
| `--max-age-days N` | Recorded-at age over N days is INFO (defaults to 30 if no other check) |
|
|
481
|
+
| `--json` | Emit JSON; still exits 1 on any CRITICAL |
|
|
482
|
+
|
|
483
|
+
See [Check Stale](check-stale.md) for the full guide.
|
|
484
|
+
|
|
485
|
+
---
|
|
486
|
+
|
|
468
487
|
## Exit codes
|
|
469
488
|
|
|
470
489
|
| Code | Meaning |
|
|
@@ -4,7 +4,7 @@ Record agent runs as cassettes and replay them deterministically in CI for $0;
|
|
|
4
4
|
mock LLMs/tools, score runs, and catch real model drift with live-eval.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
__version__ = "0.
|
|
7
|
+
__version__ = "0.3.0"
|
|
8
8
|
|
|
9
9
|
from evalcraft.capture.recorder import CaptureContext, capture
|
|
10
10
|
from evalcraft.cloud.client import EvalcraftCloud
|
|
@@ -52,6 +52,7 @@ from evalcraft.mock.llm import MockLLM
|
|
|
52
52
|
from evalcraft.mock.tool import MockTool
|
|
53
53
|
from evalcraft.regression.detector import RegressionDetector, RegressionReport
|
|
54
54
|
from evalcraft.replay.engine import ReplayEngine, replay
|
|
55
|
+
from evalcraft.staleness import StalenessChecker, StalenessFinding, StalenessReport
|
|
55
56
|
|
|
56
57
|
__all__ = [
|
|
57
58
|
"capture",
|
|
@@ -95,5 +96,8 @@ __all__ = [
|
|
|
95
96
|
"GoldenSet",
|
|
96
97
|
"RegressionDetector",
|
|
97
98
|
"RegressionReport",
|
|
99
|
+
"StalenessChecker",
|
|
100
|
+
"StalenessFinding",
|
|
101
|
+
"StalenessReport",
|
|
98
102
|
"EvalcraftCloud",
|
|
99
103
|
]
|
|
@@ -58,7 +58,7 @@ _SPAN_COLORS: dict[SpanKind, str] = {
|
|
|
58
58
|
# ─── CLI root ─────────────────────────────────────────────────────────────────
|
|
59
59
|
|
|
60
60
|
@click.group()
|
|
61
|
-
@click.version_option(version="0.
|
|
61
|
+
@click.version_option(version="0.3.0", prog_name="evalcraft")
|
|
62
62
|
def cli() -> None:
|
|
63
63
|
"""evalcraft — capture, replay, and evaluate AI agent runs."""
|
|
64
64
|
|
|
@@ -885,6 +885,108 @@ def regression_cmd(cassette: str, golden: str, as_json: bool) -> None:
|
|
|
885
885
|
sys.exit(1)
|
|
886
886
|
|
|
887
887
|
|
|
888
|
+
# ─── check-stale ──────────────────────────────────────────────────────────────
|
|
889
|
+
|
|
890
|
+
@cli.command("check-stale")
|
|
891
|
+
@click.argument("cassettes", nargs=-1, required=True,
|
|
892
|
+
type=click.Path(exists=True, dir_okay=False))
|
|
893
|
+
@click.option("--models", "models_csv", default=None,
|
|
894
|
+
help="Comma-separated current model set (e.g. 'gpt-5.1,claude-sonnet-4-5'). "
|
|
895
|
+
"A recorded model absent here is CRITICAL (retired/swapped).")
|
|
896
|
+
@click.option("--prompts", "prompts_path", default=None,
|
|
897
|
+
type=click.Path(exists=True, dir_okay=False),
|
|
898
|
+
help="JSON/text file of current prompts; its hash is compared to the "
|
|
899
|
+
"recorded prompt_hash (WARNING on drift).")
|
|
900
|
+
@click.option("--max-age-days", default=None, type=int,
|
|
901
|
+
help="Recorded-at age over N days is INFO. Defaults to 30 if no other check given.")
|
|
902
|
+
@click.option("--json", "as_json", is_flag=True, help="Output as JSON")
|
|
903
|
+
def check_stale_cmd(
|
|
904
|
+
cassettes: tuple[str, ...],
|
|
905
|
+
models_csv: str | None,
|
|
906
|
+
prompts_path: str | None,
|
|
907
|
+
max_age_days: int | None,
|
|
908
|
+
as_json: bool,
|
|
909
|
+
) -> None:
|
|
910
|
+
"""Flag CASSETTES recorded against a retired model or a drifted prompt.
|
|
911
|
+
|
|
912
|
+
Activates each cassette's recorded provenance (model set, prompt hash,
|
|
913
|
+
timestamp). Exits non-zero if ANY cassette references a model no longer in
|
|
914
|
+
--models, so CI can block deterministic tests that have silently gone stale.
|
|
915
|
+
|
|
916
|
+
Example:
|
|
917
|
+
|
|
918
|
+
evalcraft check-stale tests/cassettes/*.json --models "gpt-5.1,claude-sonnet-4-5"
|
|
919
|
+
"""
|
|
920
|
+
from evalcraft.staleness import StalenessChecker, hash_prompts_file
|
|
921
|
+
|
|
922
|
+
current_models = (
|
|
923
|
+
[m.strip() for m in models_csv.split(",") if m.strip()]
|
|
924
|
+
if models_csv is not None
|
|
925
|
+
else None
|
|
926
|
+
)
|
|
927
|
+
current_prompt_hash = hash_prompts_file(prompts_path) if prompts_path else None
|
|
928
|
+
|
|
929
|
+
# With no explicit check requested, fall back to an age check (default 30d).
|
|
930
|
+
effective_age = max_age_days
|
|
931
|
+
if max_age_days is None and current_models is None and current_prompt_hash is None:
|
|
932
|
+
effective_age = 30
|
|
933
|
+
|
|
934
|
+
checker = StalenessChecker(max_age_days=effective_age)
|
|
935
|
+
|
|
936
|
+
reports = []
|
|
937
|
+
for path in cassettes:
|
|
938
|
+
cassette = _load_cassette(path)
|
|
939
|
+
report = checker.check(
|
|
940
|
+
cassette,
|
|
941
|
+
current_models=current_models,
|
|
942
|
+
current_prompt_hash=current_prompt_hash,
|
|
943
|
+
)
|
|
944
|
+
if not report.cassette_name:
|
|
945
|
+
report.cassette_name = Path(path).stem
|
|
946
|
+
reports.append(report)
|
|
947
|
+
|
|
948
|
+
any_critical = any(r.has_critical for r in reports)
|
|
949
|
+
|
|
950
|
+
if as_json:
|
|
951
|
+
click.echo(json.dumps(
|
|
952
|
+
{"cassettes": [r.to_dict() for r in reports]}, indent=2, default=str
|
|
953
|
+
))
|
|
954
|
+
if any_critical:
|
|
955
|
+
sys.exit(1)
|
|
956
|
+
return
|
|
957
|
+
|
|
958
|
+
_SEV_COLORS = {"CRITICAL": "red", "WARNING": "yellow", "INFO": "blue"}
|
|
959
|
+
|
|
960
|
+
click.echo(
|
|
961
|
+
click.style(" staleness check", fg="cyan", bold=True)
|
|
962
|
+
+ f" {len(reports)} cassette(s)"
|
|
963
|
+
)
|
|
964
|
+
click.echo()
|
|
965
|
+
|
|
966
|
+
total_findings = 0
|
|
967
|
+
for report in reports:
|
|
968
|
+
if not report.has_findings:
|
|
969
|
+
click.echo(click.style(" fresh", fg="green", bold=True) + f" {report.cassette_name}")
|
|
970
|
+
continue
|
|
971
|
+
click.echo(click.style(f" {report.cassette_name}", bold=True))
|
|
972
|
+
for f in report.findings:
|
|
973
|
+
total_findings += 1
|
|
974
|
+
color = _SEV_COLORS.get(f.severity.value, "white")
|
|
975
|
+
icon = click.style(f" {f.severity.value:<8}", fg=color, bold=True)
|
|
976
|
+
click.echo(f"{icon} [{f.category}] {f.message}")
|
|
977
|
+
click.echo()
|
|
978
|
+
|
|
979
|
+
if total_findings == 0:
|
|
980
|
+
click.echo(click.style(" all cassettes fresh", fg="green", bold=True))
|
|
981
|
+
elif any_critical:
|
|
982
|
+
click.echo(click.style(
|
|
983
|
+
" CRITICAL staleness found — re-record the affected cassettes", fg="red", bold=True
|
|
984
|
+
))
|
|
985
|
+
|
|
986
|
+
if any_critical:
|
|
987
|
+
sys.exit(1)
|
|
988
|
+
|
|
989
|
+
|
|
888
990
|
# ─── sanitize ─────────────────────────────────────────────────────────────────
|
|
889
991
|
|
|
890
992
|
@cli.command()
|
|
@@ -1111,8 +1213,8 @@ def cloud() -> None:
|
|
|
1111
1213
|
@cloud.command("login")
|
|
1112
1214
|
@click.option("--api-key", prompt="API key", hide_input=True,
|
|
1113
1215
|
help="Your Evalcraft API key (ec_...)")
|
|
1114
|
-
@click.option("--url", default="
|
|
1115
|
-
help="
|
|
1216
|
+
@click.option("--url", default="",
|
|
1217
|
+
help="Your self-hosted dashboard URL (optional; there is no public hosted service)")
|
|
1116
1218
|
def cloud_login(api_key: str, url: str) -> None:
|
|
1117
1219
|
"""Save your API key to ~/.evalcraft/config.json.
|
|
1118
1220
|
|
|
@@ -34,13 +34,11 @@ from typing import Any
|
|
|
34
34
|
|
|
35
35
|
logger = logging.getLogger(__name__)
|
|
36
36
|
|
|
37
|
-
#
|
|
38
|
-
#
|
|
39
|
-
#
|
|
40
|
-
#
|
|
41
|
-
#
|
|
42
|
-
# offline and never contacts this endpoint.
|
|
43
|
-
_DEFAULT_BASE_URL = "https://api.evalcraft.dev/v1"
|
|
37
|
+
# There is no public hosted Evalcraft API. Cloud features are optional and target
|
|
38
|
+
# a *self-hosted* dashboard (see the ``dashboard/`` directory); configure the
|
|
39
|
+
# endpoint explicitly via the ``base_url`` argument, the ``EVALCRAFT_BASE_URL``
|
|
40
|
+
# environment variable, or ``~/.evalcraft/config.json``. The core capture /
|
|
41
|
+
# replay / eval workflow runs fully offline and never contacts any endpoint.
|
|
44
42
|
_CONFIG_DIR = Path.home() / ".evalcraft"
|
|
45
43
|
_CONFIG_FILE = _CONFIG_DIR / "config.json"
|
|
46
44
|
_QUEUE_DIR = _CONFIG_DIR / "queue"
|
|
@@ -104,7 +102,9 @@ class EvalcraftCloud:
|
|
|
104
102
|
api_key: Bearer token (``ec_...``). If None, reads from
|
|
105
103
|
``~/.evalcraft/config.json`` or the ``EVALCRAFT_API_KEY``
|
|
106
104
|
environment variable.
|
|
107
|
-
base_url:
|
|
105
|
+
base_url: URL of your self-hosted Evalcraft dashboard. Required for any
|
|
106
|
+
cloud call — there is no public hosted service. Falls back to the
|
|
107
|
+
``EVALCRAFT_BASE_URL`` env var, then ``~/.evalcraft/config.json``.
|
|
108
108
|
timeout: Request timeout in seconds (default 30).
|
|
109
109
|
max_retries: Maximum number of retry attempts for transient errors
|
|
110
110
|
(default 3). Uses exponential backoff with jitter.
|
|
@@ -115,13 +115,13 @@ class EvalcraftCloud:
|
|
|
115
115
|
def __init__(
|
|
116
116
|
self,
|
|
117
117
|
api_key: str | None = None,
|
|
118
|
-
base_url: str =
|
|
118
|
+
base_url: str | None = None,
|
|
119
119
|
timeout: int = 30,
|
|
120
120
|
max_retries: int = 3,
|
|
121
121
|
queue_dir: Path | None = None,
|
|
122
122
|
):
|
|
123
123
|
self.api_key = api_key or self._load_api_key()
|
|
124
|
-
self.base_url = base_url.rstrip("/")
|
|
124
|
+
self.base_url = (base_url or self._load_base_url()).rstrip("/")
|
|
125
125
|
self.timeout = timeout
|
|
126
126
|
self.max_retries = max_retries
|
|
127
127
|
self.queue_dir = queue_dir or _QUEUE_DIR
|
|
@@ -237,8 +237,8 @@ class EvalcraftCloud:
|
|
|
237
237
|
# ──────────────────────────────────────────
|
|
238
238
|
|
|
239
239
|
@staticmethod
|
|
240
|
-
def save_config(api_key: str, base_url: str =
|
|
241
|
-
"""Persist API key and
|
|
240
|
+
def save_config(api_key: str, base_url: str = "") -> None:
|
|
241
|
+
"""Persist the API key (and optional dashboard URL) to ``~/.evalcraft/config.json``."""
|
|
242
242
|
_CONFIG_DIR.mkdir(parents=True, exist_ok=True)
|
|
243
243
|
config: dict = {}
|
|
244
244
|
if _CONFIG_FILE.exists():
|
|
@@ -247,7 +247,8 @@ class EvalcraftCloud:
|
|
|
247
247
|
except Exception:
|
|
248
248
|
pass
|
|
249
249
|
config["api_key"] = api_key
|
|
250
|
-
|
|
250
|
+
if base_url:
|
|
251
|
+
config["base_url"] = base_url
|
|
251
252
|
_CONFIG_FILE.write_text(json.dumps(config, indent=2))
|
|
252
253
|
_CONFIG_FILE.chmod(0o600)
|
|
253
254
|
|
|
@@ -288,6 +289,15 @@ class EvalcraftCloud:
|
|
|
288
289
|
config = self.load_config()
|
|
289
290
|
return str(config.get("api_key", ""))
|
|
290
291
|
|
|
292
|
+
def _load_base_url(self) -> str:
|
|
293
|
+
"""Resolve the dashboard base URL from env or config (empty if unset)."""
|
|
294
|
+
import os
|
|
295
|
+
env_url = os.environ.get("EVALCRAFT_BASE_URL", "")
|
|
296
|
+
if env_url:
|
|
297
|
+
return env_url
|
|
298
|
+
config = self.load_config()
|
|
299
|
+
return str(config.get("base_url", ""))
|
|
300
|
+
|
|
291
301
|
def _request(
|
|
292
302
|
self,
|
|
293
303
|
method: str,
|
|
@@ -307,11 +317,18 @@ class EvalcraftCloud:
|
|
|
307
317
|
Raises:
|
|
308
318
|
CloudUploadError: After max_retries exhausted or on 4xx errors.
|
|
309
319
|
"""
|
|
320
|
+
if not self.base_url:
|
|
321
|
+
raise CloudUploadError(
|
|
322
|
+
"No Evalcraft dashboard URL is configured. There is no public "
|
|
323
|
+
"hosted service — point the client at your own self-hosted "
|
|
324
|
+
"dashboard (see the dashboard/ directory) via base_url=..., the "
|
|
325
|
+
"EVALCRAFT_BASE_URL env var, or ~/.evalcraft/config.json."
|
|
326
|
+
)
|
|
310
327
|
url = f"{self.base_url}{path}"
|
|
311
328
|
body: bytes | None = None
|
|
312
329
|
headers: dict[str, str] = {
|
|
313
330
|
"Accept": "application/json",
|
|
314
|
-
"User-Agent": "evalcraft-sdk/0.
|
|
331
|
+
"User-Agent": "evalcraft-sdk/0.3.0",
|
|
315
332
|
}
|
|
316
333
|
if self.api_key:
|
|
317
334
|
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
@@ -165,6 +165,21 @@ class Provenance:
|
|
|
165
165
|
)
|
|
166
166
|
|
|
167
167
|
|
|
168
|
+
def compute_prompt_hash(input_text: str, llm_inputs: list[Any]) -> str:
|
|
169
|
+
"""Hash the prompt surface of a run — the user input plus each LLM span's input.
|
|
170
|
+
|
|
171
|
+
Used both when recording provenance (:meth:`Cassette.capture_provenance`) and
|
|
172
|
+
when checking staleness, so a recorded hash and a recomputed one match
|
|
173
|
+
byte-for-byte. List order is significant; only dict keys are sorted.
|
|
174
|
+
"""
|
|
175
|
+
basis = json.dumps(
|
|
176
|
+
{"input_text": input_text, "llm_inputs": list(llm_inputs)},
|
|
177
|
+
sort_keys=True,
|
|
178
|
+
default=str,
|
|
179
|
+
)
|
|
180
|
+
return hashlib.sha256(basis.encode()).hexdigest()[:16]
|
|
181
|
+
|
|
182
|
+
|
|
168
183
|
@dataclass
|
|
169
184
|
class Cassette:
|
|
170
185
|
"""A recorded agent run — the fundamental unit of Evalcraft.
|
|
@@ -230,15 +245,9 @@ class Cassette:
|
|
|
230
245
|
|
|
231
246
|
llm_spans = self.get_llm_calls()
|
|
232
247
|
models = sorted({s.model for s in llm_spans if s.model})
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
"input_text": self.input_text,
|
|
236
|
-
"llm_inputs": [s.input for s in llm_spans],
|
|
237
|
-
},
|
|
238
|
-
sort_keys=True,
|
|
239
|
-
default=str,
|
|
248
|
+
prompt_hash = compute_prompt_hash(
|
|
249
|
+
self.input_text, [s.input for s in llm_spans]
|
|
240
250
|
)
|
|
241
|
-
prompt_hash = hashlib.sha256(basis.encode()).hexdigest()[:16]
|
|
242
251
|
|
|
243
252
|
self.provenance = Provenance(
|
|
244
253
|
recorded_at=time.time(),
|
|
@@ -288,7 +297,7 @@ class Cassette:
|
|
|
288
297
|
self.compute_metrics()
|
|
289
298
|
self.compute_fingerprint()
|
|
290
299
|
return {
|
|
291
|
-
"evalcraft_version": "0.
|
|
300
|
+
"evalcraft_version": "0.3.0",
|
|
292
301
|
"cassette": {
|
|
293
302
|
"id": self.id,
|
|
294
303
|
"name": self.name,
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Staleness detection — flag cassettes recorded against retired models / drifted prompts.
|
|
2
|
+
|
|
3
|
+
A cassette's recorded provenance (model set, prompt hash, timestamp) is only
|
|
4
|
+
useful if something acts on it. This module does: it turns that provenance into
|
|
5
|
+
actionable CI signal so a deterministic test can't silently keep passing against
|
|
6
|
+
a model that no longer exists.
|
|
7
|
+
|
|
8
|
+
from evalcraft.staleness import StalenessChecker
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from evalcraft.core.models import compute_prompt_hash
|
|
12
|
+
from evalcraft.staleness.checker import (
|
|
13
|
+
StalenessChecker,
|
|
14
|
+
StalenessFinding,
|
|
15
|
+
StalenessReport,
|
|
16
|
+
hash_prompts_file,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"StalenessChecker",
|
|
21
|
+
"StalenessFinding",
|
|
22
|
+
"StalenessReport",
|
|
23
|
+
"compute_prompt_hash",
|
|
24
|
+
"hash_prompts_file",
|
|
25
|
+
]
|