evalcraft 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (295) hide show
  1. {evalcraft-0.2.0 → evalcraft-0.3.0}/CHANGELOG.md +10 -0
  2. {evalcraft-0.2.0 → evalcraft-0.3.0}/PKG-INFO +4 -2
  3. {evalcraft-0.2.0 → evalcraft-0.3.0}/README.md +3 -1
  4. {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/index.md +3 -1
  5. {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/changelog.md +7 -0
  6. evalcraft-0.3.0/docs/user-guide/check-stale.md +95 -0
  7. {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/cli.md +19 -0
  8. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/__init__.py +5 -1
  9. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/main.py +105 -3
  10. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/templates/evalcraft.toml +1 -1
  11. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cloud/client.py +31 -14
  12. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/core/models.py +18 -9
  13. evalcraft-0.3.0/evalcraft/staleness/__init__.py +25 -0
  14. evalcraft-0.3.0/evalcraft/staleness/checker.py +232 -0
  15. {evalcraft-0.2.0 → evalcraft-0.3.0}/mkdocs.yml +1 -0
  16. {evalcraft-0.2.0 → evalcraft-0.3.0}/pyproject.toml +1 -1
  17. {evalcraft-0.2.0 → evalcraft-0.3.0}/scripts/seed_demo.py +3 -3
  18. {evalcraft-0.2.0 → evalcraft-0.3.0}/scripts/smoke_test.py +1 -1
  19. evalcraft-0.3.0/site/CNAME +1 -0
  20. {evalcraft-0.2.0 → evalcraft-0.3.0}/site/index.html +10 -10
  21. evalcraft-0.3.0/tests/test_check_stale.py +161 -0
  22. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_cloud.py +18 -7
  23. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_e2e_pipeline.py +1 -1
  24. evalcraft-0.2.0/site/CNAME +0 -1
  25. {evalcraft-0.2.0 → evalcraft-0.3.0}/.env.example +0 -0
  26. {evalcraft-0.2.0 → evalcraft-0.3.0}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  27. {evalcraft-0.2.0 → evalcraft-0.3.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  28. {evalcraft-0.2.0 → evalcraft-0.3.0}/.github/ISSUE_TEMPLATE/design-partner-feedback.md +0 -0
  29. {evalcraft-0.2.0 → evalcraft-0.3.0}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  30. {evalcraft-0.2.0 → evalcraft-0.3.0}/.github/workflows/ci.yml +0 -0
  31. {evalcraft-0.2.0 → evalcraft-0.3.0}/.github/workflows/publish.yml +0 -0
  32. {evalcraft-0.2.0 → evalcraft-0.3.0}/.gitignore +0 -0
  33. {evalcraft-0.2.0 → evalcraft-0.3.0}/CONTRIBUTING.md +0 -0
  34. {evalcraft-0.2.0 → evalcraft-0.3.0}/LICENSE +0 -0
  35. {evalcraft-0.2.0 → evalcraft-0.3.0}/Makefile +0 -0
  36. {evalcraft-0.2.0 → evalcraft-0.3.0}/action.yml +0 -0
  37. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/Dockerfile +0 -0
  38. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/alembic/env.py +0 -0
  39. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/alembic/script.py.mako +0 -0
  40. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/alembic/versions/.gitkeep +0 -0
  41. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/alembic/versions/001_initial.py +0 -0
  42. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/alembic.ini +0 -0
  43. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/__init__.py +0 -0
  44. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/api/__init__.py +0 -0
  45. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/api/auth.py +0 -0
  46. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/api/cassettes.py +0 -0
  47. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/api/golden_sets.py +0 -0
  48. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/api/projects.py +0 -0
  49. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/api/regressions.py +0 -0
  50. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/api/webhooks.py +0 -0
  51. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/cache.py +0 -0
  52. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/config.py +0 -0
  53. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/database.py +0 -0
  54. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/logging_config.py +0 -0
  55. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/main.py +0 -0
  56. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/models/__init__.py +0 -0
  57. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/models/cassette.py +0 -0
  58. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/models/golden_set.py +0 -0
  59. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/models/project.py +0 -0
  60. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/models/regression.py +0 -0
  61. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/models/user.py +0 -0
  62. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/schemas/__init__.py +0 -0
  63. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/schemas/api.py +0 -0
  64. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/services/alert_service.py +0 -0
  65. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/services/analytics_service.py +0 -0
  66. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/services/cassette_service.py +0 -0
  67. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/app/services/regression_service.py +0 -0
  68. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/entrypoint.sh +0 -0
  69. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/requirements.txt +0 -0
  70. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/tests/conftest.py +0 -0
  71. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/tests/test_auth.py +0 -0
  72. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/tests/test_cassettes.py +0 -0
  73. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/tests/test_golden_sets.py +0 -0
  74. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/tests/test_projects.py +0 -0
  75. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/tests/test_regressions.py +0 -0
  76. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/backend/tests/test_webhooks.py +0 -0
  77. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/.gitignore +0 -0
  78. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/Dockerfile +0 -0
  79. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/README.md +0 -0
  80. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/eslint.config.js +0 -0
  81. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/index.html +0 -0
  82. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/nginx.conf +0 -0
  83. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/package-lock.json +0 -0
  84. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/package.json +0 -0
  85. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/public/vite.svg +0 -0
  86. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/App.css +0 -0
  87. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/App.tsx +0 -0
  88. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/__tests__/ErrorBoundary.test.tsx +0 -0
  89. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/__tests__/Skeleton.test.tsx +0 -0
  90. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/__tests__/api.test.ts +0 -0
  91. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/assets/react.svg +0 -0
  92. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/components/CreateGoldenSetModal.tsx +0 -0
  93. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/components/CreateProjectModal.tsx +0 -0
  94. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/components/ErrorBoundary.tsx +0 -0
  95. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/components/Layout.tsx +0 -0
  96. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/components/MetricCard.tsx +0 -0
  97. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/components/Sidebar.tsx +0 -0
  98. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/components/Skeleton.tsx +0 -0
  99. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/components/StatusBadge.tsx +0 -0
  100. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/components/Toast.tsx +0 -0
  101. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/components/UploadCassetteModal.tsx +0 -0
  102. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/context/AuthContext.tsx +0 -0
  103. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/hooks/useApi.ts +0 -0
  104. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/index.css +0 -0
  105. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/main.tsx +0 -0
  106. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/pages/Analytics.tsx +0 -0
  107. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/pages/CassetteDetail.tsx +0 -0
  108. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/pages/Cassettes.tsx +0 -0
  109. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/pages/Dashboard.tsx +0 -0
  110. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/pages/GoldenSetDetail.tsx +0 -0
  111. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/pages/GoldenSets.tsx +0 -0
  112. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/pages/Login.tsx +0 -0
  113. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/pages/Regressions.tsx +0 -0
  114. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/pages/Settings.tsx +0 -0
  115. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/services/api.ts +0 -0
  116. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/src/test-setup.ts +0 -0
  117. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/tsconfig.app.json +0 -0
  118. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/tsconfig.json +0 -0
  119. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/tsconfig.node.json +0 -0
  120. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/vite.config.ts +0 -0
  121. {evalcraft-0.2.0 → evalcraft-0.3.0}/dashboard/frontend/vitest.config.ts +0 -0
  122. {evalcraft-0.2.0 → evalcraft-0.3.0}/docker-compose.production.yml +0 -0
  123. {evalcraft-0.2.0 → evalcraft-0.3.0}/docker-compose.yml +0 -0
  124. {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/logo.png +0 -0
  125. {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/adapters/anthropic.md +0 -0
  126. {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/adapters/crewai.md +0 -0
  127. {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/adapters/langgraph.md +0 -0
  128. {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/adapters/openai.md +0 -0
  129. {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/capture.md +0 -0
  130. {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/ci-cd.md +0 -0
  131. {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/concepts.md +0 -0
  132. {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/five-minute-case-study.md +0 -0
  133. {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/index.md +0 -0
  134. {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/live-eval.md +0 -0
  135. {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/mock.md +0 -0
  136. {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/pytest-plugin.md +0 -0
  137. {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/quickstart.md +0 -0
  138. {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/replay.md +0 -0
  139. {evalcraft-0.2.0 → evalcraft-0.3.0}/docs/user-guide/scorers.md +0 -0
  140. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/adapters/__init__.py +0 -0
  141. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/adapters/anthropic_adapter.py +0 -0
  142. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/adapters/autogen_adapter.py +0 -0
  143. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/adapters/crewai_adapter.py +0 -0
  144. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/adapters/gemini_adapter.py +0 -0
  145. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/adapters/langgraph_adapter.py +0 -0
  146. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/adapters/llamaindex_adapter.py +0 -0
  147. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/adapters/openai_adapter.py +0 -0
  148. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/adapters/pydantic_ai_adapter.py +0 -0
  149. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/alerts/__init__.py +0 -0
  150. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/alerts/email.py +0 -0
  151. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/alerts/slack.py +0 -0
  152. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/alerts/webhook.py +0 -0
  153. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/capture/__init__.py +0 -0
  154. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/capture/recorder.py +0 -0
  155. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/__init__.py +0 -0
  156. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/doctor_cmd.py +0 -0
  157. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/generate_cmd.py +0 -0
  158. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/init_cmd.py +0 -0
  159. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/templates/__init__.py +0 -0
  160. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/templates/conftest.py +0 -0
  161. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/templates/test_agent_anthropic.py +0 -0
  162. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/templates/test_agent_crewai.py +0 -0
  163. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/templates/test_agent_generic.py +0 -0
  164. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/templates/test_agent_langgraph.py +0 -0
  165. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cli/templates/test_agent_openai.py +0 -0
  166. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/cloud/__init__.py +0 -0
  167. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/core/__init__.py +0 -0
  168. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/__init__.py +0 -0
  169. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/_utils.py +0 -0
  170. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/hallucination.py +0 -0
  171. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/judge_cache.py +0 -0
  172. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/jury.py +0 -0
  173. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/live.py +0 -0
  174. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/llm_judge.py +0 -0
  175. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/pairwise.py +0 -0
  176. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/rag_scorers.py +0 -0
  177. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/scorers/__init__.py +0 -0
  178. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/eval/statistical.py +0 -0
  179. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/golden/__init__.py +0 -0
  180. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/golden/manager.py +0 -0
  181. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/mock/__init__.py +0 -0
  182. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/mock/llm.py +0 -0
  183. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/mock/tool.py +0 -0
  184. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/pytest_plugin/__init__.py +0 -0
  185. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/pytest_plugin/plugin.py +0 -0
  186. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/regression/__init__.py +0 -0
  187. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/regression/detector.py +0 -0
  188. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/regression/trend.py +0 -0
  189. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/replay/__init__.py +0 -0
  190. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/replay/engine.py +0 -0
  191. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/replay/network_guard.py +0 -0
  192. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/sanitize/__init__.py +0 -0
  193. {evalcraft-0.2.0 → evalcraft-0.3.0}/evalcraft/sanitize/redactor.py +0 -0
  194. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/anthropic-agent/README.md +0 -0
  195. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/anthropic-agent/agent.py +0 -0
  196. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/anthropic-agent/record_cassettes.py +0 -0
  197. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/anthropic-agent/requirements.txt +0 -0
  198. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/anthropic-agent/tests/cassettes/auth_middleware_review.json +0 -0
  199. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/anthropic-agent/tests/cassettes/db_pool_refactor_review.json +0 -0
  200. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/anthropic-agent/tests/test_code_review_agent.py +0 -0
  201. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/basic_capture.py +0 -0
  202. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/ci-pipeline/.github/workflows/eval.yml +0 -0
  203. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/ci-pipeline/README.md +0 -0
  204. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/ci-pipeline/evalcraft-ci.yml +0 -0
  205. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/ci-pipeline/evalcraft_gate.py +0 -0
  206. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/ci-pipeline/requirements.txt +0 -0
  207. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/ci-pipeline/tests/test_ci_gate.py +0 -0
  208. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/example-ci-gate.yml +0 -0
  209. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/langgraph-workflow/README.md +0 -0
  210. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/langgraph-workflow/record_cassettes.py +0 -0
  211. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/langgraph-workflow/requirements.txt +0 -0
  212. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/langgraph-workflow/tests/cassettes/equipment_stipend.json +0 -0
  213. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/langgraph-workflow/tests/cassettes/remote_work_policy.json +0 -0
  214. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/langgraph-workflow/tests/test_rag_workflow.py +0 -0
  215. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/langgraph-workflow/workflow.py +0 -0
  216. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/openai-agent/README.md +0 -0
  217. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/openai-agent/agent.py +0 -0
  218. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/openai-agent/build_golden.py +0 -0
  219. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/openai-agent/record_cassettes.py +0 -0
  220. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/openai-agent/requirements.txt +0 -0
  221. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/openai-agent/tests/cassettes/damaged_item.json +0 -0
  222. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/openai-agent/tests/cassettes/order_tracking.json +0 -0
  223. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/openai-agent/tests/cassettes/return_request.json +0 -0
  224. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/openai-agent/tests/test_golden.py +0 -0
  225. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/openai-agent/tests/test_support_agent.py +0 -0
  226. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/replay_and_diff.py +0 -0
  227. {evalcraft-0.2.0 → evalcraft-0.3.0}/examples/test_with_mocks.py +0 -0
  228. {evalcraft-0.2.0 → evalcraft-0.3.0}/launch/dm-templates.md +0 -0
  229. {evalcraft-0.2.0 → evalcraft-0.3.0}/launch/outreach-targets.md +0 -0
  230. {evalcraft-0.2.0 → evalcraft-0.3.0}/launch/outreach-tracker.md +0 -0
  231. {evalcraft-0.2.0 → evalcraft-0.3.0}/launch/reddit-langchain.md +0 -0
  232. {evalcraft-0.2.0 → evalcraft-0.3.0}/launch/reddit-python.md +0 -0
  233. {evalcraft-0.2.0 → evalcraft-0.3.0}/launch/show-hn.md +0 -0
  234. {evalcraft-0.2.0 → evalcraft-0.3.0}/launch/tracking-dashboard.md +0 -0
  235. {evalcraft-0.2.0 → evalcraft-0.3.0}/launch/twitter-thread.md +0 -0
  236. {evalcraft-0.2.0 → evalcraft-0.3.0}/launch-posts.md +0 -0
  237. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/package-lock.json +0 -0
  238. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/package.json +0 -0
  239. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/adapters/gemini.ts +0 -0
  240. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/adapters/openai.ts +0 -0
  241. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/adapters/vercel-ai.ts +0 -0
  242. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/capture/recorder.ts +0 -0
  243. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/core/models.ts +0 -0
  244. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/core/types.ts +0 -0
  245. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/eval/llm-judge.ts +0 -0
  246. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/eval/rag-scorers.ts +0 -0
  247. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/eval/scorers.ts +0 -0
  248. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/index.ts +0 -0
  249. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/mock/llm.ts +0 -0
  250. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/mock/tool.ts +0 -0
  251. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/replay/engine.ts +0 -0
  252. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/src/types/externals.d.ts +0 -0
  253. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/tests/capture.test.ts +0 -0
  254. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/tests/mock.test.ts +0 -0
  255. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/tests/models.test.ts +0 -0
  256. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/tests/replay.test.ts +0 -0
  257. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/tests/scorers.test.ts +0 -0
  258. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/tests/vercel-ai.test.ts +0 -0
  259. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/tsconfig.json +0 -0
  260. {evalcraft-0.2.0 → evalcraft-0.3.0}/packages/evalcraft-js/vitest.config.ts +0 -0
  261. {evalcraft-0.2.0 → evalcraft-0.3.0}/scripts/requirements.txt +0 -0
  262. {evalcraft-0.2.0 → evalcraft-0.3.0}/scripts/validate_with_openai.py +0 -0
  263. {evalcraft-0.2.0 → evalcraft-0.3.0}/site/logo.png +0 -0
  264. {evalcraft-0.2.0 → evalcraft-0.3.0}/site/robots.txt +0 -0
  265. {evalcraft-0.2.0 → evalcraft-0.3.0}/site/sitemap.xml +0 -0
  266. {evalcraft-0.2.0 → evalcraft-0.3.0}/site/thank-you.html +0 -0
  267. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/conftest.py +0 -0
  268. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/regression/test_trend_detector.py +0 -0
  269. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_alerts.py +0 -0
  270. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_anthropic_adapter.py +0 -0
  271. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_doctor.py +0 -0
  272. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_gemini_adapter.py +0 -0
  273. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_generate_tests.py +0 -0
  274. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_golden.py +0 -0
  275. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_hallucination.py +0 -0
  276. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_init.py +0 -0
  277. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_judge_cache.py +0 -0
  278. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_jury.py +0 -0
  279. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_live_eval.py +0 -0
  280. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_llm_judge.py +0 -0
  281. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_mock_llm.py +0 -0
  282. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_mock_tool.py +0 -0
  283. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_models.py +0 -0
  284. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_network_guard.py +0 -0
  285. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_pairwise.py +0 -0
  286. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_provenance.py +0 -0
  287. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_pydantic_ai_adapter.py +0 -0
  288. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_rag_scorers.py +0 -0
  289. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_recorder.py +0 -0
  290. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_regression.py +0 -0
  291. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_replay.py +0 -0
  292. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_sanitize.py +0 -0
  293. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_scorer_offline.py +0 -0
  294. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_scorers.py +0 -0
  295. {evalcraft-0.2.0 → evalcraft-0.3.0}/tests/test_statistical.py +0 -0
@@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.3.0] — 2026-06-01
9
+
10
+ ### Added
11
+ - **`evalcraft check-stale`** — activates the provenance every cassette already records (model set, prompt hash, timestamp) to flag deterministic tests that have silently gone stale: a recorded model absent from the current `--models` set is **CRITICAL** (non-zero exit — a CI gate), a drifted `--prompts` hash is a **WARNING**, and age is **INFO**. Adds a `StalenessChecker` Python API (`evalcraft.staleness`) and refactors a shared `compute_prompt_hash` so recorded and recomputed prompt hashes match byte-for-byte. No new dependencies; runs fully offline.
12
+
13
+ ## [0.2.1] — 2026-05-30
14
+
15
+ ### Fixed
16
+ - **Removed references to the unregistered `evalcraft.dev` domain.** The cloud client and the `evalcraft cloud` CLI no longer default to a non-existent `api.evalcraft.dev` endpoint. There is **no public hosted service** — configure a self-hosted dashboard URL explicitly via `base_url=`, the `EVALCRAFT_BASE_URL` env var, or `~/.evalcraft/config.json`. A cloud call with no URL configured now raises a clear, self-host-pointing error instead of failing against a dead host. Also scrubbed the dead domain from the `evalcraft init` config template and the landing-page contact links.
17
+
8
18
  ## [0.2.0] — 2026-05-30
9
19
 
10
20
  Ships everything developed since the initial `0.1.0` PyPI upload — a much larger
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evalcraft
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: VCR for AI agents — record agent runs as cassettes and replay them deterministically in CI for $0.
5
5
  Project-URL: Homepage, https://github.com/beyhangl/evalcraft
6
6
  Project-URL: Repository, https://github.com/beyhangl/evalcraft
@@ -69,7 +69,8 @@ Description-Content-Type: text/markdown
69
69
  <p align="center">
70
70
  <img src="site/logo.png" alt="Evalcraft" width="400" />
71
71
  </p>
72
- <p align="center"><strong>VCR for AI agents.</strong> Record an agent run once, replay it deterministically in CI for <strong>$0</strong> fast regression tests for your agent's plumbing (tool calls, control flow, cost &amp; latency budgets), plus live-eval to catch real model drift.</p>
72
+ <p align="center"><strong>Deterministic tests for AI agents — generated from one real run.</strong></p>
73
+ <p align="center">Capture an agent run and evalcraft writes a <strong>pytest</strong> that locks its tool calls, output shape, and cost — then replays it in CI for <strong>$0</strong>. Like VCR for HTTP, but it writes the agent tests for you.</p>
73
74
 
74
75
  [![CI](https://github.com/beyhangl/evalcraft/actions/workflows/ci.yml/badge.svg)](https://github.com/beyhangl/evalcraft/actions/workflows/ci.yml)
75
76
  [![PyPI](https://img.shields.io/pypi/v/evalcraft)](https://pypi.org/project/evalcraft/)
@@ -536,6 +537,7 @@ evalcraft [command] [options]
536
537
  | `evalcraft sanitize <cassette>` | Redact PII and secrets |
537
538
  | `evalcraft doctor` | Diagnose setup issues (deps, API keys, cassettes) |
538
539
  | `evalcraft live-eval <current> --baseline <b>` | Gate a live-eval run vs a baseline (catch drift) |
540
+ | `evalcraft check-stale <cassettes> --models <set>` | Fail CI when a cassette's recorded model was retired or swapped |
539
541
 
540
542
  ---
541
543
 
@@ -1,7 +1,8 @@
1
1
  <p align="center">
2
2
  <img src="site/logo.png" alt="Evalcraft" width="400" />
3
3
  </p>
4
- <p align="center"><strong>VCR for AI agents.</strong> Record an agent run once, replay it deterministically in CI for <strong>$0</strong> fast regression tests for your agent's plumbing (tool calls, control flow, cost &amp; latency budgets), plus live-eval to catch real model drift.</p>
4
+ <p align="center"><strong>Deterministic tests for AI agents — generated from one real run.</strong></p>
5
+ <p align="center">Capture an agent run and evalcraft writes a <strong>pytest</strong> that locks its tool calls, output shape, and cost — then replays it in CI for <strong>$0</strong>. Like VCR for HTTP, but it writes the agent tests for you.</p>
5
6
 
6
7
  [![CI](https://github.com/beyhangl/evalcraft/actions/workflows/ci.yml/badge.svg)](https://github.com/beyhangl/evalcraft/actions/workflows/ci.yml)
7
8
  [![PyPI](https://img.shields.io/pypi/v/evalcraft)](https://pypi.org/project/evalcraft/)
@@ -468,6 +469,7 @@ evalcraft [command] [options]
468
469
  | `evalcraft sanitize <cassette>` | Redact PII and secrets |
469
470
  | `evalcraft doctor` | Diagnose setup issues (deps, API keys, cassettes) |
470
471
  | `evalcraft live-eval <current> --baseline <b>` | Gate a live-eval run vs a baseline (catch drift) |
472
+ | `evalcraft check-stale <cassettes> --models <set>` | Fail CI when a cassette's recorded model was retired or swapped |
471
473
 
472
474
  ---
473
475
 
@@ -1,6 +1,8 @@
1
1
  # Evalcraft
2
2
 
3
- **VCR for AI agents.** Record an agent run once, replay it deterministically in CI for **$0** fast regression tests for your agent's plumbing (tool calls, control flow, cost & latency budgets), plus live-eval to catch real model drift.
3
+ **Deterministic tests for AI agents — generated from one real run.**
4
+
5
+ Capture an agent run and evalcraft writes a **pytest** that locks its tool calls, output shape, and cost — then replays it in CI for **$0**. Like VCR for HTTP, but it writes the agent tests for you.
4
6
 
5
7
  [![CI](https://github.com/beyhangl/evalcraft/actions/workflows/ci.yml/badge.svg)](https://github.com/beyhangl/evalcraft/actions/workflows/ci.yml)
6
8
  [![PyPI](https://img.shields.io/pypi/v/evalcraft)](https://pypi.org/project/evalcraft/)
@@ -7,6 +7,13 @@ The format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and
7
7
 
8
8
  ---
9
9
 
10
+ ## [0.3.0] — 2026-06-01
11
+
12
+ ### Added
13
+ - `evalcraft check-stale` — detect cassettes recorded against a retired/swapped model (CRITICAL, non-zero exit for CI) or a drifted prompt (WARNING), by activating the provenance each cassette records. See [Check Stale](check-stale.md).
14
+
15
+ ---
16
+
10
17
  ## [0.1.0] — 2026-03-05
11
18
 
12
19
  Initial public release of Evalcraft — the pytest for AI agents.
@@ -0,0 +1,95 @@
1
+ # Check Stale — catch cassettes recorded against a retired model
2
+
3
+ A replayed cassette is a *deterministic* test: it passes as long as the recording
4
+ is unchanged. But that's exactly the trap — a green replay says nothing about
5
+ whether the recording still mirrors reality. In 2026, models get **hard
6
+ retirement dates** (and providers silently update weights). When the model a
7
+ cassette was recorded against is gone, your test keeps "passing" against a world
8
+ that no longer exists.
9
+
10
+ `evalcraft check-stale` fixes the blind spot by **activating the provenance**
11
+ every cassette already records (model set, prompt hash, timestamp) and turning it
12
+ into a CI gate.
13
+
14
+ ```bash
15
+ evalcraft check-stale tests/cassettes/*.json --models "gpt-5.1,claude-sonnet-4-5"
16
+ ```
17
+
18
+ ```
19
+ staleness check 3 cassette(s)
20
+
21
+ refund_flow
22
+ CRITICAL [model_retired] Recorded model 'gpt-4o' is not in the current model set —
23
+ it may have been retired or swapped. This deterministic test no longer
24
+ mirrors production.
25
+ fresh weather_agent
26
+ fresh search_agent
27
+
28
+ CRITICAL staleness found — re-record the affected cassettes
29
+ # exit code 1
30
+ ```
31
+
32
+ ## What it checks
33
+
34
+ | Finding | Severity | Meaning | Exits CI? |
35
+ |---|---|---|---|
36
+ | `model_retired` | **CRITICAL** | A recorded model is absent from the current `--models` set (retired or swapped) — the cassette may now exercise an API that errors live. | **Yes (exit 1)** |
37
+ | `prompt_drift` | WARNING | The current prompt hash (`--prompts`) differs from the recorded one — still replays, but no longer mirrors the live prompt. | No |
38
+ | `age` | INFO | The recording is older than `--max-age-days`. | No |
39
+ | `no_provenance` | INFO | A legacy / hand-built cassette with no provenance — re-record to enable checks. | No |
40
+
41
+ Only a **retired model** blocks the build — it's the one signal that means "your
42
+ deterministic test is lying." Prompt drift and age are visible but non-blocking.
43
+
44
+ ## Flags
45
+
46
+ | Flag | Description |
47
+ |---|---|
48
+ | `--models "a,b,c"` | The model set you ship today. Any recorded model not in this exact set → CRITICAL. Omit to skip the model check. |
49
+ | `--prompts <file>` | A file of your current prompts; its hash is compared to the recorded `prompt_hash`. Omit to skip. |
50
+ | `--max-age-days N` | Recorded-at age over `N` days → INFO. Defaults to `30` if no other check is given. |
51
+ | `--json` | Emit `{"cassettes": [report, ...]}` (severity strings `CRITICAL`/`WARNING`/`INFO`). Still exits 1 on any CRITICAL. |
52
+
53
+ Matching is **exact and case-sensitive** — a swap from `gpt-5.1` to `gpt-5.1-mini`
54
+ *should* fire. No fuzzy family matching.
55
+
56
+ ### `--prompts` file shape
57
+
58
+ The hash basis is identical to what was recorded at capture time, so a file that
59
+ reproduces the prompts matches byte-for-byte. Accepted shapes:
60
+
61
+ ```jsonc
62
+ // 1. JSON object with the run's input + per-LLM-call inputs
63
+ { "input_text": "refund order 123", "llm_inputs": ["system + user prompt...", "..."] }
64
+
65
+ // 2. JSON list → treated as llm_inputs (input_text = "")
66
+ ["system + user prompt..."]
67
+
68
+ // 3. anything else → treated as input_text
69
+ ```
70
+
71
+ ## Wire it into CI
72
+
73
+ Add it as a fast, deterministic gate next to your other checks — no API key, no
74
+ network:
75
+
76
+ ```yaml
77
+ - name: Fail if any cassette was recorded against a retired model
78
+ run: evalcraft check-stale tests/cassettes/*.json --models "${{ vars.CURRENT_MODELS }}"
79
+ ```
80
+
81
+ When a model is retired, the gate goes red — **re-record the affected cassettes**
82
+ (which refreshes their provenance), review the new behavior, and commit.
83
+
84
+ ## Python API
85
+
86
+ ```python
87
+ from evalcraft import StalenessChecker
88
+ from evalcraft.core.models import Cassette
89
+
90
+ report = StalenessChecker(max_age_days=30).check(
91
+ Cassette.load("tests/cassettes/refund_flow.json"),
92
+ current_models=["gpt-5.1", "claude-sonnet-4-5"],
93
+ )
94
+ assert not report.has_critical, report.to_dict()
95
+ ```
@@ -465,6 +465,25 @@ def test_with_generated_fixtures():
465
465
 
466
466
  ---
467
467
 
468
+ ## `evalcraft check-stale`
469
+
470
+ Flag cassettes recorded against a model that's been retired or swapped (or a drifted prompt), using the provenance each cassette records. Exits non-zero on a retired model so CI can block stale deterministic tests.
471
+
472
+ ```bash
473
+ evalcraft check-stale CASSETTES... [OPTIONS]
474
+ ```
475
+
476
+ | Option | Description |
477
+ |--------|-------------|
478
+ | `--models "a,b,c"` | Current model set; a recorded model not in it is CRITICAL |
479
+ | `--prompts PATH` | Current prompts file; hash drift vs the recording is a WARNING |
480
+ | `--max-age-days N` | Recorded-at age over N days is INFO (defaults to 30 if no other check) |
481
+ | `--json` | Emit JSON; still exits 1 on any CRITICAL |
482
+
483
+ See [Check Stale](check-stale.md) for the full guide.
484
+
485
+ ---
486
+
468
487
  ## Exit codes
469
488
 
470
489
  | Code | Meaning |
@@ -4,7 +4,7 @@ Record agent runs as cassettes and replay them deterministically in CI for $0;
4
4
  mock LLMs/tools, score runs, and catch real model drift with live-eval.
5
5
  """
6
6
 
7
- __version__ = "0.2.0"
7
+ __version__ = "0.3.0"
8
8
 
9
9
  from evalcraft.capture.recorder import CaptureContext, capture
10
10
  from evalcraft.cloud.client import EvalcraftCloud
@@ -52,6 +52,7 @@ from evalcraft.mock.llm import MockLLM
52
52
  from evalcraft.mock.tool import MockTool
53
53
  from evalcraft.regression.detector import RegressionDetector, RegressionReport
54
54
  from evalcraft.replay.engine import ReplayEngine, replay
55
+ from evalcraft.staleness import StalenessChecker, StalenessFinding, StalenessReport
55
56
 
56
57
  __all__ = [
57
58
  "capture",
@@ -95,5 +96,8 @@ __all__ = [
95
96
  "GoldenSet",
96
97
  "RegressionDetector",
97
98
  "RegressionReport",
99
+ "StalenessChecker",
100
+ "StalenessFinding",
101
+ "StalenessReport",
98
102
  "EvalcraftCloud",
99
103
  ]
@@ -58,7 +58,7 @@ _SPAN_COLORS: dict[SpanKind, str] = {
58
58
  # ─── CLI root ─────────────────────────────────────────────────────────────────
59
59
 
60
60
  @click.group()
61
- @click.version_option(version="0.2.0", prog_name="evalcraft")
61
+ @click.version_option(version="0.3.0", prog_name="evalcraft")
62
62
  def cli() -> None:
63
63
  """evalcraft — capture, replay, and evaluate AI agent runs."""
64
64
 
@@ -885,6 +885,108 @@ def regression_cmd(cassette: str, golden: str, as_json: bool) -> None:
885
885
  sys.exit(1)
886
886
 
887
887
 
888
+ # ─── check-stale ──────────────────────────────────────────────────────────────
889
+
890
+ @cli.command("check-stale")
891
+ @click.argument("cassettes", nargs=-1, required=True,
892
+ type=click.Path(exists=True, dir_okay=False))
893
+ @click.option("--models", "models_csv", default=None,
894
+ help="Comma-separated current model set (e.g. 'gpt-5.1,claude-sonnet-4-5'). "
895
+ "A recorded model absent here is CRITICAL (retired/swapped).")
896
+ @click.option("--prompts", "prompts_path", default=None,
897
+ type=click.Path(exists=True, dir_okay=False),
898
+ help="JSON/text file of current prompts; its hash is compared to the "
899
+ "recorded prompt_hash (WARNING on drift).")
900
+ @click.option("--max-age-days", default=None, type=int,
901
+ help="Recorded-at age over N days is INFO. Defaults to 30 if no other check given.")
902
+ @click.option("--json", "as_json", is_flag=True, help="Output as JSON")
903
+ def check_stale_cmd(
904
+ cassettes: tuple[str, ...],
905
+ models_csv: str | None,
906
+ prompts_path: str | None,
907
+ max_age_days: int | None,
908
+ as_json: bool,
909
+ ) -> None:
910
+ """Flag CASSETTES recorded against a retired model or a drifted prompt.
911
+
912
+ Activates each cassette's recorded provenance (model set, prompt hash,
913
+ timestamp). Exits non-zero if ANY cassette references a model no longer in
914
+ --models, so CI can block deterministic tests that have silently gone stale.
915
+
916
+ Example:
917
+
918
+ evalcraft check-stale tests/cassettes/*.json --models "gpt-5.1,claude-sonnet-4-5"
919
+ """
920
+ from evalcraft.staleness import StalenessChecker, hash_prompts_file
921
+
922
+ current_models = (
923
+ [m.strip() for m in models_csv.split(",") if m.strip()]
924
+ if models_csv is not None
925
+ else None
926
+ )
927
+ current_prompt_hash = hash_prompts_file(prompts_path) if prompts_path else None
928
+
929
+ # With no explicit check requested, fall back to an age check (default 30d).
930
+ effective_age = max_age_days
931
+ if max_age_days is None and current_models is None and current_prompt_hash is None:
932
+ effective_age = 30
933
+
934
+ checker = StalenessChecker(max_age_days=effective_age)
935
+
936
+ reports = []
937
+ for path in cassettes:
938
+ cassette = _load_cassette(path)
939
+ report = checker.check(
940
+ cassette,
941
+ current_models=current_models,
942
+ current_prompt_hash=current_prompt_hash,
943
+ )
944
+ if not report.cassette_name:
945
+ report.cassette_name = Path(path).stem
946
+ reports.append(report)
947
+
948
+ any_critical = any(r.has_critical for r in reports)
949
+
950
+ if as_json:
951
+ click.echo(json.dumps(
952
+ {"cassettes": [r.to_dict() for r in reports]}, indent=2, default=str
953
+ ))
954
+ if any_critical:
955
+ sys.exit(1)
956
+ return
957
+
958
+ _SEV_COLORS = {"CRITICAL": "red", "WARNING": "yellow", "INFO": "blue"}
959
+
960
+ click.echo(
961
+ click.style(" staleness check", fg="cyan", bold=True)
962
+ + f" {len(reports)} cassette(s)"
963
+ )
964
+ click.echo()
965
+
966
+ total_findings = 0
967
+ for report in reports:
968
+ if not report.has_findings:
969
+ click.echo(click.style(" fresh", fg="green", bold=True) + f" {report.cassette_name}")
970
+ continue
971
+ click.echo(click.style(f" {report.cassette_name}", bold=True))
972
+ for f in report.findings:
973
+ total_findings += 1
974
+ color = _SEV_COLORS.get(f.severity.value, "white")
975
+ icon = click.style(f" {f.severity.value:<8}", fg=color, bold=True)
976
+ click.echo(f"{icon} [{f.category}] {f.message}")
977
+ click.echo()
978
+
979
+ if total_findings == 0:
980
+ click.echo(click.style(" all cassettes fresh", fg="green", bold=True))
981
+ elif any_critical:
982
+ click.echo(click.style(
983
+ " CRITICAL staleness found — re-record the affected cassettes", fg="red", bold=True
984
+ ))
985
+
986
+ if any_critical:
987
+ sys.exit(1)
988
+
989
+
888
990
  # ─── sanitize ─────────────────────────────────────────────────────────────────
889
991
 
890
992
  @cli.command()
@@ -1111,8 +1213,8 @@ def cloud() -> None:
1111
1213
  @cloud.command("login")
1112
1214
  @click.option("--api-key", prompt="API key", hide_input=True,
1113
1215
  help="Your Evalcraft API key (ec_...)")
1114
- @click.option("--url", default="https://api.evalcraft.dev/v1",
1115
- help="Override API base URL")
1216
+ @click.option("--url", default="",
1217
+ help="Your self-hosted dashboard URL (optional; there is no public hosted service)")
1116
1218
  def cloud_login(api_key: str, url: str) -> None:
1117
1219
  """Save your API key to ~/.evalcraft/config.json.
1118
1220
 
@@ -37,4 +37,4 @@ auto_upload = false
37
37
 
38
38
  [cloud]
39
39
  # Override the Evalcraft cloud API endpoint.
40
- # base_url = "https://api.evalcraft.dev/v1"
40
+ # base_url = "http://localhost:8000/v1" # your self-hosted dashboard (no public service)
@@ -34,13 +34,11 @@ from typing import Any
34
34
 
35
35
  logger = logging.getLogger(__name__)
36
36
 
37
- # NOTE: The hosted Evalcraft dashboard/API is not yet publicly available.
38
- # This default points at the *planned* hosted endpoint; until it ships, set
39
- # ``base_url`` (or the ``base_url`` field in ~/.evalcraft/config.json) to your
40
- # own self-hosted dashboard see the ``dashboard/`` directory. All cloud
41
- # features are optional: the core capture / replay / eval workflow runs fully
42
- # offline and never contacts this endpoint.
43
- _DEFAULT_BASE_URL = "https://api.evalcraft.dev/v1"
37
+ # There is no public hosted Evalcraft API. Cloud features are optional and target
38
+ # a *self-hosted* dashboard (see the ``dashboard/`` directory); configure the
39
+ # endpoint explicitly via the ``base_url`` argument, the ``EVALCRAFT_BASE_URL``
40
+ # environment variable, or ``~/.evalcraft/config.json``. The core capture /
41
+ # replay / eval workflow runs fully offline and never contacts any endpoint.
44
42
  _CONFIG_DIR = Path.home() / ".evalcraft"
45
43
  _CONFIG_FILE = _CONFIG_DIR / "config.json"
46
44
  _QUEUE_DIR = _CONFIG_DIR / "queue"
@@ -104,7 +102,9 @@ class EvalcraftCloud:
104
102
  api_key: Bearer token (``ec_...``). If None, reads from
105
103
  ``~/.evalcraft/config.json`` or the ``EVALCRAFT_API_KEY``
106
104
  environment variable.
107
- base_url: Override the default API endpoint.
105
+ base_url: URL of your self-hosted Evalcraft dashboard. Required for any
106
+ cloud call — there is no public hosted service. Falls back to the
107
+ ``EVALCRAFT_BASE_URL`` env var, then ``~/.evalcraft/config.json``.
108
108
  timeout: Request timeout in seconds (default 30).
109
109
  max_retries: Maximum number of retry attempts for transient errors
110
110
  (default 3). Uses exponential backoff with jitter.
@@ -115,13 +115,13 @@ class EvalcraftCloud:
115
115
  def __init__(
116
116
  self,
117
117
  api_key: str | None = None,
118
- base_url: str = _DEFAULT_BASE_URL,
118
+ base_url: str | None = None,
119
119
  timeout: int = 30,
120
120
  max_retries: int = 3,
121
121
  queue_dir: Path | None = None,
122
122
  ):
123
123
  self.api_key = api_key or self._load_api_key()
124
- self.base_url = base_url.rstrip("/")
124
+ self.base_url = (base_url or self._load_base_url()).rstrip("/")
125
125
  self.timeout = timeout
126
126
  self.max_retries = max_retries
127
127
  self.queue_dir = queue_dir or _QUEUE_DIR
@@ -237,8 +237,8 @@ class EvalcraftCloud:
237
237
  # ──────────────────────────────────────────
238
238
 
239
239
  @staticmethod
240
- def save_config(api_key: str, base_url: str = _DEFAULT_BASE_URL) -> None:
241
- """Persist API key and base URL to ``~/.evalcraft/config.json``."""
240
+ def save_config(api_key: str, base_url: str = "") -> None:
241
+ """Persist the API key (and optional dashboard URL) to ``~/.evalcraft/config.json``."""
242
242
  _CONFIG_DIR.mkdir(parents=True, exist_ok=True)
243
243
  config: dict = {}
244
244
  if _CONFIG_FILE.exists():
@@ -247,7 +247,8 @@ class EvalcraftCloud:
247
247
  except Exception:
248
248
  pass
249
249
  config["api_key"] = api_key
250
- config["base_url"] = base_url
250
+ if base_url:
251
+ config["base_url"] = base_url
251
252
  _CONFIG_FILE.write_text(json.dumps(config, indent=2))
252
253
  _CONFIG_FILE.chmod(0o600)
253
254
 
@@ -288,6 +289,15 @@ class EvalcraftCloud:
288
289
  config = self.load_config()
289
290
  return str(config.get("api_key", ""))
290
291
 
292
+ def _load_base_url(self) -> str:
293
+ """Resolve the dashboard base URL from env or config (empty if unset)."""
294
+ import os
295
+ env_url = os.environ.get("EVALCRAFT_BASE_URL", "")
296
+ if env_url:
297
+ return env_url
298
+ config = self.load_config()
299
+ return str(config.get("base_url", ""))
300
+
291
301
  def _request(
292
302
  self,
293
303
  method: str,
@@ -307,11 +317,18 @@ class EvalcraftCloud:
307
317
  Raises:
308
318
  CloudUploadError: After max_retries exhausted or on 4xx errors.
309
319
  """
320
+ if not self.base_url:
321
+ raise CloudUploadError(
322
+ "No Evalcraft dashboard URL is configured. There is no public "
323
+ "hosted service — point the client at your own self-hosted "
324
+ "dashboard (see the dashboard/ directory) via base_url=..., the "
325
+ "EVALCRAFT_BASE_URL env var, or ~/.evalcraft/config.json."
326
+ )
310
327
  url = f"{self.base_url}{path}"
311
328
  body: bytes | None = None
312
329
  headers: dict[str, str] = {
313
330
  "Accept": "application/json",
314
- "User-Agent": "evalcraft-sdk/0.2.0",
331
+ "User-Agent": "evalcraft-sdk/0.3.0",
315
332
  }
316
333
  if self.api_key:
317
334
  headers["Authorization"] = f"Bearer {self.api_key}"
@@ -165,6 +165,21 @@ class Provenance:
165
165
  )
166
166
 
167
167
 
168
+ def compute_prompt_hash(input_text: str, llm_inputs: list[Any]) -> str:
169
+ """Hash the prompt surface of a run — the user input plus each LLM span's input.
170
+
171
+ Used both when recording provenance (:meth:`Cassette.capture_provenance`) and
172
+ when checking staleness, so a recorded hash and a recomputed one match
173
+ byte-for-byte. List order is significant; only dict keys are sorted.
174
+ """
175
+ basis = json.dumps(
176
+ {"input_text": input_text, "llm_inputs": list(llm_inputs)},
177
+ sort_keys=True,
178
+ default=str,
179
+ )
180
+ return hashlib.sha256(basis.encode()).hexdigest()[:16]
181
+
182
+
168
183
  @dataclass
169
184
  class Cassette:
170
185
  """A recorded agent run — the fundamental unit of Evalcraft.
@@ -230,15 +245,9 @@ class Cassette:
230
245
 
231
246
  llm_spans = self.get_llm_calls()
232
247
  models = sorted({s.model for s in llm_spans if s.model})
233
- basis = json.dumps(
234
- {
235
- "input_text": self.input_text,
236
- "llm_inputs": [s.input for s in llm_spans],
237
- },
238
- sort_keys=True,
239
- default=str,
248
+ prompt_hash = compute_prompt_hash(
249
+ self.input_text, [s.input for s in llm_spans]
240
250
  )
241
- prompt_hash = hashlib.sha256(basis.encode()).hexdigest()[:16]
242
251
 
243
252
  self.provenance = Provenance(
244
253
  recorded_at=time.time(),
@@ -288,7 +297,7 @@ class Cassette:
288
297
  self.compute_metrics()
289
298
  self.compute_fingerprint()
290
299
  return {
291
- "evalcraft_version": "0.2.0",
300
+ "evalcraft_version": "0.3.0",
292
301
  "cassette": {
293
302
  "id": self.id,
294
303
  "name": self.name,
@@ -0,0 +1,25 @@
1
+ """Staleness detection — flag cassettes recorded against retired models / drifted prompts.
2
+
3
+ A cassette's recorded provenance (model set, prompt hash, timestamp) is only
4
+ useful if something acts on it. This module does: it turns that provenance into
5
+ actionable CI signal so a deterministic test can't silently keep passing against
6
+ a model that no longer exists.
7
+
8
+ from evalcraft.staleness import StalenessChecker
9
+ """
10
+
11
+ from evalcraft.core.models import compute_prompt_hash
12
+ from evalcraft.staleness.checker import (
13
+ StalenessChecker,
14
+ StalenessFinding,
15
+ StalenessReport,
16
+ hash_prompts_file,
17
+ )
18
+
19
+ __all__ = [
20
+ "StalenessChecker",
21
+ "StalenessFinding",
22
+ "StalenessReport",
23
+ "compute_prompt_hash",
24
+ "hash_prompts_file",
25
+ ]