evalcraft 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. evalcraft-0.1.0/.github/ISSUE_TEMPLATE/bug_report.yml +60 -0
  2. evalcraft-0.1.0/.github/ISSUE_TEMPLATE/config.yml +5 -0
  3. evalcraft-0.1.0/.github/ISSUE_TEMPLATE/feature_request.yml +40 -0
  4. evalcraft-0.1.0/.github/workflows/ci.yml +106 -0
  5. evalcraft-0.1.0/.github/workflows/example-ci-gate.yml +109 -0
  6. evalcraft-0.1.0/.github/workflows/publish.yml +68 -0
  7. evalcraft-0.1.0/.gitignore +46 -0
  8. evalcraft-0.1.0/CHANGELOG.md +76 -0
  9. evalcraft-0.1.0/CONTRIBUTING.md +86 -0
  10. evalcraft-0.1.0/LICENSE +21 -0
  11. evalcraft-0.1.0/PKG-INFO +431 -0
  12. evalcraft-0.1.0/README.md +369 -0
  13. evalcraft-0.1.0/action.yml +379 -0
  14. evalcraft-0.1.0/dashboard/backend/Dockerfile +17 -0
  15. evalcraft-0.1.0/dashboard/backend/alembic/env.py +64 -0
  16. evalcraft-0.1.0/dashboard/backend/alembic/script.py.mako +25 -0
  17. evalcraft-0.1.0/dashboard/backend/alembic/versions/.gitkeep +0 -0
  18. evalcraft-0.1.0/dashboard/backend/alembic.ini +36 -0
  19. evalcraft-0.1.0/dashboard/backend/app/__init__.py +0 -0
  20. evalcraft-0.1.0/dashboard/backend/app/api/__init__.py +0 -0
  21. evalcraft-0.1.0/dashboard/backend/app/api/auth.py +224 -0
  22. evalcraft-0.1.0/dashboard/backend/app/api/cassettes.py +176 -0
  23. evalcraft-0.1.0/dashboard/backend/app/api/golden_sets.py +161 -0
  24. evalcraft-0.1.0/dashboard/backend/app/api/projects.py +96 -0
  25. evalcraft-0.1.0/dashboard/backend/app/api/regressions.py +72 -0
  26. evalcraft-0.1.0/dashboard/backend/app/api/webhooks.py +80 -0
  27. evalcraft-0.1.0/dashboard/backend/app/config.py +41 -0
  28. evalcraft-0.1.0/dashboard/backend/app/database.py +43 -0
  29. evalcraft-0.1.0/dashboard/backend/app/main.py +66 -0
  30. evalcraft-0.1.0/dashboard/backend/app/models/__init__.py +18 -0
  31. evalcraft-0.1.0/dashboard/backend/app/models/cassette.py +46 -0
  32. evalcraft-0.1.0/dashboard/backend/app/models/golden_set.py +33 -0
  33. evalcraft-0.1.0/dashboard/backend/app/models/project.py +30 -0
  34. evalcraft-0.1.0/dashboard/backend/app/models/regression.py +68 -0
  35. evalcraft-0.1.0/dashboard/backend/app/models/user.py +60 -0
  36. evalcraft-0.1.0/dashboard/backend/app/schemas/__init__.py +0 -0
  37. evalcraft-0.1.0/dashboard/backend/app/schemas/api.py +227 -0
  38. evalcraft-0.1.0/dashboard/backend/app/services/alert_service.py +61 -0
  39. evalcraft-0.1.0/dashboard/backend/app/services/analytics_service.py +62 -0
  40. evalcraft-0.1.0/dashboard/backend/app/services/cassette_service.py +78 -0
  41. evalcraft-0.1.0/dashboard/backend/app/services/regression_service.py +88 -0
  42. evalcraft-0.1.0/dashboard/backend/requirements.txt +12 -0
  43. evalcraft-0.1.0/dashboard/frontend/.gitignore +24 -0
  44. evalcraft-0.1.0/dashboard/frontend/README.md +73 -0
  45. evalcraft-0.1.0/dashboard/frontend/eslint.config.js +23 -0
  46. evalcraft-0.1.0/dashboard/frontend/index.html +13 -0
  47. evalcraft-0.1.0/dashboard/frontend/package-lock.json +4281 -0
  48. evalcraft-0.1.0/dashboard/frontend/package.json +35 -0
  49. evalcraft-0.1.0/dashboard/frontend/public/vite.svg +1 -0
  50. evalcraft-0.1.0/dashboard/frontend/src/App.css +42 -0
  51. evalcraft-0.1.0/dashboard/frontend/src/App.tsx +51 -0
  52. evalcraft-0.1.0/dashboard/frontend/src/assets/react.svg +1 -0
  53. evalcraft-0.1.0/dashboard/frontend/src/components/Layout.tsx +54 -0
  54. evalcraft-0.1.0/dashboard/frontend/src/components/MetricCard.tsx +64 -0
  55. evalcraft-0.1.0/dashboard/frontend/src/components/Sidebar.tsx +252 -0
  56. evalcraft-0.1.0/dashboard/frontend/src/components/StatusBadge.tsx +53 -0
  57. evalcraft-0.1.0/dashboard/frontend/src/components/Toast.tsx +80 -0
  58. evalcraft-0.1.0/dashboard/frontend/src/data/mock.ts +143 -0
  59. evalcraft-0.1.0/dashboard/frontend/src/index.css +55 -0
  60. evalcraft-0.1.0/dashboard/frontend/src/main.tsx +10 -0
  61. evalcraft-0.1.0/dashboard/frontend/src/pages/Analytics.tsx +158 -0
  62. evalcraft-0.1.0/dashboard/frontend/src/pages/CassetteDetail.tsx +199 -0
  63. evalcraft-0.1.0/dashboard/frontend/src/pages/Cassettes.tsx +221 -0
  64. evalcraft-0.1.0/dashboard/frontend/src/pages/Dashboard.tsx +233 -0
  65. evalcraft-0.1.0/dashboard/frontend/src/pages/GoldenSets.tsx +196 -0
  66. evalcraft-0.1.0/dashboard/frontend/src/pages/Login.tsx +272 -0
  67. evalcraft-0.1.0/dashboard/frontend/src/pages/Regressions.tsx +148 -0
  68. evalcraft-0.1.0/dashboard/frontend/src/pages/Settings.tsx +193 -0
  69. evalcraft-0.1.0/dashboard/frontend/tsconfig.app.json +28 -0
  70. evalcraft-0.1.0/dashboard/frontend/tsconfig.json +7 -0
  71. evalcraft-0.1.0/dashboard/frontend/tsconfig.node.json +26 -0
  72. evalcraft-0.1.0/dashboard/frontend/vite.config.ts +7 -0
  73. evalcraft-0.1.0/docs/index.md +110 -0
  74. evalcraft-0.1.0/docs/user-guide/adapters/anthropic.md +149 -0
  75. evalcraft-0.1.0/docs/user-guide/adapters/crewai.md +172 -0
  76. evalcraft-0.1.0/docs/user-guide/adapters/langgraph.md +146 -0
  77. evalcraft-0.1.0/docs/user-guide/adapters/openai.md +141 -0
  78. evalcraft-0.1.0/docs/user-guide/capture.md +330 -0
  79. evalcraft-0.1.0/docs/user-guide/changelog.md +90 -0
  80. evalcraft-0.1.0/docs/user-guide/ci-cd.md +267 -0
  81. evalcraft-0.1.0/docs/user-guide/cli.md +483 -0
  82. evalcraft-0.1.0/docs/user-guide/concepts.md +183 -0
  83. evalcraft-0.1.0/docs/user-guide/index.md +62 -0
  84. evalcraft-0.1.0/docs/user-guide/mock.md +454 -0
  85. evalcraft-0.1.0/docs/user-guide/pytest-plugin.md +295 -0
  86. evalcraft-0.1.0/docs/user-guide/quickstart.md +326 -0
  87. evalcraft-0.1.0/docs/user-guide/replay.md +308 -0
  88. evalcraft-0.1.0/docs/user-guide/scorers.md +302 -0
  89. evalcraft-0.1.0/evalcraft/__init__.py +50 -0
  90. evalcraft-0.1.0/evalcraft/adapters/__init__.py +47 -0
  91. evalcraft-0.1.0/evalcraft/adapters/anthropic_adapter.py +298 -0
  92. evalcraft-0.1.0/evalcraft/adapters/autogen_adapter.py +362 -0
  93. evalcraft-0.1.0/evalcraft/adapters/crewai_adapter.py +361 -0
  94. evalcraft-0.1.0/evalcraft/adapters/langgraph_adapter.py +546 -0
  95. evalcraft-0.1.0/evalcraft/adapters/llamaindex_adapter.py +434 -0
  96. evalcraft-0.1.0/evalcraft/adapters/openai_adapter.py +305 -0
  97. evalcraft-0.1.0/evalcraft/alerts/__init__.py +7 -0
  98. evalcraft-0.1.0/evalcraft/alerts/email.py +195 -0
  99. evalcraft-0.1.0/evalcraft/alerts/slack.py +201 -0
  100. evalcraft-0.1.0/evalcraft/alerts/webhook.py +82 -0
  101. evalcraft-0.1.0/evalcraft/capture/__init__.py +0 -0
  102. evalcraft-0.1.0/evalcraft/capture/recorder.py +318 -0
  103. evalcraft-0.1.0/evalcraft/cli/__init__.py +0 -0
  104. evalcraft-0.1.0/evalcraft/cli/init_cmd.py +245 -0
  105. evalcraft-0.1.0/evalcraft/cli/main.py +1128 -0
  106. evalcraft-0.1.0/evalcraft/cli/templates/__init__.py +1 -0
  107. evalcraft-0.1.0/evalcraft/cli/templates/conftest.py +42 -0
  108. evalcraft-0.1.0/evalcraft/cli/templates/evalcraft.toml +40 -0
  109. evalcraft-0.1.0/evalcraft/cli/templates/test_agent_anthropic.py +148 -0
  110. evalcraft-0.1.0/evalcraft/cli/templates/test_agent_crewai.py +138 -0
  111. evalcraft-0.1.0/evalcraft/cli/templates/test_agent_generic.py +155 -0
  112. evalcraft-0.1.0/evalcraft/cli/templates/test_agent_langgraph.py +141 -0
  113. evalcraft-0.1.0/evalcraft/cli/templates/test_agent_openai.py +140 -0
  114. evalcraft-0.1.0/evalcraft/cloud/__init__.py +5 -0
  115. evalcraft-0.1.0/evalcraft/cloud/client.py +361 -0
  116. evalcraft-0.1.0/evalcraft/core/__init__.py +0 -0
  117. evalcraft-0.1.0/evalcraft/core/models.py +329 -0
  118. evalcraft-0.1.0/evalcraft/eval/__init__.py +0 -0
  119. evalcraft-0.1.0/evalcraft/eval/scorers/__init__.py +368 -0
  120. evalcraft-0.1.0/evalcraft/golden/__init__.py +5 -0
  121. evalcraft-0.1.0/evalcraft/golden/manager.py +402 -0
  122. evalcraft-0.1.0/evalcraft/mock/__init__.py +0 -0
  123. evalcraft-0.1.0/evalcraft/mock/llm.py +243 -0
  124. evalcraft-0.1.0/evalcraft/mock/tool.py +213 -0
  125. evalcraft-0.1.0/evalcraft/pytest_plugin/__init__.py +35 -0
  126. evalcraft-0.1.0/evalcraft/pytest_plugin/plugin.py +397 -0
  127. evalcraft-0.1.0/evalcraft/regression/__init__.py +10 -0
  128. evalcraft-0.1.0/evalcraft/regression/detector.py +447 -0
  129. evalcraft-0.1.0/evalcraft/replay/__init__.py +0 -0
  130. evalcraft-0.1.0/evalcraft/replay/engine.py +337 -0
  131. evalcraft-0.1.0/evalcraft/replay/network_guard.py +181 -0
  132. evalcraft-0.1.0/evalcraft/sanitize/__init__.py +5 -0
  133. evalcraft-0.1.0/evalcraft/sanitize/redactor.py +261 -0
  134. evalcraft-0.1.0/examples/anthropic-agent/README.md +109 -0
  135. evalcraft-0.1.0/examples/anthropic-agent/agent.py +255 -0
  136. evalcraft-0.1.0/examples/anthropic-agent/record_cassettes.py +81 -0
  137. evalcraft-0.1.0/examples/anthropic-agent/requirements.txt +3 -0
  138. evalcraft-0.1.0/examples/anthropic-agent/tests/cassettes/auth_middleware_review.json +176 -0
  139. evalcraft-0.1.0/examples/anthropic-agent/tests/cassettes/db_pool_refactor_review.json +169 -0
  140. evalcraft-0.1.0/examples/anthropic-agent/tests/test_code_review_agent.py +288 -0
  141. evalcraft-0.1.0/examples/basic_capture.py +101 -0
  142. evalcraft-0.1.0/examples/ci-pipeline/.github/workflows/eval.yml +193 -0
  143. evalcraft-0.1.0/examples/ci-pipeline/README.md +140 -0
  144. evalcraft-0.1.0/examples/ci-pipeline/evalcraft_gate.py +373 -0
  145. evalcraft-0.1.0/examples/ci-pipeline/requirements.txt +3 -0
  146. evalcraft-0.1.0/examples/ci-pipeline/tests/test_ci_gate.py +306 -0
  147. evalcraft-0.1.0/examples/langgraph-workflow/README.md +119 -0
  148. evalcraft-0.1.0/examples/langgraph-workflow/record_cassettes.py +93 -0
  149. evalcraft-0.1.0/examples/langgraph-workflow/requirements.txt +6 -0
  150. evalcraft-0.1.0/examples/langgraph-workflow/tests/cassettes/equipment_stipend.json +233 -0
  151. evalcraft-0.1.0/examples/langgraph-workflow/tests/cassettes/remote_work_policy.json +233 -0
  152. evalcraft-0.1.0/examples/langgraph-workflow/tests/test_rag_workflow.py +297 -0
  153. evalcraft-0.1.0/examples/langgraph-workflow/workflow.py +254 -0
  154. evalcraft-0.1.0/examples/openai-agent/README.md +127 -0
  155. evalcraft-0.1.0/examples/openai-agent/agent.py +190 -0
  156. evalcraft-0.1.0/examples/openai-agent/build_golden.py +62 -0
  157. evalcraft-0.1.0/examples/openai-agent/record_cassettes.py +98 -0
  158. evalcraft-0.1.0/examples/openai-agent/requirements.txt +3 -0
  159. evalcraft-0.1.0/examples/openai-agent/tests/cassettes/damaged_item.json +149 -0
  160. evalcraft-0.1.0/examples/openai-agent/tests/cassettes/order_tracking.json +149 -0
  161. evalcraft-0.1.0/examples/openai-agent/tests/cassettes/return_request.json +146 -0
  162. evalcraft-0.1.0/examples/openai-agent/tests/test_golden.py +52 -0
  163. evalcraft-0.1.0/examples/openai-agent/tests/test_support_agent.py +234 -0
  164. evalcraft-0.1.0/examples/replay_and_diff.py +272 -0
  165. evalcraft-0.1.0/examples/test_with_mocks.py +305 -0
  166. evalcraft-0.1.0/mkdocs.yml +84 -0
  167. evalcraft-0.1.0/packages/evalcraft-js/package-lock.json +2908 -0
  168. evalcraft-0.1.0/packages/evalcraft-js/package.json +54 -0
  169. evalcraft-0.1.0/packages/evalcraft-js/src/adapters/openai.ts +68 -0
  170. evalcraft-0.1.0/packages/evalcraft-js/src/adapters/vercel-ai.ts +293 -0
  171. evalcraft-0.1.0/packages/evalcraft-js/src/capture/recorder.ts +211 -0
  172. evalcraft-0.1.0/packages/evalcraft-js/src/core/models.ts +394 -0
  173. evalcraft-0.1.0/packages/evalcraft-js/src/core/types.ts +36 -0
  174. evalcraft-0.1.0/packages/evalcraft-js/src/eval/scorers.ts +294 -0
  175. evalcraft-0.1.0/packages/evalcraft-js/src/index.ts +46 -0
  176. evalcraft-0.1.0/packages/evalcraft-js/src/mock/llm.ts +191 -0
  177. evalcraft-0.1.0/packages/evalcraft-js/src/mock/tool.ts +172 -0
  178. evalcraft-0.1.0/packages/evalcraft-js/src/replay/engine.ts +248 -0
  179. evalcraft-0.1.0/packages/evalcraft-js/tests/capture.test.ts +155 -0
  180. evalcraft-0.1.0/packages/evalcraft-js/tests/mock.test.ts +283 -0
  181. evalcraft-0.1.0/packages/evalcraft-js/tests/models.test.ts +210 -0
  182. evalcraft-0.1.0/packages/evalcraft-js/tests/replay.test.ts +195 -0
  183. evalcraft-0.1.0/packages/evalcraft-js/tests/scorers.test.ts +286 -0
  184. evalcraft-0.1.0/packages/evalcraft-js/tests/vercel-ai.test.ts +441 -0
  185. evalcraft-0.1.0/packages/evalcraft-js/tsconfig.json +19 -0
  186. evalcraft-0.1.0/packages/evalcraft-js/vitest.config.ts +9 -0
  187. evalcraft-0.1.0/pyproject.toml +93 -0
  188. evalcraft-0.1.0/site/index.html +995 -0
  189. evalcraft-0.1.0/tests/conftest.py +75 -0
  190. evalcraft-0.1.0/tests/test_alerts.py +469 -0
  191. evalcraft-0.1.0/tests/test_anthropic_adapter.py +440 -0
  192. evalcraft-0.1.0/tests/test_cloud.py +405 -0
  193. evalcraft-0.1.0/tests/test_golden.py +482 -0
  194. evalcraft-0.1.0/tests/test_init.py +442 -0
  195. evalcraft-0.1.0/tests/test_mock_llm.py +285 -0
  196. evalcraft-0.1.0/tests/test_mock_tool.py +317 -0
  197. evalcraft-0.1.0/tests/test_models.py +334 -0
  198. evalcraft-0.1.0/tests/test_network_guard.py +387 -0
  199. evalcraft-0.1.0/tests/test_recorder.py +231 -0
  200. evalcraft-0.1.0/tests/test_regression.py +536 -0
  201. evalcraft-0.1.0/tests/test_replay.py +287 -0
  202. evalcraft-0.1.0/tests/test_sanitize.py +579 -0
  203. evalcraft-0.1.0/tests/test_scorers.py +364 -0
@@ -0,0 +1,60 @@
1
+ name: Bug Report
2
+ description: Something is broken
3
+ labels: ["bug"]
4
+ body:
5
+ - type: markdown
6
+ attributes:
7
+ value: |
8
+ Thanks for filing a bug. Please fill in as much detail as you can.
9
+
10
+ - type: textarea
11
+ id: description
12
+ attributes:
13
+ label: What happened?
14
+ description: A clear description of the bug.
15
+ validations:
16
+ required: true
17
+
18
+ - type: textarea
19
+ id: reproduction
20
+ attributes:
21
+ label: Minimal reproduction
22
+ description: Code or steps to reproduce the issue.
23
+ render: python
24
+ validations:
25
+ required: true
26
+
27
+ - type: textarea
28
+ id: expected
29
+ attributes:
30
+ label: Expected behavior
31
+ validations:
32
+ required: true
33
+
34
+ - type: textarea
35
+ id: traceback
36
+ attributes:
37
+ label: Error / traceback
38
+ render: text
39
+
40
+ - type: input
41
+ id: version
42
+ attributes:
43
+ label: Evalcraft version
44
+ placeholder: "e.g. 0.1.0"
45
+ validations:
46
+ required: true
47
+
48
+ - type: input
49
+ id: python
50
+ attributes:
51
+ label: Python version
52
+ placeholder: "e.g. 3.11"
53
+ validations:
54
+ required: true
55
+
56
+ - type: input
57
+ id: framework
58
+ attributes:
59
+ label: Framework / adapter (if relevant)
60
+ placeholder: "e.g. OpenAI SDK 1.30, LangChain 0.2"
@@ -0,0 +1,5 @@
1
+ blank_issues_enabled: false
2
+ contact_links:
3
+ - name: Question / Discussion
4
+ url: https://github.com/beyhangl/evalcraft/discussions
5
+ about: Ask questions and discuss ideas in GitHub Discussions.
@@ -0,0 +1,40 @@
1
+ name: Feature Request
2
+ description: Propose a new feature or improvement
3
+ labels: ["enhancement"]
4
+ body:
5
+ - type: textarea
6
+ id: problem
7
+ attributes:
8
+ label: What problem does this solve?
9
+ description: Describe the use case or pain point.
10
+ validations:
11
+ required: true
12
+
13
+ - type: textarea
14
+ id: solution
15
+ attributes:
16
+ label: Proposed solution
17
+ description: What should the API / behavior look like? Code sketches welcome.
18
+ validations:
19
+ required: true
20
+
21
+ - type: textarea
22
+ id: alternatives
23
+ attributes:
24
+ label: Alternatives considered
25
+ description: Other approaches you considered and why you prefer this one.
26
+
27
+ - type: dropdown
28
+ id: scope
29
+ attributes:
30
+ label: Scope
31
+ options:
32
+ - Core (capture / replay / assertions)
33
+ - Mock LLM / Mock Tools
34
+ - Framework adapter (OpenAI, LangChain, etc.)
35
+ - CLI
36
+ - pytest plugin
37
+ - Docs / examples
38
+ - Other
39
+ validations:
40
+ required: true
@@ -0,0 +1,106 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ # ── Python SDK ────────────────────────────────────────────────────────
11
+ test-python:
12
+ runs-on: ubuntu-latest
13
+ strategy:
14
+ matrix:
15
+ python-version: ["3.10", "3.11", "3.12"]
16
+
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+
20
+ - name: Set up Python ${{ matrix.python-version }}
21
+ uses: actions/setup-python@v5
22
+ with:
23
+ python-version: ${{ matrix.python-version }}
24
+
25
+ - name: Install dependencies
26
+ run: pip install -e ".[dev]"
27
+
28
+ - name: Run tests
29
+ run: pytest tests/ -v --tb=short
30
+
31
+ - name: Check imports
32
+ run: python -c "from evalcraft import capture, replay, MockLLM, MockTool, assert_tool_called; print('OK')"
33
+
34
+ build-python:
35
+ runs-on: ubuntu-latest
36
+ needs: test-python
37
+ steps:
38
+ - uses: actions/checkout@v4
39
+
40
+ - name: Set up Python
41
+ uses: actions/setup-python@v5
42
+ with:
43
+ python-version: "3.12"
44
+
45
+ - name: Install build tools
46
+ run: pip install build
47
+
48
+ - name: Build package
49
+ run: python -m build
50
+
51
+ - name: Check dist
52
+ run: |
53
+ pip install twine
54
+ twine check dist/*
55
+
56
+ # ── TypeScript SDK ────────────────────────────────────────────────────
57
+ test-typescript:
58
+ runs-on: ubuntu-latest
59
+ strategy:
60
+ matrix:
61
+ node-version: ["18", "20", "22"]
62
+
63
+ defaults:
64
+ run:
65
+ working-directory: packages/evalcraft-js
66
+
67
+ steps:
68
+ - uses: actions/checkout@v4
69
+
70
+ - name: Set up Node ${{ matrix.node-version }}
71
+ uses: actions/setup-node@v4
72
+ with:
73
+ node-version: ${{ matrix.node-version }}
74
+
75
+ - name: Install dependencies
76
+ run: npm ci
77
+
78
+ - name: Type check
79
+ run: npx tsc --noEmit
80
+
81
+ - name: Run tests
82
+ run: npx vitest run
83
+
84
+ # ── Dashboard Frontend ────────────────────────────────────────────────
85
+ build-frontend:
86
+ runs-on: ubuntu-latest
87
+ defaults:
88
+ run:
89
+ working-directory: dashboard/frontend
90
+
91
+ steps:
92
+ - uses: actions/checkout@v4
93
+
94
+ - name: Set up Node
95
+ uses: actions/setup-node@v4
96
+ with:
97
+ node-version: "20"
98
+
99
+ - name: Install dependencies
100
+ run: npm ci
101
+
102
+ - name: Type check
103
+ run: npx tsc -b --noEmit
104
+
105
+ - name: Build
106
+ run: npx vite build
@@ -0,0 +1,109 @@
1
+ # Example: Evalcraft CI Gate
2
+ #
3
+ # Copy this workflow to your repo's .github/workflows/ directory and adapt it
4
+ # to your project. Teams using evalcraft can gate PRs on agent test results —
5
+ # blocking merges when tests fail or cost/regression thresholds are exceeded.
6
+ #
7
+ # For full documentation see:
8
+ # https://github.com/beyhangl/evalcraft/blob/main/docs/user-guide/ci-cd.md
9
+
10
+ name: Agent Tests (Evalcraft CI Gate)
11
+
12
+ on:
13
+ pull_request:
14
+ branches: [main, master]
15
+ push:
16
+ branches: [main, master]
17
+
18
+ # Only allow one run per PR branch at a time; cancel superseded runs.
19
+ concurrency:
20
+ group: evalcraft-${{ github.ref }}
21
+ cancel-in-progress: true
22
+
23
+ jobs:
24
+ # ── Replay-only gate (fast, no API keys required) ────────────────────────────
25
+ #
26
+ # Runs your agent tests against committed cassettes. Fails if any test fails
27
+ # or if total cost / token regressions exceed your thresholds.
28
+ #
29
+ # This is the recommended default for pull request checks.
30
+ agent-tests:
31
+ name: Agent Tests
32
+ runs-on: ubuntu-latest
33
+ permissions:
34
+ contents: read # read cassette files
35
+ pull-requests: write # post results comment
36
+
37
+ steps:
38
+ - uses: actions/checkout@v4
39
+
40
+ - uses: beyhangl/evalcraft@v0.1.0
41
+ with:
42
+ # Path(s) to your agent test files
43
+ test-path: tests/agent_tests/
44
+
45
+ # Cassette directory (must match your pytest --cassette-dir setting)
46
+ cassette-dir: tests/cassettes/
47
+
48
+ # Fail the PR if total agent cost exceeds this value (USD)
49
+ max-cost: '0.10'
50
+
51
+ # Fail the PR if any cassette metric increases by more than this %
52
+ # relative to the baseline cassettes committed in git.
53
+ # Only meaningful when record-mode is 'new' or 'all'.
54
+ max-regression: '5%'
55
+
56
+ # Replay recorded cassettes without making real LLM calls (default).
57
+ # Change to 'new' or 'all' to re-record during CI (requires API keys).
58
+ record-mode: none
59
+
60
+ # Post a results table as a PR comment (requires pull-requests: write)
61
+ post-comment: 'true'
62
+
63
+ # Pin to a specific evalcraft version for reproducible CI
64
+ # evalcraft-version: '0.1.0'
65
+
66
+ # The action exposes outputs you can use in subsequent steps:
67
+ # - passed (true/false)
68
+ # - total-cost (e.g. "0.0423")
69
+ # - total-tokens (e.g. "1840")
70
+ # - total-tool-calls
71
+ # - cassette-count
72
+
73
+
74
+ # ── Optional: nightly re-record job ─────────────────────────────────────────
75
+ #
76
+ # Runs on a schedule with real LLM calls, re-records cassettes, and opens a
77
+ # PR if any cassette changed. Requires your LLM API key as a secret.
78
+ #
79
+ # Uncomment and adapt if you want automated cassette refresh.
80
+
81
+ # nightly-record:
82
+ # name: Nightly cassette refresh
83
+ # runs-on: ubuntu-latest
84
+ # if: github.event_name == 'schedule'
85
+ # permissions:
86
+ # contents: write
87
+ # pull-requests: write
88
+ #
89
+ # steps:
90
+ # - uses: actions/checkout@v4
91
+ #
92
+ # - uses: beyhangl/evalcraft@v0.1.0
93
+ # env:
94
+ # OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
95
+ # # or ANTHROPIC_API_KEY, etc.
96
+ # with:
97
+ # test-path: tests/agent_tests/
98
+ # cassette-dir: tests/cassettes/
99
+ # record-mode: all # re-record every cassette
100
+ # max-regression: '10%' # allow up to 10% drift from last recording
101
+ # post-comment: 'false'
102
+ #
103
+ # - name: Open PR if cassettes changed
104
+ # uses: peter-evans/create-pull-request@v6
105
+ # with:
106
+ # commit-message: 'chore: refresh evalcraft cassettes'
107
+ # title: '🤖 Cassette refresh — nightly agent test update'
108
+ # branch: cassette-refresh/nightly
109
+ # labels: cassettes, automated
@@ -0,0 +1,68 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+
14
+ - name: Set up Python
15
+ uses: actions/setup-python@v5
16
+ with:
17
+ python-version: "3.12"
18
+
19
+ - name: Install dependencies
20
+ run: pip install -e ".[dev]"
21
+
22
+ - name: Run tests
23
+ run: pytest tests/ -v --tb=short
24
+
25
+ - name: Check imports
26
+ run: python -c "from evalcraft import capture, replay, MockLLM, MockTool, assert_tool_called; print('OK')"
27
+
28
+ build:
29
+ runs-on: ubuntu-latest
30
+ needs: test
31
+ steps:
32
+ - uses: actions/checkout@v4
33
+
34
+ - name: Set up Python
35
+ uses: actions/setup-python@v5
36
+ with:
37
+ python-version: "3.12"
38
+
39
+ - name: Install build tools
40
+ run: pip install build twine
41
+
42
+ - name: Build package
43
+ run: python -m build
44
+
45
+ - name: Validate distributions
46
+ run: twine check dist/*
47
+
48
+ - name: Upload dist artifacts
49
+ uses: actions/upload-artifact@v4
50
+ with:
51
+ name: dist
52
+ path: dist/
53
+
54
+ publish:
55
+ runs-on: ubuntu-latest
56
+ needs: build
57
+ environment: pypi
58
+ permissions:
59
+ id-token: write
60
+ steps:
61
+ - name: Download dist artifacts
62
+ uses: actions/download-artifact@v4
63
+ with:
64
+ name: dist
65
+ path: dist/
66
+
67
+ - name: Publish to PyPI
68
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,46 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ *.egg-info/
7
+ *.egg
8
+ dist/
9
+ build/
10
+ *.whl
11
+
12
+ # Virtual environments
13
+ .venv/
14
+ venv/
15
+ env/
16
+
17
+ # IDE
18
+ .idea/
19
+ .vscode/
20
+ *.swp
21
+ *.swo
22
+ *~
23
+
24
+ # Testing
25
+ .pytest_cache/
26
+ .coverage
27
+ htmlcov/
28
+ .mypy_cache/
29
+
30
+ # OS
31
+ .DS_Store
32
+ Thumbs.db
33
+
34
+ # Node
35
+ node_modules/
36
+ .npm
37
+ .pnpm-store/
38
+
39
+ # Cassettes (user-generated, not tracked)
40
+ tests/cassettes/*.json
41
+ !tests/cassettes/.gitkeep
42
+
43
+ # Internal docs (strategy, launch content — not public)
44
+ docs/strategy/
45
+ docs/launch/
46
+ docs/blog/
@@ -0,0 +1,76 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.1.0] — 2026-03-05
9
+
10
+ Initial public release of Evalcraft — the pytest for AI agents.
11
+
12
+ ### Added
13
+
14
+ #### Core data model
15
+ - `Span` — atomic unit of capture, recording every LLM call, tool invocation, agent step, user input, and output with timing, token usage, and cost metadata
16
+ - `Cassette` — the fundamental recording unit that stores all spans from a single agent execution; supports fingerprinting for change detection, aggregate metrics, and JSON serialization/deserialization
17
+ - `AgentRun` — wrapper for live or replayed agent results
18
+ - `EvalResult` / `AssertionResult` — structured pass/fail results for assertions with score tracking
19
+ - `SpanKind` enum: `llm_request`, `llm_response`, `tool_call`, `tool_result`, `agent_step`, `user_input`, `agent_output`
20
+ - `TokenUsage` dataclass tracking prompt, completion, and total tokens
21
+
22
+ #### Capture
23
+ - `capture()` context manager — instrument any code block to record spans into a cassette
24
+ - `CaptureContext` — configurable capture session with name, agent name, framework tag, and optional auto-save path
25
+
26
+ #### Replay
27
+ - `ReplayEngine` — feeds recorded LLM responses back without making real API calls
28
+ - Tool result overriding for isolated replay testing
29
+ - `ReplayDiff` — compare two cassettes and detect changes in tool sequence, output text, token count, cost, and span count
30
+
31
+ #### Mock
32
+ - `MockLLM` — deterministic LLM fake with pattern-based response matching (`"*"` wildcard), token usage simulation, cost tracking, and automatic span recording
33
+ - `MockTool` — configurable tool fake with `.returns()` / `.raises()` / `.side_effect()` control
34
+
35
+ #### Eval scorers (8 built-in assertions)
36
+ - `assert_tool_called(cassette, tool_name, times=None, with_args=None, before=None, after=None)` — verify a tool was invoked, with optional count, arg, and ordering constraints
37
+ - `assert_tool_order(cassette, expected_order, strict=False)` — verify tool call sequence (strict or subsequence mode)
38
+ - `assert_no_tool_called(cassette, tool_name)` — verify a tool was never invoked
39
+ - `assert_output_contains(cassette, substring, case_sensitive=True)` — verify agent output text
40
+ - `assert_output_matches(cassette, pattern)` — verify agent output against a regex pattern
41
+ - `assert_cost_under(cassette, max_usd)` — budget enforcement
42
+ - `assert_latency_under(cassette, max_ms)` — latency enforcement
43
+ - `assert_token_count_under(cassette, max_tokens)` — token budget enforcement
44
+ - `Evaluator` — compose multiple assertions into a single evaluation with aggregate scoring
45
+
46
+ #### Framework adapters (4 adapters)
47
+ - `OpenAIAdapter` — patches the OpenAI Python SDK (`chat.completions.create`, sync and async) to auto-record LLM spans with token usage and cost
48
+ - `AnthropicAdapter` — patches the Anthropic Python SDK (`messages.create`, sync and async) with a built-in pricing table covering all Claude models
49
+ - `LangGraphAdapter` — injects a LangChain callback handler into compiled LangGraph graphs to record node executions, LLM calls, and tool calls
50
+ - `CrewAIAdapter` — instruments a CrewAI `Crew` to capture `kickoff()` timing, per-agent tool calls, task completions, and inter-agent delegation spans
51
+
52
+ #### pytest plugin (`pytest-evalcraft`)
53
+ - Auto-registered via `entry_points` — zero-config activation when evalcraft is installed
54
+ - Fixtures: `capture_context`, `mock_llm`, `mock_tool`, `cassette`, `replay_engine`, `evalcraft_cassette_dir`
55
+ - Markers: `@pytest.mark.evalcraft_cassette(path)`, `@pytest.mark.evalcraft_capture(name, save)`, `@pytest.mark.evalcraft_agent`
56
+ - CLI options: `--cassette-dir DIR`, `--evalcraft-record {none,new,all}`
57
+ - Terminal summary: per-test agent run metrics table (tokens, cost, tools, latency, fingerprint) appended to pytest output
58
+
59
+ #### CLI (`evalcraft`)
60
+ - `evalcraft capture <script>` — run a Python script under capture and save the cassette
61
+ - `evalcraft replay <cassette>` — replay a cassette and display metrics (`--verbose` shows all spans)
62
+ - `evalcraft diff <old> <new>` — compare two cassettes side-by-side (`--json` for machine-readable output)
63
+ - `evalcraft eval <cassette>` — run assertions with `--max-cost`, `--max-tokens`, `--max-latency`, `--tool` flags; exits 1 on failure (CI-friendly)
64
+ - `evalcraft info <cassette>` — inspect cassette metadata, metrics, tool sequence, and spans (`--json` for raw JSON)
65
+ - `evalcraft mock <cassette>` — generate ready-to-use `MockLLM` and `MockTool` Python fixtures from a recorded cassette
66
+
67
+ #### Project infrastructure
68
+ - MIT license
69
+ - Python 3.9–3.13 support
70
+ - Optional dependency groups: `[pytest]`, `[openai]`, `[anthropic]`, `[langchain]`, `[crewai]`, `[all]`, `[dev]`
71
+ - Hatchling build system
72
+ - Ruff linting, mypy strict type checking, pytest-asyncio for async tests
73
+ - GitHub Actions CI and PyPI publish workflows
74
+ - 260 tests
75
+
76
+ [0.1.0]: https://github.com/beyhangl/evalcraft/releases/tag/v0.1.0
@@ -0,0 +1,86 @@
1
+ # Contributing to Evalcraft
2
+
3
+ Thanks for your interest! Here's everything you need to get started.
4
+
5
+ ## Quick setup
6
+
7
+ ```bash
8
+ git clone https://github.com/beyhangl/evalcraft
9
+ cd evalcraft
10
+ pip install -e ".[dev]"
11
+ pytest
12
+ ```
13
+
14
+ ## Before opening a PR
15
+
16
+ - **For bug fixes** — open a PR directly. Include a failing test that your fix resolves.
17
+ - **For new features** — open an issue first to discuss the approach. Significant changes without prior discussion may be declined.
18
+ - **For adapters** (OpenAI, LangChain, etc.) — new framework adapters are welcome; discuss in an issue first.
19
+
20
+ ## Development workflow
21
+
22
+ ```bash
23
+ # Format
24
+ ruff format .
25
+
26
+ # Lint
27
+ ruff check .
28
+
29
+ # Type check
30
+ mypy evalcraft/
31
+
32
+ # Run tests
33
+ pytest
34
+
35
+ # Run tests with coverage
36
+ pytest --cov=evalcraft --cov-report=term-missing
37
+ ```
38
+
39
+ All three checks (format, lint, type check) must pass before a PR is merged. CI enforces this.
40
+
41
+ ## Code conventions
42
+
43
+ - **Python 3.9+** — no syntax or stdlib features above 3.9 unless gated
44
+ - **Line length** — 100 characters (configured in `pyproject.toml`)
45
+ - **Types** — strict mypy; all public functions need type annotations
46
+ - **Tests** — every new feature or bug fix needs a test in `tests/`
47
+ - **Cassette fixtures** — test cassettes live in `tests/cassettes/`
48
+
49
+ ## Adding a new framework adapter
50
+
51
+ 1. Create `evalcraft/adapters/<framework>.py`
52
+ 2. Add the optional dependency to `pyproject.toml` under `[project.optional-dependencies]`
53
+ 3. Add integration tests under `tests/adapters/`
54
+ 4. Update the README framework support table
55
+
56
+ ## Project structure
57
+
58
+ ```
59
+ evalcraft/
60
+ ├── core/ # Cassette, Span, data model
61
+ ├── capture.py # CaptureContext
62
+ ├── replay.py # replay()
63
+ ├── assertions.py # assert_tool_called, assert_cost_under, etc.
64
+ ├── mocks.py # MockLLM, MockTool
65
+ ├── adapters/ # OpenAI, LangChain, etc.
66
+ ├── cli/ # evalcraft CLI (click)
67
+ └── pytest_plugin/ # pytest fixtures and markers
68
+ tests/
69
+ ├── cassettes/ # Fixture cassettes
70
+ └── ...
71
+ ```
72
+
73
+ ## Commit messages
74
+
75
+ Use conventional commits — `fix:`, `feat:`, `docs:`, `chore:`, `refactor:`. Keep the subject line under 72 chars.
76
+
77
+ ## Releasing (maintainers only)
78
+
79
+ 1. Bump `version` in `pyproject.toml`
80
+ 2. Update the changelog
81
+ 3. Tag: `git tag v0.x.y && git push --tags`
82
+ 4. CI publishes to PyPI automatically on tag push
83
+
84
+ ## License
85
+
86
+ By contributing you agree your code will be released under the [MIT License](LICENSE).
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Beyhan Gül
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.