PyPI - evalcraft - Versions diffs - 0.1.0__tar.gz - Mend

evalcraft 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (203) hide show

evalcraft-0.1.0/.github/ISSUE_TEMPLATE/bug_report.yml +60 -0
evalcraft-0.1.0/.github/ISSUE_TEMPLATE/config.yml +5 -0
evalcraft-0.1.0/.github/ISSUE_TEMPLATE/feature_request.yml +40 -0
evalcraft-0.1.0/.github/workflows/ci.yml +106 -0
evalcraft-0.1.0/.github/workflows/example-ci-gate.yml +109 -0
evalcraft-0.1.0/.github/workflows/publish.yml +68 -0
evalcraft-0.1.0/.gitignore +46 -0
evalcraft-0.1.0/CHANGELOG.md +76 -0
evalcraft-0.1.0/CONTRIBUTING.md +86 -0
evalcraft-0.1.0/LICENSE +21 -0
evalcraft-0.1.0/PKG-INFO +431 -0
evalcraft-0.1.0/README.md +369 -0
evalcraft-0.1.0/action.yml +379 -0
evalcraft-0.1.0/dashboard/backend/Dockerfile +17 -0
evalcraft-0.1.0/dashboard/backend/alembic/env.py +64 -0
evalcraft-0.1.0/dashboard/backend/alembic/script.py.mako +25 -0
evalcraft-0.1.0/dashboard/backend/alembic/versions/.gitkeep +0 -0
evalcraft-0.1.0/dashboard/backend/alembic.ini +36 -0
evalcraft-0.1.0/dashboard/backend/app/__init__.py +0 -0
evalcraft-0.1.0/dashboard/backend/app/api/__init__.py +0 -0
evalcraft-0.1.0/dashboard/backend/app/api/auth.py +224 -0
evalcraft-0.1.0/dashboard/backend/app/api/cassettes.py +176 -0
evalcraft-0.1.0/dashboard/backend/app/api/golden_sets.py +161 -0
evalcraft-0.1.0/dashboard/backend/app/api/projects.py +96 -0
evalcraft-0.1.0/dashboard/backend/app/api/regressions.py +72 -0
evalcraft-0.1.0/dashboard/backend/app/api/webhooks.py +80 -0
evalcraft-0.1.0/dashboard/backend/app/config.py +41 -0
evalcraft-0.1.0/dashboard/backend/app/database.py +43 -0
evalcraft-0.1.0/dashboard/backend/app/main.py +66 -0
evalcraft-0.1.0/dashboard/backend/app/models/__init__.py +18 -0
evalcraft-0.1.0/dashboard/backend/app/models/cassette.py +46 -0
evalcraft-0.1.0/dashboard/backend/app/models/golden_set.py +33 -0
evalcraft-0.1.0/dashboard/backend/app/models/project.py +30 -0
evalcraft-0.1.0/dashboard/backend/app/models/regression.py +68 -0
evalcraft-0.1.0/dashboard/backend/app/models/user.py +60 -0
evalcraft-0.1.0/dashboard/backend/app/schemas/__init__.py +0 -0
evalcraft-0.1.0/dashboard/backend/app/schemas/api.py +227 -0
evalcraft-0.1.0/dashboard/backend/app/services/alert_service.py +61 -0
evalcraft-0.1.0/dashboard/backend/app/services/analytics_service.py +62 -0
evalcraft-0.1.0/dashboard/backend/app/services/cassette_service.py +78 -0
evalcraft-0.1.0/dashboard/backend/app/services/regression_service.py +88 -0
evalcraft-0.1.0/dashboard/backend/requirements.txt +12 -0
evalcraft-0.1.0/dashboard/frontend/.gitignore +24 -0
evalcraft-0.1.0/dashboard/frontend/README.md +73 -0
evalcraft-0.1.0/dashboard/frontend/eslint.config.js +23 -0
evalcraft-0.1.0/dashboard/frontend/index.html +13 -0
evalcraft-0.1.0/dashboard/frontend/package-lock.json +4281 -0
evalcraft-0.1.0/dashboard/frontend/package.json +35 -0
evalcraft-0.1.0/dashboard/frontend/public/vite.svg +1 -0
evalcraft-0.1.0/dashboard/frontend/src/App.css +42 -0
evalcraft-0.1.0/dashboard/frontend/src/App.tsx +51 -0
evalcraft-0.1.0/dashboard/frontend/src/assets/react.svg +1 -0
evalcraft-0.1.0/dashboard/frontend/src/components/Layout.tsx +54 -0
evalcraft-0.1.0/dashboard/frontend/src/components/MetricCard.tsx +64 -0
evalcraft-0.1.0/dashboard/frontend/src/components/Sidebar.tsx +252 -0
evalcraft-0.1.0/dashboard/frontend/src/components/StatusBadge.tsx +53 -0
evalcraft-0.1.0/dashboard/frontend/src/components/Toast.tsx +80 -0
evalcraft-0.1.0/dashboard/frontend/src/data/mock.ts +143 -0
evalcraft-0.1.0/dashboard/frontend/src/index.css +55 -0
evalcraft-0.1.0/dashboard/frontend/src/main.tsx +10 -0
evalcraft-0.1.0/dashboard/frontend/src/pages/Analytics.tsx +158 -0
evalcraft-0.1.0/dashboard/frontend/src/pages/CassetteDetail.tsx +199 -0
evalcraft-0.1.0/dashboard/frontend/src/pages/Cassettes.tsx +221 -0
evalcraft-0.1.0/dashboard/frontend/src/pages/Dashboard.tsx +233 -0
evalcraft-0.1.0/dashboard/frontend/src/pages/GoldenSets.tsx +196 -0
evalcraft-0.1.0/dashboard/frontend/src/pages/Login.tsx +272 -0
evalcraft-0.1.0/dashboard/frontend/src/pages/Regressions.tsx +148 -0
evalcraft-0.1.0/dashboard/frontend/src/pages/Settings.tsx +193 -0
evalcraft-0.1.0/dashboard/frontend/tsconfig.app.json +28 -0
evalcraft-0.1.0/dashboard/frontend/tsconfig.json +7 -0
evalcraft-0.1.0/dashboard/frontend/tsconfig.node.json +26 -0
evalcraft-0.1.0/dashboard/frontend/vite.config.ts +7 -0
evalcraft-0.1.0/docs/index.md +110 -0
evalcraft-0.1.0/docs/user-guide/adapters/anthropic.md +149 -0
evalcraft-0.1.0/docs/user-guide/adapters/crewai.md +172 -0
evalcraft-0.1.0/docs/user-guide/adapters/langgraph.md +146 -0
evalcraft-0.1.0/docs/user-guide/adapters/openai.md +141 -0
evalcraft-0.1.0/docs/user-guide/capture.md +330 -0
evalcraft-0.1.0/docs/user-guide/changelog.md +90 -0
evalcraft-0.1.0/docs/user-guide/ci-cd.md +267 -0
evalcraft-0.1.0/docs/user-guide/cli.md +483 -0
evalcraft-0.1.0/docs/user-guide/concepts.md +183 -0
evalcraft-0.1.0/docs/user-guide/index.md +62 -0
evalcraft-0.1.0/docs/user-guide/mock.md +454 -0
evalcraft-0.1.0/docs/user-guide/pytest-plugin.md +295 -0
evalcraft-0.1.0/docs/user-guide/quickstart.md +326 -0
evalcraft-0.1.0/docs/user-guide/replay.md +308 -0
evalcraft-0.1.0/docs/user-guide/scorers.md +302 -0
evalcraft-0.1.0/evalcraft/__init__.py +50 -0
evalcraft-0.1.0/evalcraft/adapters/__init__.py +47 -0
evalcraft-0.1.0/evalcraft/adapters/anthropic_adapter.py +298 -0
evalcraft-0.1.0/evalcraft/adapters/autogen_adapter.py +362 -0
evalcraft-0.1.0/evalcraft/adapters/crewai_adapter.py +361 -0
evalcraft-0.1.0/evalcraft/adapters/langgraph_adapter.py +546 -0
evalcraft-0.1.0/evalcraft/adapters/llamaindex_adapter.py +434 -0
evalcraft-0.1.0/evalcraft/adapters/openai_adapter.py +305 -0
evalcraft-0.1.0/evalcraft/alerts/__init__.py +7 -0
evalcraft-0.1.0/evalcraft/alerts/email.py +195 -0
evalcraft-0.1.0/evalcraft/alerts/slack.py +201 -0
evalcraft-0.1.0/evalcraft/alerts/webhook.py +82 -0
evalcraft-0.1.0/evalcraft/capture/__init__.py +0 -0
evalcraft-0.1.0/evalcraft/capture/recorder.py +318 -0
evalcraft-0.1.0/evalcraft/cli/__init__.py +0 -0
evalcraft-0.1.0/evalcraft/cli/init_cmd.py +245 -0
evalcraft-0.1.0/evalcraft/cli/main.py +1128 -0
evalcraft-0.1.0/evalcraft/cli/templates/__init__.py +1 -0
evalcraft-0.1.0/evalcraft/cli/templates/conftest.py +42 -0
evalcraft-0.1.0/evalcraft/cli/templates/evalcraft.toml +40 -0
evalcraft-0.1.0/evalcraft/cli/templates/test_agent_anthropic.py +148 -0
evalcraft-0.1.0/evalcraft/cli/templates/test_agent_crewai.py +138 -0
evalcraft-0.1.0/evalcraft/cli/templates/test_agent_generic.py +155 -0
evalcraft-0.1.0/evalcraft/cli/templates/test_agent_langgraph.py +141 -0
evalcraft-0.1.0/evalcraft/cli/templates/test_agent_openai.py +140 -0
evalcraft-0.1.0/evalcraft/cloud/__init__.py +5 -0
evalcraft-0.1.0/evalcraft/cloud/client.py +361 -0
evalcraft-0.1.0/evalcraft/core/__init__.py +0 -0
evalcraft-0.1.0/evalcraft/core/models.py +329 -0
evalcraft-0.1.0/evalcraft/eval/__init__.py +0 -0
evalcraft-0.1.0/evalcraft/eval/scorers/__init__.py +368 -0
evalcraft-0.1.0/evalcraft/golden/__init__.py +5 -0
evalcraft-0.1.0/evalcraft/golden/manager.py +402 -0
evalcraft-0.1.0/evalcraft/mock/__init__.py +0 -0
evalcraft-0.1.0/evalcraft/mock/llm.py +243 -0
evalcraft-0.1.0/evalcraft/mock/tool.py +213 -0
evalcraft-0.1.0/evalcraft/pytest_plugin/__init__.py +35 -0
evalcraft-0.1.0/evalcraft/pytest_plugin/plugin.py +397 -0
evalcraft-0.1.0/evalcraft/regression/__init__.py +10 -0
evalcraft-0.1.0/evalcraft/regression/detector.py +447 -0
evalcraft-0.1.0/evalcraft/replay/__init__.py +0 -0
evalcraft-0.1.0/evalcraft/replay/engine.py +337 -0
evalcraft-0.1.0/evalcraft/replay/network_guard.py +181 -0
evalcraft-0.1.0/evalcraft/sanitize/__init__.py +5 -0
evalcraft-0.1.0/evalcraft/sanitize/redactor.py +261 -0
evalcraft-0.1.0/examples/anthropic-agent/README.md +109 -0
evalcraft-0.1.0/examples/anthropic-agent/agent.py +255 -0
evalcraft-0.1.0/examples/anthropic-agent/record_cassettes.py +81 -0
evalcraft-0.1.0/examples/anthropic-agent/requirements.txt +3 -0
evalcraft-0.1.0/examples/anthropic-agent/tests/cassettes/auth_middleware_review.json +176 -0
evalcraft-0.1.0/examples/anthropic-agent/tests/cassettes/db_pool_refactor_review.json +169 -0
evalcraft-0.1.0/examples/anthropic-agent/tests/test_code_review_agent.py +288 -0
evalcraft-0.1.0/examples/basic_capture.py +101 -0
evalcraft-0.1.0/examples/ci-pipeline/.github/workflows/eval.yml +193 -0
evalcraft-0.1.0/examples/ci-pipeline/README.md +140 -0
evalcraft-0.1.0/examples/ci-pipeline/evalcraft_gate.py +373 -0
evalcraft-0.1.0/examples/ci-pipeline/requirements.txt +3 -0
evalcraft-0.1.0/examples/ci-pipeline/tests/test_ci_gate.py +306 -0
evalcraft-0.1.0/examples/langgraph-workflow/README.md +119 -0
evalcraft-0.1.0/examples/langgraph-workflow/record_cassettes.py +93 -0
evalcraft-0.1.0/examples/langgraph-workflow/requirements.txt +6 -0
evalcraft-0.1.0/examples/langgraph-workflow/tests/cassettes/equipment_stipend.json +233 -0
evalcraft-0.1.0/examples/langgraph-workflow/tests/cassettes/remote_work_policy.json +233 -0
evalcraft-0.1.0/examples/langgraph-workflow/tests/test_rag_workflow.py +297 -0
evalcraft-0.1.0/examples/langgraph-workflow/workflow.py +254 -0
evalcraft-0.1.0/examples/openai-agent/README.md +127 -0
evalcraft-0.1.0/examples/openai-agent/agent.py +190 -0
evalcraft-0.1.0/examples/openai-agent/build_golden.py +62 -0
evalcraft-0.1.0/examples/openai-agent/record_cassettes.py +98 -0
evalcraft-0.1.0/examples/openai-agent/requirements.txt +3 -0
evalcraft-0.1.0/examples/openai-agent/tests/cassettes/damaged_item.json +149 -0
evalcraft-0.1.0/examples/openai-agent/tests/cassettes/order_tracking.json +149 -0
evalcraft-0.1.0/examples/openai-agent/tests/cassettes/return_request.json +146 -0
evalcraft-0.1.0/examples/openai-agent/tests/test_golden.py +52 -0
evalcraft-0.1.0/examples/openai-agent/tests/test_support_agent.py +234 -0
evalcraft-0.1.0/examples/replay_and_diff.py +272 -0
evalcraft-0.1.0/examples/test_with_mocks.py +305 -0
evalcraft-0.1.0/mkdocs.yml +84 -0
evalcraft-0.1.0/packages/evalcraft-js/package-lock.json +2908 -0
evalcraft-0.1.0/packages/evalcraft-js/package.json +54 -0
evalcraft-0.1.0/packages/evalcraft-js/src/adapters/openai.ts +68 -0
evalcraft-0.1.0/packages/evalcraft-js/src/adapters/vercel-ai.ts +293 -0
evalcraft-0.1.0/packages/evalcraft-js/src/capture/recorder.ts +211 -0
evalcraft-0.1.0/packages/evalcraft-js/src/core/models.ts +394 -0
evalcraft-0.1.0/packages/evalcraft-js/src/core/types.ts +36 -0
evalcraft-0.1.0/packages/evalcraft-js/src/eval/scorers.ts +294 -0
evalcraft-0.1.0/packages/evalcraft-js/src/index.ts +46 -0
evalcraft-0.1.0/packages/evalcraft-js/src/mock/llm.ts +191 -0
evalcraft-0.1.0/packages/evalcraft-js/src/mock/tool.ts +172 -0
evalcraft-0.1.0/packages/evalcraft-js/src/replay/engine.ts +248 -0
evalcraft-0.1.0/packages/evalcraft-js/tests/capture.test.ts +155 -0
evalcraft-0.1.0/packages/evalcraft-js/tests/mock.test.ts +283 -0
evalcraft-0.1.0/packages/evalcraft-js/tests/models.test.ts +210 -0
evalcraft-0.1.0/packages/evalcraft-js/tests/replay.test.ts +195 -0
evalcraft-0.1.0/packages/evalcraft-js/tests/scorers.test.ts +286 -0
evalcraft-0.1.0/packages/evalcraft-js/tests/vercel-ai.test.ts +441 -0
evalcraft-0.1.0/packages/evalcraft-js/tsconfig.json +19 -0
evalcraft-0.1.0/packages/evalcraft-js/vitest.config.ts +9 -0
evalcraft-0.1.0/pyproject.toml +93 -0
evalcraft-0.1.0/site/index.html +995 -0
evalcraft-0.1.0/tests/conftest.py +75 -0
evalcraft-0.1.0/tests/test_alerts.py +469 -0
evalcraft-0.1.0/tests/test_anthropic_adapter.py +440 -0
evalcraft-0.1.0/tests/test_cloud.py +405 -0
evalcraft-0.1.0/tests/test_golden.py +482 -0
evalcraft-0.1.0/tests/test_init.py +442 -0
evalcraft-0.1.0/tests/test_mock_llm.py +285 -0
evalcraft-0.1.0/tests/test_mock_tool.py +317 -0
evalcraft-0.1.0/tests/test_models.py +334 -0
evalcraft-0.1.0/tests/test_network_guard.py +387 -0
evalcraft-0.1.0/tests/test_recorder.py +231 -0
evalcraft-0.1.0/tests/test_regression.py +536 -0
evalcraft-0.1.0/tests/test_replay.py +287 -0
evalcraft-0.1.0/tests/test_sanitize.py +579 -0
evalcraft-0.1.0/tests/test_scorers.py +364 -0

evalcraft-0.1.0/.github/ISSUE_TEMPLATE/bug_report.yml ADDED Viewed

@@ -0,0 +1,60 @@
+name: Bug Report
+description: Something is broken
+labels: ["bug"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for filing a bug. Please fill in as much detail as you can.
+  - type: textarea
+    id: description
+    attributes:
+      label: What happened?
+      description: A clear description of the bug.
+    validations:
+      required: true
+  - type: textarea
+    id: reproduction
+    attributes:
+      label: Minimal reproduction
+      description: Code or steps to reproduce the issue.
+      render: python
+    validations:
+      required: true
+  - type: textarea
+    id: expected
+    attributes:
+      label: Expected behavior
+    validations:
+      required: true
+  - type: textarea
+    id: traceback
+    attributes:
+      label: Error / traceback
+      render: text
+  - type: input
+    id: version
+    attributes:
+      label: Evalcraft version
+      placeholder: "e.g. 0.1.0"
+    validations:
+      required: true
+  - type: input
+    id: python
+    attributes:
+      label: Python version
+      placeholder: "e.g. 3.11"
+    validations:
+      required: true
+  - type: input
+    id: framework
+    attributes:
+      label: Framework / adapter (if relevant)
+      placeholder: "e.g. OpenAI SDK 1.30, LangChain 0.2"

evalcraft-0.1.0/.github/ISSUE_TEMPLATE/config.yml ADDED Viewed

@@ -0,0 +1,5 @@
+blank_issues_enabled: false
+contact_links:
+  - name: Question / Discussion
+    url: https://github.com/beyhangl/evalcraft/discussions
+    about: Ask questions and discuss ideas in GitHub Discussions.

evalcraft-0.1.0/.github/ISSUE_TEMPLATE/feature_request.yml ADDED Viewed

@@ -0,0 +1,40 @@
+name: Feature Request
+description: Propose a new feature or improvement
+labels: ["enhancement"]
+body:
+  - type: textarea
+    id: problem
+    attributes:
+      label: What problem does this solve?
+      description: Describe the use case or pain point.
+    validations:
+      required: true
+  - type: textarea
+    id: solution
+    attributes:
+      label: Proposed solution
+      description: What should the API / behavior look like? Code sketches welcome.
+    validations:
+      required: true
+  - type: textarea
+    id: alternatives
+    attributes:
+      label: Alternatives considered
+      description: Other approaches you considered and why you prefer this one.
+  - type: dropdown
+    id: scope
+    attributes:
+      label: Scope
+      options:
+        - Core (capture / replay / assertions)
+        - Mock LLM / Mock Tools
+        - Framework adapter (OpenAI, LangChain, etc.)
+        - CLI
+        - pytest plugin
+        - Docs / examples
+        - Other
+    validations:
+      required: true

evalcraft-0.1.0/.github/workflows/ci.yml ADDED Viewed

@@ -0,0 +1,106 @@
+name: CI
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+jobs:
+  # ── Python SDK ────────────────────────────────────────────────────────
+  test-python:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12"]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: pip install -e ".[dev]"
+      - name: Run tests
+        run: pytest tests/ -v --tb=short
+      - name: Check imports
+        run: python -c "from evalcraft import capture, replay, MockLLM, MockTool, assert_tool_called; print('OK')"
+  build-python:
+    runs-on: ubuntu-latest
+    needs: test-python
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - name: Install build tools
+        run: pip install build
+      - name: Build package
+        run: python -m build
+      - name: Check dist
+        run: |
+          pip install twine
+          twine check dist/*
+  # ── TypeScript SDK ────────────────────────────────────────────────────
+  test-typescript:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        node-version: ["18", "20", "22"]
+    defaults:
+      run:
+        working-directory: packages/evalcraft-js
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Node ${{ matrix.node-version }}
+        uses: actions/setup-node@v4
+        with:
+          node-version: ${{ matrix.node-version }}
+      - name: Install dependencies
+        run: npm ci
+      - name: Type check
+        run: npx tsc --noEmit
+      - name: Run tests
+        run: npx vitest run
+  # ── Dashboard Frontend ────────────────────────────────────────────────
+  build-frontend:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: dashboard/frontend
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+      - name: Install dependencies
+        run: npm ci
+      - name: Type check
+        run: npx tsc -b --noEmit
+      - name: Build
+        run: npx vite build

evalcraft-0.1.0/.github/workflows/example-ci-gate.yml ADDED Viewed

@@ -0,0 +1,109 @@
+# Example: Evalcraft CI Gate
+#
+# Copy this workflow to your repo's .github/workflows/ directory and adapt it
+# to your project. Teams using evalcraft can gate PRs on agent test results —
+# blocking merges when tests fail or cost/regression thresholds are exceeded.
+#
+# For full documentation see:
+#   https://github.com/beyhangl/evalcraft/blob/main/docs/user-guide/ci-cd.md
+name: Agent Tests (Evalcraft CI Gate)
+on:
+  pull_request:
+    branches: [main, master]
+  push:
+    branches: [main, master]
+# Only allow one run per PR branch at a time; cancel superseded runs.
+concurrency:
+  group: evalcraft-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  # ── Replay-only gate (fast, no API keys required) ────────────────────────────
+  #
+  # Runs your agent tests against committed cassettes. Fails if any test fails
+  # or if total cost / token regressions exceed your thresholds.
+  #
+  # This is the recommended default for pull request checks.
+  agent-tests:
+    name: Agent Tests
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read       # read cassette files
+      pull-requests: write # post results comment
+    steps:
+      - uses: actions/checkout@v4
+      - uses: beyhangl/evalcraft@v0.1.0
+        with:
+          # Path(s) to your agent test files
+          test-path: tests/agent_tests/
+          # Cassette directory (must match your pytest --cassette-dir setting)
+          cassette-dir: tests/cassettes/
+          # Fail the PR if total agent cost exceeds this value (USD)
+          max-cost: '0.10'
+          # Fail the PR if any cassette metric increases by more than this %
+          # relative to the baseline cassettes committed in git.
+          # Only meaningful when record-mode is 'new' or 'all'.
+          max-regression: '5%'
+          # Replay recorded cassettes without making real LLM calls (default).
+          # Change to 'new' or 'all' to re-record during CI (requires API keys).
+          record-mode: none
+          # Post a results table as a PR comment (requires pull-requests: write)
+          post-comment: 'true'
+          # Pin to a specific evalcraft version for reproducible CI
+          # evalcraft-version: '0.1.0'
+      # The action exposes outputs you can use in subsequent steps:
+      # - passed         (true/false)
+      # - total-cost     (e.g. "0.0423")
+      # - total-tokens   (e.g. "1840")
+      # - total-tool-calls
+      # - cassette-count
+  # ── Optional: nightly re-record job ─────────────────────────────────────────
+  #
+  # Runs on a schedule with real LLM calls, re-records cassettes, and opens a
+  # PR if any cassette changed. Requires your LLM API key as a secret.
+  #
+  # Uncomment and adapt if you want automated cassette refresh.
+  # nightly-record:
+  #   name: Nightly cassette refresh
+  #   runs-on: ubuntu-latest
+  #   if: github.event_name == 'schedule'
+  #   permissions:
+  #     contents: write
+  #     pull-requests: write
+  #
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #
+  #     - uses: beyhangl/evalcraft@v0.1.0
+  #       env:
+  #         OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+  #         # or ANTHROPIC_API_KEY, etc.
+  #       with:
+  #         test-path: tests/agent_tests/
+  #         cassette-dir: tests/cassettes/
+  #         record-mode: all          # re-record every cassette
+  #         max-regression: '10%'     # allow up to 10% drift from last recording
+  #         post-comment: 'false'
+  #
+  #     - name: Open PR if cassettes changed
+  #       uses: peter-evans/create-pull-request@v6
+  #       with:
+  #         commit-message: 'chore: refresh evalcraft cassettes'
+  #         title: '🤖 Cassette refresh — nightly agent test update'
+  #         branch: cassette-refresh/nightly
+  #         labels: cassettes, automated

evalcraft-0.1.0/.github/workflows/publish.yml ADDED Viewed

@@ -0,0 +1,68 @@
+name: Publish to PyPI
+on:
+  push:
+    tags:
+      - "v*"
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - name: Install dependencies
+        run: pip install -e ".[dev]"
+      - name: Run tests
+        run: pytest tests/ -v --tb=short
+      - name: Check imports
+        run: python -c "from evalcraft import capture, replay, MockLLM, MockTool, assert_tool_called; print('OK')"
+  build:
+    runs-on: ubuntu-latest
+    needs: test
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - name: Install build tools
+        run: pip install build twine
+      - name: Build package
+        run: python -m build
+      - name: Validate distributions
+        run: twine check dist/*
+      - name: Upload dist artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: dist
+          path: dist/
+  publish:
+    runs-on: ubuntu-latest
+    needs: build
+    environment: pypi
+    permissions:
+      id-token: write
+    steps:
+      - name: Download dist artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: dist
+          path: dist/
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1

evalcraft-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,46 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+*.egg-info/
+*.egg
+dist/
+build/
+*.whl
+# Virtual environments
+.venv/
+venv/
+env/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+.mypy_cache/
+# OS
+.DS_Store
+Thumbs.db
+# Node
+node_modules/
+.npm
+.pnpm-store/
+# Cassettes (user-generated, not tracked)
+tests/cassettes/*.json
+!tests/cassettes/.gitkeep
+# Internal docs (strategy, launch content — not public)
+docs/strategy/
+docs/launch/
+docs/blog/

evalcraft-0.1.0/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,76 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [0.1.0] — 2026-03-05
+Initial public release of Evalcraft — the pytest for AI agents.
+### Added
+#### Core data model
+- `Span` — atomic unit of capture, recording every LLM call, tool invocation, agent step, user input, and output with timing, token usage, and cost metadata
+- `Cassette` — the fundamental recording unit that stores all spans from a single agent execution; supports fingerprinting for change detection, aggregate metrics, and JSON serialization/deserialization
+- `AgentRun` — wrapper for live or replayed agent results
+- `EvalResult` / `AssertionResult` — structured pass/fail results for assertions with score tracking
+- `SpanKind` enum: `llm_request`, `llm_response`, `tool_call`, `tool_result`, `agent_step`, `user_input`, `agent_output`
+- `TokenUsage` dataclass tracking prompt, completion, and total tokens
+#### Capture
+- `capture()` context manager — instrument any code block to record spans into a cassette
+- `CaptureContext` — configurable capture session with name, agent name, framework tag, and optional auto-save path
+#### Replay
+- `ReplayEngine` — feeds recorded LLM responses back without making real API calls
+- Tool result overriding for isolated replay testing
+- `ReplayDiff` — compare two cassettes and detect changes in tool sequence, output text, token count, cost, and span count
+#### Mock
+- `MockLLM` — deterministic LLM fake with pattern-based response matching (`"*"` wildcard), token usage simulation, cost tracking, and automatic span recording
+- `MockTool` — configurable tool fake with `.returns()` / `.raises()` / `.side_effect()` control
+#### Eval scorers (8 built-in assertions)
+- `assert_tool_called(cassette, tool_name, times=None, with_args=None, before=None, after=None)` — verify a tool was invoked, with optional count, arg, and ordering constraints
+- `assert_tool_order(cassette, expected_order, strict=False)` — verify tool call sequence (strict or subsequence mode)
+- `assert_no_tool_called(cassette, tool_name)` — verify a tool was never invoked
+- `assert_output_contains(cassette, substring, case_sensitive=True)` — verify agent output text
+- `assert_output_matches(cassette, pattern)` — verify agent output against a regex pattern
+- `assert_cost_under(cassette, max_usd)` — budget enforcement
+- `assert_latency_under(cassette, max_ms)` — latency enforcement
+- `assert_token_count_under(cassette, max_tokens)` — token budget enforcement
+- `Evaluator` — compose multiple assertions into a single evaluation with aggregate scoring
+#### Framework adapters (4 adapters)
+- `OpenAIAdapter` — patches the OpenAI Python SDK (`chat.completions.create`, sync and async) to auto-record LLM spans with token usage and cost
+- `AnthropicAdapter` — patches the Anthropic Python SDK (`messages.create`, sync and async) with a built-in pricing table covering all Claude models
+- `LangGraphAdapter` — injects a LangChain callback handler into compiled LangGraph graphs to record node executions, LLM calls, and tool calls
+- `CrewAIAdapter` — instruments a CrewAI `Crew` to capture `kickoff()` timing, per-agent tool calls, task completions, and inter-agent delegation spans
+#### pytest plugin (`pytest-evalcraft`)
+- Auto-registered via `entry_points` — zero-config activation when evalcraft is installed
+- Fixtures: `capture_context`, `mock_llm`, `mock_tool`, `cassette`, `replay_engine`, `evalcraft_cassette_dir`
+- Markers: `@pytest.mark.evalcraft_cassette(path)`, `@pytest.mark.evalcraft_capture(name, save)`, `@pytest.mark.evalcraft_agent`
+- CLI options: `--cassette-dir DIR`, `--evalcraft-record {none,new,all}`
+- Terminal summary: per-test agent run metrics table (tokens, cost, tools, latency, fingerprint) appended to pytest output
+#### CLI (`evalcraft`)
+- `evalcraft capture <script>` — run a Python script under capture and save the cassette
+- `evalcraft replay <cassette>` — replay a cassette and display metrics (`--verbose` shows all spans)
+- `evalcraft diff <old> <new>` — compare two cassettes side-by-side (`--json` for machine-readable output)
+- `evalcraft eval <cassette>` — run assertions with `--max-cost`, `--max-tokens`, `--max-latency`, `--tool` flags; exits 1 on failure (CI-friendly)
+- `evalcraft info <cassette>` — inspect cassette metadata, metrics, tool sequence, and spans (`--json` for raw JSON)
+- `evalcraft mock <cassette>` — generate ready-to-use `MockLLM` and `MockTool` Python fixtures from a recorded cassette
+#### Project infrastructure
+- MIT license
+- Python 3.9–3.13 support
+- Optional dependency groups: `[pytest]`, `[openai]`, `[anthropic]`, `[langchain]`, `[crewai]`, `[all]`, `[dev]`
+- Hatchling build system
+- Ruff linting, mypy strict type checking, pytest-asyncio for async tests
+- GitHub Actions CI and PyPI publish workflows
+- 260 tests
+[0.1.0]: https://github.com/beyhangl/evalcraft/releases/tag/v0.1.0

evalcraft-0.1.0/CONTRIBUTING.md ADDED Viewed

@@ -0,0 +1,86 @@
+# Contributing to Evalcraft
+Thanks for your interest! Here's everything you need to get started.
+## Quick setup
+```bash
+git clone https://github.com/beyhangl/evalcraft
+cd evalcraft
+pip install -e ".[dev]"
+pytest
+```
+## Before opening a PR
+- **For bug fixes** — open a PR directly. Include a failing test that your fix resolves.
+- **For new features** — open an issue first to discuss the approach. Significant changes without prior discussion may be declined.
+- **For adapters** (OpenAI, LangChain, etc.) — new framework adapters are welcome; discuss in an issue first.
+## Development workflow
+```bash
+# Format
+ruff format .
+# Lint
+ruff check .
+# Type check
+mypy evalcraft/
+# Run tests
+pytest
+# Run tests with coverage
+pytest --cov=evalcraft --cov-report=term-missing
+```
+All three checks (format, lint, type check) must pass before a PR is merged. CI enforces this.
+## Code conventions
+- **Python 3.9+** — no syntax or stdlib features above 3.9 unless gated
+- **Line length** — 100 characters (configured in `pyproject.toml`)
+- **Types** — strict mypy; all public functions need type annotations
+- **Tests** — every new feature or bug fix needs a test in `tests/`
+- **Cassette fixtures** — test cassettes live in `tests/cassettes/`
+## Adding a new framework adapter
+1. Create `evalcraft/adapters/<framework>.py`
+2. Add the optional dependency to `pyproject.toml` under `[project.optional-dependencies]`
+3. Add integration tests under `tests/adapters/`
+4. Update the README framework support table
+## Project structure
+```
+evalcraft/
+├── core/          # Cassette, Span, data model
+├── capture.py     # CaptureContext
+├── replay.py      # replay()
+├── assertions.py  # assert_tool_called, assert_cost_under, etc.
+├── mocks.py       # MockLLM, MockTool
+├── adapters/      # OpenAI, LangChain, etc.
+├── cli/           # evalcraft CLI (click)
+└── pytest_plugin/ # pytest fixtures and markers
+tests/
+├── cassettes/     # Fixture cassettes
+└── ...
+```
+## Commit messages
+Use conventional commits — `fix:`, `feat:`, `docs:`, `chore:`, `refactor:`. Keep the subject line under 72 chars.
+## Releasing (maintainers only)
+1. Bump `version` in `pyproject.toml`
+2. Update the changelog
+3. Tag: `git tag v0.x.y && git push --tags`
+4. CI publishes to PyPI automatically on tag push
+## License
+By contributing you agree your code will be released under the [MIT License](LICENSE).

evalcraft-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Beyhan Gül
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.