pytest-assay 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pytest_assay-0.1.0/.claude/CLAUDE.md +120 -0
- pytest_assay-0.1.0/.claude/README.md +4 -0
- pytest_assay-0.1.0/.claude/plans/.gitkeep +0 -0
- pytest_assay-0.1.0/.claude/plans/custom_evaluators.plan.md +79 -0
- pytest_assay-0.1.0/.claude/plans/remove_generator.plan.md +52 -0
- pytest_assay-0.1.0/.claude/rules/general.md +5 -0
- pytest_assay-0.1.0/.claude/rules/python-testing.md +79 -0
- pytest_assay-0.1.0/.claude/rules/python.md +305 -0
- pytest_assay-0.1.0/.claude/settings.json +71 -0
- pytest_assay-0.1.0/.codecov.yaml +9 -0
- pytest_assay-0.1.0/.coderabbit.yaml +225 -0
- pytest_assay-0.1.0/.cursor/README.md +21 -0
- pytest_assay-0.1.0/.cursor/mcp.json.example +18 -0
- pytest_assay-0.1.0/.cursor/plans/.gitkeep +0 -0
- pytest_assay-0.1.0/.cursor/rules/cursor-rules.mdc +67 -0
- pytest_assay-0.1.0/.cursor/rules/general.mdc +43 -0
- pytest_assay-0.1.0/.cursor/rules/python-testing.mdc +69 -0
- pytest_assay-0.1.0/.cursor/rules/python.mdc +309 -0
- pytest_assay-0.1.0/.cursor/settings.json +11 -0
- pytest_assay-0.1.0/.cursorignore +13 -0
- pytest_assay-0.1.0/.devcontainer/Dockerfile +46 -0
- pytest_assay-0.1.0/.devcontainer/README.md +37 -0
- pytest_assay-0.1.0/.devcontainer/devcontainer.json +141 -0
- pytest_assay-0.1.0/.env.example +4 -0
- pytest_assay-0.1.0/.github/copilot-instructions.md +158 -0
- pytest_assay-0.1.0/.github/dependabot.yml +11 -0
- pytest_assay-0.1.0/.github/images/banner_pytest-assay.png +0 -0
- pytest_assay-0.1.0/.github/workflows/build.yaml +128 -0
- pytest_assay-0.1.0/.gitignore +21 -0
- pytest_assay-0.1.0/.mcp.json.example +20 -0
- pytest_assay-0.1.0/.vscode/README.md +14 -0
- pytest_assay-0.1.0/.vscode/extensions.json +40 -0
- pytest_assay-0.1.0/.vscode/mcp.json +16 -0
- pytest_assay-0.1.0/.vscode/settings.json +111 -0
- pytest_assay-0.1.0/LICENSE +21 -0
- pytest_assay-0.1.0/PKG-INFO +95 -0
- pytest_assay-0.1.0/README.md +58 -0
- pytest_assay-0.1.0/pyproject.toml +152 -0
- pytest_assay-0.1.0/src/pytest_assay/__init__.py +5 -0
- pytest_assay-0.1.0/src/pytest_assay/config.py +25 -0
- pytest_assay-0.1.0/src/pytest_assay/evaluators/README.md +5 -0
- pytest_assay-0.1.0/src/pytest_assay/evaluators/__init__.py +6 -0
- pytest_assay-0.1.0/src/pytest_assay/evaluators/bradleyterry.py +551 -0
- pytest_assay-0.1.0/src/pytest_assay/evaluators/pairwise.py +191 -0
- pytest_assay-0.1.0/src/pytest_assay/logger.py +23 -0
- pytest_assay-0.1.0/src/pytest_assay/models.py +52 -0
- pytest_assay-0.1.0/src/pytest_assay/plugin.py +312 -0
- pytest_assay-0.1.0/src/pytest_assay/py.typed +0 -0
- pytest_assay-0.1.0/tests/README.md +5 -0
- pytest_assay-0.1.0/tests/__init__.py +0 -0
- pytest_assay-0.1.0/tests/_ollama.py +5 -0
- pytest_assay-0.1.0/tests/assays/test_plugin_integration/test_integration_bradleyterryevaluator.json +96 -0
- pytest_assay-0.1.0/tests/assays/test_plugin_integration/test_integration_bradleyterryevaluator.readout.json +30 -0
- pytest_assay-0.1.0/tests/assays/test_plugin_integration/test_integration_pairwiseevaluator.json +96 -0
- pytest_assay-0.1.0/tests/assays/test_plugin_integration/test_integration_pairwiseevaluator.readout.json +30 -0
- pytest_assay-0.1.0/tests/conftest.py +62 -0
- pytest_assay-0.1.0/tests/evaluators/__init__.py +0 -0
- pytest_assay-0.1.0/tests/evaluators/conftest.py +87 -0
- pytest_assay-0.1.0/tests/evaluators/test_bradleyterry.py +531 -0
- pytest_assay-0.1.0/tests/evaluators/test_pairwise.py +309 -0
- pytest_assay-0.1.0/tests/pytest_assay/test_plugin_integration/test_integration_bradleyterryevaluator.json +97 -0
- pytest_assay-0.1.0/tests/pytest_assay/test_plugin_integration/test_integration_bradleyterryevaluator.readout.json +30 -0
- pytest_assay-0.1.0/tests/pytest_assay/test_plugin_integration/test_integration_pairwiseevaluator.json +97 -0
- pytest_assay-0.1.0/tests/pytest_assay/test_plugin_integration/test_integration_pairwiseevaluator.readout.json +30 -0
- pytest_assay-0.1.0/tests/test_config.py +72 -0
- pytest_assay-0.1.0/tests/test_logger.py +33 -0
- pytest_assay-0.1.0/tests/test_models.py +191 -0
- pytest_assay-0.1.0/tests/test_plugin.py +947 -0
- pytest_assay-0.1.0/tests/test_plugin_integration.py +155 -0
- pytest_assay-0.1.0/uv.lock +3290 -0
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Project Overview
|
|
6
|
+
|
|
7
|
+
Pytest-assay is a framework for the evaluation of Pydantic AI agents. By adding the `@pytest.mark.assay` decorator to a test, you can run an assay resulting in a readout report. The assay compares the current agent responses against previously recorded baseline responses, e.g. from the main branch. The implementation is based on pytest hooks which capture `Agent.run()` responses.
|
|
8
|
+
|
|
9
|
+
## Development Commands
|
|
10
|
+
|
|
11
|
+
### Environment Setup
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
# Install Ollama (required for local models)
|
|
15
|
+
# See https://ollama.com for installation instructions
|
|
16
|
+
|
|
17
|
+
# Pull the default model
|
|
18
|
+
ollama pull qwen3:8b
|
|
19
|
+
|
|
20
|
+
# Install dependencies
|
|
21
|
+
uv sync
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### Testing
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
# Run all tests
|
|
28
|
+
uv run pytest
|
|
29
|
+
|
|
30
|
+
# Run tests with verbose output
|
|
31
|
+
uv run pytest -v
|
|
32
|
+
|
|
33
|
+
# Run specific test file
|
|
34
|
+
uv run pytest tests/test_utils.py
|
|
35
|
+
|
|
36
|
+
# Run specific test
|
|
37
|
+
uv run pytest tests/test_utils.py::test_duckduckgo_search
|
|
38
|
+
|
|
39
|
+
# Run tests in parallel
|
|
40
|
+
uv run pytest -n auto
|
|
41
|
+
|
|
42
|
+
# Run tests with coverage report
|
|
43
|
+
uv run pytest --cov=src/pytest-assay --cov-report=term-missing
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Code Quality
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
# Format code
|
|
50
|
+
uv run ruff format .
|
|
51
|
+
|
|
52
|
+
# Check and fix linting issues (ALWAYS run with --fix)
|
|
53
|
+
uv run ruff check --fix .
|
|
54
|
+
|
|
55
|
+
# Type checking (ALWAYS run after code changes)
|
|
56
|
+
uv run pyright .
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Before Committing
|
|
60
|
+
|
|
61
|
+
Run these checks:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# 1. Format code
|
|
65
|
+
uv run ruff format .
|
|
66
|
+
|
|
67
|
+
# 2. Check and fix linting issues
|
|
68
|
+
uv run ruff check --fix .
|
|
69
|
+
|
|
70
|
+
# 3. Type checking
|
|
71
|
+
uv run pyright .
|
|
72
|
+
|
|
73
|
+
# 4. Run tests
|
|
74
|
+
uv run pytest -n auto
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## MCP Servers
|
|
78
|
+
|
|
79
|
+
This project uses Model Context Protocol (MCP) servers to extend AI capabilities. These are automatically invoked when relevant.
|
|
80
|
+
|
|
81
|
+
### Context7 Documentation Server
|
|
82
|
+
|
|
83
|
+
**When to use:**
|
|
84
|
+
- Looking up library documentation (e.g., "How do I use pydantic-ai streaming?")
|
|
85
|
+
- Checking API references for dependencies
|
|
86
|
+
- Finding code examples from official docs
|
|
87
|
+
- Verifying correct usage of third-party packages
|
|
88
|
+
|
|
89
|
+
**Examples:**
|
|
90
|
+
- "What's the latest pydantic-ai agent syntax?"
|
|
91
|
+
- "Show me httpx async client examples"
|
|
92
|
+
- "How do I configure pytest-asyncio?"
|
|
93
|
+
|
|
94
|
+
### GitHub Repository Server
|
|
95
|
+
|
|
96
|
+
**When to use:**
|
|
97
|
+
- Checking open/closed issues in this repository
|
|
98
|
+
- Reviewing pull requests and their status
|
|
99
|
+
- Reading issue comments and discussions
|
|
100
|
+
- Finding related issues or PRs
|
|
101
|
+
- Understanding project history and decisions
|
|
102
|
+
|
|
103
|
+
**Examples:**
|
|
104
|
+
- "What are the open issues about curiosity?"
|
|
105
|
+
- "Show me recent PRs related to PDF support"
|
|
106
|
+
- "Are there any issues about MLX integration?"
|
|
107
|
+
- "What's the status of issue #13?"
|
|
108
|
+
|
|
109
|
+
### Best Practices
|
|
110
|
+
|
|
111
|
+
- **Be specific:** "Check issue #15" is better than "check issues"
|
|
112
|
+
- **Context first:** Read codebase before checking issues
|
|
113
|
+
- **Combine sources:** Use Context7 for "how to use X" and GitHub for "what's our approach to X"
|
|
114
|
+
|
|
115
|
+
## Coding Standards
|
|
116
|
+
|
|
117
|
+
See the rules files for detailed coding standards:
|
|
118
|
+
|
|
119
|
+
- `.claude/rules/python.md` - Python coding standards, type hints, async patterns
|
|
120
|
+
- `.claude/rules/python-testing.md` - Testing conventions, markers, coverage requirements
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
If read-write file system access is configured for Claude Code via `/allowed-tools`, it is advised to run [Claude Code](https://docs.anthropic.com/en/docs/claude-code/overview) in a dev container. For more details, refer to the [Dev Container](../.devcontainer/README.md) documentation.
|
|
2
|
+
|
|
3
|
+
## MCP servers
|
|
4
|
+
MCP servers are configured in the `./.mcp.json` file. The file cannot be moved to `.claude/mcp.json`.
|
|
File without changes
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Custom Evaluators Design Plan
|
|
2
|
+
|
|
3
|
+
## Problem
|
|
4
|
+
|
|
5
|
+
A user writing `MyCustomEvaluator` would need to do:
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
from pytest_assay.plugin import AGENT_RESPONSES_KEY, BASELINE_DATASET_KEY
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
These stash keys are **internal implementation details** -- not exported in `__all__`, and they couple any custom evaluator directly to the plugin's storage mechanism. The `Evaluator` protocol receiving a raw `pytest.Item` is the root cause: it forces evaluators to know *how* to extract data rather than just *receiving* it.
|
|
12
|
+
|
|
13
|
+
## Design: Pass an `EvaluatorInput` Instead of `Item`
|
|
14
|
+
|
|
15
|
+
Introduce a data class that the plugin constructs and passes to evaluators, replacing the raw `Item`:
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
# models.py
|
|
19
|
+
class EvaluatorInput(BaseModel):
|
|
20
|
+
baseline_dataset: Dataset | None
|
|
21
|
+
agent_responses: list[AgentRunResult[Any]]
|
|
22
|
+
|
|
23
|
+
class Evaluator(Protocol):
|
|
24
|
+
def __call__(self, input: EvaluatorInput) -> Coroutine[Any, Any, Readout]: ...
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
The plugin's `_run_evaluation` would change from:
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
readout = asyncio.run(evaluator(item))
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
to:
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
eval_input = EvaluatorInput(
|
|
37
|
+
baseline_dataset=item.stash.get(BASELINE_DATASET_KEY, None),
|
|
38
|
+
agent_responses=item.stash.get(AGENT_RESPONSES_KEY, []),
|
|
39
|
+
)
|
|
40
|
+
readout = asyncio.run(evaluator(eval_input))
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
The stash keys stay private. Evaluators receive exactly the data they need.
|
|
44
|
+
|
|
45
|
+
## What Changes
|
|
46
|
+
|
|
47
|
+
| Component | Change |
|
|
48
|
+
|---|---|
|
|
49
|
+
| `Evaluator` protocol | `__call__(self, input: EvaluatorInput)` instead of `__call__(self, item: Item)` |
|
|
50
|
+
| `_run_evaluation` in plugin.py | Build `EvaluatorInput` from stash, pass it |
|
|
51
|
+
| `PairwiseEvaluator.__call__` | Accept `EvaluatorInput`, drop the stash key import |
|
|
52
|
+
| `BradleyTerryEvaluator.__call__` | Accept `EvaluatorInput`, drop the stash key import |
|
|
53
|
+
| Public exports | Add `EvaluatorInput`, `Evaluator`, `Readout` to `__all__` |
|
|
54
|
+
|
|
55
|
+
## What a Custom Evaluator Looks Like
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from pytest_assay.models import Evaluator, EvaluatorInput, Readout
|
|
59
|
+
|
|
60
|
+
class MyCustomEvaluator:
|
|
61
|
+
async def __call__(self, input: EvaluatorInput) -> Readout:
|
|
62
|
+
# input.baseline_dataset has the baseline cases
|
|
63
|
+
# input.agent_responses has the captured AgentRunResult objects
|
|
64
|
+
score = my_scoring_logic(input.baseline_dataset, input.agent_responses)
|
|
65
|
+
return Readout(passed=score > 0.5, details={"score": score})
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
No internal imports needed. Fully decoupled from pytest stash mechanics.
|
|
69
|
+
|
|
70
|
+
## Why Not Alternatives
|
|
71
|
+
|
|
72
|
+
- **Export the stash keys as public API**: Leaks an implementation detail. If you ever change how data flows (e.g. stop using stash), every custom evaluator breaks.
|
|
73
|
+
- **Accessor helper functions** (`get_baseline(item)`, `get_responses(item)`): Better than raw keys, but still couples evaluators to `pytest.Item`. Harder to unit-test evaluators in isolation.
|
|
74
|
+
- **The `EvaluatorInput` approach**: Evaluators become plain async functions on data. Easy to test, no pytest dependency, no internal imports. The plugin owns the extraction logic.
|
|
75
|
+
|
|
76
|
+
## Open Questions
|
|
77
|
+
|
|
78
|
+
1. Should `EvaluatorInput` also carry metadata (test name, assay path, mode) so evaluators can customize behavior per-test?
|
|
79
|
+
2. `AgentRunResult` comes from `pydantic-ai` -- is that an acceptable public API surface, or should responses be further abstracted (e.g. to plain strings)?
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# Design Analysis: `generator` in `@pytest.mark.assay` vs. External Dataset Creation
|
|
2
|
+
|
|
3
|
+
## Current Design
|
|
4
|
+
|
|
5
|
+
The `generator` callable is passed as a marker kwarg. The plugin calls it inside `pytest_runtest_setup` only if no serialized dataset file exists yet. The resulting `Dataset` is wrapped into an `AssayContext` and injected into the test via `item.funcargs["context"]`.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Option A: Keep `generator` in the Marker (current)
|
|
10
|
+
|
|
11
|
+
**Pros:**
|
|
12
|
+
- **Declarative, self-contained test definition.** The marker is a single source of truth: evaluator *and* data source live together. You can read the decorator and understand the full assay without chasing fixtures.
|
|
13
|
+
- **Lazy, conditional generation.** The plugin only calls the generator when no baseline file exists. This logic is invisible to the test author -- they don't need `if not path.exists()` boilerplate.
|
|
14
|
+
- **Symmetry with `evaluator`.** Both the data source and the evaluation strategy are declared in the same place. Moving one out breaks that symmetry.
|
|
15
|
+
|
|
16
|
+
**Cons:**
|
|
17
|
+
- **Generator signature is constrained.** It must be a zero-argument callable returning `Dataset`. If a generator needs runtime information (e.g. fixtures, config, parametrize values), there's no clean way to pass it.
|
|
18
|
+
- **Magic injection of `context`.** The test receives `context: AssayContext` as a parameter, but it's not a real pytest fixture -- it's stuffed into `funcargs` by a hook. This is surprising to pytest users who expect fixtures to be defined via `@pytest.fixture` or `conftest.py`.
|
|
19
|
+
- **Harder to share datasets.** Two tests that share the same dataset must both reference the same generator function. With fixtures, you'd just depend on the same fixture name.
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Option B: Dataset Created in a Fixture / Test Body
|
|
24
|
+
|
|
25
|
+
**Pros:**
|
|
26
|
+
- **Standard pytest idiom.** A `@pytest.fixture` returning `AssayContext` is immediately understandable. No hidden `funcargs` injection.
|
|
27
|
+
- **Full access to other fixtures.** A fixture can depend on `tmp_path`, database connections, parametrize values, or any other fixture -- something a zero-arg generator cannot do.
|
|
28
|
+
- **Easier dataset sharing.** Multiple tests reuse the same fixture by name; no need to import and wire the same callable.
|
|
29
|
+
|
|
30
|
+
**Cons:**
|
|
31
|
+
- **Loses conditional loading.** The plugin currently skips generation when a baseline file already exists. Moving generation to a fixture means either (a) the fixture must replicate that logic, or (b) the plugin still needs a hook to intercept and replace the dataset -- defeating the purpose.
|
|
32
|
+
- **Splits the assay definition.** The marker would declare the evaluator, but the data source would live elsewhere (fixture or test body). You'd need to look in two places to understand a single assay.
|
|
33
|
+
- **Test body pollution.** If the dataset is built inside the test function, the test mixes setup concerns (data generation) with execution concerns (running the agent). Fixtures avoid this, but then you're back to the "two places" problem.
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## Option C: Hybrid -- Fixture-Based with Marker Override
|
|
38
|
+
|
|
39
|
+
The marker could accept an *optional* `generator`. If absent, the plugin looks for a `context` fixture (standard pytest resolution). If present, the marker generator takes priority.
|
|
40
|
+
|
|
41
|
+
**Pros:** Best of both worlds -- simple cases stay declarative, complex cases use fixtures.
|
|
42
|
+
**Cons:** Two code paths to maintain and document. Users must understand when each applies.
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## Conclusion
|
|
47
|
+
|
|
48
|
+
The current design is the right call for this project. The key reason: **the plugin needs control over *when* and *whether* to generate the dataset** (skip if baseline file exists, generate otherwise). That conditional logic is core to the assay lifecycle and belongs in the plugin, not in user-written fixtures. Forcing fixture authors to replicate that `if file.exists()` check would be error-prone and leak implementation details.
|
|
49
|
+
|
|
50
|
+
The zero-arg constraint on the generator is acceptable because dataset generation is inherently a pure function -- it defines *what to evaluate*, not *how to run* the test. If you ever need runtime-dependent datasets (e.g. parametrized topics), the cleaner extension would be to let the generator accept an optional config dict via the marker (`generator_kwargs={"n_cases": 5}`) rather than moving to fixtures.
|
|
51
|
+
|
|
52
|
+
The one thing worth improving: document that `context` is injected by the plugin, not a regular fixture. A brief note in the marker docstring or a type stub would reduce confusion.
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
## Plan Mode
|
|
2
|
+
|
|
3
|
+
- Make the plan extremely concise. Sacrifice grammar for the sake of being concise.
|
|
4
|
+
- At the end of each plan, give me a list of unresolved questions to answer, if any. Make the questions extremely concise.
|
|
5
|
+
- When in *plan mode*, you are writing the final plan as markdown file to `~/.claude/plans/` by default. When writing a plan to `~/.claude/plans/` you MUST always copy it to `./.claude/plans/` in the project root. For example, `~/.claude/plans/rosy-dancing-finch.md` should be copied to `./.claude/plans/rosy-dancing-finch.md`.
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Testing with Pytest
|
|
2
|
+
|
|
3
|
+
## Testing Principles
|
|
4
|
+
|
|
5
|
+
- **Reuse, Don't Replicate**: Tests should reuse as much functional code as possible. Avoid reimplementing application logic within a test. Import and call the actual functions and classes you intend to test.
|
|
6
|
+
- **Mock Fundamental Processes**: When isolating code for a unit test, mock the most fundamental external interaction. For example, mock the `asyncio.create_subprocess_exec` call for a command-line tool, not a higher-level function that wraps it. This ensures you are testing your application's error handling and response parsing logic.
|
|
7
|
+
- **Cover All Failure Modes**: Every test suite should cover not just the "happy path" but also all conceivable failure modes. Use `pytest.raises` to verify that your code correctly handles non-zero return codes, missing commands (`FileNotFoundError`), network errors, and other exceptional conditions.
|
|
8
|
+
|
|
9
|
+
## Test Structure
|
|
10
|
+
|
|
11
|
+
```python
|
|
12
|
+
@pytest.mark.asyncio # For async tests
|
|
13
|
+
async def test_feature() -> None:
|
|
14
|
+
"""Test description."""
|
|
15
|
+
# Arrange
|
|
16
|
+
...
|
|
17
|
+
|
|
18
|
+
# Act
|
|
19
|
+
result = await some_function()
|
|
20
|
+
|
|
21
|
+
# Assert
|
|
22
|
+
assert result is not None
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Running Tests
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
# Run all tests
|
|
29
|
+
uv run pytest
|
|
30
|
+
|
|
31
|
+
# Run tests with verbose output
|
|
32
|
+
uv run pytest -v
|
|
33
|
+
|
|
34
|
+
# Run specific test file
|
|
35
|
+
uv run pytest tests/test_utils.py
|
|
36
|
+
|
|
37
|
+
# Run specific test
|
|
38
|
+
uv run pytest tests/test_utils.py::test_fetch_content -v
|
|
39
|
+
|
|
40
|
+
# Run tests in parallel
|
|
41
|
+
uv run pytest -n auto
|
|
42
|
+
|
|
43
|
+
# Run tests with coverage report
|
|
44
|
+
uv run pytest --cov=src/pytest-assay --cov-report=term-missing
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Markers
|
|
48
|
+
|
|
49
|
+
- `@pytest.mark.ollama` - Requires local Ollama (skipped in CI)
|
|
50
|
+
|
|
51
|
+
## VCR Cassettes
|
|
52
|
+
|
|
53
|
+
- Location: `tests/cassettes/`
|
|
54
|
+
- Record mode: `none` (playback only by default)
|
|
55
|
+
- Hostname normalization in `conftest.py` handles `host.docker.internal` → `localhost`
|
|
56
|
+
- Deterministic tests: set `temperature=0.0` in `MODEL_SETTINGS`
|
|
57
|
+
|
|
58
|
+
## Coverage Requirements
|
|
59
|
+
|
|
60
|
+
**After writing or modifying tests**, verify coverage targets are met:
|
|
61
|
+
|
|
62
|
+
1. **Read coverage targets** from `.codecov.yaml` to determine:
|
|
63
|
+
- `coverage.status.project.default.target` - minimum overall project coverage
|
|
64
|
+
- `coverage.status.project.default.threshold` - allowed coverage drop tolerance
|
|
65
|
+
- `coverage.status.patch.default.target` - minimum coverage for new/modified code
|
|
66
|
+
|
|
67
|
+
2. **Run coverage report**:
|
|
68
|
+
```bash
|
|
69
|
+
# Full project coverage
|
|
70
|
+
uv run pytest --cov=src/pytest-assay --cov-report=term-missing
|
|
71
|
+
|
|
72
|
+
# Specific module coverage
|
|
73
|
+
uv run pytest --cov=src/pytest-assay/MODULE_NAME --cov-report=term-missing tests/test_MODULE_NAME.py
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
3. **Verify targets are met**:
|
|
77
|
+
- New code must meet the **patch target** from `.codecov.yaml`
|
|
78
|
+
- Overall coverage must not drop below **project target minus threshold**
|
|
79
|
+
- Focus coverage on critical paths: error handling, edge cases, and main functionality
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
# Python Coding Standards
|
|
2
|
+
|
|
3
|
+
## Python Version
|
|
4
|
+
|
|
5
|
+
- **Required:** Python 3.12 (no 3.13+ features)
|
|
6
|
+
- **Check:** `requires-python = ">=3.12,<3.13"` in pyproject.toml
|
|
7
|
+
- Avoid features introduced in Python 3.13
|
|
8
|
+
|
|
9
|
+
## Code Style & Formatting
|
|
10
|
+
|
|
11
|
+
### Ruff Configuration
|
|
12
|
+
Use Ruff for formatting and linting (configured in `pyproject.toml`):
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
# Format code
|
|
16
|
+
uv run ruff format .
|
|
17
|
+
|
|
18
|
+
# Check and fix linting issues (ALWAYS run with --fix)
|
|
19
|
+
uv run ruff check --fix .
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
**Important:** Always use `--fix` to automatically resolve fixable issues. Run this after any code changes.
|
|
23
|
+
|
|
24
|
+
**Key rules enabled:**
|
|
25
|
+
- `E`, `F` - pycodestyle, pyflakes (essential errors)
|
|
26
|
+
- `I` - isort (import sorting)
|
|
27
|
+
- `UP` - pyupgrade (modern Python syntax)
|
|
28
|
+
- `ANN` - type annotations (required)
|
|
29
|
+
- `B` - bugbear (common bugs)
|
|
30
|
+
- `PL` - pylint rules
|
|
31
|
+
|
|
32
|
+
**Allowed exceptions:**
|
|
33
|
+
- `SIM108` - Allow if-else blocks instead of forcing ternary operators
|
|
34
|
+
- `PLR2004` - Allow magic values (constants without named variables)
|
|
35
|
+
- `PLR0915` - Allow long functions
|
|
36
|
+
- `PLR0912` - Allow many branches
|
|
37
|
+
- `PLR0913` - Allow many arguments
|
|
38
|
+
|
|
39
|
+
### Line Length
|
|
40
|
+
- Maximum: 150 characters (configured in ruff)
|
|
41
|
+
- Prefer shorter lines when reasonable
|
|
42
|
+
|
|
43
|
+
### Import Order
|
|
44
|
+
```python
|
|
45
|
+
# 1. Standard library
|
|
46
|
+
from collections.abc import Generator
|
|
47
|
+
import os
|
|
48
|
+
|
|
49
|
+
# 2. Third-party packages
|
|
50
|
+
import pytest
|
|
51
|
+
from pydantic_ai import Agent
|
|
52
|
+
|
|
53
|
+
# 3. Local imports
|
|
54
|
+
from pytest_assay.config import config
|
|
55
|
+
from pytest_assay.models import WebSearchQuery
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Type Hints
|
|
59
|
+
|
|
60
|
+
### Required
|
|
61
|
+
Type hints are **required** for all functions (enforced by Pyright):
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
# Good
|
|
65
|
+
def search_web(query: str, max_results: int = 5) -> list[dict[str, str]]:
|
|
66
|
+
...
|
|
67
|
+
|
|
68
|
+
async def fetch_content(url: str) -> str:
|
|
69
|
+
...
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Pyright Configuration
|
|
73
|
+
- `typeCheckingMode: "basic"`
|
|
74
|
+
- Error-level rules:
|
|
75
|
+
- `reportOptionalMemberAccess`
|
|
76
|
+
- `reportOptionalSubscript`
|
|
77
|
+
- `reportOptionalCall`
|
|
78
|
+
- `reportGeneralTypeIssues`
|
|
79
|
+
- `reportReturnType`
|
|
80
|
+
|
|
81
|
+
### Type Checking
|
|
82
|
+
```bash
|
|
83
|
+
# ALWAYS run after code changes
|
|
84
|
+
uv run pyright .
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
**Important:** Always run pyright to catch type errors. This must pass before committing any Python code.
|
|
88
|
+
|
|
89
|
+
### Modern Syntax
|
|
90
|
+
Use Python 3.12+ type syntax:
|
|
91
|
+
```python
|
|
92
|
+
# Good (3.12+)
|
|
93
|
+
def process(items: list[str]) -> dict[str, int]:
|
|
94
|
+
...
|
|
95
|
+
|
|
96
|
+
# Avoid (old style)
|
|
97
|
+
from typing import List, Dict
|
|
98
|
+
def process(items: List[str]) -> Dict[str, int]:
|
|
99
|
+
...
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Async/Await Patterns
|
|
103
|
+
|
|
104
|
+
### When to Use Async
|
|
105
|
+
- Network I/O (web searches, API calls)
|
|
106
|
+
- File I/O with async libraries
|
|
107
|
+
- Concurrent operations
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
# Good - concurrent operations
|
|
111
|
+
async def fetch_multiple(urls: list[str]) -> list[str]:
|
|
112
|
+
async with httpx.AsyncClient() as client:
|
|
113
|
+
tasks = [client.get(url) for url in urls]
|
|
114
|
+
responses = await asyncio.gather(*tasks)
|
|
115
|
+
return [r.text for r in responses]
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Pydantic AI Agents
|
|
119
|
+
All agents are async:
|
|
120
|
+
```python
|
|
121
|
+
from pydantic_ai import Agent
|
|
122
|
+
|
|
123
|
+
agent = Agent(
|
|
124
|
+
model=model,
|
|
125
|
+
output_type=WebSearchQuery,
|
|
126
|
+
system_prompt=QUERY_INSTRUCTIONS,
|
|
127
|
+
retries=5,
|
|
128
|
+
instrument=True,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Usage
|
|
132
|
+
async with agent:
|
|
133
|
+
result = await agent.run(user_prompt="Generate query")
|
|
134
|
+
print(result.output)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Dependencies
|
|
138
|
+
|
|
139
|
+
### Management
|
|
140
|
+
Use `uv` (not pip or poetry):
|
|
141
|
+
```bash
|
|
142
|
+
# Install dependencies
|
|
143
|
+
uv sync
|
|
144
|
+
|
|
145
|
+
# Add new dependency
|
|
146
|
+
# Edit pyproject.toml manually, then:
|
|
147
|
+
uv sync
|
|
148
|
+
|
|
149
|
+
# Run command in venv
|
|
150
|
+
uv run pytest
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### Adding Dependencies
|
|
154
|
+
1. Add to `pyproject.toml` under `[project.dependencies]`
|
|
155
|
+
2. Specify minimum version: `"package>=1.2.3"`
|
|
156
|
+
3. Run `uv sync` to update lock file
|
|
157
|
+
4. Test that it works
|
|
158
|
+
|
|
159
|
+
## Error Handling & Logging
|
|
160
|
+
|
|
161
|
+
### Logging
|
|
162
|
+
Use `loguru` for structured logging:
|
|
163
|
+
```python
|
|
164
|
+
from pytest_assay.logger import logger
|
|
165
|
+
|
|
166
|
+
# Info
|
|
167
|
+
logger.info("Starting web search for topic: {}", topic)
|
|
168
|
+
|
|
169
|
+
# Debug (verbose)
|
|
170
|
+
logger.debug("Received {} results", len(results))
|
|
171
|
+
|
|
172
|
+
# Error with context
|
|
173
|
+
try:
|
|
174
|
+
result = await fetch_content(url)
|
|
175
|
+
except Exception as e:
|
|
176
|
+
logger.error("Failed to fetch {}: {}", url, e)
|
|
177
|
+
raise
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### Error Handling
|
|
181
|
+
```python
|
|
182
|
+
# Good - explicit handling
|
|
183
|
+
try:
|
|
184
|
+
result = await risky_operation()
|
|
185
|
+
except ValueError as e:
|
|
186
|
+
logger.error("Invalid input: {}", e)
|
|
187
|
+
return default_value
|
|
188
|
+
except httpx.HTTPError as e:
|
|
189
|
+
logger.error("Network error: {}", e)
|
|
190
|
+
raise
|
|
191
|
+
|
|
192
|
+
# Avoid silent failures
|
|
193
|
+
try:
|
|
194
|
+
...
|
|
195
|
+
except Exception:
|
|
196
|
+
pass # Bad - hides errors
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
## Common Patterns
|
|
200
|
+
|
|
201
|
+
### Pydantic Models
|
|
202
|
+
Use for structured data:
|
|
203
|
+
```python
|
|
204
|
+
from pydantic import BaseModel, Field
|
|
205
|
+
|
|
206
|
+
class WebSearchQuery(BaseModel):
|
|
207
|
+
query: str = Field(max_length=100)
|
|
208
|
+
aspect: str
|
|
209
|
+
rationale: str
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### Configuration
|
|
213
|
+
Access via centralized config:
|
|
214
|
+
```python
|
|
215
|
+
from pytest_assay.config import config
|
|
216
|
+
|
|
217
|
+
# Good
|
|
218
|
+
max_loops = config.max_research_loops
|
|
219
|
+
model_name = config.model.value
|
|
220
|
+
|
|
221
|
+
# Avoid hardcoded values
|
|
222
|
+
max_loops = 5 # Bad
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
### Agent Definitions
|
|
226
|
+
Pattern used in `agents.py`:
|
|
227
|
+
```python
|
|
228
|
+
AGENT_NAME = Agent(
|
|
229
|
+
model=model,
|
|
230
|
+
output_type=OutputType,
|
|
231
|
+
system_prompt=PROMPT_CONSTANT,
|
|
232
|
+
retries=5,
|
|
233
|
+
instrument=True, # For logfire tracking
|
|
234
|
+
)
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
## Module and Package Structure
|
|
238
|
+
|
|
239
|
+
### `__init__.py` Files
|
|
240
|
+
`__init__.py` files should be kept minimal. Their primary purpose is to define a package and expose its public API.
|
|
241
|
+
|
|
242
|
+
- **Good**: Use for imports, `__all__`, and package-level docstrings.
|
|
243
|
+
```python
|
|
244
|
+
# src/pytest-assay/mcp/__init__.py
|
|
245
|
+
"""MCP server implementations for pytest-assay."""
|
|
246
|
+
from .server import date_server
|
|
247
|
+
|
|
248
|
+
__all__ = ["date_server"]
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
- **Bad**: Avoid defining functions, classes, or complex logic directly in `__init__.py`. This can lead to import side effects and makes the code harder to navigate. Move such code into separate modules (e.g., `server.py`) and import them.
|
|
252
|
+
|
|
253
|
+
## Docstrings
|
|
254
|
+
|
|
255
|
+
Use Google-style docstrings (not Sphinx style):
|
|
256
|
+
|
|
257
|
+
### Formatting Rules
|
|
258
|
+
- **No backticks**: Do not use backticks in docstrings. Write `--assay-mode` as --assay-mode, `None` as None, `True` as True, etc.
|
|
259
|
+
- Google style uses plain text for parameter names, options, and values
|
|
260
|
+
- For single-line docstrings, place initial and final triple quotes on the same line:
|
|
261
|
+
```python
|
|
262
|
+
def add(a: int, b: int) -> int:
|
|
263
|
+
"""Add two integers."""
|
|
264
|
+
return a + b
|
|
265
|
+
```
|
|
266
|
+
- For multi-line docstrings, place initial and final triple quotes on their own lines:
|
|
267
|
+
```python
|
|
268
|
+
def complex_function(param1: str, param2: int = 5) -> dict[str, Any]:
|
|
269
|
+
"""
|
|
270
|
+
Short one-line description.
|
|
271
|
+
|
|
272
|
+
Longer description if needed, explaining the function's purpose,
|
|
273
|
+
behavior, and any important details.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
param1: Description of param1.
|
|
277
|
+
param2: Description of param2. Defaults to 5.
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
Description of return value.
|
|
281
|
+
|
|
282
|
+
Raises:
|
|
283
|
+
ValueError: When param2 is negative.
|
|
284
|
+
"""
|
|
285
|
+
...
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
## Before Committing
|
|
289
|
+
|
|
290
|
+
Run these checks:
|
|
291
|
+
```bash
|
|
292
|
+
# 1. Format code
|
|
293
|
+
uv run ruff format .
|
|
294
|
+
|
|
295
|
+
# 2. Check and fix linting issues
|
|
296
|
+
uv run ruff check --fix .
|
|
297
|
+
|
|
298
|
+
# 3. Type checking
|
|
299
|
+
uv run pyright .
|
|
300
|
+
|
|
301
|
+
# 4. Run tests
|
|
302
|
+
uv run pytest -n auto
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
**Note:** The `--fix` flag automatically resolves fixable linting issues. Always verify the changes it makes are appropriate.
|