pytest-assay 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. pytest_assay-0.1.0/.claude/CLAUDE.md +120 -0
  2. pytest_assay-0.1.0/.claude/README.md +4 -0
  3. pytest_assay-0.1.0/.claude/plans/.gitkeep +0 -0
  4. pytest_assay-0.1.0/.claude/plans/custom_evaluators.plan.md +79 -0
  5. pytest_assay-0.1.0/.claude/plans/remove_generator.plan.md +52 -0
  6. pytest_assay-0.1.0/.claude/rules/general.md +5 -0
  7. pytest_assay-0.1.0/.claude/rules/python-testing.md +79 -0
  8. pytest_assay-0.1.0/.claude/rules/python.md +305 -0
  9. pytest_assay-0.1.0/.claude/settings.json +71 -0
  10. pytest_assay-0.1.0/.codecov.yaml +9 -0
  11. pytest_assay-0.1.0/.coderabbit.yaml +225 -0
  12. pytest_assay-0.1.0/.cursor/README.md +21 -0
  13. pytest_assay-0.1.0/.cursor/mcp.json.example +18 -0
  14. pytest_assay-0.1.0/.cursor/plans/.gitkeep +0 -0
  15. pytest_assay-0.1.0/.cursor/rules/cursor-rules.mdc +67 -0
  16. pytest_assay-0.1.0/.cursor/rules/general.mdc +43 -0
  17. pytest_assay-0.1.0/.cursor/rules/python-testing.mdc +69 -0
  18. pytest_assay-0.1.0/.cursor/rules/python.mdc +309 -0
  19. pytest_assay-0.1.0/.cursor/settings.json +11 -0
  20. pytest_assay-0.1.0/.cursorignore +13 -0
  21. pytest_assay-0.1.0/.devcontainer/Dockerfile +46 -0
  22. pytest_assay-0.1.0/.devcontainer/README.md +37 -0
  23. pytest_assay-0.1.0/.devcontainer/devcontainer.json +141 -0
  24. pytest_assay-0.1.0/.env.example +4 -0
  25. pytest_assay-0.1.0/.github/copilot-instructions.md +158 -0
  26. pytest_assay-0.1.0/.github/dependabot.yml +11 -0
  27. pytest_assay-0.1.0/.github/images/banner_pytest-assay.png +0 -0
  28. pytest_assay-0.1.0/.github/workflows/build.yaml +128 -0
  29. pytest_assay-0.1.0/.gitignore +21 -0
  30. pytest_assay-0.1.0/.mcp.json.example +20 -0
  31. pytest_assay-0.1.0/.vscode/README.md +14 -0
  32. pytest_assay-0.1.0/.vscode/extensions.json +40 -0
  33. pytest_assay-0.1.0/.vscode/mcp.json +16 -0
  34. pytest_assay-0.1.0/.vscode/settings.json +111 -0
  35. pytest_assay-0.1.0/LICENSE +21 -0
  36. pytest_assay-0.1.0/PKG-INFO +95 -0
  37. pytest_assay-0.1.0/README.md +58 -0
  38. pytest_assay-0.1.0/pyproject.toml +152 -0
  39. pytest_assay-0.1.0/src/pytest_assay/__init__.py +5 -0
  40. pytest_assay-0.1.0/src/pytest_assay/config.py +25 -0
  41. pytest_assay-0.1.0/src/pytest_assay/evaluators/README.md +5 -0
  42. pytest_assay-0.1.0/src/pytest_assay/evaluators/__init__.py +6 -0
  43. pytest_assay-0.1.0/src/pytest_assay/evaluators/bradleyterry.py +551 -0
  44. pytest_assay-0.1.0/src/pytest_assay/evaluators/pairwise.py +191 -0
  45. pytest_assay-0.1.0/src/pytest_assay/logger.py +23 -0
  46. pytest_assay-0.1.0/src/pytest_assay/models.py +52 -0
  47. pytest_assay-0.1.0/src/pytest_assay/plugin.py +312 -0
  48. pytest_assay-0.1.0/src/pytest_assay/py.typed +0 -0
  49. pytest_assay-0.1.0/tests/README.md +5 -0
  50. pytest_assay-0.1.0/tests/__init__.py +0 -0
  51. pytest_assay-0.1.0/tests/_ollama.py +5 -0
  52. pytest_assay-0.1.0/tests/assays/test_plugin_integration/test_integration_bradleyterryevaluator.json +96 -0
  53. pytest_assay-0.1.0/tests/assays/test_plugin_integration/test_integration_bradleyterryevaluator.readout.json +30 -0
  54. pytest_assay-0.1.0/tests/assays/test_plugin_integration/test_integration_pairwiseevaluator.json +96 -0
  55. pytest_assay-0.1.0/tests/assays/test_plugin_integration/test_integration_pairwiseevaluator.readout.json +30 -0
  56. pytest_assay-0.1.0/tests/conftest.py +62 -0
  57. pytest_assay-0.1.0/tests/evaluators/__init__.py +0 -0
  58. pytest_assay-0.1.0/tests/evaluators/conftest.py +87 -0
  59. pytest_assay-0.1.0/tests/evaluators/test_bradleyterry.py +531 -0
  60. pytest_assay-0.1.0/tests/evaluators/test_pairwise.py +309 -0
  61. pytest_assay-0.1.0/tests/pytest_assay/test_plugin_integration/test_integration_bradleyterryevaluator.json +97 -0
  62. pytest_assay-0.1.0/tests/pytest_assay/test_plugin_integration/test_integration_bradleyterryevaluator.readout.json +30 -0
  63. pytest_assay-0.1.0/tests/pytest_assay/test_plugin_integration/test_integration_pairwiseevaluator.json +97 -0
  64. pytest_assay-0.1.0/tests/pytest_assay/test_plugin_integration/test_integration_pairwiseevaluator.readout.json +30 -0
  65. pytest_assay-0.1.0/tests/test_config.py +72 -0
  66. pytest_assay-0.1.0/tests/test_logger.py +33 -0
  67. pytest_assay-0.1.0/tests/test_models.py +191 -0
  68. pytest_assay-0.1.0/tests/test_plugin.py +947 -0
  69. pytest_assay-0.1.0/tests/test_plugin_integration.py +155 -0
  70. pytest_assay-0.1.0/uv.lock +3290 -0
@@ -0,0 +1,120 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ Pytest-assay is a framework for the evaluation of Pydantic AI agents. By adding the `@pytest.mark.assay` decorator to a test, you can run an assay resulting in a readout report. The assay compares the current agent responses against previously recorded baseline responses, e.g. from the main branch. The implementation is based on pytest hooks which capture `Agent.run()` responses.
8
+
9
+ ## Development Commands
10
+
11
+ ### Environment Setup
12
+
13
+ ```bash
14
+ # Install Ollama (required for local models)
15
+ # See https://ollama.com for installation instructions
16
+
17
+ # Pull the default model
18
+ ollama pull qwen3:8b
19
+
20
+ # Install dependencies
21
+ uv sync
22
+ ```
23
+
24
+ ### Testing
25
+
26
+ ```bash
27
+ # Run all tests
28
+ uv run pytest
29
+
30
+ # Run tests with verbose output
31
+ uv run pytest -v
32
+
33
+ # Run specific test file
34
+ uv run pytest tests/test_utils.py
35
+
36
+ # Run specific test
37
+ uv run pytest tests/test_utils.py::test_duckduckgo_search
38
+
39
+ # Run tests in parallel
40
+ uv run pytest -n auto
41
+
42
+ # Run tests with coverage report
43
+ uv run pytest --cov=src/pytest-assay --cov-report=term-missing
44
+ ```
45
+
46
+ ### Code Quality
47
+
48
+ ```bash
49
+ # Format code
50
+ uv run ruff format .
51
+
52
+ # Check and fix linting issues (ALWAYS run with --fix)
53
+ uv run ruff check --fix .
54
+
55
+ # Type checking (ALWAYS run after code changes)
56
+ uv run pyright .
57
+ ```
58
+
59
+ ### Before Committing
60
+
61
+ Run these checks:
62
+
63
+ ```bash
64
+ # 1. Format code
65
+ uv run ruff format .
66
+
67
+ # 2. Check and fix linting issues
68
+ uv run ruff check --fix .
69
+
70
+ # 3. Type checking
71
+ uv run pyright .
72
+
73
+ # 4. Run tests
74
+ uv run pytest -n auto
75
+ ```
76
+
77
+ ## MCP Servers
78
+
79
+ This project uses Model Context Protocol (MCP) servers to extend AI capabilities. These are automatically invoked when relevant.
80
+
81
+ ### Context7 Documentation Server
82
+
83
+ **When to use:**
84
+ - Looking up library documentation (e.g., "How do I use pydantic-ai streaming?")
85
+ - Checking API references for dependencies
86
+ - Finding code examples from official docs
87
+ - Verifying correct usage of third-party packages
88
+
89
+ **Examples:**
90
+ - "What's the latest pydantic-ai agent syntax?"
91
+ - "Show me httpx async client examples"
92
+ - "How do I configure pytest-asyncio?"
93
+
94
+ ### GitHub Repository Server
95
+
96
+ **When to use:**
97
+ - Checking open/closed issues in this repository
98
+ - Reviewing pull requests and their status
99
+ - Reading issue comments and discussions
100
+ - Finding related issues or PRs
101
+ - Understanding project history and decisions
102
+
103
+ **Examples:**
104
+ - "What are the open issues about curiosity?"
105
+ - "Show me recent PRs related to PDF support"
106
+ - "Are there any issues about MLX integration?"
107
+ - "What's the status of issue #13?"
108
+
109
+ ### Best Practices
110
+
111
+ - **Be specific:** "Check issue #15" is better than "check issues"
112
+ - **Context first:** Read codebase before checking issues
113
+ - **Combine sources:** Use Context7 for "how to use X" and GitHub for "what's our approach to X"
114
+
115
+ ## Coding Standards
116
+
117
+ See the rules files for detailed coding standards:
118
+
119
+ - `.claude/rules/python.md` - Python coding standards, type hints, async patterns
120
+ - `.claude/rules/python-testing.md` - Testing conventions, markers, coverage requirements
@@ -0,0 +1,4 @@
1
+ If read-write file system access is configured for Claude Code via `/allowed-tools`, it is advised to run [Claude Code](https://docs.anthropic.com/en/docs/claude-code/overview) in a dev container. For more details, refer to the [Dev Container](../.devcontainer/README.md) documentation.
2
+
3
+ ## MCP servers
4
+ MCP servers are configured in the `./.mcp.json` file. The file cannot be moved to `.claude/mcp.json`.
File without changes
@@ -0,0 +1,79 @@
1
+ # Custom Evaluators Design Plan
2
+
3
+ ## Problem
4
+
5
+ A user writing `MyCustomEvaluator` would need to do:
6
+
7
+ ```python
8
+ from pytest_assay.plugin import AGENT_RESPONSES_KEY, BASELINE_DATASET_KEY
9
+ ```
10
+
11
+ These stash keys are **internal implementation details** -- not exported in `__all__`, and they couple any custom evaluator directly to the plugin's storage mechanism. The `Evaluator` protocol receiving a raw `pytest.Item` is the root cause: it forces evaluators to know *how* to extract data rather than just *receiving* it.
12
+
13
+ ## Design: Pass an `EvaluatorInput` Instead of `Item`
14
+
15
+ Introduce a data class that the plugin constructs and passes to evaluators, replacing the raw `Item`:
16
+
17
+ ```python
18
+ # models.py
19
+ class EvaluatorInput(BaseModel):
20
+ baseline_dataset: Dataset | None
21
+ agent_responses: list[AgentRunResult[Any]]
22
+
23
+ class Evaluator(Protocol):
24
+ def __call__(self, input: EvaluatorInput) -> Coroutine[Any, Any, Readout]: ...
25
+ ```
26
+
27
+ The plugin's `_run_evaluation` would change from:
28
+
29
+ ```python
30
+ readout = asyncio.run(evaluator(item))
31
+ ```
32
+
33
+ to:
34
+
35
+ ```python
36
+ eval_input = EvaluatorInput(
37
+ baseline_dataset=item.stash.get(BASELINE_DATASET_KEY, None),
38
+ agent_responses=item.stash.get(AGENT_RESPONSES_KEY, []),
39
+ )
40
+ readout = asyncio.run(evaluator(eval_input))
41
+ ```
42
+
43
+ The stash keys stay private. Evaluators receive exactly the data they need.
44
+
45
+ ## What Changes
46
+
47
+ | Component | Change |
48
+ |---|---|
49
+ | `Evaluator` protocol | `__call__(self, input: EvaluatorInput)` instead of `__call__(self, item: Item)` |
50
+ | `_run_evaluation` in plugin.py | Build `EvaluatorInput` from stash, pass it |
51
+ | `PairwiseEvaluator.__call__` | Accept `EvaluatorInput`, drop the stash key import |
52
+ | `BradleyTerryEvaluator.__call__` | Accept `EvaluatorInput`, drop the stash key import |
53
+ | Public exports | Add `EvaluatorInput`, `Evaluator`, `Readout` to `__all__` |
54
+
55
+ ## What a Custom Evaluator Looks Like
56
+
57
+ ```python
58
+ from pytest_assay.models import Evaluator, EvaluatorInput, Readout
59
+
60
+ class MyCustomEvaluator:
61
+ async def __call__(self, input: EvaluatorInput) -> Readout:
62
+ # input.baseline_dataset has the baseline cases
63
+ # input.agent_responses has the captured AgentRunResult objects
64
+ score = my_scoring_logic(input.baseline_dataset, input.agent_responses)
65
+ return Readout(passed=score > 0.5, details={"score": score})
66
+ ```
67
+
68
+ No internal imports needed. Fully decoupled from pytest stash mechanics.
69
+
70
+ ## Why Not Alternatives
71
+
72
+ - **Export the stash keys as public API**: Leaks an implementation detail. If you ever change how data flows (e.g. stop using stash), every custom evaluator breaks.
73
+ - **Accessor helper functions** (`get_baseline(item)`, `get_responses(item)`): Better than raw keys, but still couples evaluators to `pytest.Item`. Harder to unit-test evaluators in isolation.
74
+ - **The `EvaluatorInput` approach**: Evaluators become plain async functions on data. Easy to test, no pytest dependency, no internal imports. The plugin owns the extraction logic.
75
+
76
+ ## Open Questions
77
+
78
+ 1. Should `EvaluatorInput` also carry metadata (test name, assay path, mode) so evaluators can customize behavior per-test?
79
+ 2. `AgentRunResult` comes from `pydantic-ai` -- is that an acceptable public API surface, or should responses be further abstracted (e.g. to plain strings)?
@@ -0,0 +1,52 @@
1
+ # Design Analysis: `generator` in `@pytest.mark.assay` vs. External Dataset Creation
2
+
3
+ ## Current Design
4
+
5
+ The `generator` callable is passed as a marker kwarg. The plugin calls it inside `pytest_runtest_setup` only if no serialized dataset file exists yet. The resulting `Dataset` is wrapped into an `AssayContext` and injected into the test via `item.funcargs["context"]`.
6
+
7
+ ---
8
+
9
+ ## Option A: Keep `generator` in the Marker (current)
10
+
11
+ **Pros:**
12
+ - **Declarative, self-contained test definition.** The marker is a single source of truth: evaluator *and* data source live together. You can read the decorator and understand the full assay without chasing fixtures.
13
+ - **Lazy, conditional generation.** The plugin only calls the generator when no baseline file exists. This logic is invisible to the test author -- they don't need `if not path.exists()` boilerplate.
14
+ - **Symmetry with `evaluator`.** Both the data source and the evaluation strategy are declared in the same place. Moving one out breaks that symmetry.
15
+
16
+ **Cons:**
17
+ - **Generator signature is constrained.** It must be a zero-argument callable returning `Dataset`. If a generator needs runtime information (e.g. fixtures, config, parametrize values), there's no clean way to pass it.
18
+ - **Magic injection of `context`.** The test receives `context: AssayContext` as a parameter, but it's not a real pytest fixture -- it's stuffed into `funcargs` by a hook. This is surprising to pytest users who expect fixtures to be defined via `@pytest.fixture` or `conftest.py`.
19
+ - **Harder to share datasets.** Two tests that share the same dataset must both reference the same generator function. With fixtures, you'd just depend on the same fixture name.
20
+
21
+ ---
22
+
23
+ ## Option B: Dataset Created in a Fixture / Test Body
24
+
25
+ **Pros:**
26
+ - **Standard pytest idiom.** A `@pytest.fixture` returning `AssayContext` is immediately understandable. No hidden `funcargs` injection.
27
+ - **Full access to other fixtures.** A fixture can depend on `tmp_path`, database connections, parametrize values, or any other fixture -- something a zero-arg generator cannot do.
28
+ - **Easier dataset sharing.** Multiple tests reuse the same fixture by name; no need to import and wire the same callable.
29
+
30
+ **Cons:**
31
+ - **Loses conditional loading.** The plugin currently skips generation when a baseline file already exists. Moving generation to a fixture means either (a) the fixture must replicate that logic, or (b) the plugin still needs a hook to intercept and replace the dataset -- defeating the purpose.
32
+ - **Splits the assay definition.** The marker would declare the evaluator, but the data source would live elsewhere (fixture or test body). You'd need to look in two places to understand a single assay.
33
+ - **Test body pollution.** If the dataset is built inside the test function, the test mixes setup concerns (data generation) with execution concerns (running the agent). Fixtures avoid this, but then you're back to the "two places" problem.
34
+
35
+ ---
36
+
37
+ ## Option C: Hybrid -- Fixture-Based with Marker Override
38
+
39
+ The marker could accept an *optional* `generator`. If absent, the plugin looks for a `context` fixture (standard pytest resolution). If present, the marker generator takes priority.
40
+
41
+ **Pros:** Best of both worlds -- simple cases stay declarative, complex cases use fixtures.
42
+ **Cons:** Two code paths to maintain and document. Users must understand when each applies.
43
+
44
+ ---
45
+
46
+ ## Conclusion
47
+
48
+ The current design is the right call for this project. The key reason: **the plugin needs control over *when* and *whether* to generate the dataset** (skip if baseline file exists, generate otherwise). That conditional logic is core to the assay lifecycle and belongs in the plugin, not in user-written fixtures. Forcing fixture authors to replicate that `if file.exists()` check would be error-prone and leak implementation details.
49
+
50
+ The zero-arg constraint on the generator is acceptable because dataset generation is inherently a pure function -- it defines *what to evaluate*, not *how to run* the test. If you ever need runtime-dependent datasets (e.g. parametrized topics), the cleaner extension would be to let the generator accept an optional config dict via the marker (`generator_kwargs={"n_cases": 5}`) rather than moving to fixtures.
51
+
52
+ The one thing worth improving: document that `context` is injected by the plugin, not a regular fixture. A brief note in the marker docstring or a type stub would reduce confusion.
@@ -0,0 +1,5 @@
1
+ ## Plan Mode
2
+
3
+ - Make the plan extremely concise. Sacrifice grammar for the sake of being concise.
4
+ - At the end of each plan, give me a list of unresolved questions to answer, if any. Make the questions extremely concise.
5
+ - When in *plan mode*, you are writing the final plan as markdown file to `~/.claude/plans/` by default. When writing a plan to `~/.claude/plans/` you MUST always copy it to `./.claude/plans/` in the project root. For example, `~/.claude/plans/rosy-dancing-finch.md` should be copied to `./.claude/plans/rosy-dancing-finch.md`.
@@ -0,0 +1,79 @@
1
+ # Testing with Pytest
2
+
3
+ ## Testing Principles
4
+
5
+ - **Reuse, Don't Replicate**: Tests should reuse as much functional code as possible. Avoid reimplementing application logic within a test. Import and call the actual functions and classes you intend to test.
6
+ - **Mock Fundamental Processes**: When isolating code for a unit test, mock the most fundamental external interaction. For example, mock the `asyncio.create_subprocess_exec` call for a command-line tool, not a higher-level function that wraps it. This ensures you are testing your application's error handling and response parsing logic.
7
+ - **Cover All Failure Modes**: Every test suite should cover not just the "happy path" but also all conceivable failure modes. Use `pytest.raises` to verify that your code correctly handles non-zero return codes, missing commands (`FileNotFoundError`), network errors, and other exceptional conditions.
8
+
9
+ ## Test Structure
10
+
11
+ ```python
12
+ @pytest.mark.asyncio # For async tests
13
+ async def test_feature() -> None:
14
+ """Test description."""
15
+ # Arrange
16
+ ...
17
+
18
+ # Act
19
+ result = await some_function()
20
+
21
+ # Assert
22
+ assert result is not None
23
+ ```
24
+
25
+ ## Running Tests
26
+
27
+ ```bash
28
+ # Run all tests
29
+ uv run pytest
30
+
31
+ # Run tests with verbose output
32
+ uv run pytest -v
33
+
34
+ # Run specific test file
35
+ uv run pytest tests/test_utils.py
36
+
37
+ # Run specific test
38
+ uv run pytest tests/test_utils.py::test_fetch_content -v
39
+
40
+ # Run tests in parallel
41
+ uv run pytest -n auto
42
+
43
+ # Run tests with coverage report
44
+ uv run pytest --cov=src/pytest-assay --cov-report=term-missing
45
+ ```
46
+
47
+ ## Markers
48
+
49
+ - `@pytest.mark.ollama` - Requires local Ollama (skipped in CI)
50
+
51
+ ## VCR Cassettes
52
+
53
+ - Location: `tests/cassettes/`
54
+ - Record mode: `none` (playback only by default)
55
+ - Hostname normalization in `conftest.py` handles `host.docker.internal` → `localhost`
56
+ - Deterministic tests: set `temperature=0.0` in `MODEL_SETTINGS`
57
+
58
+ ## Coverage Requirements
59
+
60
+ **After writing or modifying tests**, verify coverage targets are met:
61
+
62
+ 1. **Read coverage targets** from `.codecov.yaml` to determine:
63
+ - `coverage.status.project.default.target` - minimum overall project coverage
64
+ - `coverage.status.project.default.threshold` - allowed coverage drop tolerance
65
+ - `coverage.status.patch.default.target` - minimum coverage for new/modified code
66
+
67
+ 2. **Run coverage report**:
68
+ ```bash
69
+ # Full project coverage
70
+ uv run pytest --cov=src/pytest-assay --cov-report=term-missing
71
+
72
+ # Specific module coverage
73
+ uv run pytest --cov=src/pytest-assay/MODULE_NAME --cov-report=term-missing tests/test_MODULE_NAME.py
74
+ ```
75
+
76
+ 3. **Verify targets are met**:
77
+ - New code must meet the **patch target** from `.codecov.yaml`
78
+ - Overall coverage must not drop below **project target minus threshold**
79
+ - Focus coverage on critical paths: error handling, edge cases, and main functionality
@@ -0,0 +1,305 @@
1
+ # Python Coding Standards
2
+
3
+ ## Python Version
4
+
5
+ - **Required:** Python 3.12 (no 3.13+ features)
6
+ - **Check:** `requires-python = ">=3.12,<3.13"` in pyproject.toml
7
+ - Avoid features introduced in Python 3.13
8
+
9
+ ## Code Style & Formatting
10
+
11
+ ### Ruff Configuration
12
+ Use Ruff for formatting and linting (configured in `pyproject.toml`):
13
+
14
+ ```bash
15
+ # Format code
16
+ uv run ruff format .
17
+
18
+ # Check and fix linting issues (ALWAYS run with --fix)
19
+ uv run ruff check --fix .
20
+ ```
21
+
22
+ **Important:** Always use `--fix` to automatically resolve fixable issues. Run this after any code changes.
23
+
24
+ **Key rules enabled:**
25
+ - `E`, `F` - pycodestyle, pyflakes (essential errors)
26
+ - `I` - isort (import sorting)
27
+ - `UP` - pyupgrade (modern Python syntax)
28
+ - `ANN` - type annotations (required)
29
+ - `B` - bugbear (common bugs)
30
+ - `PL` - pylint rules
31
+
32
+ **Allowed exceptions:**
33
+ - `SIM108` - Allow if-else blocks instead of forcing ternary operators
34
+ - `PLR2004` - Allow magic values (constants without named variables)
35
+ - `PLR0915` - Allow long functions
36
+ - `PLR0912` - Allow many branches
37
+ - `PLR0913` - Allow many arguments
38
+
39
+ ### Line Length
40
+ - Maximum: 150 characters (configured in ruff)
41
+ - Prefer shorter lines when reasonable
42
+
43
+ ### Import Order
44
+ ```python
45
+ # 1. Standard library
46
+ from collections.abc import Generator
47
+ import os
48
+
49
+ # 2. Third-party packages
50
+ import pytest
51
+ from pydantic_ai import Agent
52
+
53
+ # 3. Local imports
54
+ from pytest_assay.config import config
55
+ from pytest_assay.models import WebSearchQuery
56
+ ```
57
+
58
+ ## Type Hints
59
+
60
+ ### Required
61
+ Type hints are **required** for all functions (enforced by Pyright):
62
+
63
+ ```python
64
+ # Good
65
+ def search_web(query: str, max_results: int = 5) -> list[dict[str, str]]:
66
+ ...
67
+
68
+ async def fetch_content(url: str) -> str:
69
+ ...
70
+ ```
71
+
72
+ ### Pyright Configuration
73
+ - `typeCheckingMode: "basic"`
74
+ - Error-level rules:
75
+ - `reportOptionalMemberAccess`
76
+ - `reportOptionalSubscript`
77
+ - `reportOptionalCall`
78
+ - `reportGeneralTypeIssues`
79
+ - `reportReturnType`
80
+
81
+ ### Type Checking
82
+ ```bash
83
+ # ALWAYS run after code changes
84
+ uv run pyright .
85
+ ```
86
+
87
+ **Important:** Always run pyright to catch type errors. This must pass before committing any Python code.
88
+
89
+ ### Modern Syntax
90
+ Use Python 3.12+ type syntax:
91
+ ```python
92
+ # Good (3.12+)
93
+ def process(items: list[str]) -> dict[str, int]:
94
+ ...
95
+
96
+ # Avoid (old style)
97
+ from typing import List, Dict
98
+ def process(items: List[str]) -> Dict[str, int]:
99
+ ...
100
+ ```
101
+
102
+ ## Async/Await Patterns
103
+
104
+ ### When to Use Async
105
+ - Network I/O (web searches, API calls)
106
+ - File I/O with async libraries
107
+ - Concurrent operations
108
+
109
+ ```python
110
+ # Good - concurrent operations
111
+ async def fetch_multiple(urls: list[str]) -> list[str]:
112
+ async with httpx.AsyncClient() as client:
113
+ tasks = [client.get(url) for url in urls]
114
+ responses = await asyncio.gather(*tasks)
115
+ return [r.text for r in responses]
116
+ ```
117
+
118
+ ### Pydantic AI Agents
119
+ All agents are async:
120
+ ```python
121
+ from pydantic_ai import Agent
122
+
123
+ agent = Agent(
124
+ model=model,
125
+ output_type=WebSearchQuery,
126
+ system_prompt=QUERY_INSTRUCTIONS,
127
+ retries=5,
128
+ instrument=True,
129
+ )
130
+
131
+ # Usage
132
+ async with agent:
133
+ result = await agent.run(user_prompt="Generate query")
134
+ print(result.output)
135
+ ```
136
+
137
+ ## Dependencies
138
+
139
+ ### Management
140
+ Use `uv` (not pip or poetry):
141
+ ```bash
142
+ # Install dependencies
143
+ uv sync
144
+
145
+ # Add new dependency
146
+ # Edit pyproject.toml manually, then:
147
+ uv sync
148
+
149
+ # Run command in venv
150
+ uv run pytest
151
+ ```
152
+
153
+ ### Adding Dependencies
154
+ 1. Add to `pyproject.toml` under `[project.dependencies]`
155
+ 2. Specify minimum version: `"package>=1.2.3"`
156
+ 3. Run `uv sync` to update lock file
157
+ 4. Test that it works
158
+
159
+ ## Error Handling & Logging
160
+
161
+ ### Logging
162
+ Use `loguru` for structured logging:
163
+ ```python
164
+ from pytest_assay.logger import logger
165
+
166
+ # Info
167
+ logger.info("Starting web search for topic: {}", topic)
168
+
169
+ # Debug (verbose)
170
+ logger.debug("Received {} results", len(results))
171
+
172
+ # Error with context
173
+ try:
174
+ result = await fetch_content(url)
175
+ except Exception as e:
176
+ logger.error("Failed to fetch {}: {}", url, e)
177
+ raise
178
+ ```
179
+
180
+ ### Error Handling
181
+ ```python
182
+ # Good - explicit handling
183
+ try:
184
+ result = await risky_operation()
185
+ except ValueError as e:
186
+ logger.error("Invalid input: {}", e)
187
+ return default_value
188
+ except httpx.HTTPError as e:
189
+ logger.error("Network error: {}", e)
190
+ raise
191
+
192
+ # Avoid silent failures
193
+ try:
194
+ ...
195
+ except Exception:
196
+ pass # Bad - hides errors
197
+ ```
198
+
199
+ ## Common Patterns
200
+
201
+ ### Pydantic Models
202
+ Use for structured data:
203
+ ```python
204
+ from pydantic import BaseModel, Field
205
+
206
+ class WebSearchQuery(BaseModel):
207
+ query: str = Field(max_length=100)
208
+ aspect: str
209
+ rationale: str
210
+ ```
211
+
212
+ ### Configuration
213
+ Access via centralized config:
214
+ ```python
215
+ from pytest_assay.config import config
216
+
217
+ # Good
218
+ max_loops = config.max_research_loops
219
+ model_name = config.model.value
220
+
221
+ # Avoid hardcoded values
222
+ max_loops = 5 # Bad
223
+ ```
224
+
225
+ ### Agent Definitions
226
+ Pattern used in `agents.py`:
227
+ ```python
228
+ AGENT_NAME = Agent(
229
+ model=model,
230
+ output_type=OutputType,
231
+ system_prompt=PROMPT_CONSTANT,
232
+ retries=5,
233
+ instrument=True, # For logfire tracking
234
+ )
235
+ ```
236
+
237
+ ## Module and Package Structure
238
+
239
+ ### `__init__.py` Files
240
+ `__init__.py` files should be kept minimal. Their primary purpose is to define a package and expose its public API.
241
+
242
+ - **Good**: Use for imports, `__all__`, and package-level docstrings.
243
+ ```python
244
+ # src/pytest-assay/mcp/__init__.py
245
+ """MCP server implementations for pytest-assay."""
246
+ from .server import date_server
247
+
248
+ __all__ = ["date_server"]
249
+ ```
250
+
251
+ - **Bad**: Avoid defining functions, classes, or complex logic directly in `__init__.py`. This can lead to import side effects and makes the code harder to navigate. Move such code into separate modules (e.g., `server.py`) and import them.
252
+
253
+ ## Docstrings
254
+
255
+ Use Google-style docstrings (not Sphinx style):
256
+
257
+ ### Formatting Rules
258
+ - **No backticks**: Do not use backticks in docstrings. Write `--assay-mode` as --assay-mode, `None` as None, `True` as True, etc.
259
+ - Google style uses plain text for parameter names, options, and values
260
+ - For single-line docstrings, place initial and final triple quotes on the same line:
261
+ ```python
262
+ def add(a: int, b: int) -> int:
263
+ """Add two integers."""
264
+ return a + b
265
+ ```
266
+ - For multi-line docstrings, place initial and final triple quotes on their own lines:
267
+ ```python
268
+ def complex_function(param1: str, param2: int = 5) -> dict[str, Any]:
269
+ """
270
+ Short one-line description.
271
+
272
+ Longer description if needed, explaining the function's purpose,
273
+ behavior, and any important details.
274
+
275
+ Args:
276
+ param1: Description of param1.
277
+ param2: Description of param2. Defaults to 5.
278
+
279
+ Returns:
280
+ Description of return value.
281
+
282
+ Raises:
283
+ ValueError: When param2 is negative.
284
+ """
285
+ ...
286
+ ```
287
+
288
+ ## Before Committing
289
+
290
+ Run these checks:
291
+ ```bash
292
+ # 1. Format code
293
+ uv run ruff format .
294
+
295
+ # 2. Check and fix linting issues
296
+ uv run ruff check --fix .
297
+
298
+ # 3. Type checking
299
+ uv run pyright .
300
+
301
+ # 4. Run tests
302
+ uv run pytest -n auto
303
+ ```
304
+
305
+ **Note:** The `--fix` flag automatically resolves fixable linting issues. Always verify the changes it makes are appropriate.