serenecode 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {serenecode-0.2.0 → serenecode-0.3.0}/.gitignore +5 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/PKG-INFO +135 -22
- {serenecode-0.2.0 → serenecode-0.3.0}/README.md +132 -21
- {serenecode-0.2.0 → serenecode-0.3.0}/SERENECODE.md +184 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/pyproject.toml +4 -2
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/adapters/coverage_adapter.py +5 -2
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/adapters/crosshair_adapter.py +62 -51
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/adapters/hypothesis_adapter.py +5 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/checker/compositional.py +4 -4
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/checker/coverage.py +0 -4
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/checker/properties.py +2 -2
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/checker/spec_traceability.py +6 -4
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/checker/structural.py +1256 -6
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/checker/symbolic.py +2 -2
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/checker/types.py +2 -2
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/cli.py +149 -9
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/config.py +121 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/core/pipeline.py +2 -1
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/init.py +42 -0
- serenecode-0.3.0/src/serenecode/mcp/__init__.py +23 -0
- serenecode-0.3.0/src/serenecode/mcp/resources.py +118 -0
- serenecode-0.3.0/src/serenecode/mcp/schemas.py +106 -0
- serenecode-0.3.0/src/serenecode/mcp/server.py +234 -0
- serenecode-0.3.0/src/serenecode/mcp/tools.py +858 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/models.py +1 -4
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/templates/content.py +45 -0
- serenecode-0.3.0/tests/e2e/test_cli_branches.py +478 -0
- serenecode-0.3.0/tests/e2e/test_init_command.py +298 -0
- serenecode-0.3.0/tests/e2e/test_mcp_command.py +43 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/fixtures/edge_cases/async_functions.py +1 -1
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/fixtures/valid/full_module.py +1 -1
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/integration/test_coverage_adapter.py +165 -0
- serenecode-0.3.0/tests/integration/test_crosshair_adapter_helpers.py +687 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/integration/test_file_adapter.py +25 -0
- serenecode-0.3.0/tests/integration/test_hypothesis_adapter_helpers.py +711 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/integration/test_module_loader.py +22 -0
- serenecode-0.3.0/tests/integration/test_resources.py +96 -0
- serenecode-0.3.0/tests/integration/test_schemas.py +135 -0
- serenecode-0.3.0/tests/integration/test_server.py +70 -0
- serenecode-0.3.0/tests/integration/test_tools.py +647 -0
- serenecode-0.3.0/tests/unit/checker/test_compositional_helpers.py +286 -0
- serenecode-0.3.0/tests/unit/checker/test_structural.py +2180 -0
- serenecode-0.3.0/tests/unit/checker/test_structural_helpers.py +811 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/unit/checker/test_symbolic.py +79 -1
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/unit/checker/test_types.py +37 -1
- serenecode-0.3.0/tests/unit/test_api.py +272 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/unit/test_config.py +51 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/unit/test_models.py +26 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/unit/test_pipeline.py +30 -0
- serenecode-0.3.0/tests/unit/test_source_discovery.py +395 -0
- serenecode-0.3.0/tests/unit/test_templates_content.py +74 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/uv.lock +577 -2
- serenecode-0.2.0/.env +0 -1
- serenecode-0.2.0/tests/e2e/test_init_command.py +0 -109
- serenecode-0.2.0/tests/unit/checker/test_structural.py +0 -1083
- serenecode-0.2.0/tests/unit/test_api.py +0 -154
- serenecode-0.2.0/tests/unit/test_source_discovery.py +0 -208
- {serenecode-0.2.0 → serenecode-0.3.0}/CLAUDE.md +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/LICENSE +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/examples/DOSAGE_CALC_SPEC.md +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/examples/dosage-regular/dosage_calc.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/examples/dosage-regular/test_dosage_calc.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/examples/dosage-serenecode/CLAUDE.md +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/examples/dosage-serenecode/SERENECODE.md +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/examples/dosage-serenecode/SPEC.md +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/examples/dosage-serenecode/pyproject.toml +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/examples/dosage-serenecode/src/dosage/__init__.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/examples/dosage-serenecode/src/dosage/core/__init__.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/examples/dosage-serenecode/src/dosage/core/dosage.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/examples/dosage-serenecode/src/dosage/core/models.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/examples/dosage-serenecode/src/dosage/core/safety.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/examples/dosage-serenecode/tests/__init__.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/examples/dosage-serenecode/tests/unit/__init__.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/examples/dosage-serenecode/tests/unit/test_dosage.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/examples/dosage-serenecode/tests/unit/test_models.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/examples/dosage-serenecode/tests/unit/test_safety.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/examples/dosage-serenecode/uv.lock +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/serenecode.jpg +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/__init__.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/adapters/__init__.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/adapters/local_fs.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/adapters/module_loader.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/adapters/mypy_adapter.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/checker/__init__.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/contracts/__init__.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/contracts/predicates.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/core/__init__.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/core/exceptions.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/ports/__init__.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/ports/coverage_analyzer.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/ports/file_system.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/ports/property_tester.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/ports/symbolic_checker.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/ports/type_checker.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/reporter.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/source_discovery.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/src/serenecode/templates/__init__.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/__init__.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/conftest.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/e2e/__init__.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/e2e/test_check_command.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/e2e/test_cli.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/e2e/test_init.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/e2e/test_report_command.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/e2e/test_status_command.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/fixtures/edge_cases/aliased_import.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/fixtures/edge_cases/empty_module.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/fixtures/edge_cases/from_import.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/fixtures/invalid/broken_postcondition.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/fixtures/invalid/io_in_core.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/fixtures/invalid/missing_contracts.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/fixtures/invalid/missing_invariant.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/fixtures/invalid/missing_types.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/fixtures/valid/class_with_invariant.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/fixtures/valid/simple_function.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/integration/__init__.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/integration/test_adapter_internals.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/integration/test_checkers_real_code.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/integration/test_crosshair_adapter.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/integration/test_example_projects.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/integration/test_hypothesis_adapter.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/integration/test_local_fs.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/integration/test_mypy_adapter.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/unit/__init__.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/unit/checker/__init__.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/unit/checker/test_compositional.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/unit/checker/test_coverage.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/unit/checker/test_properties.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/unit/checker/test_spec_traceability.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/unit/checker/test_structural_hypothesis.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/unit/contracts/__init__.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/unit/contracts/test_predicates.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/unit/contracts/test_predicates_hypothesis.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/unit/test_models_hypothesis.py +0 -0
- {serenecode-0.2.0 → serenecode-0.3.0}/tests/unit/test_reporter.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: serenecode
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Verification framework for AI-generated Python — test coverage, property testing, and symbolic execution
|
|
5
5
|
Project-URL: Homepage, https://github.com/helgster77/serenecode
|
|
6
6
|
Project-URL: Repository, https://github.com/helgster77/serenecode
|
|
@@ -29,6 +29,8 @@ Requires-Dist: mypy>=1.0
|
|
|
29
29
|
Provides-Extra: dev
|
|
30
30
|
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
31
31
|
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
32
|
+
Provides-Extra: mcp
|
|
33
|
+
Requires-Dist: mcp>=1.0; extra == 'mcp'
|
|
32
34
|
Description-Content-Type: text/markdown
|
|
33
35
|
|
|
34
36
|
<p align="center">
|
|
@@ -37,9 +39,11 @@ Description-Content-Type: text/markdown
|
|
|
37
39
|
|
|
38
40
|
<h3 align="center">A Framework for AI-Driven Development of Verifiable Systems</h3>
|
|
39
41
|
|
|
40
|
-
SereneCode is a spec-to-verified-implementation framework for AI-generated Python. It ensures that every requirement in your spec is implemented, tested, and formally verified — closing the gap between what you asked for and what the AI built. The workflow starts from a spec with traceable requirements (REQ-xxx), enforces that the AI writes verifiable code with contracts and tests, then verifies at multiple levels — from structural checks and test coverage through property-based testing to symbolic execution with an SMT solver. You choose the verification depth during interactive setup: lightweight for internal tools, balanced for production systems, strict for safety-critical code.
|
|
42
|
+
SereneCode is a spec-to-verified-implementation framework for AI-generated Python. It ensures that every requirement in your spec is implemented, tested, and formally verified — closing the gap between what you asked for and what the AI built. The workflow starts from a spec with traceable requirements (REQ-xxx), enforces that the AI writes verifiable code with contracts and tests, then verifies at multiple levels — from structural checks and test coverage through property-based testing to symbolic execution with an SMT solver. You choose the verification depth during interactive setup: lightweight for internal tools, balanced for production systems, strict for safety-critical code.
|
|
41
43
|
|
|
42
|
-
|
|
44
|
+
SereneCode also ships a built-in **MCP server** so verification runs *inside* your AI assistant's edit loop, not just at the end. Once registered with Claude Code, Cursor, Cline, or Continue, the agent calls verification tools after every function it writes — getting structured findings, contract suggestions, and counterexamples back as JSON, fixing them mid-turn, and only reporting the work complete when the result is clean. AI agents write code fast but can miss requirements and skip edge cases; SereneCode closes that gap with spec traceability, test-existence enforcement, formal verification, and an MCP-driven inner loop the agent can drive on itself.
|
|
45
|
+
|
|
46
|
+
> **This framework was bootstrapped with AI under its own rules.** SereneCode's SERENECODE.md was written before the first line of code, and the codebase has been developed under those conventions from the start — including the MCP server, which the same AI agents now use to verify their own work mid-edit. The current tree passes its own `serenecode check src --level 6 --allow-code-execution` end-to-end via the bare CLI (718 functions checked, 557 passed, 161 exempt; ~6 minutes wall time), an internal strict-config Level 6 self-check in the test suite (`pytest tests/integration/test_example_projects.py::test_serenecode_repo_passes_strict_level_6`, which exercises L4-L6 against `strict_config` over the full source tree), `mypy src examples/dosage-serenecode/src`, the shipped dosage example's own `serenecode check src --level 6 --allow-code-execution`, and the full `pytest` suite (1,393 passing tests, 16 skipped). The verification output is transparent about scope: exempt modules (adapters, CLI, ports, MCP server, `__init__.py`) and functions excluded from deep verification (non-primitive parameter types) are reported as "exempt" rather than silently omitted.
|
|
43
47
|
|
|
44
48
|
---
|
|
45
49
|
|
|
@@ -49,6 +53,34 @@ AI writes code fast. But *fast* and *correct* aren't the same thing. When you're
|
|
|
49
53
|
|
|
50
54
|
The problem is that formal verification has always been expensive — too slow, too manual, too specialized. SereneCode makes it tractable by controlling the process from the start: a convention file tells the AI to write verification-ready code, a structural linter checks it followed the rules, and CrossHair + Z3 search for contract violations via symbolic execution.
|
|
51
55
|
|
|
56
|
+
### Common AI failure modes SereneCode is built to catch
|
|
57
|
+
|
|
58
|
+
After enough hours pair-programming with coding agents, the same handful of mistakes show up over and over. They're not random bugs — they're systematic patterns that follow from how a language model writes code: optimize for the happy path, infer intent from limited context, finish the visible task, leave implicit assumptions implicit. SereneCode treats each one as a verification target rather than a code-review target.
|
|
59
|
+
|
|
60
|
+
- **Skipping requirements without realizing it.** Given a spec with twelve requirements, the agent writes code that handles eight of them confidently and quietly omits the four it didn't see a clean place for. There's no error, no TODO — just missing behavior. SereneCode's spec traceability (REQ-xxx tags, `Implements:` / `Verifies:` references, the `serenecode_orphans` and `serenecode_req_status` tools) makes it impossible for a requirement to be silently dropped: every REQ in SPEC.md must be both implemented and tested or it shows up as an orphan.
|
|
61
|
+
|
|
62
|
+
- **Happy-path tests only.** Asked to "add tests," the agent writes a handful of cases that walk the obvious path through the function. Edge cases (empty input, off-by-one boundaries, the exact threshold value, the negative number, the unicode string) are routinely missed because they require imagining what could go wrong. L3 coverage catches uncovered branches; L4 Hypothesis property testing generates inputs the agent never thought of and runs them against the contracts.
|
|
63
|
+
|
|
64
|
+
- **Stub residue and "I'll come back to this."** A function the agent didn't quite know how to write often ships as `pass`, `...`, or `raise NotImplementedError("TODO")` — and then never gets revisited because the test suite doesn't fail on it. L1's stub-residue check flags these immediately.
|
|
65
|
+
|
|
66
|
+
- **Weak or tautological postconditions.** Asked to add a contract, the agent reaches for whatever satisfies the structural checker without actually constraining behavior: `lambda result: isinstance(result, int)` on a function that already returns `int`, or `lambda result: True`. These pass every check but verify nothing. L1 flags both patterns.
|
|
67
|
+
|
|
68
|
+
- **Silent exception handling.** `try: risky() / except Exception: pass` is a load-bearing anti-pattern in agent-generated code. The agent encounters an error during testing, decides the cleanest fix is to swallow it, and ships code where real failures vanish into the void. L1 flags any handler whose body is `pass`, `...`, `continue`, `break`, or a bare `return` and demands a meaningful response or an explicit `# silent-except: <reason>` opt-out.
|
|
69
|
+
|
|
70
|
+
- **Mutable default arguments.** `def f(x=[])` is a Python footgun every senior developer learned to avoid the hard way. Agents reproduce it cheerfully because they pattern-match on shape, not language semantics. L1 catches it by default.
|
|
71
|
+
|
|
72
|
+
- **Bare `assert` as a runtime check.** Agents reach for `assert x > 0` to "validate input" — but assertions disappear under `python -O`. The check vanishes silently in any production environment that strips them. L1 flags asserts in non-test source.
|
|
73
|
+
|
|
74
|
+
- **`print()` debug residue.** Trace prints from the agent's own debugging session ship to production because nobody pruned them. L1 catches `print()` in core modules.
|
|
75
|
+
|
|
76
|
+
- **Unsafe deserialization and shell calls.** `eval`, `exec`, `pickle.loads` on untrusted input, and `subprocess.run(..., shell=True)` are calls a security-aware human writes only with a comment explaining why. Agents reach for them when the simple solution looks fastest. L1 flags every one, requiring an `# allow-dangerous: <reason>` opt-out for the rare legitimate case.
|
|
77
|
+
|
|
78
|
+
- **Tests that pass but verify nothing.** A `def test_foo(): foo()` with no `assert` runs successfully and counts as "covered" — but it only checks that the function doesn't raise. L1's no-assertions-in-tests check fires on any `test_*` function with no `assert`, `pytest.raises`, `pytest.fail`, or `self.assertX` call.
|
|
79
|
+
|
|
80
|
+
- **Architectural drift.** Asked to "add a feature," the agent puts I/O in core, business logic in adapters, and circular imports between them. The system still works in tests because everything is loaded together — but the layering rule that made the code testable in the first place is gone. L6 compositional checks enforce dependency direction, interface compliance, and contract presence at module boundaries.
|
|
81
|
+
|
|
82
|
+
None of these failures are unique to AI; humans make them too. What's unique is the *rate* at which an agent produces them and the *confidence* with which the agent reports the work as done. The structural checker, the contracts, the property tester, and the symbolic search exist to make each pattern impossible to ship without an explicit, reviewed override.
|
|
83
|
+
|
|
52
84
|
SereneCode is designed for **building new verifiable systems from scratch with AI**, not for retrofitting verification onto large existing codebases. The conventions go in before the first line of code, and every module is written with verification in mind from day one. That's what makes it work. SereneCode is a best-effort tool, not a guarantee — see the [Disclaimer](#disclaimer) for important limitations on what it can and cannot assure.
|
|
53
85
|
|
|
54
86
|
### Choosing the Right Level
|
|
@@ -69,6 +101,8 @@ Pick the level that matches the stakes. Safety-critical code should start at Str
|
|
|
69
101
|
|
|
70
102
|
## See It In Action: The Medical Dosage Calculator
|
|
71
103
|
|
|
104
|
+
> **This is a hypothetical example for demonstration purposes only.** The medical dosage calculator is not a real, clinically validated, or regulator-approved tool. It is not derived from any actual drug-dosing protocol, has not been reviewed by medical professionals, and must not be used for any clinical decision-making. Its purpose is solely to illustrate how SereneCode shapes the way an AI agent writes verifiable code in a context where contracts and bounded symbolic search would matter. Building real medical software requires domain experts, regulated processes, and assurance methodologies that go far beyond what this framework provides.
|
|
105
|
+
|
|
72
106
|
We built the same medical dosage calculator twice from the same spec — once with plain AI, once with SereneCode — to show the difference.
|
|
73
107
|
|
|
74
108
|
Both versions implement four functions: dose calculation with weight-based dosing and max caps, renal function adjustment with tiered CrCl thresholds, daily safety checks with explicit total-versus-threshold calculations, and contraindication detection across current medications.
|
|
@@ -114,6 +148,8 @@ This creates SERENECODE.md (project conventions including spec traceability) and
|
|
|
114
148
|
|
|
115
149
|
A lightweight AST-based checker that validates code follows SERENECODE.md conventions in seconds. Missing a postcondition? No class invariant? No test file for a module? Caught before you waste time on heavy verification.
|
|
116
150
|
|
|
151
|
+
L1 also catches AI-failure-mode patterns that compile and look correct but represent real bugs: stub residue (`pass`/`...`/`raise NotImplementedError` left as a function body), mutable default arguments, bare `assert` in non-test source, `print()` in core, dangerous calls (`eval`, `exec`, `pickle.loads`, `os.system`, `subprocess` with `shell=True`), `TODO`/`FIXME`/`XXX`/`HACK` markers in tracked files, tests with no assertions, silent exception handlers, and tautological postconditions. Each rule has a per-rule opt-out comment for legitimate exceptions; see SERENECODE.md "Code Quality Standards" for the full list.
|
|
152
|
+
|
|
117
153
|
```bash
|
|
118
154
|
serenecode check src/ --structural # structural conventions
|
|
119
155
|
serenecode check src/ --spec SPEC.md # + spec traceability
|
|
@@ -144,29 +180,91 @@ The full pipeline is thorough but not instant. Larger systems will take longer,
|
|
|
144
180
|
|
|
145
181
|
Levels 3-6 import and execute project modules so coverage.py, Hypothesis, and CrossHair can exercise real code. Deep runs therefore require explicit `--allow-code-execution` and should only be used on trusted code.
|
|
146
182
|
|
|
183
|
+
### 4. The MCP Server — Verification Inside the Agent's Edit Loop
|
|
184
|
+
|
|
185
|
+
SereneCode ships a built-in MCP (Model Context Protocol) server that exposes the entire verification pipeline as tools any MCP-speaking AI coding assistant can call *while it writes code*, not just at the end. Instead of waiting for `serenecode check` to run at the bottom of a feature, the agent calls `serenecode_check_function` after every function it writes, sees structured findings inline, fixes them, and only reports the work complete when the result is clean. This collapses the feedback loop from minutes-after-writing to seconds-while-writing and turns serenecode from a batch tool you run *at* the agent into a peer tool the agent uses *on itself*.
|
|
186
|
+
|
|
187
|
+
**Setup (one-time):**
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
uv add 'serenecode[mcp]'
|
|
191
|
+
claude mcp add serenecode -- uv run serenecode mcp # read-only (L1, L2)
|
|
192
|
+
claude mcp add serenecode -- uv run serenecode mcp --allow-code-execution # all six levels
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
The same `serenecode mcp` stdio server works in Claude Code, Cursor, Cline, Continue, and any other MCP client.
|
|
196
|
+
|
|
197
|
+
**Tools the agent can call:**
|
|
198
|
+
|
|
199
|
+
| Tool | What it does |
|
|
200
|
+
|---|---|
|
|
201
|
+
| `serenecode_check` | Run the full pipeline on a project root |
|
|
202
|
+
| `serenecode_check_file` | Pipeline scoped to one source file |
|
|
203
|
+
| `serenecode_check_function` | Pipeline scoped to one function — the inner-loop tool |
|
|
204
|
+
| `serenecode_verify_fixed` | Re-run on one function and report whether a specific finding is gone |
|
|
205
|
+
| `serenecode_suggest_contracts` | Derive `@require`/`@ensure` decorators from a function signature |
|
|
206
|
+
| `serenecode_uncovered` | L3 coverage findings for one function (uncovered lines + mock advice) |
|
|
207
|
+
| `serenecode_suggest_test` | Test scaffold for an uncovered function |
|
|
208
|
+
| `serenecode_validate_spec` | Validate a SPEC.md is well-formed |
|
|
209
|
+
| `serenecode_list_reqs` | List REQ-xxx identifiers in a SPEC.md |
|
|
210
|
+
| `serenecode_req_status` | Implementation/verification status of one REQ |
|
|
211
|
+
| `serenecode_orphans` | REQs with no implementation or no test |
|
|
212
|
+
|
|
213
|
+
**Read-only resources** the agent can fetch without "calling" anything: `serenecode://config` (active SerenecodeConfig as JSON), `serenecode://findings/last-run` (most recent CheckResponse from this server session), `serenecode://exempt-modules` (the exempt path patterns for the active config), `serenecode://reqs` (parsed REQ-xxx list from the project's SPEC.md).
|
|
214
|
+
|
|
215
|
+
The server-level `--allow-code-execution` flag mirrors the CLI: without it, Levels 3-6 tools return a structured error rather than importing project code. `serenecode init` writes a copy-pasteable MCP setup snippet into the generated CLAUDE.md so newly initialized projects ship with the registration command and recommended workflow. See SERENECODE.md "MCP Integration" for the full descriptions and the agent-side workflow.
|
|
216
|
+
|
|
147
217
|
Scoped targets keep their package/import context across verification levels. In practice that means commands like `serenecode check src/core/ --level 4 --allow-code-execution` and `serenecode check src/core/models.py --level 3 --allow-code-execution` use the same local import roots and architectural module paths as a project-wide run instead of breaking relative imports or scoped core-module rules. Those scoped core/exemption rules are matched on path segments, not raw substrings, so names like `notcli.py`, `viewmodels.py`, and `transports/` do not accidentally change policy classification. Standalone files with non-importable names are also targeted correctly for CrossHair via `file.py:line` references.
|
|
148
218
|
|
|
149
219
|
---
|
|
150
220
|
|
|
151
221
|
## The AI Agent Loop
|
|
152
222
|
|
|
153
|
-
SereneCode is designed for spec-driven development with AI agents:
|
|
223
|
+
SereneCode is designed for spec-driven development with AI agents. The loop has two layers — a fast inner loop the agent drives on itself through the MCP server, and an outer batch loop for full project verification:
|
|
154
224
|
|
|
155
225
|
```
|
|
156
|
-
|
|
157
|
-
serenecode
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
serenecode
|
|
161
|
-
serenecode
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
226
|
+
─── one-time setup ──────────────────────────────────────────────────────────
|
|
227
|
+
serenecode init → spec mode + verification level
|
|
228
|
+
(also offers MCP server setup)
|
|
229
|
+
claude mcp add serenecode -- uv run \ → register MCP server with the AI
|
|
230
|
+
serenecode mcp --allow-code-execution tool (Claude Code, Cursor, ...)
|
|
231
|
+
serenecode spec SPEC.md → validate spec is ready
|
|
232
|
+
(REQ-xxx format, no gaps)
|
|
233
|
+
|
|
234
|
+
─── inner loop (per function, driven by the agent through MCP) ──────────────
|
|
235
|
+
AI reads SERENECODE.md + SPEC.md → conventions and what to build
|
|
236
|
+
AI calls serenecode_suggest_contracts → derive @require/@ensure for the
|
|
237
|
+
function it's about to write
|
|
238
|
+
AI writes the function → with Implements: REQ-xxx tag
|
|
239
|
+
AI calls serenecode_check_function → L1-L4 scoped to that function
|
|
240
|
+
AI reads structured findings → missing contracts, mutable
|
|
241
|
+
defaults, weak postconditions,
|
|
242
|
+
uncovered branches, etc.
|
|
243
|
+
AI fixes them and calls verify_fixed → confirms each finding is gone
|
|
244
|
+
before moving on to the next
|
|
245
|
+
function
|
|
246
|
+
|
|
247
|
+
─── outer loop (per feature, batch verification) ─────────────────────────────
|
|
248
|
+
serenecode check src/ --spec SPEC.md → did the AI follow conventions?
|
|
249
|
+
--structural all REQs covered?
|
|
250
|
+
serenecode check src/ --level 5 \ → deep verification: coverage,
|
|
251
|
+
--allow-code-execution \ property testing, symbolic search
|
|
252
|
+
--spec SPEC.md
|
|
253
|
+
AI calls serenecode_orphans / → which REQs are unimplemented or
|
|
254
|
+
serenecode_req_status untested?
|
|
255
|
+
AI fixes the gaps → adds implementations, tests,
|
|
256
|
+
stronger contracts
|
|
257
|
+
Repeat until verified → all REQs implemented + tested,
|
|
258
|
+
no counterexamples within bounds
|
|
165
259
|
```
|
|
166
260
|
|
|
261
|
+
The inner loop is what the MCP server enables. Before MCP, the agent had to finish writing, exit its turn, wait for `serenecode check` to run, parse the output, and iterate. With MCP, every function the agent writes gets validated *before* it moves to the next one — `serenecode_check_function` returns structured JSON in milliseconds, the agent fixes any findings inline, and only reports the overall task complete when the result is clean. This collapses an iteration loop that used to span multiple turns into a sequence of tool calls inside a single turn.
|
|
262
|
+
|
|
263
|
+
The outer loop still matters: cross-module compositional analysis, full coverage runs, and spec-traceability sweeps over the whole codebase aren't function-scoped, so they live at the batch level. The CLI handles those, and the same pipeline runs identically in CI.
|
|
264
|
+
|
|
167
265
|
AI-generated code won't always pass verification on the first try — and that's the point. SereneCode gives the coding agent structured feedback on exactly what failed and why: missing requirement implementations, counterexamples, violated contracts, untested modules, and suggested fixes. When there are many findings, SereneCode suggests the agent spawn subagents to address groups of related issues in parallel. **The value isn't in one-shotting perfection — it's in the loop that converges on verified completeness and correctness.**
|
|
168
266
|
|
|
169
|
-
Works in Claude Code, works in the terminal, works in CI:
|
|
267
|
+
Works in Claude Code, works in Cursor / Cline / Continue (via the same MCP server), works in the terminal, works in CI:
|
|
170
268
|
|
|
171
269
|
```python
|
|
172
270
|
import serenecode
|
|
@@ -181,6 +279,8 @@ for failure in result.failures:
|
|
|
181
279
|
print(detail.suggestion) # proposed fix direction
|
|
182
280
|
```
|
|
183
281
|
|
|
282
|
+
The library API (`serenecode.check`) and the MCP server (`serenecode_check`, `serenecode_check_function`) call into the same pipeline, so verification semantics are identical between an agent calling tools, a developer running the CLI, and CI invoking the Python API.
|
|
283
|
+
|
|
184
284
|
---
|
|
185
285
|
|
|
186
286
|
## Built With Its Own Medicine
|
|
@@ -189,11 +289,11 @@ SereneCode isn't just a tool that *tells* you to write verified code. It *is* ve
|
|
|
189
289
|
|
|
190
290
|
The SERENECODE.md convention file was the first artifact created — before any Python was written. The framework has been developed under those conventions with AI as a first-class contributor, and the repository continuously checks itself with:
|
|
191
291
|
|
|
192
|
-
- `pytest` across the full suite (currently
|
|
292
|
+
- `pytest` across the full suite (currently 1,393 passing tests, 16 skipped)
|
|
193
293
|
- `mypy --strict` across `src/` and `examples/dosage-serenecode/src/`
|
|
194
294
|
- SereneCode's own structural, type, property, symbolic, and compositional passes
|
|
195
295
|
|
|
196
|
-
On the current tree, `serenecode check src --level 6 --allow-code-execution` runs all
|
|
296
|
+
On the current tree, the bare CLI invocation `serenecode check src --level 6 --allow-code-execution` runs the full L1-L6 pipeline end-to-end against the framework's own source — 718 functions checked, 557 passed, 161 exempt, 0 failures, ~6 minutes wall time. A separate integration test, `test_serenecode_repo_passes_strict_level_6`, runs the same source tree through `run_pipeline` with `strict_config()` and `start_level=4`, which strips every path-based exemption and forces every adapter, CLI handler, MCP tool, and `__init__.py` through L4-L6. SereneCode also passes that strict-config self-check end-to-end: 0 L1 findings across all 466 strict-checked functions, 0 L3 coverage gaps across the strict-checked subset (~3.5 minutes), and 0 L4-L6 findings. The exempt items in the default-config run include adapter modules (which handle I/O and are integration-tested), port interfaces (Protocols that define abstract contracts), CLI entry points, the MCP server package, and functions whose parameter types are too complex for automated strategy generation or symbolic execution. Exempt items are visible in the output — they are not silently omitted.
|
|
197
297
|
|
|
198
298
|
At Level 5, CrossHair and Z3 search for counterexamples across the codebase's symbolic-friendly contracted top-level functions. Functions with non-primitive parameters (custom dataclasses, Protocol implementations, Callable types) are reported as exempt because the solver cannot generate inputs for them. Level 6 adds structural compositional analysis: dependency direction, circular dependency detection, interface compliance, contract presence at module boundaries, aliased cross-module call resolution, and architectural invariants. Interface compliance follows explicit `Protocol` inheritance and checks substitutability, including extra required parameters and incompatible return annotations. Together, they provide both deep per-function verification and system-level structural guarantees — but the structural checks at L6 verify contract *presence*, not logical *sufficiency* across call chains.
|
|
199
299
|
|
|
@@ -202,17 +302,27 @@ At Level 5, CrossHair and Z3 search for counterexamples across the codebase's sy
|
|
|
202
302
|
## Quick Start
|
|
203
303
|
|
|
204
304
|
```bash
|
|
205
|
-
# Install from PyPI
|
|
206
|
-
|
|
305
|
+
# Install from PyPI (add the [mcp] extra to enable the MCP server).
|
|
306
|
+
# Note: the MCP server ships in the next release; until it's published
|
|
307
|
+
# to PyPI, install from the source checkout instead:
|
|
308
|
+
# git clone https://github.com/helgster77/serenecode && cd serenecode
|
|
309
|
+
# uv sync --extra mcp # or: pip install -e '.[mcp]'
|
|
310
|
+
pip install 'serenecode[mcp]'
|
|
207
311
|
|
|
208
312
|
# Initialize — interactive setup (spec mode + verification level)
|
|
209
313
|
serenecode init
|
|
210
314
|
|
|
315
|
+
# Register the MCP server with your AI coding tool so verification
|
|
316
|
+
# runs inside the agent's edit loop, not just at the end:
|
|
317
|
+
claude mcp add serenecode -- uv run serenecode mcp --allow-code-execution
|
|
318
|
+
# (Cursor, Cline, Continue, and other MCP clients work the same way)
|
|
319
|
+
|
|
211
320
|
# Place your spec in the project directory, then start a coding session.
|
|
212
321
|
# Your agent reads SERENECODE.md, converts the spec to REQ-xxx format,
|
|
213
|
-
# validates it, creates an implementation plan, and builds from it
|
|
322
|
+
# validates it, creates an implementation plan, and builds from it —
|
|
323
|
+
# calling serenecode_check_function after every function it writes.
|
|
214
324
|
|
|
215
|
-
# Verify structure + spec traceability:
|
|
325
|
+
# Verify structure + spec traceability from the CLI:
|
|
216
326
|
serenecode check src/ --spec SPEC.md --structural
|
|
217
327
|
|
|
218
328
|
# Go deep — test coverage, property testing, symbolic verification:
|
|
@@ -239,6 +349,8 @@ serenecode check [<path>] [--level 1-6] [--allow-code-execution] # run ve
|
|
|
239
349
|
serenecode status [<path>] [--format human|json] # verification status
|
|
240
350
|
serenecode report [<path>] [--format human|json|html] # generate reports
|
|
241
351
|
[--output FILE] [--allow-code-execution] # write to file
|
|
352
|
+
serenecode mcp [--allow-code-execution] # boot the MCP server
|
|
353
|
+
[--project-root DIR] # over stdio
|
|
242
354
|
```
|
|
243
355
|
|
|
244
356
|
**Exit codes:** 0 = passed, 1 = structural, 2 = types, 3 = coverage, 4 = properties, 5 = symbolic, 6 = compositional, 10 = internal error or deep verification refused without explicit trust.
|
|
@@ -253,7 +365,7 @@ SereneCode is honest about what it can and can't do:
|
|
|
253
365
|
|
|
254
366
|
**Contracts are only as good as you write them.** A function with weak postconditions will pass verification even if the implementation is subtly wrong. SereneCode checks that contracts exist and hold, but can't check that they fully capture your intent. Tautological contracts like `lambda self: True` are now flagged by the conventions and should not be used — they provide no verification value.
|
|
255
367
|
|
|
256
|
-
**Exempt items are visible, not hidden.** Modules exempt from structural checking (adapters, CLI, ports, `__init__.py`) and functions excluded from deep verification (non-primitive parameter types, adapter code) are reported as "exempt" in the output rather than being silently omitted. This makes the verification scope transparent: the tool reports passed, failed, skipped, and exempt counts separately so you can see exactly what was and wasn't deeply verified. Previous versions silently omitted these, inflating the apparent scope.
|
|
368
|
+
**Exempt items are visible, not hidden.** Modules exempt from structural checking (adapters, CLI, ports, MCP server, `__init__.py`) and functions excluded from deep verification (non-primitive parameter types, adapter code) are reported as "exempt" in the output rather than being silently omitted. This makes the verification scope transparent: the tool reports passed, failed, skipped, and exempt counts separately so you can see exactly what was and wasn't deeply verified. Previous versions silently omitted these, inflating the apparent scope.
|
|
257
369
|
|
|
258
370
|
**Runtime checks can be disabled.** icontract decorators are checked on every call by default, but can be disabled via environment variables for performance in production. This is a feature, not a bug — but it means runtime guarantees depend on configuration.
|
|
259
371
|
|
|
@@ -274,7 +386,8 @@ SereneCode is honest about what it can and can't do:
|
|
|
274
386
|
SereneCode follows hexagonal architecture — the same pattern it enforces on your code:
|
|
275
387
|
|
|
276
388
|
```
|
|
277
|
-
CLI / Library API
|
|
389
|
+
CLI / Library API / MCP ← composition roots (interactive init, spec validation,
|
|
390
|
+
│ MCP server for AI agents)
|
|
278
391
|
│
|
|
279
392
|
├──▸ Pipeline ← orchestrates L1 → L2 → L3 → L4 → L5 → L6
|
|
280
393
|
│ ├──▸ Structural Checker (ast)
|
|
@@ -4,9 +4,11 @@
|
|
|
4
4
|
|
|
5
5
|
<h3 align="center">A Framework for AI-Driven Development of Verifiable Systems</h3>
|
|
6
6
|
|
|
7
|
-
SereneCode is a spec-to-verified-implementation framework for AI-generated Python. It ensures that every requirement in your spec is implemented, tested, and formally verified — closing the gap between what you asked for and what the AI built. The workflow starts from a spec with traceable requirements (REQ-xxx), enforces that the AI writes verifiable code with contracts and tests, then verifies at multiple levels — from structural checks and test coverage through property-based testing to symbolic execution with an SMT solver. You choose the verification depth during interactive setup: lightweight for internal tools, balanced for production systems, strict for safety-critical code.
|
|
7
|
+
SereneCode is a spec-to-verified-implementation framework for AI-generated Python. It ensures that every requirement in your spec is implemented, tested, and formally verified — closing the gap between what you asked for and what the AI built. The workflow starts from a spec with traceable requirements (REQ-xxx), enforces that the AI writes verifiable code with contracts and tests, then verifies at multiple levels — from structural checks and test coverage through property-based testing to symbolic execution with an SMT solver. You choose the verification depth during interactive setup: lightweight for internal tools, balanced for production systems, strict for safety-critical code.
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
SereneCode also ships a built-in **MCP server** so verification runs *inside* your AI assistant's edit loop, not just at the end. Once registered with Claude Code, Cursor, Cline, or Continue, the agent calls verification tools after every function it writes — getting structured findings, contract suggestions, and counterexamples back as JSON, fixing them mid-turn, and only reporting the work complete when the result is clean. AI agents write code fast but can miss requirements and skip edge cases; SereneCode closes that gap with spec traceability, test-existence enforcement, formal verification, and an MCP-driven inner loop the agent can drive on itself.
|
|
10
|
+
|
|
11
|
+
> **This framework was bootstrapped with AI under its own rules.** SereneCode's SERENECODE.md was written before the first line of code, and the codebase has been developed under those conventions from the start — including the MCP server, which the same AI agents now use to verify their own work mid-edit. The current tree passes its own `serenecode check src --level 6 --allow-code-execution` end-to-end via the bare CLI (718 functions checked, 557 passed, 161 exempt; ~6 minutes wall time), an internal strict-config Level 6 self-check in the test suite (`pytest tests/integration/test_example_projects.py::test_serenecode_repo_passes_strict_level_6`, which exercises L4-L6 against `strict_config` over the full source tree), `mypy src examples/dosage-serenecode/src`, the shipped dosage example's own `serenecode check src --level 6 --allow-code-execution`, and the full `pytest` suite (1,393 passing tests, 16 skipped). The verification output is transparent about scope: exempt modules (adapters, CLI, ports, MCP server, `__init__.py`) and functions excluded from deep verification (non-primitive parameter types) are reported as "exempt" rather than silently omitted.
|
|
10
12
|
|
|
11
13
|
---
|
|
12
14
|
|
|
@@ -16,6 +18,34 @@ AI writes code fast. But *fast* and *correct* aren't the same thing. When you're
|
|
|
16
18
|
|
|
17
19
|
The problem is that formal verification has always been expensive — too slow, too manual, too specialized. SereneCode makes it tractable by controlling the process from the start: a convention file tells the AI to write verification-ready code, a structural linter checks it followed the rules, and CrossHair + Z3 search for contract violations via symbolic execution.
|
|
18
20
|
|
|
21
|
+
### Common AI failure modes SereneCode is built to catch
|
|
22
|
+
|
|
23
|
+
After enough hours pair-programming with coding agents, the same handful of mistakes show up over and over. They're not random bugs — they're systematic patterns that follow from how a language model writes code: optimize for the happy path, infer intent from limited context, finish the visible task, leave implicit assumptions implicit. SereneCode treats each one as a verification target rather than a code-review target.
|
|
24
|
+
|
|
25
|
+
- **Skipping requirements without realizing it.** Given a spec with twelve requirements, the agent writes code that handles eight of them confidently and quietly omits the four it didn't see a clean place for. There's no error, no TODO — just missing behavior. SereneCode's spec traceability (REQ-xxx tags, `Implements:` / `Verifies:` references, the `serenecode_orphans` and `serenecode_req_status` tools) makes it impossible for a requirement to be silently dropped: every REQ in SPEC.md must be both implemented and tested or it shows up as an orphan.
|
|
26
|
+
|
|
27
|
+
- **Happy-path tests only.** Asked to "add tests," the agent writes a handful of cases that walk the obvious path through the function. Edge cases (empty input, off-by-one boundaries, the exact threshold value, the negative number, the unicode string) are routinely missed because they require imagining what could go wrong. L3 coverage catches uncovered branches; L4 Hypothesis property testing generates inputs the agent never thought of and runs them against the contracts.
|
|
28
|
+
|
|
29
|
+
- **Stub residue and "I'll come back to this."** A function the agent didn't quite know how to write often ships as `pass`, `...`, or `raise NotImplementedError("TODO")` — and then never gets revisited because the test suite doesn't fail on it. L1's stub-residue check flags these immediately.
|
|
30
|
+
|
|
31
|
+
- **Weak or tautological postconditions.** Asked to add a contract, the agent reaches for whatever satisfies the structural checker without actually constraining behavior: `lambda result: isinstance(result, int)` on a function that already returns `int`, or `lambda result: True`. These pass every check but verify nothing. L1 flags both patterns.
|
|
32
|
+
|
|
33
|
+
- **Silent exception handling.** `try: risky() / except Exception: pass` is a load-bearing anti-pattern in agent-generated code. The agent encounters an error during testing, decides the cleanest fix is to swallow it, and ships code where real failures vanish into the void. L1 flags any handler whose body is `pass`, `...`, `continue`, `break`, or a bare `return` and demands a meaningful response or an explicit `# silent-except: <reason>` opt-out.
|
|
34
|
+
|
|
35
|
+
- **Mutable default arguments.** `def f(x=[])` is a Python footgun every senior developer learned to avoid the hard way. Agents reproduce it cheerfully because they pattern-match on shape, not language semantics. L1 catches it by default.
|
|
36
|
+
|
|
37
|
+
- **Bare `assert` as a runtime check.** Agents reach for `assert x > 0` to "validate input" — but assertions disappear under `python -O`. The check vanishes silently in any production environment that strips them. L1 flags asserts in non-test source.
|
|
38
|
+
|
|
39
|
+
- **`print()` debug residue.** Trace prints from the agent's own debugging session ship to production because nobody pruned them. L1 catches `print()` in core modules.
|
|
40
|
+
|
|
41
|
+
- **Unsafe deserialization and shell calls.** `eval`, `exec`, `pickle.loads` on untrusted input, and `subprocess.run(..., shell=True)` are calls a security-aware human writes only with a comment explaining why. Agents reach for them when the simple solution looks fastest. L1 flags every one, requiring an `# allow-dangerous: <reason>` opt-out for the rare legitimate case.
|
|
42
|
+
|
|
43
|
+
- **Tests that pass but verify nothing.** A `def test_foo(): foo()` with no `assert` runs successfully and counts as "covered" — but it only checks that the function doesn't raise. L1's no-assertions-in-tests check fires on any `test_*` function with no `assert`, `pytest.raises`, `pytest.fail`, or `self.assertX` call.
|
|
44
|
+
|
|
45
|
+
- **Architectural drift.** Asked to "add a feature," the agent puts I/O in core, business logic in adapters, and circular imports between them. The system still works in tests because everything is loaded together — but the layering rule that made the code testable in the first place is gone. L6 compositional checks enforce dependency direction, interface compliance, and contract presence at module boundaries.
|
|
46
|
+
|
|
47
|
+
None of these failures are unique to AI; humans make them too. What's unique is the *rate* at which an agent produces them and the *confidence* with which the agent reports the work as done. The structural checker, the contracts, the property tester, and the symbolic search exist to make each pattern impossible to ship without an explicit, reviewed override.
|
|
48
|
+
|
|
19
49
|
SereneCode is designed for **building new verifiable systems from scratch with AI**, not for retrofitting verification onto large existing codebases. The conventions go in before the first line of code, and every module is written with verification in mind from day one. That's what makes it work. SereneCode is a best-effort tool, not a guarantee — see the [Disclaimer](#disclaimer) for important limitations on what it can and cannot assure.
|
|
20
50
|
|
|
21
51
|
### Choosing the Right Level
|
|
@@ -36,6 +66,8 @@ Pick the level that matches the stakes. Safety-critical code should start at Str
|
|
|
36
66
|
|
|
37
67
|
## See It In Action: The Medical Dosage Calculator
|
|
38
68
|
|
|
69
|
+
> **This is a hypothetical example for demonstration purposes only.** The medical dosage calculator is not a real, clinically validated, or regulator-approved tool. It is not derived from any actual drug-dosing protocol, has not been reviewed by medical professionals, and must not be used for any clinical decision-making. Its purpose is solely to illustrate how SereneCode shapes the way an AI agent writes verifiable code in a context where contracts and bounded symbolic search would matter. Building real medical software requires domain experts, regulated processes, and assurance methodologies that go far beyond what this framework provides.
|
|
70
|
+
|
|
39
71
|
We built the same medical dosage calculator twice from the same spec — once with plain AI, once with SereneCode — to show the difference.
|
|
40
72
|
|
|
41
73
|
Both versions implement four functions: dose calculation with weight-based dosing and max caps, renal function adjustment with tiered CrCl thresholds, daily safety checks with explicit total-versus-threshold calculations, and contraindication detection across current medications.
|
|
@@ -81,6 +113,8 @@ This creates SERENECODE.md (project conventions including spec traceability) and
|
|
|
81
113
|
|
|
82
114
|
A lightweight AST-based checker that validates code follows SERENECODE.md conventions in seconds. Missing a postcondition? No class invariant? No test file for a module? Caught before you waste time on heavy verification.
|
|
83
115
|
|
|
116
|
+
L1 also catches AI-failure-mode patterns that compile and look correct but represent real bugs: stub residue (`pass`/`...`/`raise NotImplementedError` left as a function body), mutable default arguments, bare `assert` in non-test source, `print()` in core, dangerous calls (`eval`, `exec`, `pickle.loads`, `os.system`, `subprocess` with `shell=True`), `TODO`/`FIXME`/`XXX`/`HACK` markers in tracked files, tests with no assertions, silent exception handlers, and tautological postconditions. Each rule has a per-rule opt-out comment for legitimate exceptions; see SERENECODE.md "Code Quality Standards" for the full list.
|
|
117
|
+
|
|
84
118
|
```bash
|
|
85
119
|
serenecode check src/ --structural # structural conventions
|
|
86
120
|
serenecode check src/ --spec SPEC.md # + spec traceability
|
|
@@ -111,29 +145,91 @@ The full pipeline is thorough but not instant. Larger systems will take longer,
|
|
|
111
145
|
|
|
112
146
|
Levels 3-6 import and execute project modules so coverage.py, Hypothesis, and CrossHair can exercise real code. Deep runs therefore require explicit `--allow-code-execution` and should only be used on trusted code.
|
|
113
147
|
|
|
148
|
+
### 4. The MCP Server — Verification Inside the Agent's Edit Loop
|
|
149
|
+
|
|
150
|
+
SereneCode ships a built-in MCP (Model Context Protocol) server that exposes the entire verification pipeline as tools any MCP-speaking AI coding assistant can call *while it writes code*, not just at the end. Instead of waiting for `serenecode check` to run at the bottom of a feature, the agent calls `serenecode_check_function` after every function it writes, sees structured findings inline, fixes them, and only reports the work complete when the result is clean. This collapses the feedback loop from minutes-after-writing to seconds-while-writing and turns serenecode from a batch tool you run *at* the agent into a peer tool the agent uses *on itself*.
|
|
151
|
+
|
|
152
|
+
**Setup (one-time):**
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
uv add 'serenecode[mcp]'
|
|
156
|
+
claude mcp add serenecode -- uv run serenecode mcp # read-only (L1, L2)
|
|
157
|
+
claude mcp add serenecode -- uv run serenecode mcp --allow-code-execution # all six levels
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
The same `serenecode mcp` stdio server works in Claude Code, Cursor, Cline, Continue, and any other MCP client.
|
|
161
|
+
|
|
162
|
+
**Tools the agent can call:**
|
|
163
|
+
|
|
164
|
+
| Tool | What it does |
|
|
165
|
+
|---|---|
|
|
166
|
+
| `serenecode_check` | Run the full pipeline on a project root |
|
|
167
|
+
| `serenecode_check_file` | Pipeline scoped to one source file |
|
|
168
|
+
| `serenecode_check_function` | Pipeline scoped to one function — the inner-loop tool |
|
|
169
|
+
| `serenecode_verify_fixed` | Re-run on one function and report whether a specific finding is gone |
|
|
170
|
+
| `serenecode_suggest_contracts` | Derive `@require`/`@ensure` decorators from a function signature |
|
|
171
|
+
| `serenecode_uncovered` | L3 coverage findings for one function (uncovered lines + mock advice) |
|
|
172
|
+
| `serenecode_suggest_test` | Test scaffold for an uncovered function |
|
|
173
|
+
| `serenecode_validate_spec` | Validate a SPEC.md is well-formed |
|
|
174
|
+
| `serenecode_list_reqs` | List REQ-xxx identifiers in a SPEC.md |
|
|
175
|
+
| `serenecode_req_status` | Implementation/verification status of one REQ |
|
|
176
|
+
| `serenecode_orphans` | REQs with no implementation or no test |
|
|
177
|
+
|
|
178
|
+
**Read-only resources** the agent can fetch without "calling" anything: `serenecode://config` (active SerenecodeConfig as JSON), `serenecode://findings/last-run` (most recent CheckResponse from this server session), `serenecode://exempt-modules` (the exempt path patterns for the active config), `serenecode://reqs` (parsed REQ-xxx list from the project's SPEC.md).
|
|
179
|
+
|
|
180
|
+
The server-level `--allow-code-execution` flag mirrors the CLI: without it, Levels 3-6 tools return a structured error rather than importing project code. `serenecode init` writes a copy-pasteable MCP setup snippet into the generated CLAUDE.md so newly initialized projects ship with the registration command and recommended workflow. See SERENECODE.md "MCP Integration" for the full descriptions and the agent-side workflow.
|
|
181
|
+
|
|
114
182
|
Scoped targets keep their package/import context across verification levels. In practice that means commands like `serenecode check src/core/ --level 4 --allow-code-execution` and `serenecode check src/core/models.py --level 3 --allow-code-execution` use the same local import roots and architectural module paths as a project-wide run instead of breaking relative imports or scoped core-module rules. Those scoped core/exemption rules are matched on path segments, not raw substrings, so names like `notcli.py`, `viewmodels.py`, and `transports/` do not accidentally change policy classification. Standalone files with non-importable names are also targeted correctly for CrossHair via `file.py:line` references.
|
|
115
183
|
|
|
116
184
|
---
|
|
117
185
|
|
|
118
186
|
## The AI Agent Loop
|
|
119
187
|
|
|
120
|
-
SereneCode is designed for spec-driven development with AI agents:
|
|
188
|
+
SereneCode is designed for spec-driven development with AI agents. The loop has two layers — a fast inner loop the agent drives on itself through the MCP server, and an outer batch loop for full project verification:
|
|
121
189
|
|
|
122
190
|
```
|
|
123
|
-
|
|
124
|
-
serenecode
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
serenecode
|
|
128
|
-
serenecode
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
191
|
+
─── one-time setup ──────────────────────────────────────────────────────────
|
|
192
|
+
serenecode init → spec mode + verification level
|
|
193
|
+
(also offers MCP server setup)
|
|
194
|
+
claude mcp add serenecode -- uv run \ → register MCP server with the AI
|
|
195
|
+
serenecode mcp --allow-code-execution tool (Claude Code, Cursor, ...)
|
|
196
|
+
serenecode spec SPEC.md → validate spec is ready
|
|
197
|
+
(REQ-xxx format, no gaps)
|
|
198
|
+
|
|
199
|
+
─── inner loop (per function, driven by the agent through MCP) ──────────────
|
|
200
|
+
AI reads SERENECODE.md + SPEC.md → conventions and what to build
|
|
201
|
+
AI calls serenecode_suggest_contracts → derive @require/@ensure for the
|
|
202
|
+
function it's about to write
|
|
203
|
+
AI writes the function → with Implements: REQ-xxx tag
|
|
204
|
+
AI calls serenecode_check_function → L1-L4 scoped to that function
|
|
205
|
+
AI reads structured findings → missing contracts, mutable
|
|
206
|
+
defaults, weak postconditions,
|
|
207
|
+
uncovered branches, etc.
|
|
208
|
+
AI fixes them and calls verify_fixed → confirms each finding is gone
|
|
209
|
+
before moving on to the next
|
|
210
|
+
function
|
|
211
|
+
|
|
212
|
+
─── outer loop (per feature, batch verification) ─────────────────────────────
|
|
213
|
+
serenecode check src/ --spec SPEC.md → did the AI follow conventions?
|
|
214
|
+
--structural all REQs covered?
|
|
215
|
+
serenecode check src/ --level 5 \ → deep verification: coverage,
|
|
216
|
+
--allow-code-execution \ property testing, symbolic search
|
|
217
|
+
--spec SPEC.md
|
|
218
|
+
AI calls serenecode_orphans / → which REQs are unimplemented or
|
|
219
|
+
serenecode_req_status untested?
|
|
220
|
+
AI fixes the gaps → adds implementations, tests,
|
|
221
|
+
stronger contracts
|
|
222
|
+
Repeat until verified → all REQs implemented + tested,
|
|
223
|
+
no counterexamples within bounds
|
|
132
224
|
```
|
|
133
225
|
|
|
226
|
+
The inner loop is what the MCP server enables. Before MCP, the agent had to finish writing, exit its turn, wait for `serenecode check` to run, parse the output, and iterate. With MCP, every function the agent writes gets validated *before* it moves to the next one — `serenecode_check_function` returns structured JSON in milliseconds, the agent fixes any findings inline, and only reports the overall task complete when the result is clean. This collapses an iteration loop that used to span multiple turns into a sequence of tool calls inside a single turn.
|
|
227
|
+
|
|
228
|
+
The outer loop still matters: cross-module compositional analysis, full coverage runs, and spec-traceability sweeps over the whole codebase aren't function-scoped, so they live at the batch level. The CLI handles those, and the same pipeline runs identically in CI.
|
|
229
|
+
|
|
134
230
|
AI-generated code won't always pass verification on the first try — and that's the point. SereneCode gives the coding agent structured feedback on exactly what failed and why: missing requirement implementations, counterexamples, violated contracts, untested modules, and suggested fixes. When there are many findings, SereneCode suggests the agent spawn subagents to address groups of related issues in parallel. **The value isn't in one-shotting perfection — it's in the loop that converges on verified completeness and correctness.**
|
|
135
231
|
|
|
136
|
-
Works in Claude Code, works in the terminal, works in CI:
|
|
232
|
+
Works in Claude Code, works in Cursor / Cline / Continue (via the same MCP server), works in the terminal, works in CI:
|
|
137
233
|
|
|
138
234
|
```python
|
|
139
235
|
import serenecode
|
|
@@ -148,6 +244,8 @@ for failure in result.failures:
|
|
|
148
244
|
print(detail.suggestion) # proposed fix direction
|
|
149
245
|
```
|
|
150
246
|
|
|
247
|
+
The library API (`serenecode.check`) and the MCP server (`serenecode_check`, `serenecode_check_function`) call into the same pipeline, so verification semantics are identical between an agent calling tools, a developer running the CLI, and CI invoking the Python API.
|
|
248
|
+
|
|
151
249
|
---
|
|
152
250
|
|
|
153
251
|
## Built With Its Own Medicine
|
|
@@ -156,11 +254,11 @@ SereneCode isn't just a tool that *tells* you to write verified code. It *is* ve
|
|
|
156
254
|
|
|
157
255
|
The SERENECODE.md convention file was the first artifact created — before any Python was written. The framework has been developed under those conventions with AI as a first-class contributor, and the repository continuously checks itself with:
|
|
158
256
|
|
|
159
|
-
- `pytest` across the full suite (currently
|
|
257
|
+
- `pytest` across the full suite (currently 1,393 passing tests, 16 skipped)
|
|
160
258
|
- `mypy --strict` across `src/` and `examples/dosage-serenecode/src/`
|
|
161
259
|
- SereneCode's own structural, type, property, symbolic, and compositional passes
|
|
162
260
|
|
|
163
|
-
On the current tree, `serenecode check src --level 6 --allow-code-execution` runs all
|
|
261
|
+
On the current tree, the bare CLI invocation `serenecode check src --level 6 --allow-code-execution` runs the full L1-L6 pipeline end-to-end against the framework's own source — 718 functions checked, 557 passed, 161 exempt, 0 failures, ~6 minutes wall time. A separate integration test, `test_serenecode_repo_passes_strict_level_6`, runs the same source tree through `run_pipeline` with `strict_config()` and `start_level=4`, which strips every path-based exemption and forces every adapter, CLI handler, MCP tool, and `__init__.py` through L4-L6. SereneCode also passes that strict-config self-check end-to-end: 0 L1 findings across all 466 strict-checked functions, 0 L3 coverage gaps across the strict-checked subset (~3.5 minutes), and 0 L4-L6 findings. The exempt items in the default-config run include adapter modules (which handle I/O and are integration-tested), port interfaces (Protocols that define abstract contracts), CLI entry points, the MCP server package, and functions whose parameter types are too complex for automated strategy generation or symbolic execution. Exempt items are visible in the output — they are not silently omitted.
|
|
164
262
|
|
|
165
263
|
At Level 5, CrossHair and Z3 search for counterexamples across the codebase's symbolic-friendly contracted top-level functions. Functions with non-primitive parameters (custom dataclasses, Protocol implementations, Callable types) are reported as exempt because the solver cannot generate inputs for them. Level 6 adds structural compositional analysis: dependency direction, circular dependency detection, interface compliance, contract presence at module boundaries, aliased cross-module call resolution, and architectural invariants. Interface compliance follows explicit `Protocol` inheritance and checks substitutability, including extra required parameters and incompatible return annotations. Together, they provide both deep per-function verification and system-level structural guarantees — but the structural checks at L6 verify contract *presence*, not logical *sufficiency* across call chains.
|
|
166
264
|
|
|
@@ -169,17 +267,27 @@ At Level 5, CrossHair and Z3 search for counterexamples across the codebase's sy
|
|
|
169
267
|
## Quick Start
|
|
170
268
|
|
|
171
269
|
```bash
|
|
172
|
-
# Install from PyPI
|
|
173
|
-
|
|
270
|
+
# Install from PyPI (add the [mcp] extra to enable the MCP server).
|
|
271
|
+
# Note: the MCP server ships in the next release; until it's published
|
|
272
|
+
# to PyPI, install from the source checkout instead:
|
|
273
|
+
# git clone https://github.com/helgster77/serenecode && cd serenecode
|
|
274
|
+
# uv sync --extra mcp # or: pip install -e '.[mcp]'
|
|
275
|
+
pip install 'serenecode[mcp]'
|
|
174
276
|
|
|
175
277
|
# Initialize — interactive setup (spec mode + verification level)
|
|
176
278
|
serenecode init
|
|
177
279
|
|
|
280
|
+
# Register the MCP server with your AI coding tool so verification
|
|
281
|
+
# runs inside the agent's edit loop, not just at the end:
|
|
282
|
+
claude mcp add serenecode -- uv run serenecode mcp --allow-code-execution
|
|
283
|
+
# (Cursor, Cline, Continue, and other MCP clients work the same way)
|
|
284
|
+
|
|
178
285
|
# Place your spec in the project directory, then start a coding session.
|
|
179
286
|
# Your agent reads SERENECODE.md, converts the spec to REQ-xxx format,
|
|
180
|
-
# validates it, creates an implementation plan, and builds from it
|
|
287
|
+
# validates it, creates an implementation plan, and builds from it —
|
|
288
|
+
# calling serenecode_check_function after every function it writes.
|
|
181
289
|
|
|
182
|
-
# Verify structure + spec traceability:
|
|
290
|
+
# Verify structure + spec traceability from the CLI:
|
|
183
291
|
serenecode check src/ --spec SPEC.md --structural
|
|
184
292
|
|
|
185
293
|
# Go deep — test coverage, property testing, symbolic verification:
|
|
@@ -206,6 +314,8 @@ serenecode check [<path>] [--level 1-6] [--allow-code-execution] # run ve
|
|
|
206
314
|
serenecode status [<path>] [--format human|json] # verification status
|
|
207
315
|
serenecode report [<path>] [--format human|json|html] # generate reports
|
|
208
316
|
[--output FILE] [--allow-code-execution] # write to file
|
|
317
|
+
serenecode mcp [--allow-code-execution] # boot the MCP server
|
|
318
|
+
[--project-root DIR] # over stdio
|
|
209
319
|
```
|
|
210
320
|
|
|
211
321
|
**Exit codes:** 0 = passed, 1 = structural, 2 = types, 3 = coverage, 4 = properties, 5 = symbolic, 6 = compositional, 10 = internal error or deep verification refused without explicit trust.
|
|
@@ -220,7 +330,7 @@ SereneCode is honest about what it can and can't do:
|
|
|
220
330
|
|
|
221
331
|
**Contracts are only as good as you write them.** A function with weak postconditions will pass verification even if the implementation is subtly wrong. SereneCode checks that contracts exist and hold, but can't check that they fully capture your intent. Tautological contracts like `lambda self: True` are now flagged by the conventions and should not be used — they provide no verification value.
|
|
222
332
|
|
|
223
|
-
**Exempt items are visible, not hidden.** Modules exempt from structural checking (adapters, CLI, ports, `__init__.py`) and functions excluded from deep verification (non-primitive parameter types, adapter code) are reported as "exempt" in the output rather than being silently omitted. This makes the verification scope transparent: the tool reports passed, failed, skipped, and exempt counts separately so you can see exactly what was and wasn't deeply verified. Previous versions silently omitted these, inflating the apparent scope.
|
|
333
|
+
**Exempt items are visible, not hidden.** Modules exempt from structural checking (adapters, CLI, ports, MCP server, `__init__.py`) and functions excluded from deep verification (non-primitive parameter types, adapter code) are reported as "exempt" in the output rather than being silently omitted. This makes the verification scope transparent: the tool reports passed, failed, skipped, and exempt counts separately so you can see exactly what was and wasn't deeply verified. Previous versions silently omitted these, inflating the apparent scope.
|
|
224
334
|
|
|
225
335
|
**Runtime checks can be disabled.** icontract decorators are checked on every call by default, but can be disabled via environment variables for performance in production. This is a feature, not a bug — but it means runtime guarantees depend on configuration.
|
|
226
336
|
|
|
@@ -241,7 +351,8 @@ SereneCode is honest about what it can and can't do:
|
|
|
241
351
|
SereneCode follows hexagonal architecture — the same pattern it enforces on your code:
|
|
242
352
|
|
|
243
353
|
```
|
|
244
|
-
CLI / Library API
|
|
354
|
+
CLI / Library API / MCP ← composition roots (interactive init, spec validation,
|
|
355
|
+
│ MCP server for AI agents)
|
|
245
356
|
│
|
|
246
357
|
├──▸ Pipeline ← orchestrates L1 → L2 → L3 → L4 → L5 → L6
|
|
247
358
|
│ ├──▸ Structural Checker (ast)
|