sorkit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. sorkit-0.1.0/.claude/skills/experiment-loop.md +84 -0
  2. sorkit-0.1.0/.github/workflows/ci.yml +30 -0
  3. sorkit-0.1.0/.github/workflows/publish.yml +30 -0
  4. sorkit-0.1.0/.gitignore +31 -0
  5. sorkit-0.1.0/CLAUDE.md +75 -0
  6. sorkit-0.1.0/LICENSE.md +21 -0
  7. sorkit-0.1.0/PKG-INFO +741 -0
  8. sorkit-0.1.0/README.md +712 -0
  9. sorkit-0.1.0/examples/sentiment/README.md +99 -0
  10. sorkit-0.1.0/examples/sentiment/fixtures/golden_set.json +55 -0
  11. sorkit-0.1.0/examples/sentiment/sor.yaml +54 -0
  12. sorkit-0.1.0/examples/sentiment/src/__init__.py +0 -0
  13. sorkit-0.1.0/examples/sentiment/src/api.py +22 -0
  14. sorkit-0.1.0/examples/sentiment/src/classifier.py +32 -0
  15. sorkit-0.1.0/examples/sentiment/tests/__init__.py +0 -0
  16. sorkit-0.1.0/examples/sentiment/tests/conftest.py +16 -0
  17. sorkit-0.1.0/examples/sentiment/tests/test_api_contract.py +45 -0
  18. sorkit-0.1.0/examples/sentiment/tests/test_classifier_accuracy.py +84 -0
  19. sorkit-0.1.0/examples/sentiment/tests/test_classifier_contract.py +58 -0
  20. sorkit-0.1.0/pyproject.toml +50 -0
  21. sorkit-0.1.0/src/sorkit/__init__.py +3 -0
  22. sorkit-0.1.0/src/sorkit/__main__.py +11 -0
  23. sorkit-0.1.0/src/sorkit/audit.py +328 -0
  24. sorkit-0.1.0/src/sorkit/config.py +298 -0
  25. sorkit-0.1.0/src/sorkit/frozen.py +35 -0
  26. sorkit-0.1.0/src/sorkit/init.py +366 -0
  27. sorkit-0.1.0/src/sorkit/notify.py +160 -0
  28. sorkit-0.1.0/src/sorkit/oracle.py +201 -0
  29. sorkit-0.1.0/src/sorkit/ratchet.py +270 -0
  30. sorkit-0.1.0/src/sorkit/results.py +141 -0
  31. sorkit-0.1.0/src/sorkit/server.py +465 -0
  32. sorkit-0.1.0/tests/__init__.py +0 -0
  33. sorkit-0.1.0/tests/conftest.py +58 -0
  34. sorkit-0.1.0/tests/test_audit.py +348 -0
  35. sorkit-0.1.0/tests/test_config.py +260 -0
  36. sorkit-0.1.0/tests/test_frozen.py +53 -0
  37. sorkit-0.1.0/tests/test_init.py +242 -0
  38. sorkit-0.1.0/tests/test_notify.py +174 -0
  39. sorkit-0.1.0/tests/test_oracle.py +212 -0
  40. sorkit-0.1.0/tests/test_ratchet.py +357 -0
  41. sorkit-0.1.0/tests/test_results.py +163 -0
  42. sorkit-0.1.0/tests/test_server.py +271 -0
@@ -0,0 +1,84 @@
1
+ # Skill: Autonomous Experiment Loop
2
+
3
+ ## When to Use
4
+ When implementing any layer of this project autonomously.
5
+ This skill defines the experiment protocol — how to iterate, evaluate, and ratchet.
6
+
7
+ ## Protocol
8
+
9
+ ### Before You Start
10
+
11
+ 1. Read `CLAUDE.md` to confirm which layer you're working on
12
+ 2. Read the oracle tests for your layer (see the contracts/scored_tests in sor.yaml)
13
+ 3. Read the current mutation surface files to understand the starting state
14
+ 4. Check `results.tsv` for previous experiment history (if any)
15
+
16
+ ### Experiment Loop
17
+
18
+ Each iteration follows this exact sequence. Do not deviate.
19
+
20
+ #### Step 1: Plan the Change
21
+
22
+ Before editing any code, write a one-line hypothesis:
23
+
24
+ ```
25
+ HYPOTHESIS: [what you're changing] should [expected effect] because [reasoning]
26
+ ```
27
+
28
+ #### Step 2: Implement
29
+
30
+ Edit ONLY files in the current layer's mutation surface (see CLAUDE.md).
31
+ Do NOT touch frozen files. Do NOT touch files from other layers.
32
+ Keep changes atomic — one idea per iteration.
33
+
34
+ #### Step 3: Run the Ratchet
35
+
36
+ ```bash
37
+ ./scripts/ratchet.sh <layer_number> "brief hypothesis description"
38
+ ```
39
+
40
+ The ratchet will:
41
+ - Run the oracle
42
+ - Compare scores to previous best
43
+ - Git commit if improved, git reset if not
44
+ - Check all stopping conditions
45
+ - Notify if a stopping condition is hit
46
+
47
+ #### Step 4: Parse the Output
48
+
49
+ The ratchet prints exactly one of:
50
+ - `KEEP score={X} prev={Y}` — improvement, committed
51
+ - `DISCARD score={X} best={Y}` — no improvement, reverted
52
+ - `DISCARD FAIL` — tests failed, reverted
53
+ - `STOP:{reason} score={X} attempts={N} kept={K}` — stopping condition hit
54
+
55
+ #### Step 5: Decide Next Experiment
56
+
57
+ Review `results.tsv` to see what you've tried. Pick a different approach.
58
+ Do NOT repeat a failed hypothesis with minor variations more than once.
59
+
60
+ If you have 3+ consecutive failures, read the test output more carefully:
61
+ ```bash
62
+ tail -n 50 run.log
63
+ ```
64
+
65
+ ### Stopping Conditions
66
+
67
+ Stop the loop and report to the human if you see any `STOP:` output:
68
+ - `TARGET_MET` — scored layer reached its target composite score
69
+ - `ALL_PASS` — pass/fail layer succeeded
70
+ - `PLATEAU` — too many consecutive non-improvements
71
+ - `DIMINISHING` — improvements too small to matter
72
+ - `MAX_ATTEMPTS` — hard ceiling reached
73
+ - `CONSECUTIVE_FAILURES` — too many crashes in a row
74
+ - `ORACLE_ERROR` — the oracle itself is broken (needs human fix)
75
+
76
+ ### Results TSV Format
77
+
78
+ Tab-separated, appended by the ratchet:
79
+
80
+ ```
81
+ timestamp layer hypothesis score outcome
82
+ 2026-03-12T10:30:00 0 hybrid BM25+cosine 0.6/0.4 0.72 KEEP
83
+ 2026-03-12T10:35:00 0 pure cosine similarity 0.65 DISCARD
84
+ ```
@@ -0,0 +1,30 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up Python ${{ matrix.python-version }}
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: ${{ matrix.python-version }}
23
+
24
+ - name: Install dependencies
25
+ run: |
26
+ python -m pip install --upgrade pip
27
+ pip install -e ".[dev]"
28
+
29
+ - name: Test
30
+ run: pytest -v
@@ -0,0 +1,30 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ permissions:
8
+ id-token: write
9
+
10
+ jobs:
11
+ publish:
12
+ runs-on: ubuntu-latest
13
+ environment: pypi
14
+
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - name: Set up Python
19
+ uses: actions/setup-python@v5
20
+ with:
21
+ python-version: "3.12"
22
+
23
+ - name: Install build tools
24
+ run: pip install build
25
+
26
+ - name: Build package
27
+ run: python -m build
28
+
29
+ - name: Publish to PyPI
30
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,31 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ *.egg
6
+ dist/
7
+ build/
8
+ *.whl
9
+
10
+ # Virtual environments
11
+ .venv/
12
+ venv/
13
+
14
+ # Testing
15
+ .pytest_cache/
16
+ .coverage
17
+ htmlcov/
18
+
19
+ # IDE
20
+ .idea/
21
+ .vscode/
22
+ *.swp
23
+ *.swo
24
+
25
+ # OS
26
+ .DS_Store
27
+ Thumbs.db
28
+
29
+ # SOR runtime
30
+ run.log
31
+ reports/
sorkit-0.1.0/CLAUDE.md ADDED
@@ -0,0 +1,75 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## What This Is
6
+
7
+ sorkit is a Python MCP server (`pip install sorkit`) implementing the Surface-Oracle-Ratchet pattern for autonomous code optimization. An AI agent edits designated files (surfaces), is evaluated by frozen tests (oracles), and advances via git commit on improvement / git reset on failure (ratchet).
8
+
9
+ ## Commands
10
+
11
+ ```bash
12
+ # Install (editable with dev deps)
13
+ pip install -e ".[dev]"
14
+
15
+ # Run all tests
16
+ python -m pytest
17
+
18
+ # Run a single test file
19
+ python -m pytest tests/test_config.py
20
+
21
+ # Run a single test by name
22
+ python -m pytest -k test_loads_layers
23
+
24
+ # Run with verbose output
25
+ python -m pytest -v
26
+
27
+ # Run the MCP server
28
+ sorkit
29
+
30
+ # Build for distribution
31
+ python -m build
32
+ ```
33
+
34
+ ## Architecture
35
+
36
+ The package lives in `src/sorkit/` with this data flow:
37
+
38
+ ```
39
+ sor.yaml → config.py → server.py (9 MCP tools)
40
+
41
+ oracle.py (run tests, extract metrics)
42
+
43
+ ratchet.py (compare → git commit/reset → check stops)
44
+
45
+ results.py (append to results.tsv)
46
+
47
+ notify.py (file/Slack/email/desktop)
48
+ ```
49
+
50
+ **config.py** — Dataclass model for `sor.yaml`. `load_config()` parses YAML into `SorConfig` with layers, each containing `OracleConfig`, `MetricConfig`, `ThresholdConfig`. `resolve_threshold()` cascades layer overrides to defaults. `resolve_layer_index()` accepts name or numeric index.
51
+
52
+ **server.py** — FastMCP server (`from fastmcp import FastMCP`). Nine `@mcp.tool()` functions. `sor_init` uses a two-call pattern: no config returns a template, with config saves and generates artifacts. `sor_ratchet` is the core loop tool.
53
+
54
+ **oracle.py** — Async subprocess runner. Contracts run first (`-x --tb=short -q`), scored tests add `-s` to capture `print()` output. `_extract_metric()` uses regex `^{PATTERN}:\s+(\S+)` to pull floats from stdout. Returns `OracleResult` with composite score.
55
+
56
+ **ratchet.py** — `ratchet_once()` is the convergence engine. Checks 7 stopping conditions: TARGET_MET, ALL_PASS, PLATEAU, DIMINISHING, MAX_ATTEMPTS, CONSECUTIVE_FAILURES, ORACLE_ERROR. Git operations use `asyncio.create_subprocess_exec`.
57
+
58
+ **init.py** — Generates `CLAUDE.md`, `.claude/skills/experiment-loop.md`, and `results.tsv` from config. `config_from_dict()` handles both template format (with `_value`/`_description` keys) and plain dicts.
59
+
60
+ **results.py** — `ResultsStore` reads/writes a TSV file. Methods like `get_best_score()`, `get_consecutive_failures()`, `get_consecutive_non_improvements()` drive stopping condition checks.
61
+
62
+ **frozen.py** — `get_frozen_paths()` computes the full frozen set: `always_frozen` + surfaces from all layers below the current one.
63
+
64
+ **audit.py** — Analysis tools over `ResultsStore`: score progression with running best, hypothesis grouping with keep/discard/fail counts, full audit reports with convergence estimates.
65
+
66
+ ## Testing Patterns
67
+
68
+ - All tests are async-compatible (`asyncio_mode = "auto"` in pyproject.toml)
69
+ - Shared fixture `sor_project` (in `tests/conftest.py`) creates a tmp dir with a valid `sor.yaml`
70
+ - Server tests use `async with Client(mcp) as client` then `result = await client.call_tool(name, args)` — access output via `result.content[0].text`
71
+ - Oracle/ratchet tests mock `asyncio.create_subprocess_exec` to avoid real subprocess calls
72
+
73
+ ## Example
74
+
75
+ `examples/sentiment/` is a self-contained demo: a naive sentiment classifier (~40% accuracy) that an agent optimizes against 50 labeled examples. Layer 1 is scored (classifier), Layer 2 is pass/fail (API stub). Run from that directory with `python -m pytest tests/ -s`.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 2Lines
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.