autoresearch-core 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autoresearch_core-0.1.1/.github/workflows/ci.yml +33 -0
- autoresearch_core-0.1.1/.github/workflows/publish.yml +43 -0
- autoresearch_core-0.1.1/.gitignore +7 -0
- autoresearch_core-0.1.1/CHANGELOG.md +23 -0
- autoresearch_core-0.1.1/LICENSE +21 -0
- autoresearch_core-0.1.1/PKG-INFO +102 -0
- autoresearch_core-0.1.1/README.md +74 -0
- autoresearch_core-0.1.1/autoresearch_core/__init__.py +31 -0
- autoresearch_core-0.1.1/autoresearch_core/contract.py +52 -0
- autoresearch_core-0.1.1/autoresearch_core/failures.py +26 -0
- autoresearch_core-0.1.1/autoresearch_core/gates.py +33 -0
- autoresearch_core-0.1.1/autoresearch_core/policy.py +45 -0
- autoresearch_core-0.1.1/autoresearch_core/ports.py +34 -0
- autoresearch_core-0.1.1/autoresearch_core/promote.py +48 -0
- autoresearch_core-0.1.1/autoresearch_core/py.typed +1 -0
- autoresearch_core-0.1.1/autoresearch_core/types.py +79 -0
- autoresearch_core-0.1.1/autoresearch_core/verdict.py +64 -0
- autoresearch_core-0.1.1/pyproject.toml +41 -0
- autoresearch_core-0.1.1/tests/test_contract.py +71 -0
- autoresearch_core-0.1.1/tests/test_failures.py +18 -0
- autoresearch_core-0.1.1/tests/test_gates.py +43 -0
- autoresearch_core-0.1.1/tests/test_parity.py +58 -0
- autoresearch_core-0.1.1/tests/test_policy.py +40 -0
- autoresearch_core-0.1.1/tests/test_ports.py +26 -0
- autoresearch_core-0.1.1/tests/test_promote.py +47 -0
- autoresearch_core-0.1.1/tests/test_types.py +35 -0
- autoresearch_core-0.1.1/tests/test_verdict.py +29 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
strategy:
|
|
12
|
+
fail-fast: false
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: ${{ matrix.python-version }}
|
|
20
|
+
- run: python -m pip install --upgrade pip
|
|
21
|
+
- run: pip install -e ".[dev]"
|
|
22
|
+
- run: pytest -q --cov=autoresearch_core --cov-report=term-missing --cov-fail-under=95
|
|
23
|
+
|
|
24
|
+
build:
|
|
25
|
+
runs-on: ubuntu-latest
|
|
26
|
+
steps:
|
|
27
|
+
- uses: actions/checkout@v4
|
|
28
|
+
- uses: actions/setup-python@v5
|
|
29
|
+
with:
|
|
30
|
+
python-version: "3.12"
|
|
31
|
+
- run: pip install build twine
|
|
32
|
+
- run: python -m build
|
|
33
|
+
- run: twine check dist/*
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
# Publishes on a version tag (e.g. v0.1.1) via PyPI Trusted Publishing (OIDC).
|
|
4
|
+
# One-time setup on PyPI: project → Publishing → add a Trusted Publisher with
|
|
5
|
+
# owner: ca1773130n repo: autoresearch-core
|
|
6
|
+
# workflow: publish.yml environment: pypi
|
|
7
|
+
# No API token is stored anywhere.
|
|
8
|
+
|
|
9
|
+
on:
|
|
10
|
+
push:
|
|
11
|
+
tags: ["v*"]
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
build:
|
|
15
|
+
name: Build distribution
|
|
16
|
+
runs-on: ubuntu-latest
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
- uses: actions/setup-python@v5
|
|
20
|
+
with:
|
|
21
|
+
python-version: "3.12"
|
|
22
|
+
- run: python -m pip install --upgrade build
|
|
23
|
+
- run: python -m build
|
|
24
|
+
- uses: actions/upload-artifact@v4
|
|
25
|
+
with:
|
|
26
|
+
name: dist
|
|
27
|
+
path: dist/
|
|
28
|
+
|
|
29
|
+
publish:
|
|
30
|
+
name: Publish to PyPI
|
|
31
|
+
needs: build
|
|
32
|
+
runs-on: ubuntu-latest
|
|
33
|
+
environment:
|
|
34
|
+
name: pypi
|
|
35
|
+
url: https://pypi.org/p/autoresearch-core
|
|
36
|
+
permissions:
|
|
37
|
+
id-token: write # OIDC for Trusted Publishing
|
|
38
|
+
steps:
|
|
39
|
+
- uses: actions/download-artifact@v4
|
|
40
|
+
with:
|
|
41
|
+
name: dist
|
|
42
|
+
path: dist/
|
|
43
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to `autoresearch-core` are documented here. The format
|
|
4
|
+
follows [Keep a Changelog](https://keepachangelog.com/), and the project
|
|
5
|
+
adheres to [Semantic Versioning](https://semver.org/).
|
|
6
|
+
|
|
7
|
+
## [0.1.1] - 2026-06-03
|
|
8
|
+
### Fixed
|
|
9
|
+
- Reject non-finite metric values (`1e999` → `inf`, `nan`) in `parse_metrics_line`
|
|
10
|
+
and `validate_metric_spec`, matching JS `JSON.parse` semantics.
|
|
11
|
+
- `approach_hash` collapses internal whitespace before hashing.
|
|
12
|
+
- `build_dead_end_record` raises unless the verdict is a deterministic refutation.
|
|
13
|
+
- `resolve_gates` tolerates a non-mapping `research_gates` config value.
|
|
14
|
+
- `ExperimentResult` defensively copies `metrics` so the frozen instance is immutable.
|
|
15
|
+
|
|
16
|
+
## [0.1.0] - 2026-06-03
|
|
17
|
+
### Added
|
|
18
|
+
- Initial release: `MetricSpec`, deterministic `Verdict` (`measure`), failure
|
|
19
|
+
classifier (H2/H3/H4), gate model, decision policy (`decide`,
|
|
20
|
+
`detect_plateau`, `should_promote_dead_end`), promotion record shapes, and the
|
|
21
|
+
adapter `Protocol`s (`Spawn`, `Retriever`, `KnowledgeGraph`, `ExperimentRunner`,
|
|
22
|
+
`Store`). Pure-Python, zero runtime dependencies; behaviour parity-tested
|
|
23
|
+
against the GRD autoresearch loop.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Cameleon X
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: autoresearch-core
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Decision-contracts library: deterministic verdict/gate/failure/promotion logic for autoresearch loops.
|
|
5
|
+
Project-URL: Homepage, https://github.com/ca1773130n/autoresearch-core
|
|
6
|
+
Project-URL: Repository, https://github.com/ca1773130n/autoresearch-core
|
|
7
|
+
Project-URL: Issues, https://github.com/ca1773130n/autoresearch-core/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/ca1773130n/autoresearch-core/blob/main/CHANGELOG.md
|
|
9
|
+
Author-email: Cameleon X <ca1773130n@gmail.com>
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: agent,autoresearch,decision-contracts,deterministic,research-loop,verdict
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering
|
|
22
|
+
Classifier: Typing :: Typed
|
|
23
|
+
Requires-Python: >=3.11
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
26
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# autoresearch-core
|
|
30
|
+
|
|
31
|
+
[](https://github.com/ca1773130n/autoresearch-core/actions/workflows/ci.yml)
|
|
32
|
+
[](https://pypi.org/project/autoresearch-core/)
|
|
33
|
+
[](https://pypi.org/project/autoresearch-core/)
|
|
34
|
+
[](LICENSE)
|
|
35
|
+
|
|
36
|
+
A tiny, **pure-Python decision-contracts library** for autoresearch / agentic
|
|
37
|
+
loops: a *deterministic* verdict (metric / comparator / target), failure
|
|
38
|
+
classification, gates, and promotion record shapes — the disciplined decision
|
|
39
|
+
core, with **zero runtime dependencies** and **no I/O**.
|
|
40
|
+
|
|
41
|
+
You bring the loop, the retrieval, the runner, and the storage; you bind them to
|
|
42
|
+
the library's `Protocol`s and call `measure` / `decide` / `should_promote_dead_end`
|
|
43
|
+
at your decision points. The verdict logic is parity-tested against the GRD
|
|
44
|
+
autoresearch loop.
|
|
45
|
+
|
|
46
|
+
## Install
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install autoresearch-core
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Requires Python 3.11+. No runtime dependencies.
|
|
53
|
+
|
|
54
|
+
## Quickstart
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from autoresearch_core import (
|
|
58
|
+
MetricSpec, ExperimentResult, measure, parse_metrics_line, should_promote_dead_end,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
spec = MetricSpec(metric_key="recall_at_10", comparator=">=", target=0.8)
|
|
62
|
+
|
|
63
|
+
# An experiment prints `__RESULT__ {"recall_at_10": 0.83}` on stdout:
|
|
64
|
+
metrics = parse_metrics_line(stdout) # -> {"recall_at_10": 0.83}
|
|
65
|
+
verdict = measure(spec, ExperimentResult(metrics=metrics, exit_code=0))
|
|
66
|
+
|
|
67
|
+
verdict.verdict # "supported" | "refuted" | "inconclusive" (deterministic)
|
|
68
|
+
verdict.evidence_level # "deterministic"
|
|
69
|
+
should_promote_dead_end(verdict) # True only for a deterministic refutation
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## What it owns (and what it doesn't)
|
|
73
|
+
|
|
74
|
+
**Owns — the decision discipline:**
|
|
75
|
+
- `MetricSpec` + the `__RESULT__` result contract (`parse_metrics_line`, `validate_metric_spec`)
|
|
76
|
+
- `DeterministicVerdict` / `measure` (metric vs target → supported / refuted / inconclusive)
|
|
77
|
+
- failure classification (`classify_run_failure` → `H2` / `H3` / `H4`)
|
|
78
|
+
- gates (`resolve_gates`, `check_gate`)
|
|
79
|
+
- policy (`decide`, `detect_plateau`, `should_promote_dead_end`)
|
|
80
|
+
- promotion record shapes (`DeadEndRecord`, `KnowhowRecord`, `approach_hash`, `should_skip`)
|
|
81
|
+
|
|
82
|
+
**Doesn't own — bind these via `ports.py` `Protocol`s to your own infra:**
|
|
83
|
+
`Spawn`, `Retriever`, `KnowledgeGraph`, `ExperimentRunner`, `Store`.
|
|
84
|
+
|
|
85
|
+
## Verdict authority
|
|
86
|
+
|
|
87
|
+
`DeterministicVerdict` is the default and the reason this package exists. Other
|
|
88
|
+
strategies (an LLM judge, an exit-code check) can be plugged in via the
|
|
89
|
+
`VerdictStrategy` protocol, but **only a deterministic refutation auto-promotes a
|
|
90
|
+
dead-end** — non-deterministic verdicts are advisory. Every verdict records its
|
|
91
|
+
`strategy` and `evidence_level`, so the decision trail stays auditable.
|
|
92
|
+
|
|
93
|
+
## Development
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
pip install -e ".[dev]"
|
|
97
|
+
pytest -q --cov=autoresearch_core
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## License
|
|
101
|
+
|
|
102
|
+
MIT © Cameleon X — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# autoresearch-core
|
|
2
|
+
|
|
3
|
+
[](https://github.com/ca1773130n/autoresearch-core/actions/workflows/ci.yml)
|
|
4
|
+
[](https://pypi.org/project/autoresearch-core/)
|
|
5
|
+
[](https://pypi.org/project/autoresearch-core/)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
|
|
8
|
+
A tiny, **pure-Python decision-contracts library** for autoresearch / agentic
|
|
9
|
+
loops: a *deterministic* verdict (metric / comparator / target), failure
|
|
10
|
+
classification, gates, and promotion record shapes — the disciplined decision
|
|
11
|
+
core, with **zero runtime dependencies** and **no I/O**.
|
|
12
|
+
|
|
13
|
+
You bring the loop, the retrieval, the runner, and the storage; you bind them to
|
|
14
|
+
the library's `Protocol`s and call `measure` / `decide` / `should_promote_dead_end`
|
|
15
|
+
at your decision points. The verdict logic is parity-tested against the GRD
|
|
16
|
+
autoresearch loop.
|
|
17
|
+
|
|
18
|
+
## Install
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install autoresearch-core
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Requires Python 3.11+. No runtime dependencies.
|
|
25
|
+
|
|
26
|
+
## Quickstart
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
from autoresearch_core import (
|
|
30
|
+
MetricSpec, ExperimentResult, measure, parse_metrics_line, should_promote_dead_end,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
spec = MetricSpec(metric_key="recall_at_10", comparator=">=", target=0.8)
|
|
34
|
+
|
|
35
|
+
# An experiment prints `__RESULT__ {"recall_at_10": 0.83}` on stdout:
|
|
36
|
+
metrics = parse_metrics_line(stdout) # -> {"recall_at_10": 0.83}
|
|
37
|
+
verdict = measure(spec, ExperimentResult(metrics=metrics, exit_code=0))
|
|
38
|
+
|
|
39
|
+
verdict.verdict # "supported" | "refuted" | "inconclusive" (deterministic)
|
|
40
|
+
verdict.evidence_level # "deterministic"
|
|
41
|
+
should_promote_dead_end(verdict) # True only for a deterministic refutation
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## What it owns (and what it doesn't)
|
|
45
|
+
|
|
46
|
+
**Owns — the decision discipline:**
|
|
47
|
+
- `MetricSpec` + the `__RESULT__` result contract (`parse_metrics_line`, `validate_metric_spec`)
|
|
48
|
+
- `DeterministicVerdict` / `measure` (metric vs target → supported / refuted / inconclusive)
|
|
49
|
+
- failure classification (`classify_run_failure` → `H2` / `H3` / `H4`)
|
|
50
|
+
- gates (`resolve_gates`, `check_gate`)
|
|
51
|
+
- policy (`decide`, `detect_plateau`, `should_promote_dead_end`)
|
|
52
|
+
- promotion record shapes (`DeadEndRecord`, `KnowhowRecord`, `approach_hash`, `should_skip`)
|
|
53
|
+
|
|
54
|
+
**Doesn't own — bind these via `ports.py` `Protocol`s to your own infra:**
|
|
55
|
+
`Spawn`, `Retriever`, `KnowledgeGraph`, `ExperimentRunner`, `Store`.
|
|
56
|
+
|
|
57
|
+
## Verdict authority
|
|
58
|
+
|
|
59
|
+
`DeterministicVerdict` is the default and the reason this package exists. Other
|
|
60
|
+
strategies (an LLM judge, an exit-code check) can be plugged in via the
|
|
61
|
+
`VerdictStrategy` protocol, but **only a deterministic refutation auto-promotes a
|
|
62
|
+
dead-end** — non-deterministic verdicts are advisory. Every verdict records its
|
|
63
|
+
`strategy` and `evidence_level`, so the decision trail stays auditable.
|
|
64
|
+
|
|
65
|
+
## Development
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install -e ".[dev]"
|
|
69
|
+
pytest -q --cov=autoresearch_core
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## License
|
|
73
|
+
|
|
74
|
+
MIT © Cameleon X — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""autoresearch-core: pure-Python decision contracts for autoresearch loops."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.1"
|
|
4
|
+
|
|
5
|
+
from .types import (
|
|
6
|
+
Comparator, EvidenceLevel, ExperimentResult, FailureClass, GateCheck, GateState,
|
|
7
|
+
Hypothesis, MetricSpec, Takeaway, Verdict, VerdictRecord,
|
|
8
|
+
)
|
|
9
|
+
from .contract import parse_metrics_line, validate_metric_spec
|
|
10
|
+
from .failures import classify_run_failure
|
|
11
|
+
from .verdict import compare, DeterministicVerdict, VerdictStrategy
|
|
12
|
+
from .gates import resolve_gates, check_gate
|
|
13
|
+
from .policy import (
|
|
14
|
+
decide_branch, should_terminate, detect_plateau, should_promote_dead_end,
|
|
15
|
+
measure, decide,
|
|
16
|
+
)
|
|
17
|
+
from .promote import (
|
|
18
|
+
DeadEndRecord, KnowhowRecord, approach_hash, build_dead_end_record, should_skip,
|
|
19
|
+
)
|
|
20
|
+
from .ports import Spawn, Retriever, KnowledgeGraph, ExperimentRunner, Store
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"Comparator", "EvidenceLevel", "ExperimentResult", "FailureClass", "GateCheck", "GateState",
|
|
24
|
+
"Hypothesis", "MetricSpec", "Takeaway", "Verdict", "VerdictRecord",
|
|
25
|
+
"parse_metrics_line", "validate_metric_spec", "classify_run_failure",
|
|
26
|
+
"compare", "DeterministicVerdict", "VerdictStrategy", "resolve_gates", "check_gate",
|
|
27
|
+
"decide_branch", "should_terminate", "detect_plateau", "should_promote_dead_end",
|
|
28
|
+
"measure", "decide",
|
|
29
|
+
"DeadEndRecord", "KnowhowRecord", "approach_hash", "build_dead_end_record", "should_skip",
|
|
30
|
+
"Spawn", "Retriever", "KnowledgeGraph", "ExperimentRunner", "Store",
|
|
31
|
+
]
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""The machine-readable experiment-result contract. Parity with GRD runner.ts."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import math
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
from .types import Comparator, MetricSpec
|
|
9
|
+
|
|
10
|
+
_RESULT_RE = re.compile(r"__RESULT__\s*(\{.*\})")
|
|
11
|
+
_COMPARATORS: tuple[Comparator, ...] = (">=", "<=", ">", "<", "==")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _reject_constant(token: str) -> float:
|
|
15
|
+
# GRD parity: JS JSON.parse rejects NaN/Infinity/-Infinity. Mirror that.
|
|
16
|
+
raise ValueError(f"non-JSON constant: {token}")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def parse_metrics_line(stdout: str) -> dict[str, float]:
|
|
20
|
+
"""Extract {metric: number} from the first `__RESULT__ {json}` occurrence.
|
|
21
|
+
|
|
22
|
+
Mirrors GRD: non-numeric values are dropped. Python `bool` is an `int`
|
|
23
|
+
subclass, so booleans are excluded explicitly.
|
|
24
|
+
"""
|
|
25
|
+
match = _RESULT_RE.search(stdout)
|
|
26
|
+
if not match:
|
|
27
|
+
return {}
|
|
28
|
+
try:
|
|
29
|
+
obj = json.loads(match.group(1), parse_constant=_reject_constant)
|
|
30
|
+
except (ValueError, TypeError):
|
|
31
|
+
return {}
|
|
32
|
+
if not isinstance(obj, dict):
|
|
33
|
+
return {}
|
|
34
|
+
out: dict[str, float] = {}
|
|
35
|
+
for key, value in obj.items():
|
|
36
|
+
if isinstance(value, bool):
|
|
37
|
+
continue
|
|
38
|
+
if isinstance(value, (int, float)) and math.isfinite(value):
|
|
39
|
+
out[str(key)] = float(value)
|
|
40
|
+
return out
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def validate_metric_spec(spec: MetricSpec) -> None:
|
|
44
|
+
"""Raise ValueError if the spec cannot drive a deterministic verdict."""
|
|
45
|
+
if not isinstance(spec.metric_key, str) or not spec.metric_key:
|
|
46
|
+
raise ValueError("MetricSpec.metric_key must be a non-empty string")
|
|
47
|
+
if spec.comparator not in _COMPARATORS:
|
|
48
|
+
raise ValueError(f"MetricSpec.comparator must be one of {_COMPARATORS}")
|
|
49
|
+
if not isinstance(spec.target, (int, float)) or isinstance(spec.target, bool):
|
|
50
|
+
raise ValueError("MetricSpec.target must be numeric")
|
|
51
|
+
if not math.isfinite(spec.target):
|
|
52
|
+
raise ValueError("MetricSpec.target must be finite")
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Failure classification. Parity with GRD runner.ts classifyRunFailure."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
from .types import FailureClass
|
|
7
|
+
|
|
8
|
+
_H2_RE = re.compile(
|
|
9
|
+
r"command not found|not found:|ModuleNotFoundError|ImportError", re.IGNORECASE
|
|
10
|
+
)
|
|
11
|
+
_H3_RE = re.compile(
|
|
12
|
+
r"No such file or directory|ENOENT|permission denied", re.IGNORECASE
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def classify_run_failure(stderr: str, timed_out: bool) -> FailureClass:
|
|
17
|
+
"""H4=timeout/other-runtime, H2=missing dep, H3=missing file/permission, none=empty."""
|
|
18
|
+
if timed_out:
|
|
19
|
+
return "H4"
|
|
20
|
+
if _H2_RE.search(stderr):
|
|
21
|
+
return "H2"
|
|
22
|
+
if _H3_RE.search(stderr):
|
|
23
|
+
return "H3"
|
|
24
|
+
if not stderr:
|
|
25
|
+
return "none"
|
|
26
|
+
return "H4"
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Gate model. Parity with GRD gates.ts.
|
|
2
|
+
|
|
3
|
+
The config sub-key is `experiment_execution` (NOT `execute`); it controls the
|
|
4
|
+
runtime gate named `execute`. Any value other than literal False leaves it on.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import Any, Literal, Mapping
|
|
9
|
+
|
|
10
|
+
from .types import GateCheck, GateState
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def resolve_gates(config: Mapping[str, Any], no_gates: bool) -> GateState:
|
|
14
|
+
if no_gates:
|
|
15
|
+
return GateState(execute=False, kg_write=False)
|
|
16
|
+
_rg = config.get("research_gates")
|
|
17
|
+
rg: dict[str, object] = _rg if isinstance(_rg, dict) else {}
|
|
18
|
+
return GateState(
|
|
19
|
+
execute=rg.get("experiment_execution") is not False,
|
|
20
|
+
kg_write=rg.get("kg_write") is not False,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def check_gate(gates: GateState, gate: Literal["execute", "kg_write"], approved: bool) -> GateCheck:
|
|
25
|
+
"""Decide whether to proceed or pause at `gate`. Parity with GRD checkGate
|
|
26
|
+
(which also sets thread.status='paused'/pendingGate — the caller does that).
|
|
27
|
+
Unknown gate names raise ValueError (fail-fast; GRD would silently proceed)."""
|
|
28
|
+
current = getattr(gates, gate, None)
|
|
29
|
+
if current is None:
|
|
30
|
+
raise ValueError(f"unknown gate: {gate!r}")
|
|
31
|
+
if (not current) or approved:
|
|
32
|
+
return GateCheck(proceed=True, pending_gate=None)
|
|
33
|
+
return GateCheck(proceed=False, pending_gate=gate)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Pure decision policy + facades. Parity with GRD verdict.ts + promotion-authority rule."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from .types import ExperimentResult, MetricSpec, Verdict, VerdictRecord
|
|
5
|
+
from .verdict import DeterministicVerdict, VerdictStrategy
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def decide_branch(verdict: Verdict) -> str:
|
|
9
|
+
"""'finalize' if supported, else 'revise'."""
|
|
10
|
+
return "finalize" if verdict == "supported" else "revise"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def should_terminate(iteration: int, max_iterations: int, last_verdict: Verdict) -> tuple[bool, str]:
|
|
14
|
+
"""Return (done, status). supported -> supported; budget hit -> exhausted; else active."""
|
|
15
|
+
if last_verdict == "supported":
|
|
16
|
+
return True, "supported"
|
|
17
|
+
if iteration >= max_iterations:
|
|
18
|
+
return True, "exhausted"
|
|
19
|
+
return False, "active"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def detect_plateau(verdicts: list[Verdict], window: int = 3) -> bool:
|
|
23
|
+
"""True when the last `window` verdicts are all non-supported."""
|
|
24
|
+
if len(verdicts) < window:
|
|
25
|
+
return False
|
|
26
|
+
return all(v != "supported" for v in verdicts[-window:])
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def should_promote_dead_end(record: VerdictRecord) -> bool:
|
|
30
|
+
"""Codex rule: only a DETERMINISTIC refutation may auto-promote a dead-end."""
|
|
31
|
+
return record.verdict == "refuted" and record.evidence_level == "deterministic"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def measure(
|
|
35
|
+
spec: MetricSpec, result: ExperimentResult, strategy: VerdictStrategy | None = None
|
|
36
|
+
) -> VerdictRecord:
|
|
37
|
+
"""Facade: evaluate a result under a verdict strategy (deterministic by default)."""
|
|
38
|
+
return (strategy or DeterministicVerdict()).evaluate(spec, result)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def decide(iteration: int, max_iterations: int, verdict: Verdict) -> tuple[str, bool, str]:
|
|
42
|
+
"""Facade: (branch, done, status) from a verdict. branch in {finalize, revise}."""
|
|
43
|
+
branch = decide_branch(verdict)
|
|
44
|
+
done, status = should_terminate(iteration, max_iterations, verdict)
|
|
45
|
+
return branch, done, status
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Adapter protocols. Each project binds these to its own infra (no impl here)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any, Protocol, Sequence, runtime_checkable
|
|
5
|
+
|
|
6
|
+
from .types import ExperimentResult
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@runtime_checkable
|
|
10
|
+
class Spawn(Protocol):
|
|
11
|
+
async def __call__(self, prompt: str) -> str: ...
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@runtime_checkable
|
|
15
|
+
class Retriever(Protocol):
|
|
16
|
+
async def retrieve(self, query: str, k: int = 8) -> Sequence[dict[str, Any]]: ...
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@runtime_checkable
|
|
20
|
+
class KnowledgeGraph(Protocol):
|
|
21
|
+
async def prior_findings(self, query: str) -> Sequence[dict[str, Any]]: ...
|
|
22
|
+
async def write_finding(self, finding: dict[str, Any]) -> None: ...
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@runtime_checkable
|
|
26
|
+
class ExperimentRunner(Protocol):
|
|
27
|
+
def run(self, plan: dict[str, Any], workdir: str) -> ExperimentResult: ...
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@runtime_checkable
|
|
31
|
+
class Store(Protocol):
|
|
32
|
+
def save_verdict(self, thread_id: str, record: Any) -> None: ...
|
|
33
|
+
def load_dead_end_hashes(self, scope: str) -> set[str]: ...
|
|
34
|
+
def save_dead_end(self, scope: str, record: Any) -> None: ...
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Promotion record shapes (KNOWHOW / DEAD-ENDS). Shape only; projects persist."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import hashlib
|
|
5
|
+
import re
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
from .policy import should_promote_dead_end
|
|
9
|
+
from .types import Hypothesis, VerdictRecord
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True)
|
|
13
|
+
class DeadEndRecord:
|
|
14
|
+
approach_hash: str
|
|
15
|
+
statement: str
|
|
16
|
+
reason: str
|
|
17
|
+
iteration: int
|
|
18
|
+
evidence_level: str
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True)
|
|
22
|
+
class KnowhowRecord:
|
|
23
|
+
statement: str
|
|
24
|
+
content: str
|
|
25
|
+
iteration: int
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def approach_hash(statement: str) -> str:
|
|
29
|
+
"""Stable, case/space-insensitive hash used to dedupe approaches."""
|
|
30
|
+
normalized = re.sub(r"\s+", " ", statement.strip().lower())
|
|
31
|
+
return hashlib.sha256(normalized.encode("utf-8")).hexdigest()[:16]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def build_dead_end_record(hypothesis: Hypothesis, record: VerdictRecord) -> DeadEndRecord:
|
|
35
|
+
if not should_promote_dead_end(record):
|
|
36
|
+
raise ValueError("build_dead_end_record requires a deterministic refutation")
|
|
37
|
+
return DeadEndRecord(
|
|
38
|
+
approach_hash=approach_hash(hypothesis.statement),
|
|
39
|
+
statement=hypothesis.statement,
|
|
40
|
+
reason=record.detail,
|
|
41
|
+
iteration=hypothesis.iteration,
|
|
42
|
+
evidence_level=record.evidence_level,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def should_skip(statement: str, dead_end_hashes: set[str]) -> bool:
|
|
47
|
+
"""Don't re-propose an approach already in the dead-ends set."""
|
|
48
|
+
return approach_hash(statement) in dead_end_hashes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# PEP 561 marker: autoresearch-core ships inline type annotations.
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Core data definitions for autoresearch-core. Pure data, no logic, no I/O."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
Verdict = Literal["supported", "refuted", "inconclusive"]
|
|
8
|
+
Comparator = Literal[">=", "<=", ">", "<", "=="]
|
|
9
|
+
FailureClass = Literal["H2", "H3", "H4", "none"]
|
|
10
|
+
EvidenceLevel = Literal["deterministic", "exit_code", "llm"]
|
|
11
|
+
HypothesisStatus = Literal[
|
|
12
|
+
"open", "testing", "supported", "refuted", "inconclusive", "superseded"
|
|
13
|
+
]
|
|
14
|
+
TakeawayKind = Literal[
|
|
15
|
+
"success_pattern", "failure_root_cause", "constraint", "domain_fact", "tool_pattern"
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(frozen=True)
|
|
20
|
+
class MetricSpec:
|
|
21
|
+
"""The machine-readable verdict contract a hypothesis must carry."""
|
|
22
|
+
metric_key: str
|
|
23
|
+
comparator: Comparator
|
|
24
|
+
target: float
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class ExperimentResult:
|
|
29
|
+
metrics: dict[str, float]
|
|
30
|
+
exit_code: int
|
|
31
|
+
failure_class: FailureClass = "none"
|
|
32
|
+
runner: str = "subprocess"
|
|
33
|
+
duration_ms: int = 0
|
|
34
|
+
stdout_excerpt: str = ""
|
|
35
|
+
|
|
36
|
+
def __post_init__(self) -> None:
|
|
37
|
+
object.__setattr__(self, "metrics", dict(self.metrics))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass(frozen=True)
|
|
41
|
+
class VerdictRecord:
|
|
42
|
+
verdict: Verdict
|
|
43
|
+
strategy: str
|
|
44
|
+
evidence_level: EvidenceLevel
|
|
45
|
+
detail: str
|
|
46
|
+
raw_evidence_ref: str | None = None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass(frozen=True)
|
|
50
|
+
class Hypothesis:
|
|
51
|
+
id: str
|
|
52
|
+
iteration: int
|
|
53
|
+
statement: str
|
|
54
|
+
predicted_outcome: str
|
|
55
|
+
status: HypothesisStatus = "open"
|
|
56
|
+
parent_id: str | None = None
|
|
57
|
+
verdict: Verdict | None = None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass(frozen=True)
|
|
61
|
+
class Takeaway:
|
|
62
|
+
kind: TakeawayKind
|
|
63
|
+
content: str
|
|
64
|
+
confidence: float
|
|
65
|
+
evidence: str
|
|
66
|
+
failure_class: FailureClass
|
|
67
|
+
iteration: int
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass(frozen=True)
|
|
71
|
+
class GateState:
|
|
72
|
+
execute: bool = True
|
|
73
|
+
kg_write: bool = True
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass(frozen=True)
|
|
77
|
+
class GateCheck:
|
|
78
|
+
proceed: bool
|
|
79
|
+
pending_gate: str | None = None
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Deterministic verdict. Parity with GRD verdict.ts (compare + evaluateVerdict)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Protocol, runtime_checkable
|
|
5
|
+
|
|
6
|
+
from .types import Comparator, ExperimentResult, MetricSpec, VerdictRecord
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def compare(value: float, comparator: Comparator, target: float) -> bool:
|
|
10
|
+
if comparator == ">=":
|
|
11
|
+
return value >= target
|
|
12
|
+
if comparator == "<=":
|
|
13
|
+
return value <= target
|
|
14
|
+
if comparator == ">":
|
|
15
|
+
return value > target
|
|
16
|
+
if comparator == "<":
|
|
17
|
+
return value < target
|
|
18
|
+
if comparator == "==":
|
|
19
|
+
return value == target
|
|
20
|
+
return False
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@runtime_checkable
|
|
24
|
+
class VerdictStrategy(Protocol):
|
|
25
|
+
name: str
|
|
26
|
+
|
|
27
|
+
def evaluate(self, spec: MetricSpec, result: ExperimentResult) -> VerdictRecord: ...
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DeterministicVerdict:
|
|
31
|
+
"""Authoritative strategy: numeric metric vs target. evidence_level='deterministic'."""
|
|
32
|
+
|
|
33
|
+
name = "deterministic"
|
|
34
|
+
|
|
35
|
+
def evaluate(self, spec: MetricSpec, result: ExperimentResult) -> VerdictRecord:
|
|
36
|
+
if result.exit_code != 0:
|
|
37
|
+
return VerdictRecord(
|
|
38
|
+
verdict="inconclusive",
|
|
39
|
+
strategy=self.name,
|
|
40
|
+
evidence_level="deterministic",
|
|
41
|
+
detail=f"experiment run failed ({result.failure_class})",
|
|
42
|
+
)
|
|
43
|
+
if spec.metric_key not in result.metrics:
|
|
44
|
+
return VerdictRecord(
|
|
45
|
+
verdict="inconclusive",
|
|
46
|
+
strategy=self.name,
|
|
47
|
+
evidence_level="deterministic",
|
|
48
|
+
detail=f'metric "{spec.metric_key}" not reported',
|
|
49
|
+
)
|
|
50
|
+
value = result.metrics[spec.metric_key]
|
|
51
|
+
passed = compare(value, spec.comparator, spec.target)
|
|
52
|
+
return VerdictRecord(
|
|
53
|
+
verdict="supported" if passed else "refuted",
|
|
54
|
+
strategy=self.name,
|
|
55
|
+
evidence_level="deterministic",
|
|
56
|
+
detail=f"{spec.metric_key}={_fmt(value)} {spec.comparator} {_fmt(spec.target)} "
|
|
57
|
+
f"→ {'pass' if passed else 'fail'}",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _fmt(n: float) -> str:
|
|
62
|
+
"""Render 5 not 5.0; 0.9 stays 0.9. The `detail` string is human-readable and
|
|
63
|
+
NOT a byte-for-byte parity guarantee with GRD — only the verdict OUTCOME is."""
|
|
64
|
+
return str(int(n)) if isinstance(n, float) and n.is_integer() else str(n)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "autoresearch-core"
|
|
7
|
+
version = "0.1.1"
|
|
8
|
+
description = "Decision-contracts library: deterministic verdict/gate/failure/promotion logic for autoresearch loops."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Cameleon X", email = "ca1773130n@gmail.com" }]
|
|
13
|
+
keywords = ["autoresearch", "decision-contracts", "verdict", "research-loop", "deterministic", "agent"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Programming Language :: Python :: 3.13",
|
|
23
|
+
"Topic :: Scientific/Engineering",
|
|
24
|
+
"Typing :: Typed",
|
|
25
|
+
]
|
|
26
|
+
dependencies = []
|
|
27
|
+
|
|
28
|
+
[project.urls]
|
|
29
|
+
Homepage = "https://github.com/ca1773130n/autoresearch-core"
|
|
30
|
+
Repository = "https://github.com/ca1773130n/autoresearch-core"
|
|
31
|
+
Issues = "https://github.com/ca1773130n/autoresearch-core/issues"
|
|
32
|
+
Changelog = "https://github.com/ca1773130n/autoresearch-core/blob/main/CHANGELOG.md"
|
|
33
|
+
|
|
34
|
+
[project.optional-dependencies]
|
|
35
|
+
dev = ["pytest>=8.0", "pytest-cov>=5.0"]
|
|
36
|
+
|
|
37
|
+
[tool.hatch.build.targets.wheel]
|
|
38
|
+
packages = ["autoresearch_core"]
|
|
39
|
+
|
|
40
|
+
[tool.pytest.ini_options]
|
|
41
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from autoresearch_core.contract import parse_metrics_line, validate_metric_spec
|
|
3
|
+
from autoresearch_core.types import MetricSpec
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_parses_first_result_line_numeric_only():
|
|
7
|
+
assert parse_metrics_line('noise\n__RESULT__ {"latency_ms": 180, "ok": "yes"}\nmore') == {
|
|
8
|
+
"latency_ms": 180.0
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_excludes_bool_values():
|
|
13
|
+
# bool is a subclass of int in Python; must not leak in as 1.0/0.0
|
|
14
|
+
assert parse_metrics_line('__RESULT__ {"passed": true, "n": 3}') == {"n": 3.0}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_missing_marker_or_bad_json_returns_empty():
|
|
18
|
+
assert parse_metrics_line("no marker here") == {}
|
|
19
|
+
assert parse_metrics_line("__RESULT__ {not json}") == {}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_rejects_nan_and_infinity_like_js():
|
|
23
|
+
# JS JSON.parse rejects these tokens; GRD returns {} — match that.
|
|
24
|
+
assert parse_metrics_line('__RESULT__ {"x": NaN}') == {}
|
|
25
|
+
assert parse_metrics_line('__RESULT__ {"x": Infinity}') == {}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_first_of_multiple_result_lines():
|
|
29
|
+
assert parse_metrics_line('__RESULT__ {"a": 1}\n__RESULT__ {"a": 2}') == {"a": 1.0}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def test_validate_metric_spec():
|
|
33
|
+
validate_metric_spec(MetricSpec("recall", ">=", 0.8)) # no raise
|
|
34
|
+
with pytest.raises(ValueError):
|
|
35
|
+
validate_metric_spec(MetricSpec("", ">=", 0.8))
|
|
36
|
+
with pytest.raises(ValueError):
|
|
37
|
+
validate_metric_spec(MetricSpec("x", "!=", 0.8)) # type: ignore[arg-type]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_greedy_regex_drops_line_with_trailing_junk_grd_parity():
|
|
41
|
+
# INTENTIONAL: GRD's runner.ts regex is greedy (\{.*\}), so a line with
|
|
42
|
+
# trailing JSON-shaped junk captures the whole span, JSON.parse fails, and
|
|
43
|
+
# the result is dropped. We MATCH that behavior on purpose. Do not "fix" to
|
|
44
|
+
# non-greedy — that would diverge from GRD. The __RESULT__ contract is one
|
|
45
|
+
# clean `{json}` per line.
|
|
46
|
+
assert parse_metrics_line('__RESULT__ {"a": 1} junk {"b": 2}') == {}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_validate_metric_spec_rejects_bool_target():
|
|
50
|
+
with pytest.raises(ValueError):
|
|
51
|
+
validate_metric_spec(MetricSpec("x", ">=", True)) # type: ignore[arg-type]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_parse_metrics_drops_overflow_to_inf():
|
|
55
|
+
# 1e999 overflows to float('inf') in Python JSON; must be dropped (non-finite guard).
|
|
56
|
+
assert parse_metrics_line('__RESULT__ {"x": 1e999}') == {}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_parse_metrics_drops_neg_inf():
|
|
60
|
+
# -1e999 overflows to float('-inf'); must also be dropped.
|
|
61
|
+
assert parse_metrics_line('__RESULT__ {"x": -1e999}') == {}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def test_validate_metric_spec_rejects_inf_target():
|
|
65
|
+
with pytest.raises(ValueError):
|
|
66
|
+
validate_metric_spec(MetricSpec("x", ">=", float("inf")))
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def test_validate_metric_spec_rejects_nan_target():
|
|
70
|
+
with pytest.raises(ValueError):
|
|
71
|
+
validate_metric_spec(MetricSpec("x", ">=", float("nan")))
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from autoresearch_core.failures import classify_run_failure
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_classify():
|
|
5
|
+
assert classify_run_failure("", True) == "H4" # timeout wins
|
|
6
|
+
assert classify_run_failure("ModuleNotFoundError: x", False) == "H2"
|
|
7
|
+
assert classify_run_failure("ImportError: bad", False) == "H2"
|
|
8
|
+
assert classify_run_failure("bash: foo: command not found", False) == "H2"
|
|
9
|
+
assert classify_run_failure("not found: foo", False) == "H2"
|
|
10
|
+
assert classify_run_failure("ENOENT: no such file", False) == "H3"
|
|
11
|
+
assert classify_run_failure("permission denied", False) == "H3"
|
|
12
|
+
assert classify_run_failure("", False) == "none" # empty stderr
|
|
13
|
+
assert classify_run_failure("segfault boom", False) == "H4" # other runtime
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_h2_takes_precedence_over_h3_when_both_present():
|
|
17
|
+
# GRD checks H2 before H3
|
|
18
|
+
assert classify_run_failure("ImportError and No such file or directory", False) == "H2"
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from autoresearch_core.gates import resolve_gates, check_gate
|
|
2
|
+
from autoresearch_core.types import GateState
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def test_resolve_defaults_on():
|
|
6
|
+
g = resolve_gates({}, no_gates=False)
|
|
7
|
+
assert g.execute is True and g.kg_write is True
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_resolve_disable_execute_via_experiment_execution_key():
|
|
11
|
+
g = resolve_gates({"research_gates": {"experiment_execution": False}}, no_gates=False)
|
|
12
|
+
assert g.execute is False and g.kg_write is True
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_no_gates_disables_all():
|
|
16
|
+
g = resolve_gates({"research_gates": {"experiment_execution": True}}, no_gates=True)
|
|
17
|
+
assert g.execute is False and g.kg_write is False
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_check_gate_pause_vs_proceed():
|
|
21
|
+
gates = GateState(execute=True, kg_write=True)
|
|
22
|
+
paused = check_gate(gates, "execute", approved=False)
|
|
23
|
+
assert paused.proceed is False and paused.pending_gate == "execute"
|
|
24
|
+
assert check_gate(gates, "execute", approved=True).proceed is True
|
|
25
|
+
assert check_gate(GateState(execute=False), "execute", approved=False).proceed is True
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_check_gate_unknown_gate_raises():
|
|
29
|
+
import pytest
|
|
30
|
+
with pytest.raises(ValueError):
|
|
31
|
+
check_gate(GateState(), "nope", approved=False) # type: ignore[arg-type]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_resolve_gates_non_dict_research_gates_treated_as_empty():
|
|
35
|
+
# When research_gates is a non-dict (e.g. True), fall back to defaults (both on).
|
|
36
|
+
g = resolve_gates({"research_gates": True}, no_gates=False)
|
|
37
|
+
assert g.execute is True and g.kg_write is True
|
|
38
|
+
|
|
39
|
+
g2 = resolve_gates({"research_gates": 1}, no_gates=False)
|
|
40
|
+
assert g2.execute is True and g2.kg_write is True
|
|
41
|
+
|
|
42
|
+
g3 = resolve_gates({"research_gates": "yes"}, no_gates=False)
|
|
43
|
+
assert g3.execute is True and g3.kg_write is True
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# tests/test_parity.py
|
|
2
|
+
"""Vectors transcribed from GRD lib/research/{verdict.ts,runner.ts,gates.ts}.
|
|
3
|
+
If GRD changes these behaviors, update here deliberately — do not loosen."""
|
|
4
|
+
import autoresearch_core as ac
|
|
5
|
+
from autoresearch_core.types import MetricSpec, ExperimentResult
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_public_surface_exports():
|
|
9
|
+
for name in [
|
|
10
|
+
"MetricSpec", "ExperimentResult", "VerdictRecord", "GateState", "GateCheck",
|
|
11
|
+
"parse_metrics_line", "classify_run_failure", "compare",
|
|
12
|
+
"DeterministicVerdict", "resolve_gates", "check_gate",
|
|
13
|
+
"decide_branch", "should_terminate", "detect_plateau",
|
|
14
|
+
"should_promote_dead_end", "measure", "decide",
|
|
15
|
+
"approach_hash", "build_dead_end_record",
|
|
16
|
+
]:
|
|
17
|
+
assert hasattr(ac, name), f"missing public export: {name}"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_end_to_end_supported_path():
|
|
21
|
+
spec = MetricSpec("recall_at_10", ">=", 0.8)
|
|
22
|
+
stdout = 'log line\n__RESULT__ {"recall_at_10": 0.83}\n'
|
|
23
|
+
metrics = ac.parse_metrics_line(stdout)
|
|
24
|
+
result = ExperimentResult(metrics=metrics, exit_code=0)
|
|
25
|
+
rec = ac.DeterministicVerdict().evaluate(spec, result)
|
|
26
|
+
assert rec.verdict == "supported"
|
|
27
|
+
assert ac.decide_branch(rec.verdict) == "finalize"
|
|
28
|
+
assert ac.should_promote_dead_end(rec) is False
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_end_to_end_refuted_then_promote():
|
|
32
|
+
spec = MetricSpec("latency_ms", "<", 200)
|
|
33
|
+
result = ExperimentResult(metrics={"latency_ms": 300.0}, exit_code=0)
|
|
34
|
+
rec = ac.DeterministicVerdict().evaluate(spec, result)
|
|
35
|
+
assert rec.verdict == "refuted"
|
|
36
|
+
assert ac.decide_branch(rec.verdict) == "revise"
|
|
37
|
+
assert ac.should_promote_dead_end(rec) is True
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_end_to_end_failed_run_is_inconclusive():
|
|
41
|
+
spec = MetricSpec("x", ">=", 1)
|
|
42
|
+
err = ExperimentResult(metrics={}, exit_code=127, failure_class="H2")
|
|
43
|
+
rec = ac.DeterministicVerdict().evaluate(spec, err)
|
|
44
|
+
assert rec.verdict == "inconclusive"
|
|
45
|
+
assert ac.should_promote_dead_end(rec) is False # inconclusive never promotes
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_invalid_comparator_returns_false():
|
|
49
|
+
assert ac.compare(1, "!=", 1) is False # type: ignore[arg-type]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def test_exit_code_precedence_over_metrics():
|
|
53
|
+
# non-zero exit -> inconclusive even when a metric is present
|
|
54
|
+
spec = MetricSpec("x", ">=", 1)
|
|
55
|
+
rec = ac.DeterministicVerdict().evaluate(
|
|
56
|
+
spec, ExperimentResult(metrics={"x": 5.0}, exit_code=1, failure_class="H4")
|
|
57
|
+
)
|
|
58
|
+
assert rec.verdict == "inconclusive"
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from autoresearch_core.policy import (
|
|
2
|
+
decide_branch, should_terminate, detect_plateau, should_promote_dead_end,
|
|
3
|
+
measure, decide,
|
|
4
|
+
)
|
|
5
|
+
from autoresearch_core.types import MetricSpec, ExperimentResult, VerdictRecord
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_decide_branch():
|
|
9
|
+
assert decide_branch("supported") == "finalize"
|
|
10
|
+
assert decide_branch("refuted") == "revise"
|
|
11
|
+
assert decide_branch("inconclusive") == "revise"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_should_terminate():
|
|
15
|
+
assert should_terminate(2, 8, "supported") == (True, "supported")
|
|
16
|
+
assert should_terminate(8, 8, "refuted") == (True, "exhausted")
|
|
17
|
+
assert should_terminate(3, 8, "refuted") == (False, "active")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_detect_plateau():
|
|
21
|
+
assert detect_plateau(["refuted", "refuted"], window=3) is False # too few
|
|
22
|
+
assert detect_plateau(["refuted", "inconclusive", "refuted"]) is True
|
|
23
|
+
assert detect_plateau(["refuted", "supported", "refuted"]) is False
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_promotion_authority_deterministic_only():
|
|
27
|
+
det = VerdictRecord("refuted", "deterministic", "deterministic", "x<y")
|
|
28
|
+
llm = VerdictRecord("refuted", "reviewer", "llm", "looks wrong")
|
|
29
|
+
ok = VerdictRecord("supported", "deterministic", "deterministic", "x>=y")
|
|
30
|
+
assert should_promote_dead_end(det) is True
|
|
31
|
+
assert should_promote_dead_end(llm) is False # advisory only
|
|
32
|
+
assert should_promote_dead_end(ok) is False # supported never promotes
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_measure_and_decide_facades():
|
|
36
|
+
spec = MetricSpec("recall", ">=", 0.8)
|
|
37
|
+
rec = measure(spec, ExperimentResult(metrics={"recall": 0.9}, exit_code=0))
|
|
38
|
+
assert rec.verdict == "supported" and rec.evidence_level == "deterministic"
|
|
39
|
+
assert decide(2, 8, rec.verdict) == ("finalize", True, "supported")
|
|
40
|
+
assert decide(3, 8, "refuted") == ("revise", False, "active")
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from autoresearch_core.ports import ExperimentRunner, Store
|
|
2
|
+
from autoresearch_core.types import ExperimentResult
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class FakeRunner:
|
|
6
|
+
def run(self, plan: dict, workdir: str) -> ExperimentResult:
|
|
7
|
+
return ExperimentResult(metrics={"x": 1.0}, exit_code=0)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FakeStore:
|
|
11
|
+
def __init__(self):
|
|
12
|
+
self.dead_ends: dict[str, list] = {}
|
|
13
|
+
|
|
14
|
+
def save_verdict(self, thread_id: str, record) -> None:
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
def load_dead_end_hashes(self, scope: str) -> set[str]:
|
|
18
|
+
return set()
|
|
19
|
+
|
|
20
|
+
def save_dead_end(self, scope: str, record) -> None:
|
|
21
|
+
self.dead_ends.setdefault(scope, []).append(record)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_fakes_satisfy_protocols():
|
|
25
|
+
assert isinstance(FakeRunner(), ExperimentRunner)
|
|
26
|
+
assert isinstance(FakeStore(), Store)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from autoresearch_core.promote import (
|
|
3
|
+
approach_hash, build_dead_end_record, should_skip, DeadEndRecord,
|
|
4
|
+
)
|
|
5
|
+
from autoresearch_core.types import Hypothesis, VerdictRecord
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_approach_hash_is_normalized_and_stable():
|
|
9
|
+
a = approach_hash(" Memoize The Tokenizer ")
|
|
10
|
+
b = approach_hash("memoize the tokenizer")
|
|
11
|
+
assert a == b and len(a) == 16
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_build_dead_end_record():
|
|
15
|
+
h = Hypothesis(id="h1", iteration=2, statement="cache embeddings",
|
|
16
|
+
predicted_outcome="faster")
|
|
17
|
+
rec = VerdictRecord("refuted", "deterministic", "deterministic", "latency=300 < 200 -> fail")
|
|
18
|
+
de = build_dead_end_record(h, rec)
|
|
19
|
+
assert isinstance(de, DeadEndRecord)
|
|
20
|
+
assert de.statement == "cache embeddings" and de.iteration == 2
|
|
21
|
+
assert de.evidence_level == "deterministic" and de.reason.endswith("fail")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_should_skip_against_known_hashes():
|
|
25
|
+
seen = {approach_hash("cache embeddings")}
|
|
26
|
+
assert should_skip("Cache Embeddings", seen) is True
|
|
27
|
+
assert should_skip("use a bloom filter", seen) is False
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_approach_hash_whitespace_normalization():
|
|
31
|
+
# Internal runs of whitespace must collapse to a single space.
|
|
32
|
+
assert approach_hash("cache embeddings") == approach_hash("cache embeddings")
|
|
33
|
+
assert approach_hash("cache\t\tembeddings") == approach_hash("cache embeddings")
|
|
34
|
+
assert approach_hash("cache\n embeddings") == approach_hash("cache embeddings")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_build_dead_end_record_raises_on_non_deterministic_refutation():
|
|
38
|
+
h = Hypothesis(id="h2", iteration=1, statement="use llm scoring",
|
|
39
|
+
predicted_outcome="better")
|
|
40
|
+
# supported + llm — neither condition of should_promote_dead_end is met
|
|
41
|
+
rec_supported = VerdictRecord("supported", "deterministic", "deterministic", "ok")
|
|
42
|
+
with pytest.raises(ValueError, match="deterministic refutation"):
|
|
43
|
+
build_dead_end_record(h, rec_supported)
|
|
44
|
+
# refuted but evidence_level=llm — not deterministic
|
|
45
|
+
rec_llm = VerdictRecord("refuted", "deterministic", "llm", "llm said no")
|
|
46
|
+
with pytest.raises(ValueError, match="deterministic refutation"):
|
|
47
|
+
build_dead_end_record(h, rec_llm)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import pytest
|
|
3
|
+
from autoresearch_core.types import (
|
|
4
|
+
MetricSpec, ExperimentResult, VerdictRecord, Hypothesis, Takeaway, GateState,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
def test_dataclasses_construct_and_are_frozen():
|
|
8
|
+
spec = MetricSpec(metric_key="recall", comparator=">=", target=0.8)
|
|
9
|
+
assert spec.metric_key == "recall" and spec.comparator == ">=" and spec.target == 0.8
|
|
10
|
+
res = ExperimentResult(metrics={"recall": 0.9}, exit_code=0)
|
|
11
|
+
assert res.failure_class == "none" and res.runner == "subprocess"
|
|
12
|
+
rec = VerdictRecord(verdict="supported", strategy="deterministic",
|
|
13
|
+
evidence_level="deterministic", detail="ok")
|
|
14
|
+
assert rec.raw_evidence_ref is None
|
|
15
|
+
gates = GateState()
|
|
16
|
+
assert gates.execute is True and gates.kg_write is True
|
|
17
|
+
with pytest.raises(dataclasses.FrozenInstanceError):
|
|
18
|
+
spec.target = 0.5 # type: ignore[misc]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_experiment_result_defensive_copy():
|
|
22
|
+
# Mutating the source dict after construction must not alter .metrics.
|
|
23
|
+
src = {"recall": 0.9, "latency_ms": 120.0}
|
|
24
|
+
res = ExperimentResult(metrics=src, exit_code=0)
|
|
25
|
+
src["recall"] = 0.0
|
|
26
|
+
src["new_key"] = 999.0
|
|
27
|
+
assert res.metrics["recall"] == 0.9
|
|
28
|
+
assert "new_key" not in res.metrics
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_experiment_result_metrics_is_still_frozen():
|
|
32
|
+
# The defensive copy must not make metrics itself mutable via setattr.
|
|
33
|
+
res = ExperimentResult(metrics={"x": 1.0}, exit_code=0)
|
|
34
|
+
with pytest.raises(dataclasses.FrozenInstanceError):
|
|
35
|
+
res.metrics = {} # type: ignore[misc]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from autoresearch_core.verdict import compare, DeterministicVerdict
|
|
2
|
+
from autoresearch_core.types import MetricSpec, ExperimentResult
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def test_compare_all_operators():
|
|
6
|
+
assert compare(0.8, ">=", 0.8) is True
|
|
7
|
+
assert compare(199, "<", 200) is True
|
|
8
|
+
assert compare(5, "==", 5) is True
|
|
9
|
+
assert compare(5, ">", 5) is False
|
|
10
|
+
assert compare(5, "<=", 4) is False
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_deterministic_supported_and_refuted():
|
|
14
|
+
strat = DeterministicVerdict()
|
|
15
|
+
spec = MetricSpec("recall", ">=", 0.8)
|
|
16
|
+
rec = strat.evaluate(spec, ExperimentResult(metrics={"recall": 0.9}, exit_code=0))
|
|
17
|
+
assert rec.verdict == "supported" and rec.evidence_level == "deterministic"
|
|
18
|
+
assert rec.detail == "recall=0.9 >= 0.8 → pass"
|
|
19
|
+
rec2 = strat.evaluate(spec, ExperimentResult(metrics={"recall": 0.5}, exit_code=0))
|
|
20
|
+
assert rec2.verdict == "refuted"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_deterministic_inconclusive_paths():
|
|
24
|
+
strat = DeterministicVerdict()
|
|
25
|
+
spec = MetricSpec("recall", ">=", 0.8)
|
|
26
|
+
bad = strat.evaluate(spec, ExperimentResult(metrics={}, exit_code=1, failure_class="H2"))
|
|
27
|
+
assert bad.verdict == "inconclusive" and "H2" in bad.detail
|
|
28
|
+
missing = strat.evaluate(spec, ExperimentResult(metrics={"other": 1.0}, exit_code=0))
|
|
29
|
+
assert missing.verdict == "inconclusive" and "not reported" in missing.detail
|