autoresearch-core 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,33 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ strategy:
12
+ fail-fast: false
13
+ matrix:
14
+ python-version: ["3.11", "3.12", "3.13"]
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ - uses: actions/setup-python@v5
18
+ with:
19
+ python-version: ${{ matrix.python-version }}
20
+ - run: python -m pip install --upgrade pip
21
+ - run: pip install -e ".[dev]"
22
+ - run: pytest -q --cov=autoresearch_core --cov-report=term-missing --cov-fail-under=95
23
+
24
+ build:
25
+ runs-on: ubuntu-latest
26
+ steps:
27
+ - uses: actions/checkout@v4
28
+ - uses: actions/setup-python@v5
29
+ with:
30
+ python-version: "3.12"
31
+ - run: pip install build twine
32
+ - run: python -m build
33
+ - run: twine check dist/*
@@ -0,0 +1,43 @@
1
+ name: Publish to PyPI
2
+
3
+ # Publishes on a version tag (e.g. v0.1.1) via PyPI Trusted Publishing (OIDC).
4
+ # One-time setup on PyPI: project → Publishing → add a Trusted Publisher with
5
+ # owner: ca1773130n repo: autoresearch-core
6
+ # workflow: publish.yml environment: pypi
7
+ # No API token is stored anywhere.
8
+
9
+ on:
10
+ push:
11
+ tags: ["v*"]
12
+
13
+ jobs:
14
+ build:
15
+ name: Build distribution
16
+ runs-on: ubuntu-latest
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+ - uses: actions/setup-python@v5
20
+ with:
21
+ python-version: "3.12"
22
+ - run: python -m pip install --upgrade build
23
+ - run: python -m build
24
+ - uses: actions/upload-artifact@v4
25
+ with:
26
+ name: dist
27
+ path: dist/
28
+
29
+ publish:
30
+ name: Publish to PyPI
31
+ needs: build
32
+ runs-on: ubuntu-latest
33
+ environment:
34
+ name: pypi
35
+ url: https://pypi.org/p/autoresearch-core
36
+ permissions:
37
+ id-token: write # OIDC for Trusted Publishing
38
+ steps:
39
+ - uses: actions/download-artifact@v4
40
+ with:
41
+ name: dist
42
+ path: dist/
43
+ - uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,7 @@
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+ .pytest_cache/
5
+ *.egg-info/
6
+ .coverage
7
+ dist/
@@ -0,0 +1,23 @@
1
+ # Changelog
2
+
3
+ All notable changes to `autoresearch-core` are documented here. The format
4
+ follows [Keep a Changelog](https://keepachangelog.com/), and the project
5
+ adheres to [Semantic Versioning](https://semver.org/).
6
+
7
+ ## [0.1.1] - 2026-06-03
8
+ ### Fixed
9
+ - Reject non-finite metric values (`1e999` → `inf`, `nan`) in `parse_metrics_line`
10
+ and `validate_metric_spec`, matching JS `JSON.parse` semantics.
11
+ - `approach_hash` collapses internal whitespace before hashing.
12
+ - `build_dead_end_record` raises unless the verdict is a deterministic refutation.
13
+ - `resolve_gates` tolerates a non-mapping `research_gates` config value.
14
+ - `ExperimentResult` defensively copies `metrics` so the frozen instance is immutable.
15
+
16
+ ## [0.1.0] - 2026-06-03
17
+ ### Added
18
+ - Initial release: `MetricSpec`, deterministic `Verdict` (`measure`), failure
19
+ classifier (H2/H3/H4), gate model, decision policy (`decide`,
20
+ `detect_plateau`, `should_promote_dead_end`), promotion record shapes, and the
21
+ adapter `Protocol`s (`Spawn`, `Retriever`, `KnowledgeGraph`, `ExperimentRunner`,
22
+ `Store`). Pure-Python, zero runtime dependencies; behaviour parity-tested
23
+ against the GRD autoresearch loop.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Cameleon X
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,102 @@
1
+ Metadata-Version: 2.4
2
+ Name: autoresearch-core
3
+ Version: 0.1.1
4
+ Summary: Decision-contracts library: deterministic verdict/gate/failure/promotion logic for autoresearch loops.
5
+ Project-URL: Homepage, https://github.com/ca1773130n/autoresearch-core
6
+ Project-URL: Repository, https://github.com/ca1773130n/autoresearch-core
7
+ Project-URL: Issues, https://github.com/ca1773130n/autoresearch-core/issues
8
+ Project-URL: Changelog, https://github.com/ca1773130n/autoresearch-core/blob/main/CHANGELOG.md
9
+ Author-email: Cameleon X <ca1773130n@gmail.com>
10
+ License: MIT
11
+ License-File: LICENSE
12
+ Keywords: agent,autoresearch,decision-contracts,deterministic,research-loop,verdict
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Scientific/Engineering
22
+ Classifier: Typing :: Typed
23
+ Requires-Python: >=3.11
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
26
+ Requires-Dist: pytest>=8.0; extra == 'dev'
27
+ Description-Content-Type: text/markdown
28
+
29
+ # autoresearch-core
30
+
31
+ [![CI](https://github.com/ca1773130n/autoresearch-core/actions/workflows/ci.yml/badge.svg)](https://github.com/ca1773130n/autoresearch-core/actions/workflows/ci.yml)
32
+ [![PyPI](https://img.shields.io/pypi/v/autoresearch-core.svg)](https://pypi.org/project/autoresearch-core/)
33
+ [![Python](https://img.shields.io/pypi/pyversions/autoresearch-core.svg)](https://pypi.org/project/autoresearch-core/)
34
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
35
+
36
+ A tiny, **pure-Python decision-contracts library** for autoresearch / agentic
37
+ loops: a *deterministic* verdict (metric / comparator / target), failure
38
+ classification, gates, and promotion record shapes — the disciplined decision
39
+ core, with **zero runtime dependencies** and **no I/O**.
40
+
41
+ You bring the loop, the retrieval, the runner, and the storage; you bind them to
42
+ the library's `Protocol`s and call `measure` / `decide` / `should_promote_dead_end`
43
+ at your decision points. The verdict logic is parity-tested against the GRD
44
+ autoresearch loop.
45
+
46
+ ## Install
47
+
48
+ ```bash
49
+ pip install autoresearch-core
50
+ ```
51
+
52
+ Requires Python 3.11+. No runtime dependencies.
53
+
54
+ ## Quickstart
55
+
56
+ ```python
57
+ from autoresearch_core import (
58
+ MetricSpec, ExperimentResult, measure, parse_metrics_line, should_promote_dead_end,
59
+ )
60
+
61
+ spec = MetricSpec(metric_key="recall_at_10", comparator=">=", target=0.8)
62
+
63
+ # An experiment prints `__RESULT__ {"recall_at_10": 0.83}` on stdout:
64
+ metrics = parse_metrics_line(stdout) # -> {"recall_at_10": 0.83}
65
+ verdict = measure(spec, ExperimentResult(metrics=metrics, exit_code=0))
66
+
67
+ verdict.verdict # "supported" | "refuted" | "inconclusive" (deterministic)
68
+ verdict.evidence_level # "deterministic"
69
+ should_promote_dead_end(verdict) # True only for a deterministic refutation
70
+ ```
71
+
72
+ ## What it owns (and what it doesn't)
73
+
74
+ **Owns — the decision discipline:**
75
+ - `MetricSpec` + the `__RESULT__` result contract (`parse_metrics_line`, `validate_metric_spec`)
76
+ - `DeterministicVerdict` / `measure` (metric vs target → supported / refuted / inconclusive)
77
+ - failure classification (`classify_run_failure` → `H2` / `H3` / `H4`)
78
+ - gates (`resolve_gates`, `check_gate`)
79
+ - policy (`decide`, `detect_plateau`, `should_promote_dead_end`)
80
+ - promotion record shapes (`DeadEndRecord`, `KnowhowRecord`, `approach_hash`, `should_skip`)
81
+
82
+ **Doesn't own — bind these via `ports.py` `Protocol`s to your own infra:**
83
+ `Spawn`, `Retriever`, `KnowledgeGraph`, `ExperimentRunner`, `Store`.
84
+
85
+ ## Verdict authority
86
+
87
+ `DeterministicVerdict` is the default and the reason this package exists. Other
88
+ strategies (an LLM judge, an exit-code check) can be plugged in via the
89
+ `VerdictStrategy` protocol, but **only a deterministic refutation auto-promotes a
90
+ dead-end** — non-deterministic verdicts are advisory. Every verdict records its
91
+ `strategy` and `evidence_level`, so the decision trail stays auditable.
92
+
93
+ ## Development
94
+
95
+ ```bash
96
+ pip install -e ".[dev]"
97
+ pytest -q --cov=autoresearch_core
98
+ ```
99
+
100
+ ## License
101
+
102
+ MIT © Cameleon X — see [LICENSE](LICENSE).
@@ -0,0 +1,74 @@
1
+ # autoresearch-core
2
+
3
+ [![CI](https://github.com/ca1773130n/autoresearch-core/actions/workflows/ci.yml/badge.svg)](https://github.com/ca1773130n/autoresearch-core/actions/workflows/ci.yml)
4
+ [![PyPI](https://img.shields.io/pypi/v/autoresearch-core.svg)](https://pypi.org/project/autoresearch-core/)
5
+ [![Python](https://img.shields.io/pypi/pyversions/autoresearch-core.svg)](https://pypi.org/project/autoresearch-core/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
7
+
8
+ A tiny, **pure-Python decision-contracts library** for autoresearch / agentic
9
+ loops: a *deterministic* verdict (metric / comparator / target), failure
10
+ classification, gates, and promotion record shapes — the disciplined decision
11
+ core, with **zero runtime dependencies** and **no I/O**.
12
+
13
+ You bring the loop, the retrieval, the runner, and the storage; you bind them to
14
+ the library's `Protocol`s and call `measure` / `decide` / `should_promote_dead_end`
15
+ at your decision points. The verdict logic is parity-tested against the GRD
16
+ autoresearch loop.
17
+
18
+ ## Install
19
+
20
+ ```bash
21
+ pip install autoresearch-core
22
+ ```
23
+
24
+ Requires Python 3.11+. No runtime dependencies.
25
+
26
+ ## Quickstart
27
+
28
+ ```python
29
+ from autoresearch_core import (
30
+ MetricSpec, ExperimentResult, measure, parse_metrics_line, should_promote_dead_end,
31
+ )
32
+
33
+ spec = MetricSpec(metric_key="recall_at_10", comparator=">=", target=0.8)
34
+
35
+ # An experiment prints `__RESULT__ {"recall_at_10": 0.83}` on stdout:
36
+ metrics = parse_metrics_line(stdout) # -> {"recall_at_10": 0.83}
37
+ verdict = measure(spec, ExperimentResult(metrics=metrics, exit_code=0))
38
+
39
+ verdict.verdict # "supported" | "refuted" | "inconclusive" (deterministic)
40
+ verdict.evidence_level # "deterministic"
41
+ should_promote_dead_end(verdict) # True only for a deterministic refutation
42
+ ```
43
+
44
+ ## What it owns (and what it doesn't)
45
+
46
+ **Owns — the decision discipline:**
47
+ - `MetricSpec` + the `__RESULT__` result contract (`parse_metrics_line`, `validate_metric_spec`)
48
+ - `DeterministicVerdict` / `measure` (metric vs target → supported / refuted / inconclusive)
49
+ - failure classification (`classify_run_failure` → `H2` / `H3` / `H4`)
50
+ - gates (`resolve_gates`, `check_gate`)
51
+ - policy (`decide`, `detect_plateau`, `should_promote_dead_end`)
52
+ - promotion record shapes (`DeadEndRecord`, `KnowhowRecord`, `approach_hash`, `should_skip`)
53
+
54
+ **Doesn't own — bind these via `ports.py` `Protocol`s to your own infra:**
55
+ `Spawn`, `Retriever`, `KnowledgeGraph`, `ExperimentRunner`, `Store`.
56
+
57
+ ## Verdict authority
58
+
59
+ `DeterministicVerdict` is the default and the reason this package exists. Other
60
+ strategies (an LLM judge, an exit-code check) can be plugged in via the
61
+ `VerdictStrategy` protocol, but **only a deterministic refutation auto-promotes a
62
+ dead-end** — non-deterministic verdicts are advisory. Every verdict records its
63
+ `strategy` and `evidence_level`, so the decision trail stays auditable.
64
+
65
+ ## Development
66
+
67
+ ```bash
68
+ pip install -e ".[dev]"
69
+ pytest -q --cov=autoresearch_core
70
+ ```
71
+
72
+ ## License
73
+
74
+ MIT © Cameleon X — see [LICENSE](LICENSE).
@@ -0,0 +1,31 @@
1
+ """autoresearch-core: pure-Python decision contracts for autoresearch loops."""
2
+
3
+ __version__ = "0.1.1"
4
+
5
+ from .types import (
6
+ Comparator, EvidenceLevel, ExperimentResult, FailureClass, GateCheck, GateState,
7
+ Hypothesis, MetricSpec, Takeaway, Verdict, VerdictRecord,
8
+ )
9
+ from .contract import parse_metrics_line, validate_metric_spec
10
+ from .failures import classify_run_failure
11
+ from .verdict import compare, DeterministicVerdict, VerdictStrategy
12
+ from .gates import resolve_gates, check_gate
13
+ from .policy import (
14
+ decide_branch, should_terminate, detect_plateau, should_promote_dead_end,
15
+ measure, decide,
16
+ )
17
+ from .promote import (
18
+ DeadEndRecord, KnowhowRecord, approach_hash, build_dead_end_record, should_skip,
19
+ )
20
+ from .ports import Spawn, Retriever, KnowledgeGraph, ExperimentRunner, Store
21
+
22
+ __all__ = [
23
+ "Comparator", "EvidenceLevel", "ExperimentResult", "FailureClass", "GateCheck", "GateState",
24
+ "Hypothesis", "MetricSpec", "Takeaway", "Verdict", "VerdictRecord",
25
+ "parse_metrics_line", "validate_metric_spec", "classify_run_failure",
26
+ "compare", "DeterministicVerdict", "VerdictStrategy", "resolve_gates", "check_gate",
27
+ "decide_branch", "should_terminate", "detect_plateau", "should_promote_dead_end",
28
+ "measure", "decide",
29
+ "DeadEndRecord", "KnowhowRecord", "approach_hash", "build_dead_end_record", "should_skip",
30
+ "Spawn", "Retriever", "KnowledgeGraph", "ExperimentRunner", "Store",
31
+ ]
@@ -0,0 +1,52 @@
1
+ """The machine-readable experiment-result contract. Parity with GRD runner.ts."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import math
6
+ import re
7
+
8
+ from .types import Comparator, MetricSpec
9
+
10
+ _RESULT_RE = re.compile(r"__RESULT__\s*(\{.*\})")
11
+ _COMPARATORS: tuple[Comparator, ...] = (">=", "<=", ">", "<", "==")
12
+
13
+
14
+ def _reject_constant(token: str) -> float:
15
+ # GRD parity: JS JSON.parse rejects NaN/Infinity/-Infinity. Mirror that.
16
+ raise ValueError(f"non-JSON constant: {token}")
17
+
18
+
19
+ def parse_metrics_line(stdout: str) -> dict[str, float]:
20
+ """Extract {metric: number} from the first `__RESULT__ {json}` occurrence.
21
+
22
+ Mirrors GRD: non-numeric values are dropped. Python `bool` is an `int`
23
+ subclass, so booleans are excluded explicitly.
24
+ """
25
+ match = _RESULT_RE.search(stdout)
26
+ if not match:
27
+ return {}
28
+ try:
29
+ obj = json.loads(match.group(1), parse_constant=_reject_constant)
30
+ except (ValueError, TypeError):
31
+ return {}
32
+ if not isinstance(obj, dict):
33
+ return {}
34
+ out: dict[str, float] = {}
35
+ for key, value in obj.items():
36
+ if isinstance(value, bool):
37
+ continue
38
+ if isinstance(value, (int, float)) and math.isfinite(value):
39
+ out[str(key)] = float(value)
40
+ return out
41
+
42
+
43
+ def validate_metric_spec(spec: MetricSpec) -> None:
44
+ """Raise ValueError if the spec cannot drive a deterministic verdict."""
45
+ if not isinstance(spec.metric_key, str) or not spec.metric_key:
46
+ raise ValueError("MetricSpec.metric_key must be a non-empty string")
47
+ if spec.comparator not in _COMPARATORS:
48
+ raise ValueError(f"MetricSpec.comparator must be one of {_COMPARATORS}")
49
+ if not isinstance(spec.target, (int, float)) or isinstance(spec.target, bool):
50
+ raise ValueError("MetricSpec.target must be numeric")
51
+ if not math.isfinite(spec.target):
52
+ raise ValueError("MetricSpec.target must be finite")
@@ -0,0 +1,26 @@
1
+ """Failure classification. Parity with GRD runner.ts classifyRunFailure."""
2
+ from __future__ import annotations
3
+
4
+ import re
5
+
6
+ from .types import FailureClass
7
+
8
+ _H2_RE = re.compile(
9
+ r"command not found|not found:|ModuleNotFoundError|ImportError", re.IGNORECASE
10
+ )
11
+ _H3_RE = re.compile(
12
+ r"No such file or directory|ENOENT|permission denied", re.IGNORECASE
13
+ )
14
+
15
+
16
+ def classify_run_failure(stderr: str, timed_out: bool) -> FailureClass:
17
+ """H4=timeout/other-runtime, H2=missing dep, H3=missing file/permission, none=empty."""
18
+ if timed_out:
19
+ return "H4"
20
+ if _H2_RE.search(stderr):
21
+ return "H2"
22
+ if _H3_RE.search(stderr):
23
+ return "H3"
24
+ if not stderr:
25
+ return "none"
26
+ return "H4"
@@ -0,0 +1,33 @@
1
+ """Gate model. Parity with GRD gates.ts.
2
+
3
+ The config sub-key is `experiment_execution` (NOT `execute`); it controls the
4
+ runtime gate named `execute`. Any value other than literal False leaves it on.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ from typing import Any, Literal, Mapping
9
+
10
+ from .types import GateCheck, GateState
11
+
12
+
13
+ def resolve_gates(config: Mapping[str, Any], no_gates: bool) -> GateState:
14
+ if no_gates:
15
+ return GateState(execute=False, kg_write=False)
16
+ _rg = config.get("research_gates")
17
+ rg: dict[str, object] = _rg if isinstance(_rg, dict) else {}
18
+ return GateState(
19
+ execute=rg.get("experiment_execution") is not False,
20
+ kg_write=rg.get("kg_write") is not False,
21
+ )
22
+
23
+
24
+ def check_gate(gates: GateState, gate: Literal["execute", "kg_write"], approved: bool) -> GateCheck:
25
+ """Decide whether to proceed or pause at `gate`. Parity with GRD checkGate
26
+ (which also sets thread.status='paused'/pendingGate — the caller does that).
27
+ Unknown gate names raise ValueError (fail-fast; GRD would silently proceed)."""
28
+ current = getattr(gates, gate, None)
29
+ if current is None:
30
+ raise ValueError(f"unknown gate: {gate!r}")
31
+ if (not current) or approved:
32
+ return GateCheck(proceed=True, pending_gate=None)
33
+ return GateCheck(proceed=False, pending_gate=gate)
@@ -0,0 +1,45 @@
1
+ """Pure decision policy + facades. Parity with GRD verdict.ts + promotion-authority rule."""
2
+ from __future__ import annotations
3
+
4
+ from .types import ExperimentResult, MetricSpec, Verdict, VerdictRecord
5
+ from .verdict import DeterministicVerdict, VerdictStrategy
6
+
7
+
8
+ def decide_branch(verdict: Verdict) -> str:
9
+ """'finalize' if supported, else 'revise'."""
10
+ return "finalize" if verdict == "supported" else "revise"
11
+
12
+
13
+ def should_terminate(iteration: int, max_iterations: int, last_verdict: Verdict) -> tuple[bool, str]:
14
+ """Return (done, status). supported -> supported; budget hit -> exhausted; else active."""
15
+ if last_verdict == "supported":
16
+ return True, "supported"
17
+ if iteration >= max_iterations:
18
+ return True, "exhausted"
19
+ return False, "active"
20
+
21
+
22
+ def detect_plateau(verdicts: list[Verdict], window: int = 3) -> bool:
23
+ """True when the last `window` verdicts are all non-supported."""
24
+ if len(verdicts) < window:
25
+ return False
26
+ return all(v != "supported" for v in verdicts[-window:])
27
+
28
+
29
+ def should_promote_dead_end(record: VerdictRecord) -> bool:
30
+ """Codex rule: only a DETERMINISTIC refutation may auto-promote a dead-end."""
31
+ return record.verdict == "refuted" and record.evidence_level == "deterministic"
32
+
33
+
34
+ def measure(
35
+ spec: MetricSpec, result: ExperimentResult, strategy: VerdictStrategy | None = None
36
+ ) -> VerdictRecord:
37
+ """Facade: evaluate a result under a verdict strategy (deterministic by default)."""
38
+ return (strategy or DeterministicVerdict()).evaluate(spec, result)
39
+
40
+
41
+ def decide(iteration: int, max_iterations: int, verdict: Verdict) -> tuple[str, bool, str]:
42
+ """Facade: (branch, done, status) from a verdict. branch in {finalize, revise}."""
43
+ branch = decide_branch(verdict)
44
+ done, status = should_terminate(iteration, max_iterations, verdict)
45
+ return branch, done, status
@@ -0,0 +1,34 @@
1
+ """Adapter protocols. Each project binds these to its own infra (no impl here)."""
2
+ from __future__ import annotations
3
+
4
+ from typing import Any, Protocol, Sequence, runtime_checkable
5
+
6
+ from .types import ExperimentResult
7
+
8
+
9
+ @runtime_checkable
10
+ class Spawn(Protocol):
11
+ async def __call__(self, prompt: str) -> str: ...
12
+
13
+
14
+ @runtime_checkable
15
+ class Retriever(Protocol):
16
+ async def retrieve(self, query: str, k: int = 8) -> Sequence[dict[str, Any]]: ...
17
+
18
+
19
+ @runtime_checkable
20
+ class KnowledgeGraph(Protocol):
21
+ async def prior_findings(self, query: str) -> Sequence[dict[str, Any]]: ...
22
+ async def write_finding(self, finding: dict[str, Any]) -> None: ...
23
+
24
+
25
+ @runtime_checkable
26
+ class ExperimentRunner(Protocol):
27
+ def run(self, plan: dict[str, Any], workdir: str) -> ExperimentResult: ...
28
+
29
+
30
+ @runtime_checkable
31
+ class Store(Protocol):
32
+ def save_verdict(self, thread_id: str, record: Any) -> None: ...
33
+ def load_dead_end_hashes(self, scope: str) -> set[str]: ...
34
+ def save_dead_end(self, scope: str, record: Any) -> None: ...
@@ -0,0 +1,48 @@
1
+ """Promotion record shapes (KNOWHOW / DEAD-ENDS). Shape only; projects persist."""
2
+ from __future__ import annotations
3
+
4
+ import hashlib
5
+ import re
6
+ from dataclasses import dataclass
7
+
8
+ from .policy import should_promote_dead_end
9
+ from .types import Hypothesis, VerdictRecord
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class DeadEndRecord:
14
+ approach_hash: str
15
+ statement: str
16
+ reason: str
17
+ iteration: int
18
+ evidence_level: str
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class KnowhowRecord:
23
+ statement: str
24
+ content: str
25
+ iteration: int
26
+
27
+
28
+ def approach_hash(statement: str) -> str:
29
+ """Stable, case/space-insensitive hash used to dedupe approaches."""
30
+ normalized = re.sub(r"\s+", " ", statement.strip().lower())
31
+ return hashlib.sha256(normalized.encode("utf-8")).hexdigest()[:16]
32
+
33
+
34
+ def build_dead_end_record(hypothesis: Hypothesis, record: VerdictRecord) -> DeadEndRecord:
35
+ if not should_promote_dead_end(record):
36
+ raise ValueError("build_dead_end_record requires a deterministic refutation")
37
+ return DeadEndRecord(
38
+ approach_hash=approach_hash(hypothesis.statement),
39
+ statement=hypothesis.statement,
40
+ reason=record.detail,
41
+ iteration=hypothesis.iteration,
42
+ evidence_level=record.evidence_level,
43
+ )
44
+
45
+
46
+ def should_skip(statement: str, dead_end_hashes: set[str]) -> bool:
47
+ """Don't re-propose an approach already in the dead-ends set."""
48
+ return approach_hash(statement) in dead_end_hashes
@@ -0,0 +1 @@
1
+ # PEP 561 marker: autoresearch-core ships inline type annotations.
@@ -0,0 +1,79 @@
1
+ """Core data definitions for autoresearch-core. Pure data, no logic, no I/O."""
2
+ from __future__ import annotations
3
+
4
+ from dataclasses import dataclass
5
+ from typing import Literal
6
+
7
+ Verdict = Literal["supported", "refuted", "inconclusive"]
8
+ Comparator = Literal[">=", "<=", ">", "<", "=="]
9
+ FailureClass = Literal["H2", "H3", "H4", "none"]
10
+ EvidenceLevel = Literal["deterministic", "exit_code", "llm"]
11
+ HypothesisStatus = Literal[
12
+ "open", "testing", "supported", "refuted", "inconclusive", "superseded"
13
+ ]
14
+ TakeawayKind = Literal[
15
+ "success_pattern", "failure_root_cause", "constraint", "domain_fact", "tool_pattern"
16
+ ]
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class MetricSpec:
21
+ """The machine-readable verdict contract a hypothesis must carry."""
22
+ metric_key: str
23
+ comparator: Comparator
24
+ target: float
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class ExperimentResult:
29
+ metrics: dict[str, float]
30
+ exit_code: int
31
+ failure_class: FailureClass = "none"
32
+ runner: str = "subprocess"
33
+ duration_ms: int = 0
34
+ stdout_excerpt: str = ""
35
+
36
+ def __post_init__(self) -> None:
37
+ object.__setattr__(self, "metrics", dict(self.metrics))
38
+
39
+
40
+ @dataclass(frozen=True)
41
+ class VerdictRecord:
42
+ verdict: Verdict
43
+ strategy: str
44
+ evidence_level: EvidenceLevel
45
+ detail: str
46
+ raw_evidence_ref: str | None = None
47
+
48
+
49
+ @dataclass(frozen=True)
50
+ class Hypothesis:
51
+ id: str
52
+ iteration: int
53
+ statement: str
54
+ predicted_outcome: str
55
+ status: HypothesisStatus = "open"
56
+ parent_id: str | None = None
57
+ verdict: Verdict | None = None
58
+
59
+
60
+ @dataclass(frozen=True)
61
+ class Takeaway:
62
+ kind: TakeawayKind
63
+ content: str
64
+ confidence: float
65
+ evidence: str
66
+ failure_class: FailureClass
67
+ iteration: int
68
+
69
+
70
+ @dataclass(frozen=True)
71
+ class GateState:
72
+ execute: bool = True
73
+ kg_write: bool = True
74
+
75
+
76
+ @dataclass(frozen=True)
77
+ class GateCheck:
78
+ proceed: bool
79
+ pending_gate: str | None = None
@@ -0,0 +1,64 @@
1
+ """Deterministic verdict. Parity with GRD verdict.ts (compare + evaluateVerdict)."""
2
+ from __future__ import annotations
3
+
4
+ from typing import Protocol, runtime_checkable
5
+
6
+ from .types import Comparator, ExperimentResult, MetricSpec, VerdictRecord
7
+
8
+
9
+ def compare(value: float, comparator: Comparator, target: float) -> bool:
10
+ if comparator == ">=":
11
+ return value >= target
12
+ if comparator == "<=":
13
+ return value <= target
14
+ if comparator == ">":
15
+ return value > target
16
+ if comparator == "<":
17
+ return value < target
18
+ if comparator == "==":
19
+ return value == target
20
+ return False
21
+
22
+
23
+ @runtime_checkable
24
+ class VerdictStrategy(Protocol):
25
+ name: str
26
+
27
+ def evaluate(self, spec: MetricSpec, result: ExperimentResult) -> VerdictRecord: ...
28
+
29
+
30
+ class DeterministicVerdict:
31
+ """Authoritative strategy: numeric metric vs target. evidence_level='deterministic'."""
32
+
33
+ name = "deterministic"
34
+
35
+ def evaluate(self, spec: MetricSpec, result: ExperimentResult) -> VerdictRecord:
36
+ if result.exit_code != 0:
37
+ return VerdictRecord(
38
+ verdict="inconclusive",
39
+ strategy=self.name,
40
+ evidence_level="deterministic",
41
+ detail=f"experiment run failed ({result.failure_class})",
42
+ )
43
+ if spec.metric_key not in result.metrics:
44
+ return VerdictRecord(
45
+ verdict="inconclusive",
46
+ strategy=self.name,
47
+ evidence_level="deterministic",
48
+ detail=f'metric "{spec.metric_key}" not reported',
49
+ )
50
+ value = result.metrics[spec.metric_key]
51
+ passed = compare(value, spec.comparator, spec.target)
52
+ return VerdictRecord(
53
+ verdict="supported" if passed else "refuted",
54
+ strategy=self.name,
55
+ evidence_level="deterministic",
56
+ detail=f"{spec.metric_key}={_fmt(value)} {spec.comparator} {_fmt(spec.target)} "
57
+ f"→ {'pass' if passed else 'fail'}",
58
+ )
59
+
60
+
61
+ def _fmt(n: float) -> str:
62
+ """Render 5 not 5.0; 0.9 stays 0.9. The `detail` string is human-readable and
63
+ NOT a byte-for-byte parity guarantee with GRD — only the verdict OUTCOME is."""
64
+ return str(int(n)) if isinstance(n, float) and n.is_integer() else str(n)
@@ -0,0 +1,41 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "autoresearch-core"
7
+ version = "0.1.1"
8
+ description = "Decision-contracts library: deterministic verdict/gate/failure/promotion logic for autoresearch loops."
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "Cameleon X", email = "ca1773130n@gmail.com" }]
13
+ keywords = ["autoresearch", "decision-contracts", "verdict", "research-loop", "deterministic", "agent"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Operating System :: OS Independent",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Programming Language :: Python :: 3.13",
23
+ "Topic :: Scientific/Engineering",
24
+ "Typing :: Typed",
25
+ ]
26
+ dependencies = []
27
+
28
+ [project.urls]
29
+ Homepage = "https://github.com/ca1773130n/autoresearch-core"
30
+ Repository = "https://github.com/ca1773130n/autoresearch-core"
31
+ Issues = "https://github.com/ca1773130n/autoresearch-core/issues"
32
+ Changelog = "https://github.com/ca1773130n/autoresearch-core/blob/main/CHANGELOG.md"
33
+
34
+ [project.optional-dependencies]
35
+ dev = ["pytest>=8.0", "pytest-cov>=5.0"]
36
+
37
+ [tool.hatch.build.targets.wheel]
38
+ packages = ["autoresearch_core"]
39
+
40
+ [tool.pytest.ini_options]
41
+ testpaths = ["tests"]
@@ -0,0 +1,71 @@
1
+ import pytest
2
+ from autoresearch_core.contract import parse_metrics_line, validate_metric_spec
3
+ from autoresearch_core.types import MetricSpec
4
+
5
+
6
+ def test_parses_first_result_line_numeric_only():
7
+ assert parse_metrics_line('noise\n__RESULT__ {"latency_ms": 180, "ok": "yes"}\nmore') == {
8
+ "latency_ms": 180.0
9
+ }
10
+
11
+
12
+ def test_excludes_bool_values():
13
+ # bool is a subclass of int in Python; must not leak in as 1.0/0.0
14
+ assert parse_metrics_line('__RESULT__ {"passed": true, "n": 3}') == {"n": 3.0}
15
+
16
+
17
+ def test_missing_marker_or_bad_json_returns_empty():
18
+ assert parse_metrics_line("no marker here") == {}
19
+ assert parse_metrics_line("__RESULT__ {not json}") == {}
20
+
21
+
22
+ def test_rejects_nan_and_infinity_like_js():
23
+ # JS JSON.parse rejects these tokens; GRD returns {} — match that.
24
+ assert parse_metrics_line('__RESULT__ {"x": NaN}') == {}
25
+ assert parse_metrics_line('__RESULT__ {"x": Infinity}') == {}
26
+
27
+
28
+ def test_first_of_multiple_result_lines():
29
+ assert parse_metrics_line('__RESULT__ {"a": 1}\n__RESULT__ {"a": 2}') == {"a": 1.0}
30
+
31
+
32
+ def test_validate_metric_spec():
33
+ validate_metric_spec(MetricSpec("recall", ">=", 0.8)) # no raise
34
+ with pytest.raises(ValueError):
35
+ validate_metric_spec(MetricSpec("", ">=", 0.8))
36
+ with pytest.raises(ValueError):
37
+ validate_metric_spec(MetricSpec("x", "!=", 0.8)) # type: ignore[arg-type]
38
+
39
+
40
+ def test_greedy_regex_drops_line_with_trailing_junk_grd_parity():
41
+ # INTENTIONAL: GRD's runner.ts regex is greedy (\{.*\}), so a line with
42
+ # trailing JSON-shaped junk captures the whole span, JSON.parse fails, and
43
+ # the result is dropped. We MATCH that behavior on purpose. Do not "fix" to
44
+ # non-greedy — that would diverge from GRD. The __RESULT__ contract is one
45
+ # clean `{json}` per line.
46
+ assert parse_metrics_line('__RESULT__ {"a": 1} junk {"b": 2}') == {}
47
+
48
+
49
+ def test_validate_metric_spec_rejects_bool_target():
50
+ with pytest.raises(ValueError):
51
+ validate_metric_spec(MetricSpec("x", ">=", True)) # type: ignore[arg-type]
52
+
53
+
54
+ def test_parse_metrics_drops_overflow_to_inf():
55
+ # 1e999 overflows to float('inf') in Python JSON; must be dropped (non-finite guard).
56
+ assert parse_metrics_line('__RESULT__ {"x": 1e999}') == {}
57
+
58
+
59
+ def test_parse_metrics_drops_neg_inf():
60
+ # -1e999 overflows to float('-inf'); must also be dropped.
61
+ assert parse_metrics_line('__RESULT__ {"x": -1e999}') == {}
62
+
63
+
64
+ def test_validate_metric_spec_rejects_inf_target():
65
+ with pytest.raises(ValueError):
66
+ validate_metric_spec(MetricSpec("x", ">=", float("inf")))
67
+
68
+
69
+ def test_validate_metric_spec_rejects_nan_target():
70
+ with pytest.raises(ValueError):
71
+ validate_metric_spec(MetricSpec("x", ">=", float("nan")))
@@ -0,0 +1,18 @@
1
+ from autoresearch_core.failures import classify_run_failure
2
+
3
+
4
+ def test_classify():
5
+ assert classify_run_failure("", True) == "H4" # timeout wins
6
+ assert classify_run_failure("ModuleNotFoundError: x", False) == "H2"
7
+ assert classify_run_failure("ImportError: bad", False) == "H2"
8
+ assert classify_run_failure("bash: foo: command not found", False) == "H2"
9
+ assert classify_run_failure("not found: foo", False) == "H2"
10
+ assert classify_run_failure("ENOENT: no such file", False) == "H3"
11
+ assert classify_run_failure("permission denied", False) == "H3"
12
+ assert classify_run_failure("", False) == "none" # empty stderr
13
+ assert classify_run_failure("segfault boom", False) == "H4" # other runtime
14
+
15
+
16
+ def test_h2_takes_precedence_over_h3_when_both_present():
17
+ # GRD checks H2 before H3
18
+ assert classify_run_failure("ImportError and No such file or directory", False) == "H2"
@@ -0,0 +1,43 @@
1
+ from autoresearch_core.gates import resolve_gates, check_gate
2
+ from autoresearch_core.types import GateState
3
+
4
+
5
+ def test_resolve_defaults_on():
6
+ g = resolve_gates({}, no_gates=False)
7
+ assert g.execute is True and g.kg_write is True
8
+
9
+
10
+ def test_resolve_disable_execute_via_experiment_execution_key():
11
+ g = resolve_gates({"research_gates": {"experiment_execution": False}}, no_gates=False)
12
+ assert g.execute is False and g.kg_write is True
13
+
14
+
15
+ def test_no_gates_disables_all():
16
+ g = resolve_gates({"research_gates": {"experiment_execution": True}}, no_gates=True)
17
+ assert g.execute is False and g.kg_write is False
18
+
19
+
20
+ def test_check_gate_pause_vs_proceed():
21
+ gates = GateState(execute=True, kg_write=True)
22
+ paused = check_gate(gates, "execute", approved=False)
23
+ assert paused.proceed is False and paused.pending_gate == "execute"
24
+ assert check_gate(gates, "execute", approved=True).proceed is True
25
+ assert check_gate(GateState(execute=False), "execute", approved=False).proceed is True
26
+
27
+
28
+ def test_check_gate_unknown_gate_raises():
29
+ import pytest
30
+ with pytest.raises(ValueError):
31
+ check_gate(GateState(), "nope", approved=False) # type: ignore[arg-type]
32
+
33
+
34
+ def test_resolve_gates_non_dict_research_gates_treated_as_empty():
35
+ # When research_gates is a non-dict (e.g. True), fall back to defaults (both on).
36
+ g = resolve_gates({"research_gates": True}, no_gates=False)
37
+ assert g.execute is True and g.kg_write is True
38
+
39
+ g2 = resolve_gates({"research_gates": 1}, no_gates=False)
40
+ assert g2.execute is True and g2.kg_write is True
41
+
42
+ g3 = resolve_gates({"research_gates": "yes"}, no_gates=False)
43
+ assert g3.execute is True and g3.kg_write is True
@@ -0,0 +1,58 @@
1
+ # tests/test_parity.py
2
+ """Vectors transcribed from GRD lib/research/{verdict.ts,runner.ts,gates.ts}.
3
+ If GRD changes these behaviors, update here deliberately — do not loosen."""
4
+ import autoresearch_core as ac
5
+ from autoresearch_core.types import MetricSpec, ExperimentResult
6
+
7
+
8
+ def test_public_surface_exports():
9
+ for name in [
10
+ "MetricSpec", "ExperimentResult", "VerdictRecord", "GateState", "GateCheck",
11
+ "parse_metrics_line", "classify_run_failure", "compare",
12
+ "DeterministicVerdict", "resolve_gates", "check_gate",
13
+ "decide_branch", "should_terminate", "detect_plateau",
14
+ "should_promote_dead_end", "measure", "decide",
15
+ "approach_hash", "build_dead_end_record",
16
+ ]:
17
+ assert hasattr(ac, name), f"missing public export: {name}"
18
+
19
+
20
+ def test_end_to_end_supported_path():
21
+ spec = MetricSpec("recall_at_10", ">=", 0.8)
22
+ stdout = 'log line\n__RESULT__ {"recall_at_10": 0.83}\n'
23
+ metrics = ac.parse_metrics_line(stdout)
24
+ result = ExperimentResult(metrics=metrics, exit_code=0)
25
+ rec = ac.DeterministicVerdict().evaluate(spec, result)
26
+ assert rec.verdict == "supported"
27
+ assert ac.decide_branch(rec.verdict) == "finalize"
28
+ assert ac.should_promote_dead_end(rec) is False
29
+
30
+
31
+ def test_end_to_end_refuted_then_promote():
32
+ spec = MetricSpec("latency_ms", "<", 200)
33
+ result = ExperimentResult(metrics={"latency_ms": 300.0}, exit_code=0)
34
+ rec = ac.DeterministicVerdict().evaluate(spec, result)
35
+ assert rec.verdict == "refuted"
36
+ assert ac.decide_branch(rec.verdict) == "revise"
37
+ assert ac.should_promote_dead_end(rec) is True
38
+
39
+
40
+ def test_end_to_end_failed_run_is_inconclusive():
41
+ spec = MetricSpec("x", ">=", 1)
42
+ err = ExperimentResult(metrics={}, exit_code=127, failure_class="H2")
43
+ rec = ac.DeterministicVerdict().evaluate(spec, err)
44
+ assert rec.verdict == "inconclusive"
45
+ assert ac.should_promote_dead_end(rec) is False # inconclusive never promotes
46
+
47
+
48
+ def test_invalid_comparator_returns_false():
49
+ assert ac.compare(1, "!=", 1) is False # type: ignore[arg-type]
50
+
51
+
52
+ def test_exit_code_precedence_over_metrics():
53
+ # non-zero exit -> inconclusive even when a metric is present
54
+ spec = MetricSpec("x", ">=", 1)
55
+ rec = ac.DeterministicVerdict().evaluate(
56
+ spec, ExperimentResult(metrics={"x": 5.0}, exit_code=1, failure_class="H4")
57
+ )
58
+ assert rec.verdict == "inconclusive"
@@ -0,0 +1,40 @@
1
+ from autoresearch_core.policy import (
2
+ decide_branch, should_terminate, detect_plateau, should_promote_dead_end,
3
+ measure, decide,
4
+ )
5
+ from autoresearch_core.types import MetricSpec, ExperimentResult, VerdictRecord
6
+
7
+
8
+ def test_decide_branch():
9
+ assert decide_branch("supported") == "finalize"
10
+ assert decide_branch("refuted") == "revise"
11
+ assert decide_branch("inconclusive") == "revise"
12
+
13
+
14
+ def test_should_terminate():
15
+ assert should_terminate(2, 8, "supported") == (True, "supported")
16
+ assert should_terminate(8, 8, "refuted") == (True, "exhausted")
17
+ assert should_terminate(3, 8, "refuted") == (False, "active")
18
+
19
+
20
+ def test_detect_plateau():
21
+ assert detect_plateau(["refuted", "refuted"], window=3) is False # too few
22
+ assert detect_plateau(["refuted", "inconclusive", "refuted"]) is True
23
+ assert detect_plateau(["refuted", "supported", "refuted"]) is False
24
+
25
+
26
+ def test_promotion_authority_deterministic_only():
27
+ det = VerdictRecord("refuted", "deterministic", "deterministic", "x<y")
28
+ llm = VerdictRecord("refuted", "reviewer", "llm", "looks wrong")
29
+ ok = VerdictRecord("supported", "deterministic", "deterministic", "x>=y")
30
+ assert should_promote_dead_end(det) is True
31
+ assert should_promote_dead_end(llm) is False # advisory only
32
+ assert should_promote_dead_end(ok) is False # supported never promotes
33
+
34
+
35
+ def test_measure_and_decide_facades():
36
+ spec = MetricSpec("recall", ">=", 0.8)
37
+ rec = measure(spec, ExperimentResult(metrics={"recall": 0.9}, exit_code=0))
38
+ assert rec.verdict == "supported" and rec.evidence_level == "deterministic"
39
+ assert decide(2, 8, rec.verdict) == ("finalize", True, "supported")
40
+ assert decide(3, 8, "refuted") == ("revise", False, "active")
@@ -0,0 +1,26 @@
1
+ from autoresearch_core.ports import ExperimentRunner, Store
2
+ from autoresearch_core.types import ExperimentResult
3
+
4
+
5
+ class FakeRunner:
6
+ def run(self, plan: dict, workdir: str) -> ExperimentResult:
7
+ return ExperimentResult(metrics={"x": 1.0}, exit_code=0)
8
+
9
+
10
+ class FakeStore:
11
+ def __init__(self):
12
+ self.dead_ends: dict[str, list] = {}
13
+
14
+ def save_verdict(self, thread_id: str, record) -> None:
15
+ pass
16
+
17
+ def load_dead_end_hashes(self, scope: str) -> set[str]:
18
+ return set()
19
+
20
+ def save_dead_end(self, scope: str, record) -> None:
21
+ self.dead_ends.setdefault(scope, []).append(record)
22
+
23
+
24
+ def test_fakes_satisfy_protocols():
25
+ assert isinstance(FakeRunner(), ExperimentRunner)
26
+ assert isinstance(FakeStore(), Store)
@@ -0,0 +1,47 @@
1
+ import pytest
2
+ from autoresearch_core.promote import (
3
+ approach_hash, build_dead_end_record, should_skip, DeadEndRecord,
4
+ )
5
+ from autoresearch_core.types import Hypothesis, VerdictRecord
6
+
7
+
8
+ def test_approach_hash_is_normalized_and_stable():
9
+ a = approach_hash(" Memoize The Tokenizer ")
10
+ b = approach_hash("memoize the tokenizer")
11
+ assert a == b and len(a) == 16
12
+
13
+
14
+ def test_build_dead_end_record():
15
+ h = Hypothesis(id="h1", iteration=2, statement="cache embeddings",
16
+ predicted_outcome="faster")
17
+ rec = VerdictRecord("refuted", "deterministic", "deterministic", "latency=300 < 200 -> fail")
18
+ de = build_dead_end_record(h, rec)
19
+ assert isinstance(de, DeadEndRecord)
20
+ assert de.statement == "cache embeddings" and de.iteration == 2
21
+ assert de.evidence_level == "deterministic" and de.reason.endswith("fail")
22
+
23
+
24
+ def test_should_skip_against_known_hashes():
25
+ seen = {approach_hash("cache embeddings")}
26
+ assert should_skip("Cache Embeddings", seen) is True
27
+ assert should_skip("use a bloom filter", seen) is False
28
+
29
+
30
+ def test_approach_hash_whitespace_normalization():
31
+ # Internal runs of whitespace must collapse to a single space.
32
+ assert approach_hash("cache embeddings") == approach_hash("cache embeddings")
33
+ assert approach_hash("cache\t\tembeddings") == approach_hash("cache embeddings")
34
+ assert approach_hash("cache\n embeddings") == approach_hash("cache embeddings")
35
+
36
+
37
+ def test_build_dead_end_record_raises_on_non_deterministic_refutation():
38
+ h = Hypothesis(id="h2", iteration=1, statement="use llm scoring",
39
+ predicted_outcome="better")
40
+ # supported + llm — neither condition of should_promote_dead_end is met
41
+ rec_supported = VerdictRecord("supported", "deterministic", "deterministic", "ok")
42
+ with pytest.raises(ValueError, match="deterministic refutation"):
43
+ build_dead_end_record(h, rec_supported)
44
+ # refuted but evidence_level=llm — not deterministic
45
+ rec_llm = VerdictRecord("refuted", "deterministic", "llm", "llm said no")
46
+ with pytest.raises(ValueError, match="deterministic refutation"):
47
+ build_dead_end_record(h, rec_llm)
@@ -0,0 +1,35 @@
1
+ import dataclasses
2
+ import pytest
3
+ from autoresearch_core.types import (
4
+ MetricSpec, ExperimentResult, VerdictRecord, Hypothesis, Takeaway, GateState,
5
+ )
6
+
7
+ def test_dataclasses_construct_and_are_frozen():
8
+ spec = MetricSpec(metric_key="recall", comparator=">=", target=0.8)
9
+ assert spec.metric_key == "recall" and spec.comparator == ">=" and spec.target == 0.8
10
+ res = ExperimentResult(metrics={"recall": 0.9}, exit_code=0)
11
+ assert res.failure_class == "none" and res.runner == "subprocess"
12
+ rec = VerdictRecord(verdict="supported", strategy="deterministic",
13
+ evidence_level="deterministic", detail="ok")
14
+ assert rec.raw_evidence_ref is None
15
+ gates = GateState()
16
+ assert gates.execute is True and gates.kg_write is True
17
+ with pytest.raises(dataclasses.FrozenInstanceError):
18
+ spec.target = 0.5 # type: ignore[misc]
19
+
20
+
21
+ def test_experiment_result_defensive_copy():
22
+ # Mutating the source dict after construction must not alter .metrics.
23
+ src = {"recall": 0.9, "latency_ms": 120.0}
24
+ res = ExperimentResult(metrics=src, exit_code=0)
25
+ src["recall"] = 0.0
26
+ src["new_key"] = 999.0
27
+ assert res.metrics["recall"] == 0.9
28
+ assert "new_key" not in res.metrics
29
+
30
+
31
+ def test_experiment_result_metrics_is_still_frozen():
32
+ # The defensive copy must not make metrics itself mutable via setattr.
33
+ res = ExperimentResult(metrics={"x": 1.0}, exit_code=0)
34
+ with pytest.raises(dataclasses.FrozenInstanceError):
35
+ res.metrics = {} # type: ignore[misc]
@@ -0,0 +1,29 @@
1
+ from autoresearch_core.verdict import compare, DeterministicVerdict
2
+ from autoresearch_core.types import MetricSpec, ExperimentResult
3
+
4
+
5
+ def test_compare_all_operators():
6
+ assert compare(0.8, ">=", 0.8) is True
7
+ assert compare(199, "<", 200) is True
8
+ assert compare(5, "==", 5) is True
9
+ assert compare(5, ">", 5) is False
10
+ assert compare(5, "<=", 4) is False
11
+
12
+
13
+ def test_deterministic_supported_and_refuted():
14
+ strat = DeterministicVerdict()
15
+ spec = MetricSpec("recall", ">=", 0.8)
16
+ rec = strat.evaluate(spec, ExperimentResult(metrics={"recall": 0.9}, exit_code=0))
17
+ assert rec.verdict == "supported" and rec.evidence_level == "deterministic"
18
+ assert rec.detail == "recall=0.9 >= 0.8 → pass"
19
+ rec2 = strat.evaluate(spec, ExperimentResult(metrics={"recall": 0.5}, exit_code=0))
20
+ assert rec2.verdict == "refuted"
21
+
22
+
23
+ def test_deterministic_inconclusive_paths():
24
+ strat = DeterministicVerdict()
25
+ spec = MetricSpec("recall", ">=", 0.8)
26
+ bad = strat.evaluate(spec, ExperimentResult(metrics={}, exit_code=1, failure_class="H2"))
27
+ assert bad.verdict == "inconclusive" and "H2" in bad.detail
28
+ missing = strat.evaluate(spec, ExperimentResult(metrics={"other": 1.0}, exit_code=0))
29
+ assert missing.verdict == "inconclusive" and "not reported" in missing.detail