falsifyai 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- falsifyai/__init__.py +3 -0
- falsifyai/cli/__init__.py +8 -0
- falsifyai/cli/diff.py +237 -0
- falsifyai/cli/errors.py +30 -0
- falsifyai/cli/main.py +105 -0
- falsifyai/cli/render.py +196 -0
- falsifyai/cli/replay.py +76 -0
- falsifyai/cli/run.py +191 -0
- falsifyai/differential/__init__.py +0 -0
- falsifyai/execution/__init__.py +19 -0
- falsifyai/execution/adapter.py +16 -0
- falsifyai/execution/cache.py +38 -0
- falsifyai/execution/engine.py +35 -0
- falsifyai/execution/errors.py +10 -0
- falsifyai/execution/litellm_adapter.py +57 -0
- falsifyai/execution/models.py +52 -0
- falsifyai/falsifiability/__init__.py +9 -0
- falsifyai/falsifiability/score.py +49 -0
- falsifyai/invariants/__init__.py +35 -0
- falsifyai/invariants/base.py +99 -0
- falsifyai/invariants/contains.py +65 -0
- falsifyai/invariants/registry.py +35 -0
- falsifyai/invariants/semantic.py +110 -0
- falsifyai/oracles/__init__.py +0 -0
- falsifyai/perturbation/__init__.py +25 -0
- falsifyai/perturbation/base.py +91 -0
- falsifyai/perturbation/casing_variant.py +79 -0
- falsifyai/perturbation/registry.py +28 -0
- falsifyai/perturbation/typo_noise.py +158 -0
- falsifyai/replay/__init__.py +38 -0
- falsifyai/replay/in_memory_store.py +78 -0
- falsifyai/replay/models.py +114 -0
- falsifyai/replay/protocol.py +52 -0
- falsifyai/replay/serialize.py +218 -0
- falsifyai/replay/sqlite_store.py +191 -0
- falsifyai/reporting/__init__.py +0 -0
- falsifyai/session/__init__.py +0 -0
- falsifyai/spec/__init__.py +19 -0
- falsifyai/spec/errors.py +27 -0
- falsifyai/spec/loader.py +40 -0
- falsifyai/spec/materializer.py +172 -0
- falsifyai/spec/models.py +135 -0
- falsifyai/statistical/__init__.py +0 -0
- falsifyai/verdict/__init__.py +8 -0
- falsifyai/verdict/consistency.py +62 -0
- falsifyai/verdict/models.py +36 -0
- falsifyai/verdict/resolver.py +174 -0
- falsifyai/verdict/stratify.py +93 -0
- falsifyai-0.1.0.dist-info/METADATA +398 -0
- falsifyai-0.1.0.dist-info/RECORD +53 -0
- falsifyai-0.1.0.dist-info/WHEEL +4 -0
- falsifyai-0.1.0.dist-info/entry_points.txt +2 -0
- falsifyai-0.1.0.dist-info/licenses/LICENSE +201 -0
falsifyai/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""``falsifyai`` command-line interface.
|
|
2
|
+
|
|
3
|
+
The console script entry point ``falsifyai = "falsifyai.cli.main:main"`` is
|
|
4
|
+
wired in ``pyproject.toml``. Import the submodule directly
|
|
5
|
+
(``from falsifyai.cli.main import main``) — no re-export here, because doing
|
|
6
|
+
so would shadow the ``falsifyai.cli.main`` submodule with its ``main``
|
|
7
|
+
function and break ``import falsifyai.cli.main``.
|
|
8
|
+
"""
|
falsifyai/cli/diff.py
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
"""``falsifyai diff <baseline> <candidate>`` -- the launch wedge.
|
|
2
|
+
|
|
3
|
+
Loads two stored ``ReplayArtifact``s, compares them case-by-case, surfaces
|
|
4
|
+
verdict transitions in a compressed table, and exits **code 5 (REGRESSION)**
|
|
5
|
+
if any case regressed.
|
|
6
|
+
|
|
7
|
+
This is the differentiator per [plan.md §22.1](../../plan.md). Competitors
|
|
8
|
+
match an engine that runs perturbations; they do not flag model-migration
|
|
9
|
+
regressions with a single command.
|
|
10
|
+
|
|
11
|
+
**Invariants:**
|
|
12
|
+
|
|
13
|
+
- ``cmd_diff`` is strictly read-only. Never modifies either artifact.
|
|
14
|
+
- The diff does NOT re-resolve verdicts under the current resolver. The
|
|
15
|
+
verdicts compared are the ones assigned at each ``run`` time. Diff is a
|
|
16
|
+
consumer of already-resolved artifacts; the resolver stays untouched.
|
|
17
|
+
- Regression criterion is **verdict-class downgrade only**: STABLE -> FRAGILE,
|
|
18
|
+
STABLE -> CONSISTENTLY_WRONG, FRAGILE -> CONSISTENTLY_WRONG. No thresholds,
|
|
19
|
+
no per-stability deltas as regression signals. Predictable by design.
|
|
20
|
+
- Cases present in only one side are surfaced as ADDED / REMOVED but do NOT
|
|
21
|
+
trigger exit 5. Specs evolve legitimately.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
import argparse
|
|
25
|
+
from dataclasses import dataclass
|
|
26
|
+
from enum import Enum
|
|
27
|
+
|
|
28
|
+
from falsifyai.cli import render
|
|
29
|
+
from falsifyai.cli.errors import InfrastructureError
|
|
30
|
+
from falsifyai.replay.in_memory_store import InMemoryStore
|
|
31
|
+
from falsifyai.replay.models import ReplayArtifact
|
|
32
|
+
from falsifyai.replay.protocol import ReplayStore, SessionNotFoundError
|
|
33
|
+
from falsifyai.replay.sqlite_store import SQLiteStore
|
|
34
|
+
from falsifyai.verdict.models import Verdict
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class TransitionKind(Enum):
|
|
38
|
+
"""How a case's verdict changed between baseline and candidate."""
|
|
39
|
+
|
|
40
|
+
UNCHANGED = "unchanged"
|
|
41
|
+
IMPROVED = "improved"
|
|
42
|
+
REGRESSED = "regressed"
|
|
43
|
+
OTHER_CHANGE = "other_change" # informational; not a regression
|
|
44
|
+
ADDED = "added" # in candidate only
|
|
45
|
+
REMOVED = "removed" # in baseline only
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass(frozen=True)
|
|
49
|
+
class CaseTransition:
|
|
50
|
+
"""One row in the diff: how case <case_id>'s verdict changed."""
|
|
51
|
+
|
|
52
|
+
case_id: str
|
|
53
|
+
baseline_verdict: Verdict | None # None if ADDED
|
|
54
|
+
candidate_verdict: Verdict | None # None if REMOVED
|
|
55
|
+
baseline_stability_ci_low: float
|
|
56
|
+
candidate_stability_ci_low: float
|
|
57
|
+
transition_kind: TransitionKind
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass(frozen=True)
|
|
61
|
+
class DiffReport:
|
|
62
|
+
"""Compressed summary of a diff between two ReplayArtifacts."""
|
|
63
|
+
|
|
64
|
+
baseline_session_id: str
|
|
65
|
+
candidate_session_id: str
|
|
66
|
+
materialized_hash_mismatch: bool
|
|
67
|
+
transitions: list[CaseTransition]
|
|
68
|
+
regressed_count: int
|
|
69
|
+
improved_count: int
|
|
70
|
+
unchanged_count: int
|
|
71
|
+
other_change_count: int
|
|
72
|
+
added_count: int
|
|
73
|
+
removed_count: int
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# ---------------------------------------------------------------------------
|
|
77
|
+
# Transition classification
|
|
78
|
+
# ---------------------------------------------------------------------------
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# Verdict-class downgrades that count as REGRESSED.
|
|
82
|
+
# Read as: baseline_verdict -> {candidate_verdicts that are regressions}.
|
|
83
|
+
_REGRESSION_DOWNGRADES: dict[Verdict, frozenset[Verdict]] = {
|
|
84
|
+
Verdict.STABLE: frozenset({Verdict.FRAGILE, Verdict.CONSISTENTLY_WRONG}),
|
|
85
|
+
Verdict.FRAGILE: frozenset({Verdict.CONSISTENTLY_WRONG}),
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
# Reverse direction: IMPROVED. STABLE is the "good" pole; transitioning toward
|
|
89
|
+
# it (from any worse verdict) counts as improvement.
|
|
90
|
+
_IMPROVEMENT_TARGETS: dict[Verdict, frozenset[Verdict]] = {
|
|
91
|
+
Verdict.STABLE: frozenset({Verdict.FRAGILE, Verdict.CONSISTENTLY_WRONG, Verdict.INSUFFICIENT})
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _classify_transition(baseline: Verdict, candidate: Verdict) -> TransitionKind:
|
|
96
|
+
"""Decide what kind of transition this verdict-pair represents.
|
|
97
|
+
|
|
98
|
+
See ``_REGRESSION_DOWNGRADES`` and ``_IMPROVEMENT_TARGETS`` for the
|
|
99
|
+
canonical mappings. Any other transition (e.g., STABLE -> INSUFFICIENT
|
|
100
|
+
or anything involving INVALID_EVAL) is OTHER_CHANGE -- informational
|
|
101
|
+
but not a regression.
|
|
102
|
+
"""
|
|
103
|
+
if baseline is candidate:
|
|
104
|
+
return TransitionKind.UNCHANGED
|
|
105
|
+
if candidate in _REGRESSION_DOWNGRADES.get(baseline, frozenset()):
|
|
106
|
+
return TransitionKind.REGRESSED
|
|
107
|
+
# Improvement: candidate is STABLE and baseline was a worse verdict.
|
|
108
|
+
if baseline in _IMPROVEMENT_TARGETS.get(candidate, frozenset()):
|
|
109
|
+
return TransitionKind.IMPROVED
|
|
110
|
+
return TransitionKind.OTHER_CHANGE
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def compute_diff(baseline: ReplayArtifact, candidate: ReplayArtifact) -> DiffReport:
|
|
114
|
+
"""Pure function: compare two artifacts case-by-case, produce a DiffReport.
|
|
115
|
+
|
|
116
|
+
Cases are matched by ``case_id``. Cases in only one side are recorded as
|
|
117
|
+
ADDED / REMOVED. The regression criterion is verdict-class downgrade
|
|
118
|
+
per ``_classify_transition``.
|
|
119
|
+
"""
|
|
120
|
+
baseline_cases = {c.case_id: c for c in baseline.case_results}
|
|
121
|
+
candidate_cases = {c.case_id: c for c in candidate.case_results}
|
|
122
|
+
|
|
123
|
+
all_case_ids = sorted(set(baseline_cases) | set(candidate_cases))
|
|
124
|
+
transitions: list[CaseTransition] = []
|
|
125
|
+
regressed = improved = unchanged = other_change = added = removed = 0
|
|
126
|
+
|
|
127
|
+
for case_id in all_case_ids:
|
|
128
|
+
b = baseline_cases.get(case_id)
|
|
129
|
+
c = candidate_cases.get(case_id)
|
|
130
|
+
if b is None:
|
|
131
|
+
transitions.append(
|
|
132
|
+
CaseTransition(
|
|
133
|
+
case_id=case_id,
|
|
134
|
+
baseline_verdict=None,
|
|
135
|
+
candidate_verdict=c.verdict,
|
|
136
|
+
baseline_stability_ci_low=0.0,
|
|
137
|
+
candidate_stability_ci_low=c.stability_ci_low,
|
|
138
|
+
transition_kind=TransitionKind.ADDED,
|
|
139
|
+
)
|
|
140
|
+
)
|
|
141
|
+
added += 1
|
|
142
|
+
continue
|
|
143
|
+
if c is None:
|
|
144
|
+
transitions.append(
|
|
145
|
+
CaseTransition(
|
|
146
|
+
case_id=case_id,
|
|
147
|
+
baseline_verdict=b.verdict,
|
|
148
|
+
candidate_verdict=None,
|
|
149
|
+
baseline_stability_ci_low=b.stability_ci_low,
|
|
150
|
+
candidate_stability_ci_low=0.0,
|
|
151
|
+
transition_kind=TransitionKind.REMOVED,
|
|
152
|
+
)
|
|
153
|
+
)
|
|
154
|
+
removed += 1
|
|
155
|
+
continue
|
|
156
|
+
kind = _classify_transition(b.verdict, c.verdict)
|
|
157
|
+
transitions.append(
|
|
158
|
+
CaseTransition(
|
|
159
|
+
case_id=case_id,
|
|
160
|
+
baseline_verdict=b.verdict,
|
|
161
|
+
candidate_verdict=c.verdict,
|
|
162
|
+
baseline_stability_ci_low=b.stability_ci_low,
|
|
163
|
+
candidate_stability_ci_low=c.stability_ci_low,
|
|
164
|
+
transition_kind=kind,
|
|
165
|
+
)
|
|
166
|
+
)
|
|
167
|
+
if kind is TransitionKind.REGRESSED:
|
|
168
|
+
regressed += 1
|
|
169
|
+
elif kind is TransitionKind.IMPROVED:
|
|
170
|
+
improved += 1
|
|
171
|
+
elif kind is TransitionKind.UNCHANGED:
|
|
172
|
+
unchanged += 1
|
|
173
|
+
else:
|
|
174
|
+
other_change += 1
|
|
175
|
+
|
|
176
|
+
return DiffReport(
|
|
177
|
+
baseline_session_id=baseline.session_id,
|
|
178
|
+
candidate_session_id=candidate.session_id,
|
|
179
|
+
materialized_hash_mismatch=baseline.materialized_hash != candidate.materialized_hash,
|
|
180
|
+
transitions=transitions,
|
|
181
|
+
regressed_count=regressed,
|
|
182
|
+
improved_count=improved,
|
|
183
|
+
unchanged_count=unchanged,
|
|
184
|
+
other_change_count=other_change,
|
|
185
|
+
added_count=added,
|
|
186
|
+
removed_count=removed,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
# ---------------------------------------------------------------------------
|
|
191
|
+
# CLI orchestration
|
|
192
|
+
# ---------------------------------------------------------------------------
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _build_store(store_path: str) -> ReplayStore:
|
|
196
|
+
"""Mirror cli/run.py and cli/replay.py's store selection."""
|
|
197
|
+
if store_path == ":memory:":
|
|
198
|
+
return InMemoryStore()
|
|
199
|
+
return SQLiteStore(store_path)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _load_artifact(store: ReplayStore, session_id: str, *, role: str) -> ReplayArtifact:
|
|
203
|
+
"""Load an artifact, converting SessionNotFoundError to a user-facing CLIError."""
|
|
204
|
+
try:
|
|
205
|
+
return store.load_session(session_id)
|
|
206
|
+
except SessionNotFoundError as exc:
|
|
207
|
+
raise InfrastructureError(f"{role} session not found: {session_id}") from exc
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _diff_exit_code(report: DiffReport) -> int:
|
|
211
|
+
"""Exit code 5 (REGRESSION) if any case regressed, else 0."""
|
|
212
|
+
return 5 if report.regressed_count > 0 else 0
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def cmd_diff(args: argparse.Namespace) -> int:
|
|
216
|
+
"""Entry point for the ``diff`` subcommand. Returns an exit code."""
|
|
217
|
+
store = _build_store(args.store_path)
|
|
218
|
+
try:
|
|
219
|
+
baseline = _load_artifact(store, args.baseline_session_id, role="baseline")
|
|
220
|
+
candidate = _load_artifact(store, args.candidate_session_id, role="candidate")
|
|
221
|
+
finally:
|
|
222
|
+
close = getattr(store, "close", None)
|
|
223
|
+
if callable(close):
|
|
224
|
+
close()
|
|
225
|
+
|
|
226
|
+
report = compute_diff(baseline, candidate)
|
|
227
|
+
render.render_diff(report, store_path=args.store_path)
|
|
228
|
+
return _diff_exit_code(report)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
__all__ = [
|
|
232
|
+
"CaseTransition",
|
|
233
|
+
"DiffReport",
|
|
234
|
+
"TransitionKind",
|
|
235
|
+
"cmd_diff",
|
|
236
|
+
"compute_diff",
|
|
237
|
+
]
|
falsifyai/cli/errors.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""CLI-layer exception hierarchy.
|
|
2
|
+
|
|
3
|
+
The CLI catches these and maps them to exit codes per
|
|
4
|
+
[plan.md section 16.1](../../plan.md). Code 3 (ERROR) is reserved for
|
|
5
|
+
infrastructure-class failures raised by the CLI layer *before* a verdict
|
|
6
|
+
exists — bad spec, missing API key, network unreachable, etc.
|
|
7
|
+
|
|
8
|
+
Verdict-derived exit codes (0, 1, 2, 4) come from
|
|
9
|
+
``falsifyai.cli.render.exit_code_for`` and never raise.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CLIError(Exception):
|
|
14
|
+
"""Base for all CLI-layer failures. Carries the intended exit code."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, message: str, *, exit_code: int = 3) -> None:
|
|
17
|
+
super().__init__(message)
|
|
18
|
+
self.exit_code = exit_code
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SpecError(CLIError):
|
|
22
|
+
"""The spec file cannot be loaded or parsed."""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ConfigError(CLIError):
|
|
26
|
+
"""A configuration / dependency / credential prerequisite is missing."""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class InfrastructureError(CLIError):
|
|
30
|
+
"""Network / model-call / store failure during a run."""
|
falsifyai/cli/main.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""``falsifyai`` CLI entry point.
|
|
2
|
+
|
|
3
|
+
Argparse-based dispatch. Three subcommands: ``run`` (execute a spec),
|
|
4
|
+
``replay`` (re-render a stored session), and ``diff`` (compare two
|
|
5
|
+
stored sessions and exit 5 on regression).
|
|
6
|
+
|
|
7
|
+
Exit codes (per [plan.md section 16.1](../../plan.md)):
|
|
8
|
+
|
|
9
|
+
- 0 SUCCESS — session verdict STABLE
|
|
10
|
+
- 1 DEGRADED — session verdict FRAGILE
|
|
11
|
+
- 2 FAILURE — session verdict CONSISTENTLY_WRONG / INVALID_EVAL
|
|
12
|
+
- 3 ERROR — infrastructure failure (bad spec, missing credential, model call)
|
|
13
|
+
- 4 INSUFFICIENT — not enough evidence to discriminate
|
|
14
|
+
|
|
15
|
+
Codes 5 (REGRESSION) and 6 (LOW_FALSIFIABILITY) ship with Week 2 features.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
import sys
|
|
20
|
+
from collections.abc import Sequence
|
|
21
|
+
|
|
22
|
+
from falsifyai.cli import diff as diff_cmd
|
|
23
|
+
from falsifyai.cli import replay as replay_cmd
|
|
24
|
+
from falsifyai.cli import run as run_cmd
|
|
25
|
+
from falsifyai.cli.errors import CLIError
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
29
|
+
parser = argparse.ArgumentParser(
|
|
30
|
+
prog="falsifyai",
|
|
31
|
+
description="Falsification-first reliability testing for AI systems.",
|
|
32
|
+
)
|
|
33
|
+
subparsers = parser.add_subparsers(dest="command", metavar="<command>")
|
|
34
|
+
|
|
35
|
+
run_parser = subparsers.add_parser("run", help="Run a falsification eval against a spec.")
|
|
36
|
+
run_parser.add_argument("spec_path", help="Path to the YAML spec file.")
|
|
37
|
+
run_parser.add_argument(
|
|
38
|
+
"--store-path",
|
|
39
|
+
default=".falsifyai/replays.db",
|
|
40
|
+
help="ReplayStore path. Use ':memory:' for an ephemeral run. "
|
|
41
|
+
"Default: .falsifyai/replays.db",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
replay_parser = subparsers.add_parser(
|
|
45
|
+
"replay", help="Load and re-render a previously stored session."
|
|
46
|
+
)
|
|
47
|
+
replay_parser.add_argument(
|
|
48
|
+
"session_id",
|
|
49
|
+
nargs="?",
|
|
50
|
+
default=None,
|
|
51
|
+
help="Session id to load. Omit if using --latest.",
|
|
52
|
+
)
|
|
53
|
+
replay_parser.add_argument(
|
|
54
|
+
"--latest",
|
|
55
|
+
action="store_true",
|
|
56
|
+
help="Load the most recent session in the store. Mutually exclusive with session_id.",
|
|
57
|
+
)
|
|
58
|
+
replay_parser.add_argument(
|
|
59
|
+
"--store-path",
|
|
60
|
+
default=".falsifyai/replays.db",
|
|
61
|
+
help="ReplayStore path. Default: .falsifyai/replays.db",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
diff_parser = subparsers.add_parser(
|
|
65
|
+
"diff",
|
|
66
|
+
help="Compare two stored sessions case-by-case. Exit 5 if any case regressed.",
|
|
67
|
+
)
|
|
68
|
+
diff_parser.add_argument("baseline_session_id", help="Baseline session id.")
|
|
69
|
+
diff_parser.add_argument("candidate_session_id", help="Candidate session id.")
|
|
70
|
+
diff_parser.add_argument(
|
|
71
|
+
"--store-path",
|
|
72
|
+
default=".falsifyai/replays.db",
|
|
73
|
+
help="ReplayStore path (both artifacts assumed in same store). "
|
|
74
|
+
"Default: .falsifyai/replays.db",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return parser
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def main(argv: Sequence[str] | None = None) -> int:
|
|
81
|
+
parser = build_parser()
|
|
82
|
+
args = parser.parse_args(argv)
|
|
83
|
+
|
|
84
|
+
if args.command is None:
|
|
85
|
+
parser.print_help()
|
|
86
|
+
return 0
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
if args.command == "run":
|
|
90
|
+
return run_cmd.cmd_run(args)
|
|
91
|
+
if args.command == "replay":
|
|
92
|
+
return replay_cmd.cmd_replay(args)
|
|
93
|
+
if args.command == "diff":
|
|
94
|
+
return diff_cmd.cmd_diff(args)
|
|
95
|
+
except CLIError as exc:
|
|
96
|
+
print(f"falsifyai: error: {exc}", file=sys.stderr)
|
|
97
|
+
return exc.exit_code
|
|
98
|
+
|
|
99
|
+
# Unknown subcommand (argparse would normally have caught this).
|
|
100
|
+
parser.error(f"unknown command: {args.command}")
|
|
101
|
+
return 2 # pragma: no cover - parser.error raises SystemExit
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
if __name__ == "__main__": # pragma: no cover
|
|
105
|
+
sys.exit(main())
|
falsifyai/cli/render.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""Plain-text terminal output for ``falsifyai run``, ``replay``, and ``diff``.
|
|
2
|
+
|
|
3
|
+
MVP scope: one row per case + a summary footer + the session id and store
|
|
4
|
+
path so the user can find their saved artifact. No colors, no boxes, no
|
|
5
|
+
JSON. Rich/colored output and ``--json`` land in Week 3 per
|
|
6
|
+
[plan.md section 22.1](../../plan.md).
|
|
7
|
+
|
|
8
|
+
The ``loaded_from`` parameter on ``render_session`` is what distinguishes
|
|
9
|
+
the replay path: when set, an extra header line indicates the user is
|
|
10
|
+
looking at a stored session rather than a fresh run. The detection of
|
|
11
|
+
legacy artifacts (pre-PR-11, no CI evidence) lives in this module too --
|
|
12
|
+
the artifact shape, not the consumer, determines what's renderable.
|
|
13
|
+
|
|
14
|
+
``render_diff`` is the diff CLI's render path (PR #14). It consumes a
|
|
15
|
+
``DiffReport`` (consumer-side dataclass from cli/diff.py) and prints a
|
|
16
|
+
compressed transition table: only rows where something changed are shown.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import sys
|
|
20
|
+
from datetime import datetime
|
|
21
|
+
from typing import TYPE_CHECKING, TextIO
|
|
22
|
+
|
|
23
|
+
from falsifyai.replay.models import CaseResult, ReplayArtifact
|
|
24
|
+
from falsifyai.verdict.models import Verdict
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
# Type-only import to avoid a circular import at runtime: cli/diff.py
|
|
28
|
+
# imports render. The DiffReport dataclass lives in diff.py because it
|
|
29
|
+
# is a consumer-side structure, not part of the persisted artifact schema.
|
|
30
|
+
from falsifyai.cli.diff import CaseTransition, DiffReport
|
|
31
|
+
|
|
32
|
+
# Exit codes mapped to the MVP 5 verdicts per plan.md section 16.1.
|
|
33
|
+
# STABLE -> 0 SUCCESS
|
|
34
|
+
# FRAGILE -> 1 DEGRADED
|
|
35
|
+
# CONSISTENTLY_WRONG -> 2 FAILURE
|
|
36
|
+
# INVALID_EVAL -> 2 FAILURE
|
|
37
|
+
# INSUFFICIENT -> 4 INSUFFICIENT
|
|
38
|
+
# Code 3 (ERROR) is reserved for infrastructure failures raised by the CLI
|
|
39
|
+
# layer before a verdict exists; code 5 (REGRESSION) and 6 (LOW_FALSIFIABILITY)
|
|
40
|
+
# land with the Week 2 features.
|
|
41
|
+
_EXIT_CODES: dict[Verdict, int] = {
|
|
42
|
+
Verdict.STABLE: 0,
|
|
43
|
+
Verdict.FRAGILE: 1,
|
|
44
|
+
Verdict.CONSISTENTLY_WRONG: 2,
|
|
45
|
+
Verdict.INVALID_EVAL: 2,
|
|
46
|
+
Verdict.INSUFFICIENT: 4,
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def exit_code_for(verdict: Verdict) -> int:
|
|
51
|
+
"""CI exit code for a session-level verdict."""
|
|
52
|
+
return _EXIT_CODES[verdict]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _is_legacy_case(case: CaseResult) -> bool:
|
|
56
|
+
"""Pre-PR-11 artifact heuristic: nonzero verdict_confidence but no CI evidence.
|
|
57
|
+
|
|
58
|
+
The defaults from the dataclass extension (zero CI fields) trigger this
|
|
59
|
+
only when the case was constructed without PR #11's resolver -- i.e., it
|
|
60
|
+
was loaded from a pre-PR-11 replay store row. We require
|
|
61
|
+
``verdict_confidence > 0`` so an INSUFFICIENT case (all zeros, legitimately)
|
|
62
|
+
doesn't get the legacy marker.
|
|
63
|
+
"""
|
|
64
|
+
return (
|
|
65
|
+
case.verdict_confidence > 0.0
|
|
66
|
+
and case.stability == 0.0
|
|
67
|
+
and case.stability_ci_high == 0.0
|
|
68
|
+
and case.stability_ci_low == 0.0
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def render_session(
|
|
73
|
+
artifact: ReplayArtifact,
|
|
74
|
+
*,
|
|
75
|
+
store_path: str,
|
|
76
|
+
stream: TextIO | None = None,
|
|
77
|
+
loaded_from: datetime | None = None,
|
|
78
|
+
) -> None:
|
|
79
|
+
"""Print one row per case, then a summary footer.
|
|
80
|
+
|
|
81
|
+
Per-case row format:
|
|
82
|
+
case: <id> verdict: <V> confidence: <p> (CI: <lo>-<hi>) worst: <family>?
|
|
83
|
+
|
|
84
|
+
When ``loaded_from`` is set (replay path), an extra header line is
|
|
85
|
+
prepended indicating the session was loaded from the store.
|
|
86
|
+
|
|
87
|
+
Legacy case detection: cases without CI evidence (pre-PR-11 artifacts)
|
|
88
|
+
omit the misleading ``(CI: 0.00-0.00)`` and append ``(legacy)`` instead.
|
|
89
|
+
"""
|
|
90
|
+
out = stream if stream is not None else sys.stdout
|
|
91
|
+
|
|
92
|
+
if loaded_from is not None:
|
|
93
|
+
out.write(
|
|
94
|
+
f"Loaded session {artifact.session_id} · "
|
|
95
|
+
f"created_at {loaded_from.isoformat()} from {store_path}\n"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
for case in artifact.case_results:
|
|
99
|
+
if _is_legacy_case(case):
|
|
100
|
+
line = (
|
|
101
|
+
f"case: {case.case_id} verdict: {case.verdict.value.upper()} "
|
|
102
|
+
f"confidence: {case.verdict_confidence:.2f} (legacy)"
|
|
103
|
+
)
|
|
104
|
+
else:
|
|
105
|
+
line = (
|
|
106
|
+
f"case: {case.case_id} verdict: {case.verdict.value.upper()} "
|
|
107
|
+
f"confidence: {case.verdict_confidence:.2f} "
|
|
108
|
+
f"(CI: {case.stability_ci_low:.2f}-{case.stability_ci_high:.2f})"
|
|
109
|
+
)
|
|
110
|
+
if case.verdict is Verdict.FRAGILE and case.worst_case_family:
|
|
111
|
+
line += f" worst: {case.worst_case_family}"
|
|
112
|
+
out.write(line + "\n")
|
|
113
|
+
out.write("=" * 65 + "\n")
|
|
114
|
+
out.write(f"Session {artifact.session_id} -> {store_path}\n")
|
|
115
|
+
sv = artifact.session_verdict
|
|
116
|
+
out.write(
|
|
117
|
+
f"{sv.case_count} case{'s' if sv.case_count != 1 else ''}, "
|
|
118
|
+
f"verdict {sv.session_verdict.value.upper()}, "
|
|
119
|
+
f"{sv.fragile_count} FRAGILE, "
|
|
120
|
+
f"{sv.consistently_wrong_count} CONSISTENTLY_WRONG, "
|
|
121
|
+
f"falsifiability {sv.falsifyai_falsifiability_score:.2f}\n"
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# ---------------------------------------------------------------------------
|
|
126
|
+
# Diff rendering (PR #14)
|
|
127
|
+
# ---------------------------------------------------------------------------
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _format_verdict_with_stability(verdict: Verdict | None, ci_low: float) -> str:
|
|
131
|
+
"""Format ``STABLE (0.92)`` or ``-`` if the case is absent on one side."""
|
|
132
|
+
if verdict is None:
|
|
133
|
+
return "-"
|
|
134
|
+
return f"{verdict.value.upper()} ({ci_low:.2f})"
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _format_transition_row(t: "CaseTransition") -> str:
|
|
138
|
+
"""One row of the diff transition table.
|
|
139
|
+
|
|
140
|
+
Format: ``case: <id> baseline: <V> (n.nn) candidate: <V> (n.nn) <KIND>``
|
|
141
|
+
"""
|
|
142
|
+
baseline_str = _format_verdict_with_stability(t.baseline_verdict, t.baseline_stability_ci_low)
|
|
143
|
+
candidate_str = _format_verdict_with_stability(
|
|
144
|
+
t.candidate_verdict, t.candidate_stability_ci_low
|
|
145
|
+
)
|
|
146
|
+
return (
|
|
147
|
+
f"case: {t.case_id} "
|
|
148
|
+
f"baseline: {baseline_str} "
|
|
149
|
+
f"candidate: {candidate_str} "
|
|
150
|
+
f"{t.transition_kind.value.upper()}"
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def render_diff(
|
|
155
|
+
report: "DiffReport",
|
|
156
|
+
*,
|
|
157
|
+
store_path: str,
|
|
158
|
+
stream: TextIO | None = None,
|
|
159
|
+
) -> None:
|
|
160
|
+
"""Print a compressed transition table for two stored sessions.
|
|
161
|
+
|
|
162
|
+
Only transitions != UNCHANGED are surfaced as rows. The summary footer
|
|
163
|
+
always shows the full counts (unchanged + regressed + improved + ...).
|
|
164
|
+
Evidence density: show what changed; report what didn't via counts only.
|
|
165
|
+
"""
|
|
166
|
+
from falsifyai.cli.diff import TransitionKind
|
|
167
|
+
|
|
168
|
+
out = stream if stream is not None else sys.stdout
|
|
169
|
+
|
|
170
|
+
out.write(
|
|
171
|
+
f"Diff: baseline {report.baseline_session_id} -> candidate {report.candidate_session_id}\n"
|
|
172
|
+
)
|
|
173
|
+
out.write(f"Store: {store_path}\n")
|
|
174
|
+
if report.materialized_hash_mismatch:
|
|
175
|
+
out.write(
|
|
176
|
+
"note: materialized_hash differs between baseline and candidate; "
|
|
177
|
+
"comparisons may not be apples-to-apples.\n"
|
|
178
|
+
)
|
|
179
|
+
out.write("=" * 65 + "\n")
|
|
180
|
+
|
|
181
|
+
surfaced = [t for t in report.transitions if t.transition_kind is not TransitionKind.UNCHANGED]
|
|
182
|
+
if not surfaced:
|
|
183
|
+
out.write("(no transitions; all cases unchanged)\n")
|
|
184
|
+
else:
|
|
185
|
+
for t in surfaced:
|
|
186
|
+
out.write(_format_transition_row(t) + "\n")
|
|
187
|
+
|
|
188
|
+
out.write("=" * 65 + "\n")
|
|
189
|
+
out.write(
|
|
190
|
+
f"{report.regressed_count} regressed, "
|
|
191
|
+
f"{report.improved_count} improved, "
|
|
192
|
+
f"{report.unchanged_count} unchanged, "
|
|
193
|
+
f"{report.other_change_count} other, "
|
|
194
|
+
f"{report.added_count} added, "
|
|
195
|
+
f"{report.removed_count} removed\n"
|
|
196
|
+
)
|
falsifyai/cli/replay.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""``falsifyai replay <session_id>`` -- read-only consumer surface.
|
|
2
|
+
|
|
3
|
+
Loads a stored ``ReplayArtifact`` from the configured ``ReplayStore`` and
|
|
4
|
+
re-renders it via the existing ``cli/render.render_session``. No model
|
|
5
|
+
calls, no resolver invocation, no perturbations -- pure inspection of a
|
|
6
|
+
past run.
|
|
7
|
+
|
|
8
|
+
Invariants:
|
|
9
|
+
|
|
10
|
+
- ``cmd_replay`` is strictly read-only. It never modifies the stored
|
|
11
|
+
artifact. The verdict displayed is the verdict the resolver assigned at
|
|
12
|
+
``run`` time -- not a re-resolution under the current resolver.
|
|
13
|
+
- Exit codes mirror ``run``: the verdict-derived exit code lets CI use
|
|
14
|
+
``falsifyai replay <known-good-id>`` as a regression gate.
|
|
15
|
+
- ``--store-path :memory:`` is supported for symmetry with ``run``, even
|
|
16
|
+
though ``InMemoryStore`` is empty on every fresh process. Failure mode
|
|
17
|
+
is loud: ``SessionNotFoundError`` -> exit 3.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import argparse
|
|
21
|
+
|
|
22
|
+
from falsifyai.cli import render
|
|
23
|
+
from falsifyai.cli.errors import InfrastructureError
|
|
24
|
+
from falsifyai.replay.in_memory_store import InMemoryStore
|
|
25
|
+
from falsifyai.replay.protocol import ReplayStore, SessionNotFoundError
|
|
26
|
+
from falsifyai.replay.sqlite_store import SQLiteStore
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _build_store(store_path: str) -> ReplayStore:
|
|
30
|
+
"""Mirror cli/run.py's store selection: ``:memory:`` -> InMemoryStore."""
|
|
31
|
+
if store_path == ":memory:":
|
|
32
|
+
return InMemoryStore()
|
|
33
|
+
return SQLiteStore(store_path)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _resolve_target_session_id(store: ReplayStore, args: argparse.Namespace) -> str:
|
|
37
|
+
"""Pick the session_id to load: explicit, or newest if ``--latest``."""
|
|
38
|
+
if args.latest:
|
|
39
|
+
if args.session_id is not None:
|
|
40
|
+
# argparse should already reject this; defensive guard.
|
|
41
|
+
raise InfrastructureError("--latest is mutually exclusive with a positional session_id")
|
|
42
|
+
newest = next(iter(store.query_sessions(limit=1)), None)
|
|
43
|
+
if newest is None:
|
|
44
|
+
raise InfrastructureError("no sessions in store; cannot resolve --latest")
|
|
45
|
+
return newest.session_id
|
|
46
|
+
|
|
47
|
+
if args.session_id is None:
|
|
48
|
+
raise InfrastructureError("session_id is required (or pass --latest)")
|
|
49
|
+
return args.session_id
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def cmd_replay(args: argparse.Namespace) -> int:
|
|
53
|
+
"""Entry point for the ``replay`` subcommand. Returns an exit code."""
|
|
54
|
+
store = _build_store(args.store_path)
|
|
55
|
+
try:
|
|
56
|
+
session_id = _resolve_target_session_id(store, args)
|
|
57
|
+
try:
|
|
58
|
+
artifact = store.load_session(session_id)
|
|
59
|
+
except SessionNotFoundError as exc:
|
|
60
|
+
raise InfrastructureError(f"session not found: {session_id}") from exc
|
|
61
|
+
finally:
|
|
62
|
+
# InMemoryStore has no close(); SQLiteStore does.
|
|
63
|
+
close = getattr(store, "close", None)
|
|
64
|
+
if callable(close):
|
|
65
|
+
close()
|
|
66
|
+
|
|
67
|
+
render.render_session(
|
|
68
|
+
artifact,
|
|
69
|
+
store_path=args.store_path,
|
|
70
|
+
loaded_from=artifact.created_at,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
return render.exit_code_for(artifact.session_verdict.session_verdict)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
__all__ = ["cmd_replay"]
|