falsifyai 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. falsifyai/__init__.py +3 -0
  2. falsifyai/cli/__init__.py +8 -0
  3. falsifyai/cli/diff.py +237 -0
  4. falsifyai/cli/errors.py +30 -0
  5. falsifyai/cli/main.py +105 -0
  6. falsifyai/cli/render.py +196 -0
  7. falsifyai/cli/replay.py +76 -0
  8. falsifyai/cli/run.py +191 -0
  9. falsifyai/differential/__init__.py +0 -0
  10. falsifyai/execution/__init__.py +19 -0
  11. falsifyai/execution/adapter.py +16 -0
  12. falsifyai/execution/cache.py +38 -0
  13. falsifyai/execution/engine.py +35 -0
  14. falsifyai/execution/errors.py +10 -0
  15. falsifyai/execution/litellm_adapter.py +57 -0
  16. falsifyai/execution/models.py +52 -0
  17. falsifyai/falsifiability/__init__.py +9 -0
  18. falsifyai/falsifiability/score.py +49 -0
  19. falsifyai/invariants/__init__.py +35 -0
  20. falsifyai/invariants/base.py +99 -0
  21. falsifyai/invariants/contains.py +65 -0
  22. falsifyai/invariants/registry.py +35 -0
  23. falsifyai/invariants/semantic.py +110 -0
  24. falsifyai/oracles/__init__.py +0 -0
  25. falsifyai/perturbation/__init__.py +25 -0
  26. falsifyai/perturbation/base.py +91 -0
  27. falsifyai/perturbation/casing_variant.py +79 -0
  28. falsifyai/perturbation/registry.py +28 -0
  29. falsifyai/perturbation/typo_noise.py +158 -0
  30. falsifyai/replay/__init__.py +38 -0
  31. falsifyai/replay/in_memory_store.py +78 -0
  32. falsifyai/replay/models.py +114 -0
  33. falsifyai/replay/protocol.py +52 -0
  34. falsifyai/replay/serialize.py +218 -0
  35. falsifyai/replay/sqlite_store.py +191 -0
  36. falsifyai/reporting/__init__.py +0 -0
  37. falsifyai/session/__init__.py +0 -0
  38. falsifyai/spec/__init__.py +19 -0
  39. falsifyai/spec/errors.py +27 -0
  40. falsifyai/spec/loader.py +40 -0
  41. falsifyai/spec/materializer.py +172 -0
  42. falsifyai/spec/models.py +135 -0
  43. falsifyai/statistical/__init__.py +0 -0
  44. falsifyai/verdict/__init__.py +8 -0
  45. falsifyai/verdict/consistency.py +62 -0
  46. falsifyai/verdict/models.py +36 -0
  47. falsifyai/verdict/resolver.py +174 -0
  48. falsifyai/verdict/stratify.py +93 -0
  49. falsifyai-0.1.0.dist-info/METADATA +398 -0
  50. falsifyai-0.1.0.dist-info/RECORD +53 -0
  51. falsifyai-0.1.0.dist-info/WHEEL +4 -0
  52. falsifyai-0.1.0.dist-info/entry_points.txt +2 -0
  53. falsifyai-0.1.0.dist-info/licenses/LICENSE +201 -0
falsifyai/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """FalsifyAI — falsification-first reliability testing for AI systems."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,8 @@
1
+ """``falsifyai`` command-line interface.
2
+
3
+ The console script entry point ``falsifyai = "falsifyai.cli.main:main"`` is
4
+ wired in ``pyproject.toml``. Import the submodule directly
5
+ (``from falsifyai.cli.main import main``) — no re-export here, because doing
6
+ so would shadow the ``falsifyai.cli.main`` submodule with its ``main``
7
+ function and break ``import falsifyai.cli.main``.
8
+ """
falsifyai/cli/diff.py ADDED
@@ -0,0 +1,237 @@
1
+ """``falsifyai diff <baseline> <candidate>`` -- the launch wedge.
2
+
3
+ Loads two stored ``ReplayArtifact``s, compares them case-by-case, surfaces
4
+ verdict transitions in a compressed table, and exits **code 5 (REGRESSION)**
5
+ if any case regressed.
6
+
7
+ This is the differentiator per [plan.md §22.1](../../plan.md). Competitors
8
+ match an engine that runs perturbations; they do not flag model-migration
9
+ regressions with a single command.
10
+
11
+ **Invariants:**
12
+
13
+ - ``cmd_diff`` is strictly read-only. Never modifies either artifact.
14
+ - The diff does NOT re-resolve verdicts under the current resolver. The
15
+ verdicts compared are the ones assigned at each ``run`` time. Diff is a
16
+ consumer of already-resolved artifacts; the resolver stays untouched.
17
+ - Regression criterion is **verdict-class downgrade only**: STABLE -> FRAGILE,
18
+ STABLE -> CONSISTENTLY_WRONG, FRAGILE -> CONSISTENTLY_WRONG. No thresholds,
19
+ no per-stability deltas as regression signals. Predictable by design.
20
+ - Cases present in only one side are surfaced as ADDED / REMOVED but do NOT
21
+ trigger exit 5. Specs evolve legitimately.
22
+ """
23
+
24
+ import argparse
25
+ from dataclasses import dataclass
26
+ from enum import Enum
27
+
28
+ from falsifyai.cli import render
29
+ from falsifyai.cli.errors import InfrastructureError
30
+ from falsifyai.replay.in_memory_store import InMemoryStore
31
+ from falsifyai.replay.models import ReplayArtifact
32
+ from falsifyai.replay.protocol import ReplayStore, SessionNotFoundError
33
+ from falsifyai.replay.sqlite_store import SQLiteStore
34
+ from falsifyai.verdict.models import Verdict
35
+
36
+
37
+ class TransitionKind(Enum):
38
+ """How a case's verdict changed between baseline and candidate."""
39
+
40
+ UNCHANGED = "unchanged"
41
+ IMPROVED = "improved"
42
+ REGRESSED = "regressed"
43
+ OTHER_CHANGE = "other_change" # informational; not a regression
44
+ ADDED = "added" # in candidate only
45
+ REMOVED = "removed" # in baseline only
46
+
47
+
48
+ @dataclass(frozen=True)
49
+ class CaseTransition:
50
+ """One row in the diff: how case <case_id>'s verdict changed."""
51
+
52
+ case_id: str
53
+ baseline_verdict: Verdict | None # None if ADDED
54
+ candidate_verdict: Verdict | None # None if REMOVED
55
+ baseline_stability_ci_low: float
56
+ candidate_stability_ci_low: float
57
+ transition_kind: TransitionKind
58
+
59
+
60
+ @dataclass(frozen=True)
61
+ class DiffReport:
62
+ """Compressed summary of a diff between two ReplayArtifacts."""
63
+
64
+ baseline_session_id: str
65
+ candidate_session_id: str
66
+ materialized_hash_mismatch: bool
67
+ transitions: list[CaseTransition]
68
+ regressed_count: int
69
+ improved_count: int
70
+ unchanged_count: int
71
+ other_change_count: int
72
+ added_count: int
73
+ removed_count: int
74
+
75
+
76
+ # ---------------------------------------------------------------------------
77
+ # Transition classification
78
+ # ---------------------------------------------------------------------------
79
+
80
+
81
+ # Verdict-class downgrades that count as REGRESSED.
82
+ # Read as: baseline_verdict -> {candidate_verdicts that are regressions}.
83
+ _REGRESSION_DOWNGRADES: dict[Verdict, frozenset[Verdict]] = {
84
+ Verdict.STABLE: frozenset({Verdict.FRAGILE, Verdict.CONSISTENTLY_WRONG}),
85
+ Verdict.FRAGILE: frozenset({Verdict.CONSISTENTLY_WRONG}),
86
+ }
87
+
88
+ # Reverse direction: IMPROVED. STABLE is the "good" pole; transitioning toward
89
+ # it (from any worse verdict) counts as improvement.
90
+ _IMPROVEMENT_TARGETS: dict[Verdict, frozenset[Verdict]] = {
91
+ Verdict.STABLE: frozenset({Verdict.FRAGILE, Verdict.CONSISTENTLY_WRONG, Verdict.INSUFFICIENT})
92
+ }
93
+
94
+
95
+ def _classify_transition(baseline: Verdict, candidate: Verdict) -> TransitionKind:
96
+ """Decide what kind of transition this verdict-pair represents.
97
+
98
+ See ``_REGRESSION_DOWNGRADES`` and ``_IMPROVEMENT_TARGETS`` for the
99
+ canonical mappings. Any other transition (e.g., STABLE -> INSUFFICIENT
100
+ or anything involving INVALID_EVAL) is OTHER_CHANGE -- informational
101
+ but not a regression.
102
+ """
103
+ if baseline is candidate:
104
+ return TransitionKind.UNCHANGED
105
+ if candidate in _REGRESSION_DOWNGRADES.get(baseline, frozenset()):
106
+ return TransitionKind.REGRESSED
107
+ # Improvement: candidate is STABLE and baseline was a worse verdict.
108
+ if baseline in _IMPROVEMENT_TARGETS.get(candidate, frozenset()):
109
+ return TransitionKind.IMPROVED
110
+ return TransitionKind.OTHER_CHANGE
111
+
112
+
113
+ def compute_diff(baseline: ReplayArtifact, candidate: ReplayArtifact) -> DiffReport:
114
+ """Pure function: compare two artifacts case-by-case, produce a DiffReport.
115
+
116
+ Cases are matched by ``case_id``. Cases in only one side are recorded as
117
+ ADDED / REMOVED. The regression criterion is verdict-class downgrade
118
+ per ``_classify_transition``.
119
+ """
120
+ baseline_cases = {c.case_id: c for c in baseline.case_results}
121
+ candidate_cases = {c.case_id: c for c in candidate.case_results}
122
+
123
+ all_case_ids = sorted(set(baseline_cases) | set(candidate_cases))
124
+ transitions: list[CaseTransition] = []
125
+ regressed = improved = unchanged = other_change = added = removed = 0
126
+
127
+ for case_id in all_case_ids:
128
+ b = baseline_cases.get(case_id)
129
+ c = candidate_cases.get(case_id)
130
+ if b is None:
131
+ transitions.append(
132
+ CaseTransition(
133
+ case_id=case_id,
134
+ baseline_verdict=None,
135
+ candidate_verdict=c.verdict,
136
+ baseline_stability_ci_low=0.0,
137
+ candidate_stability_ci_low=c.stability_ci_low,
138
+ transition_kind=TransitionKind.ADDED,
139
+ )
140
+ )
141
+ added += 1
142
+ continue
143
+ if c is None:
144
+ transitions.append(
145
+ CaseTransition(
146
+ case_id=case_id,
147
+ baseline_verdict=b.verdict,
148
+ candidate_verdict=None,
149
+ baseline_stability_ci_low=b.stability_ci_low,
150
+ candidate_stability_ci_low=0.0,
151
+ transition_kind=TransitionKind.REMOVED,
152
+ )
153
+ )
154
+ removed += 1
155
+ continue
156
+ kind = _classify_transition(b.verdict, c.verdict)
157
+ transitions.append(
158
+ CaseTransition(
159
+ case_id=case_id,
160
+ baseline_verdict=b.verdict,
161
+ candidate_verdict=c.verdict,
162
+ baseline_stability_ci_low=b.stability_ci_low,
163
+ candidate_stability_ci_low=c.stability_ci_low,
164
+ transition_kind=kind,
165
+ )
166
+ )
167
+ if kind is TransitionKind.REGRESSED:
168
+ regressed += 1
169
+ elif kind is TransitionKind.IMPROVED:
170
+ improved += 1
171
+ elif kind is TransitionKind.UNCHANGED:
172
+ unchanged += 1
173
+ else:
174
+ other_change += 1
175
+
176
+ return DiffReport(
177
+ baseline_session_id=baseline.session_id,
178
+ candidate_session_id=candidate.session_id,
179
+ materialized_hash_mismatch=baseline.materialized_hash != candidate.materialized_hash,
180
+ transitions=transitions,
181
+ regressed_count=regressed,
182
+ improved_count=improved,
183
+ unchanged_count=unchanged,
184
+ other_change_count=other_change,
185
+ added_count=added,
186
+ removed_count=removed,
187
+ )
188
+
189
+
190
+ # ---------------------------------------------------------------------------
191
+ # CLI orchestration
192
+ # ---------------------------------------------------------------------------
193
+
194
+
195
+ def _build_store(store_path: str) -> ReplayStore:
196
+ """Mirror cli/run.py and cli/replay.py's store selection."""
197
+ if store_path == ":memory:":
198
+ return InMemoryStore()
199
+ return SQLiteStore(store_path)
200
+
201
+
202
+ def _load_artifact(store: ReplayStore, session_id: str, *, role: str) -> ReplayArtifact:
203
+ """Load an artifact, converting SessionNotFoundError to a user-facing CLIError."""
204
+ try:
205
+ return store.load_session(session_id)
206
+ except SessionNotFoundError as exc:
207
+ raise InfrastructureError(f"{role} session not found: {session_id}") from exc
208
+
209
+
210
+ def _diff_exit_code(report: DiffReport) -> int:
211
+ """Exit code 5 (REGRESSION) if any case regressed, else 0."""
212
+ return 5 if report.regressed_count > 0 else 0
213
+
214
+
215
+ def cmd_diff(args: argparse.Namespace) -> int:
216
+ """Entry point for the ``diff`` subcommand. Returns an exit code."""
217
+ store = _build_store(args.store_path)
218
+ try:
219
+ baseline = _load_artifact(store, args.baseline_session_id, role="baseline")
220
+ candidate = _load_artifact(store, args.candidate_session_id, role="candidate")
221
+ finally:
222
+ close = getattr(store, "close", None)
223
+ if callable(close):
224
+ close()
225
+
226
+ report = compute_diff(baseline, candidate)
227
+ render.render_diff(report, store_path=args.store_path)
228
+ return _diff_exit_code(report)
229
+
230
+
231
+ __all__ = [
232
+ "CaseTransition",
233
+ "DiffReport",
234
+ "TransitionKind",
235
+ "cmd_diff",
236
+ "compute_diff",
237
+ ]
@@ -0,0 +1,30 @@
1
+ """CLI-layer exception hierarchy.
2
+
3
+ The CLI catches these and maps them to exit codes per
4
+ [plan.md section 16.1](../../plan.md). Code 3 (ERROR) is reserved for
5
+ infrastructure-class failures raised by the CLI layer *before* a verdict
6
+ exists — bad spec, missing API key, network unreachable, etc.
7
+
8
+ Verdict-derived exit codes (0, 1, 2, 4) come from
9
+ ``falsifyai.cli.render.exit_code_for`` and never raise.
10
+ """
11
+
12
+
13
+ class CLIError(Exception):
14
+ """Base for all CLI-layer failures. Carries the intended exit code."""
15
+
16
+ def __init__(self, message: str, *, exit_code: int = 3) -> None:
17
+ super().__init__(message)
18
+ self.exit_code = exit_code
19
+
20
+
21
+ class SpecError(CLIError):
22
+ """The spec file cannot be loaded or parsed."""
23
+
24
+
25
+ class ConfigError(CLIError):
26
+ """A configuration / dependency / credential prerequisite is missing."""
27
+
28
+
29
+ class InfrastructureError(CLIError):
30
+ """Network / model-call / store failure during a run."""
falsifyai/cli/main.py ADDED
@@ -0,0 +1,105 @@
1
+ """``falsifyai`` CLI entry point.
2
+
3
+ Argparse-based dispatch. Three subcommands: ``run`` (execute a spec),
4
+ ``replay`` (re-render a stored session), and ``diff`` (compare two
5
+ stored sessions and exit 5 on regression).
6
+
7
+ Exit codes (per [plan.md section 16.1](../../plan.md)):
8
+
9
+ - 0 SUCCESS — session verdict STABLE
10
+ - 1 DEGRADED — session verdict FRAGILE
11
+ - 2 FAILURE — session verdict CONSISTENTLY_WRONG / INVALID_EVAL
12
+ - 3 ERROR — infrastructure failure (bad spec, missing credential, model call)
13
+ - 4 INSUFFICIENT — not enough evidence to discriminate
14
+
15
+ Codes 5 (REGRESSION) and 6 (LOW_FALSIFIABILITY) ship with Week 2 features.
16
+ """
17
+
18
+ import argparse
19
+ import sys
20
+ from collections.abc import Sequence
21
+
22
+ from falsifyai.cli import diff as diff_cmd
23
+ from falsifyai.cli import replay as replay_cmd
24
+ from falsifyai.cli import run as run_cmd
25
+ from falsifyai.cli.errors import CLIError
26
+
27
+
28
+ def build_parser() -> argparse.ArgumentParser:
29
+ parser = argparse.ArgumentParser(
30
+ prog="falsifyai",
31
+ description="Falsification-first reliability testing for AI systems.",
32
+ )
33
+ subparsers = parser.add_subparsers(dest="command", metavar="<command>")
34
+
35
+ run_parser = subparsers.add_parser("run", help="Run a falsification eval against a spec.")
36
+ run_parser.add_argument("spec_path", help="Path to the YAML spec file.")
37
+ run_parser.add_argument(
38
+ "--store-path",
39
+ default=".falsifyai/replays.db",
40
+ help="ReplayStore path. Use ':memory:' for an ephemeral run. "
41
+ "Default: .falsifyai/replays.db",
42
+ )
43
+
44
+ replay_parser = subparsers.add_parser(
45
+ "replay", help="Load and re-render a previously stored session."
46
+ )
47
+ replay_parser.add_argument(
48
+ "session_id",
49
+ nargs="?",
50
+ default=None,
51
+ help="Session id to load. Omit if using --latest.",
52
+ )
53
+ replay_parser.add_argument(
54
+ "--latest",
55
+ action="store_true",
56
+ help="Load the most recent session in the store. Mutually exclusive with session_id.",
57
+ )
58
+ replay_parser.add_argument(
59
+ "--store-path",
60
+ default=".falsifyai/replays.db",
61
+ help="ReplayStore path. Default: .falsifyai/replays.db",
62
+ )
63
+
64
+ diff_parser = subparsers.add_parser(
65
+ "diff",
66
+ help="Compare two stored sessions case-by-case. Exit 5 if any case regressed.",
67
+ )
68
+ diff_parser.add_argument("baseline_session_id", help="Baseline session id.")
69
+ diff_parser.add_argument("candidate_session_id", help="Candidate session id.")
70
+ diff_parser.add_argument(
71
+ "--store-path",
72
+ default=".falsifyai/replays.db",
73
+ help="ReplayStore path (both artifacts assumed in same store). "
74
+ "Default: .falsifyai/replays.db",
75
+ )
76
+
77
+ return parser
78
+
79
+
80
+ def main(argv: Sequence[str] | None = None) -> int:
81
+ parser = build_parser()
82
+ args = parser.parse_args(argv)
83
+
84
+ if args.command is None:
85
+ parser.print_help()
86
+ return 0
87
+
88
+ try:
89
+ if args.command == "run":
90
+ return run_cmd.cmd_run(args)
91
+ if args.command == "replay":
92
+ return replay_cmd.cmd_replay(args)
93
+ if args.command == "diff":
94
+ return diff_cmd.cmd_diff(args)
95
+ except CLIError as exc:
96
+ print(f"falsifyai: error: {exc}", file=sys.stderr)
97
+ return exc.exit_code
98
+
99
+ # Unknown subcommand (argparse would normally have caught this).
100
+ parser.error(f"unknown command: {args.command}")
101
+ return 2 # pragma: no cover - parser.error raises SystemExit
102
+
103
+
104
+ if __name__ == "__main__": # pragma: no cover
105
+ sys.exit(main())
@@ -0,0 +1,196 @@
1
+ """Plain-text terminal output for ``falsifyai run``, ``replay``, and ``diff``.
2
+
3
+ MVP scope: one row per case + a summary footer + the session id and store
4
+ path so the user can find their saved artifact. No colors, no boxes, no
5
+ JSON. Rich/colored output and ``--json`` land in Week 3 per
6
+ [plan.md section 22.1](../../plan.md).
7
+
8
+ The ``loaded_from`` parameter on ``render_session`` is what distinguishes
9
+ the replay path: when set, an extra header line indicates the user is
10
+ looking at a stored session rather than a fresh run. The detection of
11
+ legacy artifacts (pre-PR-11, no CI evidence) lives in this module too --
12
+ the artifact shape, not the consumer, determines what's renderable.
13
+
14
+ ``render_diff`` is the diff CLI's render path (PR #14). It consumes a
15
+ ``DiffReport`` (consumer-side dataclass from cli/diff.py) and prints a
16
+ compressed transition table: only rows where something changed are shown.
17
+ """
18
+
19
+ import sys
20
+ from datetime import datetime
21
+ from typing import TYPE_CHECKING, TextIO
22
+
23
+ from falsifyai.replay.models import CaseResult, ReplayArtifact
24
+ from falsifyai.verdict.models import Verdict
25
+
26
+ if TYPE_CHECKING:
27
+ # Type-only import to avoid a circular import at runtime: cli/diff.py
28
+ # imports render. The DiffReport dataclass lives in diff.py because it
29
+ # is a consumer-side structure, not part of the persisted artifact schema.
30
+ from falsifyai.cli.diff import CaseTransition, DiffReport
31
+
32
+ # Exit codes mapped to the MVP 5 verdicts per plan.md section 16.1.
33
+ # STABLE -> 0 SUCCESS
34
+ # FRAGILE -> 1 DEGRADED
35
+ # CONSISTENTLY_WRONG -> 2 FAILURE
36
+ # INVALID_EVAL -> 2 FAILURE
37
+ # INSUFFICIENT -> 4 INSUFFICIENT
38
+ # Code 3 (ERROR) is reserved for infrastructure failures raised by the CLI
39
+ # layer before a verdict exists; code 5 (REGRESSION) and 6 (LOW_FALSIFIABILITY)
40
+ # land with the Week 2 features.
41
+ _EXIT_CODES: dict[Verdict, int] = {
42
+ Verdict.STABLE: 0,
43
+ Verdict.FRAGILE: 1,
44
+ Verdict.CONSISTENTLY_WRONG: 2,
45
+ Verdict.INVALID_EVAL: 2,
46
+ Verdict.INSUFFICIENT: 4,
47
+ }
48
+
49
+
50
+ def exit_code_for(verdict: Verdict) -> int:
51
+ """CI exit code for a session-level verdict."""
52
+ return _EXIT_CODES[verdict]
53
+
54
+
55
+ def _is_legacy_case(case: CaseResult) -> bool:
56
+ """Pre-PR-11 artifact heuristic: nonzero verdict_confidence but no CI evidence.
57
+
58
+ The defaults from the dataclass extension (zero CI fields) trigger this
59
+ only when the case was constructed without PR #11's resolver -- i.e., it
60
+ was loaded from a pre-PR-11 replay store row. We require
61
+ ``verdict_confidence > 0`` so an INSUFFICIENT case (all zeros, legitimately)
62
+ doesn't get the legacy marker.
63
+ """
64
+ return (
65
+ case.verdict_confidence > 0.0
66
+ and case.stability == 0.0
67
+ and case.stability_ci_high == 0.0
68
+ and case.stability_ci_low == 0.0
69
+ )
70
+
71
+
72
+ def render_session(
73
+ artifact: ReplayArtifact,
74
+ *,
75
+ store_path: str,
76
+ stream: TextIO | None = None,
77
+ loaded_from: datetime | None = None,
78
+ ) -> None:
79
+ """Print one row per case, then a summary footer.
80
+
81
+ Per-case row format:
82
+ case: <id> verdict: <V> confidence: <p> (CI: <lo>-<hi>) worst: <family>?
83
+
84
+ When ``loaded_from`` is set (replay path), an extra header line is
85
+ prepended indicating the session was loaded from the store.
86
+
87
+ Legacy case detection: cases without CI evidence (pre-PR-11 artifacts)
88
+ omit the misleading ``(CI: 0.00-0.00)`` and append ``(legacy)`` instead.
89
+ """
90
+ out = stream if stream is not None else sys.stdout
91
+
92
+ if loaded_from is not None:
93
+ out.write(
94
+ f"Loaded session {artifact.session_id} · "
95
+ f"created_at {loaded_from.isoformat()} from {store_path}\n"
96
+ )
97
+
98
+ for case in artifact.case_results:
99
+ if _is_legacy_case(case):
100
+ line = (
101
+ f"case: {case.case_id} verdict: {case.verdict.value.upper()} "
102
+ f"confidence: {case.verdict_confidence:.2f} (legacy)"
103
+ )
104
+ else:
105
+ line = (
106
+ f"case: {case.case_id} verdict: {case.verdict.value.upper()} "
107
+ f"confidence: {case.verdict_confidence:.2f} "
108
+ f"(CI: {case.stability_ci_low:.2f}-{case.stability_ci_high:.2f})"
109
+ )
110
+ if case.verdict is Verdict.FRAGILE and case.worst_case_family:
111
+ line += f" worst: {case.worst_case_family}"
112
+ out.write(line + "\n")
113
+ out.write("=" * 65 + "\n")
114
+ out.write(f"Session {artifact.session_id} -> {store_path}\n")
115
+ sv = artifact.session_verdict
116
+ out.write(
117
+ f"{sv.case_count} case{'s' if sv.case_count != 1 else ''}, "
118
+ f"verdict {sv.session_verdict.value.upper()}, "
119
+ f"{sv.fragile_count} FRAGILE, "
120
+ f"{sv.consistently_wrong_count} CONSISTENTLY_WRONG, "
121
+ f"falsifiability {sv.falsifyai_falsifiability_score:.2f}\n"
122
+ )
123
+
124
+
125
+ # ---------------------------------------------------------------------------
126
+ # Diff rendering (PR #14)
127
+ # ---------------------------------------------------------------------------
128
+
129
+
130
+ def _format_verdict_with_stability(verdict: Verdict | None, ci_low: float) -> str:
131
+ """Format ``STABLE (0.92)`` or ``-`` if the case is absent on one side."""
132
+ if verdict is None:
133
+ return "-"
134
+ return f"{verdict.value.upper()} ({ci_low:.2f})"
135
+
136
+
137
+ def _format_transition_row(t: "CaseTransition") -> str:
138
+ """One row of the diff transition table.
139
+
140
+ Format: ``case: <id> baseline: <V> (n.nn) candidate: <V> (n.nn) <KIND>``
141
+ """
142
+ baseline_str = _format_verdict_with_stability(t.baseline_verdict, t.baseline_stability_ci_low)
143
+ candidate_str = _format_verdict_with_stability(
144
+ t.candidate_verdict, t.candidate_stability_ci_low
145
+ )
146
+ return (
147
+ f"case: {t.case_id} "
148
+ f"baseline: {baseline_str} "
149
+ f"candidate: {candidate_str} "
150
+ f"{t.transition_kind.value.upper()}"
151
+ )
152
+
153
+
154
+ def render_diff(
155
+ report: "DiffReport",
156
+ *,
157
+ store_path: str,
158
+ stream: TextIO | None = None,
159
+ ) -> None:
160
+ """Print a compressed transition table for two stored sessions.
161
+
162
+ Only transitions != UNCHANGED are surfaced as rows. The summary footer
163
+ always shows the full counts (unchanged + regressed + improved + ...).
164
+ Evidence density: show what changed; report what didn't via counts only.
165
+ """
166
+ from falsifyai.cli.diff import TransitionKind
167
+
168
+ out = stream if stream is not None else sys.stdout
169
+
170
+ out.write(
171
+ f"Diff: baseline {report.baseline_session_id} -> candidate {report.candidate_session_id}\n"
172
+ )
173
+ out.write(f"Store: {store_path}\n")
174
+ if report.materialized_hash_mismatch:
175
+ out.write(
176
+ "note: materialized_hash differs between baseline and candidate; "
177
+ "comparisons may not be apples-to-apples.\n"
178
+ )
179
+ out.write("=" * 65 + "\n")
180
+
181
+ surfaced = [t for t in report.transitions if t.transition_kind is not TransitionKind.UNCHANGED]
182
+ if not surfaced:
183
+ out.write("(no transitions; all cases unchanged)\n")
184
+ else:
185
+ for t in surfaced:
186
+ out.write(_format_transition_row(t) + "\n")
187
+
188
+ out.write("=" * 65 + "\n")
189
+ out.write(
190
+ f"{report.regressed_count} regressed, "
191
+ f"{report.improved_count} improved, "
192
+ f"{report.unchanged_count} unchanged, "
193
+ f"{report.other_change_count} other, "
194
+ f"{report.added_count} added, "
195
+ f"{report.removed_count} removed\n"
196
+ )
@@ -0,0 +1,76 @@
1
+ """``falsifyai replay <session_id>`` -- read-only consumer surface.
2
+
3
+ Loads a stored ``ReplayArtifact`` from the configured ``ReplayStore`` and
4
+ re-renders it via the existing ``cli/render.render_session``. No model
5
+ calls, no resolver invocation, no perturbations -- pure inspection of a
6
+ past run.
7
+
8
+ Invariants:
9
+
10
+ - ``cmd_replay`` is strictly read-only. It never modifies the stored
11
+ artifact. The verdict displayed is the verdict the resolver assigned at
12
+ ``run`` time -- not a re-resolution under the current resolver.
13
+ - Exit codes mirror ``run``: the verdict-derived exit code lets CI use
14
+ ``falsifyai replay <known-good-id>`` as a regression gate.
15
+ - ``--store-path :memory:`` is supported for symmetry with ``run``, even
16
+ though ``InMemoryStore`` is empty on every fresh process. Failure mode
17
+ is loud: ``SessionNotFoundError`` -> exit 3.
18
+ """
19
+
20
+ import argparse
21
+
22
+ from falsifyai.cli import render
23
+ from falsifyai.cli.errors import InfrastructureError
24
+ from falsifyai.replay.in_memory_store import InMemoryStore
25
+ from falsifyai.replay.protocol import ReplayStore, SessionNotFoundError
26
+ from falsifyai.replay.sqlite_store import SQLiteStore
27
+
28
+
29
+ def _build_store(store_path: str) -> ReplayStore:
30
+ """Mirror cli/run.py's store selection: ``:memory:`` -> InMemoryStore."""
31
+ if store_path == ":memory:":
32
+ return InMemoryStore()
33
+ return SQLiteStore(store_path)
34
+
35
+
36
+ def _resolve_target_session_id(store: ReplayStore, args: argparse.Namespace) -> str:
37
+ """Pick the session_id to load: explicit, or newest if ``--latest``."""
38
+ if args.latest:
39
+ if args.session_id is not None:
40
+ # argparse should already reject this; defensive guard.
41
+ raise InfrastructureError("--latest is mutually exclusive with a positional session_id")
42
+ newest = next(iter(store.query_sessions(limit=1)), None)
43
+ if newest is None:
44
+ raise InfrastructureError("no sessions in store; cannot resolve --latest")
45
+ return newest.session_id
46
+
47
+ if args.session_id is None:
48
+ raise InfrastructureError("session_id is required (or pass --latest)")
49
+ return args.session_id
50
+
51
+
52
+ def cmd_replay(args: argparse.Namespace) -> int:
53
+ """Entry point for the ``replay`` subcommand. Returns an exit code."""
54
+ store = _build_store(args.store_path)
55
+ try:
56
+ session_id = _resolve_target_session_id(store, args)
57
+ try:
58
+ artifact = store.load_session(session_id)
59
+ except SessionNotFoundError as exc:
60
+ raise InfrastructureError(f"session not found: {session_id}") from exc
61
+ finally:
62
+ # InMemoryStore has no close(); SQLiteStore does.
63
+ close = getattr(store, "close", None)
64
+ if callable(close):
65
+ close()
66
+
67
+ render.render_session(
68
+ artifact,
69
+ store_path=args.store_path,
70
+ loaded_from=artifact.created_at,
71
+ )
72
+
73
+ return render.exit_code_for(artifact.session_verdict.session_verdict)
74
+
75
+
76
+ __all__ = ["cmd_replay"]