selfevals 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- selfevals/.agents/skills/error-analysis/SKILL.md +149 -0
- selfevals/__init__.py +19 -0
- selfevals/_errors.py +44 -0
- selfevals/_internal/__init__.py +0 -0
- selfevals/_internal/hashing.py +23 -0
- selfevals/_internal/ids.py +65 -0
- selfevals/_internal/time.py +17 -0
- selfevals/analysis/__init__.py +23 -0
- selfevals/analysis/bundle.py +162 -0
- selfevals/analysis/hypothesis.py +26 -0
- selfevals/analysis/ingest.py +185 -0
- selfevals/analysis/schemas.py +119 -0
- selfevals/analysis/staging.py +34 -0
- selfevals/api/__init__.py +24 -0
- selfevals/api/__main__.py +47 -0
- selfevals/api/app.py +351 -0
- selfevals/api/broker.py +210 -0
- selfevals/api/broker_bridge.py +29 -0
- selfevals/api/queries.py +447 -0
- selfevals/api/schemas.py +151 -0
- selfevals/api/sse.py +114 -0
- selfevals/cli/__init__.py +15 -0
- selfevals/cli/_friendly.py +180 -0
- selfevals/cli/_help.py +55 -0
- selfevals/cli/analyze_commands.py +169 -0
- selfevals/cli/commands.py +615 -0
- selfevals/cli/main.py +409 -0
- selfevals/decision/__init__.py +34 -0
- selfevals/decision/matrix.py +185 -0
- selfevals/examples/__init__.py +8 -0
- selfevals/examples/evals/datasets/pingpong.jsonl +2 -0
- selfevals/examples/evals/experiments/example_pingpong.yaml +58 -0
- selfevals/examples/pingpong.py +21 -0
- selfevals/graders/__init__.py +46 -0
- selfevals/graders/base.py +54 -0
- selfevals/graders/calibration.py +145 -0
- selfevals/graders/deterministic.py +143 -0
- selfevals/graders/llm_judge.py +187 -0
- selfevals/graders/registry.py +66 -0
- selfevals/optimization/__init__.py +47 -0
- selfevals/optimization/aggregator.py +246 -0
- selfevals/optimization/loop.py +432 -0
- selfevals/optimization/proposers.py +202 -0
- selfevals/py.typed +0 -0
- selfevals/repo/__init__.py +28 -0
- selfevals/repo/loader.py +276 -0
- selfevals/reporter/__init__.py +21 -0
- selfevals/reporter/_metrics.py +114 -0
- selfevals/reporter/compare.py +221 -0
- selfevals/reporter/json_report.py +105 -0
- selfevals/reporter/markdown.py +232 -0
- selfevals/runner/__init__.py +42 -0
- selfevals/runner/adapters.py +268 -0
- selfevals/runner/executor.py +234 -0
- selfevals/runner/otlp_receiver.py +343 -0
- selfevals/runner/otlp_to_recorder.py +180 -0
- selfevals/runner/sandbox.py +46 -0
- selfevals/schemas/__init__.py +213 -0
- selfevals/schemas/_base.py +82 -0
- selfevals/schemas/annotation.py +55 -0
- selfevals/schemas/dataset.py +111 -0
- selfevals/schemas/enums.py +324 -0
- selfevals/schemas/eval_case.py +189 -0
- selfevals/schemas/experiment.py +367 -0
- selfevals/schemas/failure_mode.py +76 -0
- selfevals/schemas/fleet.py +111 -0
- selfevals/schemas/grader_card.py +112 -0
- selfevals/schemas/iteration.py +219 -0
- selfevals/schemas/registry.py +125 -0
- selfevals/schemas/tool.py +43 -0
- selfevals/schemas/trace.py +384 -0
- selfevals/schemas/workspace.py +69 -0
- selfevals/sdk/__init__.py +24 -0
- selfevals/sdk/auto_instrument.py +165 -0
- selfevals/sdk/context.py +45 -0
- selfevals/sdk/exporter.py +50 -0
- selfevals/sdk/facade.py +203 -0
- selfevals/skills/__init__.py +61 -0
- selfevals/storage/__init__.py +53 -0
- selfevals/storage/errors.py +66 -0
- selfevals/storage/filesystem.py +137 -0
- selfevals/storage/interface.py +135 -0
- selfevals/storage/migrations/__init__.py +80 -0
- selfevals/storage/migrations/m0001_initial.py +57 -0
- selfevals/storage/seed.py +199 -0
- selfevals/storage/sqlite.py +232 -0
- selfevals/trace/__init__.py +31 -0
- selfevals/trace/otel_importer.py +455 -0
- selfevals/trace/payload_router.py +106 -0
- selfevals/trace/recorder.py +540 -0
- selfevals/version.py +1 -0
- selfevals-0.2.2.dist-info/METADATA +283 -0
- selfevals-0.2.2.dist-info/RECORD +96 -0
- selfevals-0.2.2.dist-info/WHEEL +4 -0
- selfevals-0.2.2.dist-info/entry_points.txt +2 -0
- selfevals-0.2.2.dist-info/licenses/LICENSE +17 -0
selfevals/cli/main.py
ADDED
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
"""CLI entry point.
|
|
2
|
+
|
|
3
|
+
`app()` is what `selfevals` resolves to via the project script entry.
|
|
4
|
+
It dispatches to subcommand handlers in `selfevals.cli.commands`.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
import sys
|
|
11
|
+
from collections.abc import Sequence
|
|
12
|
+
|
|
13
|
+
from selfevals._errors import SelfEvalsUserError
|
|
14
|
+
from selfevals.cli import analyze_commands, commands
|
|
15
|
+
from selfevals.cli._help import make_subparser
|
|
16
|
+
from selfevals.version import __version__
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
20
|
+
parser = argparse.ArgumentParser(
|
|
21
|
+
prog="selfevals",
|
|
22
|
+
description="Self-improving evals framework for AI agents.",
|
|
23
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
24
|
+
epilog=(
|
|
25
|
+
"Example:\n"
|
|
26
|
+
" selfevals init my-team\n"
|
|
27
|
+
" selfevals run evals/experiments/example_pingpong.yaml --no-persist"
|
|
28
|
+
),
|
|
29
|
+
)
|
|
30
|
+
parser.add_argument("--version", action="version", version=f"selfevals {__version__}")
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"--db",
|
|
33
|
+
default="./selfevals.sqlite",
|
|
34
|
+
help="Path to SQLite database file (default: ./selfevals.sqlite).",
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
sub = parser.add_subparsers(dest="command", required=True, metavar="<command>")
|
|
38
|
+
p_init = make_subparser(
|
|
39
|
+
sub,
|
|
40
|
+
"init",
|
|
41
|
+
help_text="Create a new workspace and seed default roles.",
|
|
42
|
+
description=(
|
|
43
|
+
"Create (or re-open, idempotently) a workspace identified by SLUG. "
|
|
44
|
+
"Seeds default member roles for the owner."
|
|
45
|
+
),
|
|
46
|
+
examples=[
|
|
47
|
+
"selfevals init my-team",
|
|
48
|
+
"selfevals init my-team --name 'My Team' --user alice",
|
|
49
|
+
],
|
|
50
|
+
)
|
|
51
|
+
p_init.add_argument("slug", help="Workspace slug (kebab-case).")
|
|
52
|
+
p_init.add_argument("--name", help="Display name (default: slug).")
|
|
53
|
+
p_init.add_argument("--user", default="local", help="Owner user id.")
|
|
54
|
+
p_init.set_defaults(func=commands.cmd_init)
|
|
55
|
+
p_ws = make_subparser(
|
|
56
|
+
sub,
|
|
57
|
+
"workspace",
|
|
58
|
+
help_text="Inspect workspaces (show their metadata and counts).",
|
|
59
|
+
examples=["selfevals workspace show ws_01HZZZZZZZZZZZZZZZZZZZZZZZ"],
|
|
60
|
+
)
|
|
61
|
+
ws_sub = p_ws.add_subparsers(dest="ws_command", required=True)
|
|
62
|
+
p_ws_show = ws_sub.add_parser(
|
|
63
|
+
"show",
|
|
64
|
+
help="Show a workspace by id.",
|
|
65
|
+
description="Print a workspace's metadata and experiment count.",
|
|
66
|
+
epilog="Example:\n selfevals workspace show ws_01HZZZZZZZZZZZZZZZZZZZZZZZ",
|
|
67
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
68
|
+
)
|
|
69
|
+
p_ws_show.add_argument("workspace_id")
|
|
70
|
+
p_ws_show.set_defaults(func=commands.cmd_workspace_show)
|
|
71
|
+
p_exp = make_subparser(
|
|
72
|
+
sub,
|
|
73
|
+
"experiment",
|
|
74
|
+
help_text="List and inspect experiments inside a workspace.",
|
|
75
|
+
examples=[
|
|
76
|
+
"selfevals experiment list ws_01HZZZZZZZZZZZZZZZZZZZZZZZ",
|
|
77
|
+
"selfevals experiment show ws_01HZZZ... exp_01HXXX...",
|
|
78
|
+
],
|
|
79
|
+
)
|
|
80
|
+
exp_sub = p_exp.add_subparsers(dest="exp_command", required=True)
|
|
81
|
+
p_exp_list = exp_sub.add_parser(
|
|
82
|
+
"list",
|
|
83
|
+
help="List experiments in a workspace.",
|
|
84
|
+
description="List every experiment stored in the given workspace.",
|
|
85
|
+
epilog="Example:\n selfevals experiment list ws_01HZZZZZZZZZZZZZZZZZZZZZZZ",
|
|
86
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
87
|
+
)
|
|
88
|
+
p_exp_list.add_argument("workspace_id")
|
|
89
|
+
p_exp_list.set_defaults(func=commands.cmd_experiment_list)
|
|
90
|
+
p_exp_show = exp_sub.add_parser(
|
|
91
|
+
"show",
|
|
92
|
+
help="Show one experiment.",
|
|
93
|
+
description="Show one experiment's spec, target, and iteration count.",
|
|
94
|
+
epilog="Example:\n selfevals experiment show ws_01HZZZ... exp_01HXXX...",
|
|
95
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
96
|
+
)
|
|
97
|
+
p_exp_show.add_argument("workspace_id")
|
|
98
|
+
p_exp_show.add_argument("experiment_id")
|
|
99
|
+
p_exp_show.set_defaults(func=commands.cmd_experiment_show)
|
|
100
|
+
p_iter = make_subparser(
|
|
101
|
+
sub,
|
|
102
|
+
"iteration",
|
|
103
|
+
help_text="List iterations recorded for an experiment.",
|
|
104
|
+
examples=["selfevals iteration list ws_01HZZZ... exp_01HXXX..."],
|
|
105
|
+
)
|
|
106
|
+
iter_sub = p_iter.add_subparsers(dest="iter_command", required=True)
|
|
107
|
+
p_iter_list = iter_sub.add_parser(
|
|
108
|
+
"list",
|
|
109
|
+
help="List iterations for an experiment.",
|
|
110
|
+
description=(
|
|
111
|
+
"List the iterations stored for an experiment, "
|
|
112
|
+
"with their primary metric and decision outcome."
|
|
113
|
+
),
|
|
114
|
+
epilog="Example:\n selfevals iteration list ws_01HZZZ... exp_01HXXX...",
|
|
115
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
116
|
+
)
|
|
117
|
+
p_iter_list.add_argument("workspace_id")
|
|
118
|
+
p_iter_list.add_argument("experiment_id")
|
|
119
|
+
p_iter_list.set_defaults(func=commands.cmd_iteration_list)
|
|
120
|
+
p_report = make_subparser(
|
|
121
|
+
sub,
|
|
122
|
+
"report",
|
|
123
|
+
help_text="Render a markdown or JSON report from stored iterations.",
|
|
124
|
+
description=(
|
|
125
|
+
"Render a report for an experiment using already-persisted "
|
|
126
|
+
"iterations. Markdown by default; JSON via --format."
|
|
127
|
+
),
|
|
128
|
+
examples=[
|
|
129
|
+
"selfevals report ws_01HZZZ... exp_01HXXX...",
|
|
130
|
+
"selfevals report ws_01HZZZ... exp_01HXXX... --format json",
|
|
131
|
+
],
|
|
132
|
+
)
|
|
133
|
+
p_report.add_argument("workspace_id")
|
|
134
|
+
p_report.add_argument("experiment_id")
|
|
135
|
+
p_report.add_argument("--format", choices=["markdown", "json"], default="markdown")
|
|
136
|
+
p_report.set_defaults(func=commands.cmd_report)
|
|
137
|
+
p_run = make_subparser(
|
|
138
|
+
sub,
|
|
139
|
+
"run",
|
|
140
|
+
help_text="Run an experiment spec end-to-end (YAML).",
|
|
141
|
+
description=(
|
|
142
|
+
"Load a YAML experiment spec, resolve its agent entrypoint, "
|
|
143
|
+
"run every case through the configured proposer/grader, "
|
|
144
|
+
"persist iterations to SQLite (unless --no-persist), and "
|
|
145
|
+
"print a report."
|
|
146
|
+
),
|
|
147
|
+
examples=[
|
|
148
|
+
"selfevals run evals/experiments/example_pingpong.yaml --no-persist",
|
|
149
|
+
"selfevals run evals/experiments/example_pingpong.yaml --reps 3 --format json",
|
|
150
|
+
],
|
|
151
|
+
)
|
|
152
|
+
p_run.add_argument("spec", help="Path to evals/experiments/<name>.yaml")
|
|
153
|
+
p_run.add_argument(
|
|
154
|
+
"--workspace",
|
|
155
|
+
help="Workspace id override (otherwise read from the spec's `workspace:` key).",
|
|
156
|
+
)
|
|
157
|
+
p_run.add_argument(
|
|
158
|
+
"--max-iterations",
|
|
159
|
+
type=int,
|
|
160
|
+
default=None,
|
|
161
|
+
help="Override experiment.run.max_iterations for this run.",
|
|
162
|
+
)
|
|
163
|
+
p_run.add_argument(
|
|
164
|
+
"--reps",
|
|
165
|
+
type=int,
|
|
166
|
+
default=1,
|
|
167
|
+
help="Repetitions per case (default 1).",
|
|
168
|
+
)
|
|
169
|
+
p_run.add_argument(
|
|
170
|
+
"--format",
|
|
171
|
+
choices=["markdown", "json"],
|
|
172
|
+
default="markdown",
|
|
173
|
+
help="Report format printed at the end of the run.",
|
|
174
|
+
)
|
|
175
|
+
p_run.add_argument(
|
|
176
|
+
"--no-persist",
|
|
177
|
+
action="store_true",
|
|
178
|
+
help="Do not write iterations/decisions to the SQLite db.",
|
|
179
|
+
)
|
|
180
|
+
p_run.add_argument(
|
|
181
|
+
"--persist-traces",
|
|
182
|
+
choices=["none", "all", "failed"],
|
|
183
|
+
default=None,
|
|
184
|
+
help=(
|
|
185
|
+
"Override run.persist_traces: which traces to store — none, all, or "
|
|
186
|
+
"failed (default in the spec). Failed traces feed `analyze pull`."
|
|
187
|
+
),
|
|
188
|
+
)
|
|
189
|
+
p_run.set_defaults(func=commands.cmd_run)
|
|
190
|
+
p_compare = make_subparser(
|
|
191
|
+
sub,
|
|
192
|
+
"compare",
|
|
193
|
+
help_text="Diff two iterations side-by-side (by primary metric).",
|
|
194
|
+
description=(
|
|
195
|
+
"Print the primary metric for two iterations of the same "
|
|
196
|
+
"experiment, plus their delta and decision outcomes."
|
|
197
|
+
),
|
|
198
|
+
examples=["selfevals compare ws_01HZZZ... iter_01HAAA... iter_01HBBB..."],
|
|
199
|
+
)
|
|
200
|
+
p_compare.add_argument("workspace_id")
|
|
201
|
+
p_compare.add_argument("iter_a_id")
|
|
202
|
+
p_compare.add_argument("iter_b_id")
|
|
203
|
+
p_compare.set_defaults(func=commands.cmd_compare)
|
|
204
|
+
p_estimate = make_subparser(
|
|
205
|
+
sub,
|
|
206
|
+
"estimate",
|
|
207
|
+
help_text="Dry-run cost estimate for a search space x cases x reps.",
|
|
208
|
+
description=(
|
|
209
|
+
"Compute upper-bound agent calls and USD cost for a "
|
|
210
|
+
"hypothetical run, without touching the db or any agent."
|
|
211
|
+
),
|
|
212
|
+
examples=[
|
|
213
|
+
"selfevals estimate --cases 50 --space-size 8 --reps 3 --cost-per-call 0.01",
|
|
214
|
+
],
|
|
215
|
+
)
|
|
216
|
+
p_estimate.add_argument("--cases", type=int, required=True, help="Number of evaluation cases.")
|
|
217
|
+
p_estimate.add_argument(
|
|
218
|
+
"--space-size", type=int, required=True, help="Number of proposals in the search space."
|
|
219
|
+
)
|
|
220
|
+
p_estimate.add_argument("--reps", type=int, default=1, help="Repetitions per case (default 1).")
|
|
221
|
+
p_estimate.add_argument(
|
|
222
|
+
"--cost-per-call", type=float, required=True, help="Estimated USD per agent call."
|
|
223
|
+
)
|
|
224
|
+
p_estimate.set_defaults(func=commands.cmd_estimate)
|
|
225
|
+
p_analyze = make_subparser(
|
|
226
|
+
sub,
|
|
227
|
+
"analyze",
|
|
228
|
+
help_text="Error-analysis handshake: emit a bundle / ingest a result.",
|
|
229
|
+
examples=[
|
|
230
|
+
"selfevals analyze pull ws_01HZZZ... exp_01HXXX... > bundle.json",
|
|
231
|
+
"selfevals analyze push ws_01HZZZ... exp_01HXXX... < result.json",
|
|
232
|
+
],
|
|
233
|
+
)
|
|
234
|
+
analyze_sub = p_analyze.add_subparsers(dest="analyze_command", required=True)
|
|
235
|
+
p_an_pull = analyze_sub.add_parser(
|
|
236
|
+
"pull",
|
|
237
|
+
help="Emit an AnalysisBundle (failed traces + live taxonomy) as JSON.",
|
|
238
|
+
description=(
|
|
239
|
+
"Gather an experiment's failed traces and the live failure-mode "
|
|
240
|
+
"taxonomy into a JSON bundle on stdout, for an external coding "
|
|
241
|
+
"agent to do open/axial coding against."
|
|
242
|
+
),
|
|
243
|
+
epilog="Example:\n selfevals analyze pull ws_01HZZZ... exp_01HXXX... > bundle.json",
|
|
244
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
245
|
+
)
|
|
246
|
+
p_an_pull.add_argument("workspace_id")
|
|
247
|
+
p_an_pull.add_argument("experiment_id")
|
|
248
|
+
p_an_pull.add_argument("--iteration", type=int, default=None, help="Restrict to one iteration.")
|
|
249
|
+
p_an_pull.add_argument(
|
|
250
|
+
"--all", action="store_true", help="Include passing traces, not just failures."
|
|
251
|
+
)
|
|
252
|
+
p_an_pull.set_defaults(func=analyze_commands.cmd_analyze_pull)
|
|
253
|
+
p_an_push = analyze_sub.add_parser(
|
|
254
|
+
"push",
|
|
255
|
+
help="Ingest an AnalysisResult (assignments + candidates + hypotheses) from stdin.",
|
|
256
|
+
description=(
|
|
257
|
+
"Read an AnalysisResult JSON on stdin and apply it: stamp failure "
|
|
258
|
+
"modes on traces, create candidate modes, record hypotheses. "
|
|
259
|
+
"Enforces the assignment XOR and classify-don't-rename invariants."
|
|
260
|
+
),
|
|
261
|
+
epilog="Example:\n selfevals analyze push ws_01HZZZ... exp_01HXXX... < result.json",
|
|
262
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
263
|
+
)
|
|
264
|
+
p_an_push.add_argument("workspace_id")
|
|
265
|
+
p_an_push.add_argument("experiment_id")
|
|
266
|
+
p_an_push.add_argument(
|
|
267
|
+
"--by", default="agent:unknown", help="Provenance stamped on new candidates."
|
|
268
|
+
)
|
|
269
|
+
p_an_push.set_defaults(func=analyze_commands.cmd_analyze_push)
|
|
270
|
+
p_fm = make_subparser(
|
|
271
|
+
sub,
|
|
272
|
+
"failuremode",
|
|
273
|
+
help_text="Manage the workspace failure-mode taxonomy.",
|
|
274
|
+
examples=[
|
|
275
|
+
"selfevals failuremode list ws_01HZZZ... --status candidate",
|
|
276
|
+
"selfevals failuremode promote fm_01HAAA...",
|
|
277
|
+
],
|
|
278
|
+
)
|
|
279
|
+
fm_sub = p_fm.add_subparsers(dest="failuremode_command", required=True)
|
|
280
|
+
p_fm_list = fm_sub.add_parser(
|
|
281
|
+
"list",
|
|
282
|
+
help="List failure modes in a workspace.",
|
|
283
|
+
description="List the workspace taxonomy; filter by --status.",
|
|
284
|
+
epilog="Example:\n selfevals failuremode list ws_01HZZZ... --status official",
|
|
285
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
286
|
+
)
|
|
287
|
+
p_fm_list.add_argument("workspace_id")
|
|
288
|
+
p_fm_list.add_argument("--status", choices=["candidate", "official", "retired"], default=None)
|
|
289
|
+
p_fm_list.set_defaults(func=analyze_commands.cmd_failuremode_list)
|
|
290
|
+
p_fm_promote = fm_sub.add_parser(
|
|
291
|
+
"promote",
|
|
292
|
+
help="Promote a candidate mode to official (the human gate).",
|
|
293
|
+
description="Promote a CANDIDATE failure mode to OFFICIAL so it counts.",
|
|
294
|
+
epilog="Example:\n selfevals failuremode promote ws_01HZZZ... fm_01HAAA...",
|
|
295
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
296
|
+
)
|
|
297
|
+
p_fm_promote.add_argument("workspace_id")
|
|
298
|
+
p_fm_promote.add_argument("failure_mode_id")
|
|
299
|
+
p_fm_promote.set_defaults(func=analyze_commands.cmd_failuremode_promote)
|
|
300
|
+
p_fm_retire = fm_sub.add_parser(
|
|
301
|
+
"retire",
|
|
302
|
+
help="Retire a failure mode (kept for history).",
|
|
303
|
+
description="Mark a failure mode RETIRED; it stays for history.",
|
|
304
|
+
epilog="Example:\n selfevals failuremode retire ws_01HZZZ... fm_01HAAA...",
|
|
305
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
306
|
+
)
|
|
307
|
+
p_fm_retire.add_argument("workspace_id")
|
|
308
|
+
p_fm_retire.add_argument("failure_mode_id")
|
|
309
|
+
p_fm_retire.set_defaults(func=analyze_commands.cmd_failuremode_retire)
|
|
310
|
+
p_fm_merge = fm_sub.add_parser(
|
|
311
|
+
"merge",
|
|
312
|
+
help="Merge one mode into another (sets superseded_by).",
|
|
313
|
+
description="Move a mode's examples into another and retire the source.",
|
|
314
|
+
epilog="Example:\n selfevals failuremode merge ws_01HZZZ... fm_dup... --into fm_keep...",
|
|
315
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
316
|
+
)
|
|
317
|
+
p_fm_merge.add_argument("workspace_id")
|
|
318
|
+
p_fm_merge.add_argument("failure_mode_id")
|
|
319
|
+
p_fm_merge.add_argument("--into", required=True, help="Destination mode id.")
|
|
320
|
+
p_fm_merge.set_defaults(func=analyze_commands.cmd_failuremode_merge)
|
|
321
|
+
p_fm_edit = fm_sub.add_parser(
|
|
322
|
+
"edit",
|
|
323
|
+
help="Edit a mode's title and/or definition (human rename action).",
|
|
324
|
+
description="Edit a failure mode's title/definition — the only place a mode is renamed.",
|
|
325
|
+
epilog='Example:\n selfevals failuremode edit ws_01HZZZ... fm_01HAAA... --title "New title"',
|
|
326
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
327
|
+
)
|
|
328
|
+
p_fm_edit.add_argument("workspace_id")
|
|
329
|
+
p_fm_edit.add_argument("failure_mode_id")
|
|
330
|
+
p_fm_edit.add_argument("--title", default=None)
|
|
331
|
+
p_fm_edit.add_argument("--definition", default=None)
|
|
332
|
+
p_fm_edit.set_defaults(func=analyze_commands.cmd_failuremode_edit)
|
|
333
|
+
p_skills = make_subparser(
|
|
334
|
+
sub,
|
|
335
|
+
"skills",
|
|
336
|
+
help_text="List the agent skills bundled with this install, or print one's path.",
|
|
337
|
+
description=(
|
|
338
|
+
"selfevals ships agent skills (e.g. error-analysis) inside the "
|
|
339
|
+
"package. `list` shows them; `path` prints a skill's directory so "
|
|
340
|
+
"an agent or onboarding flow can read or install it."
|
|
341
|
+
),
|
|
342
|
+
examples=[
|
|
343
|
+
"selfevals skills list",
|
|
344
|
+
"selfevals skills path error-analysis",
|
|
345
|
+
],
|
|
346
|
+
)
|
|
347
|
+
skills_sub = p_skills.add_subparsers(dest="skills_command", required=True)
|
|
348
|
+
p_skills_list = skills_sub.add_parser(
|
|
349
|
+
"list",
|
|
350
|
+
help="List bundled skills.",
|
|
351
|
+
description="List every agent skill shipped with this selfevals install.",
|
|
352
|
+
epilog="Example:\n selfevals skills list",
|
|
353
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
354
|
+
)
|
|
355
|
+
p_skills_list.set_defaults(func=commands.cmd_skills_list)
|
|
356
|
+
p_skills_path = skills_sub.add_parser(
|
|
357
|
+
"path",
|
|
358
|
+
help="Print the directory of a bundled skill.",
|
|
359
|
+
description="Print the on-disk directory of the named bundled skill.",
|
|
360
|
+
epilog="Example:\n selfevals skills path error-analysis",
|
|
361
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
362
|
+
)
|
|
363
|
+
p_skills_path.add_argument("name", help="Skill name, e.g. error-analysis.")
|
|
364
|
+
p_skills_path.set_defaults(func=commands.cmd_skills_path)
|
|
365
|
+
p_examples = make_subparser(
|
|
366
|
+
sub,
|
|
367
|
+
"examples",
|
|
368
|
+
help_text="Copy runnable example specs into the current project.",
|
|
369
|
+
examples=[
|
|
370
|
+
"selfevals examples copy pingpong",
|
|
371
|
+
"selfevals run evals/experiments/example_pingpong.yaml --no-persist",
|
|
372
|
+
],
|
|
373
|
+
)
|
|
374
|
+
examples_sub = p_examples.add_subparsers(dest="examples_command", required=True)
|
|
375
|
+
p_examples_copy = examples_sub.add_parser(
|
|
376
|
+
"copy",
|
|
377
|
+
help="Copy a runnable example by name.",
|
|
378
|
+
description="Copy a packaged example spec and dataset into --to (default: cwd).",
|
|
379
|
+
epilog="Example:\n selfevals examples copy pingpong",
|
|
380
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
381
|
+
)
|
|
382
|
+
p_examples_copy.add_argument("name", choices=["pingpong"])
|
|
383
|
+
p_examples_copy.add_argument("--to", default=".", help="Destination directory (default: cwd).")
|
|
384
|
+
p_examples_copy.set_defaults(func=commands.cmd_examples_copy)
|
|
385
|
+
|
|
386
|
+
return parser
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def app(argv: Sequence[str] | None = None) -> int:
|
|
390
|
+
"""Programmatic entry point. Returns the intended process exit code."""
|
|
391
|
+
parser = _build_parser()
|
|
392
|
+
args = parser.parse_args(argv)
|
|
393
|
+
try:
|
|
394
|
+
return int(args.func(args))
|
|
395
|
+
except SelfEvalsUserError as exc:
|
|
396
|
+
# User-correctable errors get a clean one-line message (no
|
|
397
|
+
# traceback) and exit code 2 — the standard "user input was bad"
|
|
398
|
+
# convention. Internal errors (anything else) keep their
|
|
399
|
+
# traceback and become exit 1 via the normal exception bubbling.
|
|
400
|
+
print(f"error: {exc}", file=sys.stderr)
|
|
401
|
+
return 2
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def main() -> None: # pragma: no cover - thin wrapper for the console script.
|
|
405
|
+
raise SystemExit(app())
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
if __name__ == "__main__": # pragma: no cover
|
|
409
|
+
main()
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Decision matrix: §10 canonical subset -> DecisionOutcome.
|
|
2
|
+
|
|
3
|
+
`DecisionMatrixEvaluator` plugs into `OptimizationLoop` as a
|
|
4
|
+
`DecisionEvaluatorProtocol`. It receives the current iteration's
|
|
5
|
+
`IterationAggregate` plus the baseline aggregate (the best previous
|
|
6
|
+
iteration, or None for the first one) and returns a
|
|
7
|
+
`(DecisionOutcome, rationale: str)` tuple.
|
|
8
|
+
|
|
9
|
+
The decision tree, in order:
|
|
10
|
+
|
|
11
|
+
1. If any **guardrail** declared on `Experiment.target.guardrails` is
|
|
12
|
+
violated → `REQUIRE_TRADEOFF_REVIEW` (or `REJECT` if policy says so).
|
|
13
|
+
2. If the iteration **regressed** on a gate dataset metric → outcome
|
|
14
|
+
per `experiment.decision.if_regression_fails`.
|
|
15
|
+
3. If primary metric **did not improve** vs baseline → `REJECT`.
|
|
16
|
+
4. If primary metric **dropped** below the absolute target → `INVESTIGATE`.
|
|
17
|
+
5. Otherwise → `KEEP_CANDIDATE`.
|
|
18
|
+
|
|
19
|
+
This is a deliberately small slice of the canonical matrix — enough to
|
|
20
|
+
power MVP optimization runs without baking in policy that should belong
|
|
21
|
+
to the user. Each branch records why it fired in the rationale string.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from selfevals.decision.matrix import (
|
|
25
|
+
DecisionEvaluation,
|
|
26
|
+
DecisionMatrixEvaluator,
|
|
27
|
+
evaluate_iteration,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"DecisionEvaluation",
|
|
32
|
+
"DecisionMatrixEvaluator",
|
|
33
|
+
"evaluate_iteration",
|
|
34
|
+
]
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""Decision matrix evaluator.
|
|
2
|
+
|
|
3
|
+
Pure function in spirit: given the iteration aggregate + baseline +
|
|
4
|
+
the experiment's target/decision policy, produce a `DecisionOutcome`
|
|
5
|
+
plus a human-readable rationale. No I/O — the OptimizationLoop owns
|
|
6
|
+
persistence.
|
|
7
|
+
|
|
8
|
+
The evaluator implements `DecisionEvaluatorProtocol` from
|
|
9
|
+
`selfevals.optimization.loop` so it can be passed in at construction
|
|
10
|
+
time. The `decision/` package depends on `optimization` only at type-
|
|
11
|
+
check time to keep the import graph acyclic at runtime.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from typing import TYPE_CHECKING
|
|
18
|
+
|
|
19
|
+
from selfevals.optimization.loop import DecisionEvaluatorProtocol
|
|
20
|
+
from selfevals.schemas.enums import DecisionOutcome
|
|
21
|
+
from selfevals.schemas.experiment import MetricTarget
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from selfevals.optimization.aggregator import IterationAggregate
|
|
25
|
+
from selfevals.schemas.experiment import Experiment
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass(frozen=True)
|
|
29
|
+
class DecisionEvaluation:
|
|
30
|
+
outcome: DecisionOutcome
|
|
31
|
+
rationale: str
|
|
32
|
+
violated_guardrails: list[str] = field(default_factory=list)
|
|
33
|
+
primary_delta: float | None = None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
_OUTCOME_FOR_REGRESSION = {
|
|
37
|
+
"reject": DecisionOutcome.REJECT,
|
|
38
|
+
"investigate": DecisionOutcome.INVESTIGATE,
|
|
39
|
+
"spawn_subexperiment": DecisionOutcome.SPAWN_SUBEXPERIMENT,
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
_OUTCOME_FOR_GUARDRAIL = {
|
|
43
|
+
"reject": DecisionOutcome.REJECT,
|
|
44
|
+
"require_tradeoff_review": DecisionOutcome.REQUIRE_TRADEOFF_REVIEW,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _check_operator(value: float, op: str, threshold: float) -> bool:
|
|
49
|
+
"""True iff `value op threshold` per the operator string."""
|
|
50
|
+
match op:
|
|
51
|
+
case ">":
|
|
52
|
+
return value > threshold
|
|
53
|
+
case ">=":
|
|
54
|
+
return value >= threshold
|
|
55
|
+
case "<":
|
|
56
|
+
return value < threshold
|
|
57
|
+
case "<=":
|
|
58
|
+
return value <= threshold
|
|
59
|
+
case "==":
|
|
60
|
+
return value == threshold
|
|
61
|
+
case _: # pragma: no cover — schema validator rejects others
|
|
62
|
+
raise ValueError(f"unknown operator {op!r}")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _guardrails_violated(
|
|
66
|
+
aggregate: IterationAggregate, guardrails: list[MetricTarget]
|
|
67
|
+
) -> list[str]:
|
|
68
|
+
"""Return the names of guardrails whose value is out of bounds.
|
|
69
|
+
|
|
70
|
+
Guardrails are looked up against `aggregate.guardrails` and (as a
|
|
71
|
+
fallback) `aggregate.reliability`. A guardrail with no observed
|
|
72
|
+
value is treated as passing — we don't fail-shut on missing data
|
|
73
|
+
in MVP because the runner doesn't synthesize every metric.
|
|
74
|
+
"""
|
|
75
|
+
violations: list[str] = []
|
|
76
|
+
for g in guardrails:
|
|
77
|
+
value = aggregate.guardrails.get(g.name)
|
|
78
|
+
if value is None:
|
|
79
|
+
value = aggregate.reliability.get(g.name)
|
|
80
|
+
if value is None:
|
|
81
|
+
continue
|
|
82
|
+
if not _check_operator(value, g.operator, g.value):
|
|
83
|
+
violations.append(f"{g.name}={value:.6g} fails {g.operator}{g.value:.6g}")
|
|
84
|
+
return violations
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def evaluate_iteration(
|
|
88
|
+
*,
|
|
89
|
+
experiment: Experiment,
|
|
90
|
+
aggregate: IterationAggregate,
|
|
91
|
+
baseline: IterationAggregate | None,
|
|
92
|
+
) -> DecisionEvaluation:
|
|
93
|
+
"""Apply the §10 canonical subset and return an outcome + rationale."""
|
|
94
|
+
target = experiment.target.primary
|
|
95
|
+
primary_name = target.name
|
|
96
|
+
primary_value = aggregate.primary_value
|
|
97
|
+
|
|
98
|
+
primary_delta: float | None = None
|
|
99
|
+
if baseline is not None:
|
|
100
|
+
primary_delta = primary_value - baseline.primary_value
|
|
101
|
+
|
|
102
|
+
violations = _guardrails_violated(aggregate, experiment.target.guardrails)
|
|
103
|
+
if violations:
|
|
104
|
+
guardrail_policy = experiment.decision.if_guardrail_fails
|
|
105
|
+
guardrail_outcome = _OUTCOME_FOR_GUARDRAIL.get(
|
|
106
|
+
guardrail_policy, DecisionOutcome.REQUIRE_TRADEOFF_REVIEW
|
|
107
|
+
)
|
|
108
|
+
return DecisionEvaluation(
|
|
109
|
+
outcome=guardrail_outcome,
|
|
110
|
+
rationale=(
|
|
111
|
+
"guardrail(s) violated: " + "; ".join(violations) + f"; policy={guardrail_policy}"
|
|
112
|
+
),
|
|
113
|
+
violated_guardrails=violations,
|
|
114
|
+
primary_delta=primary_delta,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# No baseline: the first iteration. Check the absolute target.
|
|
118
|
+
if baseline is None:
|
|
119
|
+
if _check_operator(primary_value, target.operator, target.value):
|
|
120
|
+
return DecisionEvaluation(
|
|
121
|
+
outcome=DecisionOutcome.KEEP_CANDIDATE,
|
|
122
|
+
rationale=(
|
|
123
|
+
f"first iteration meets target: {primary_name}={primary_value:.6g} "
|
|
124
|
+
f"{target.operator} {target.value:.6g}"
|
|
125
|
+
),
|
|
126
|
+
primary_delta=None,
|
|
127
|
+
)
|
|
128
|
+
return DecisionEvaluation(
|
|
129
|
+
outcome=DecisionOutcome.INVESTIGATE,
|
|
130
|
+
rationale=(
|
|
131
|
+
f"first iteration below target: {primary_name}={primary_value:.6g} "
|
|
132
|
+
f"vs target {target.operator} {target.value:.6g}; investigate before bailing"
|
|
133
|
+
),
|
|
134
|
+
primary_delta=None,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Subsequent iterations: compare against baseline and target.
|
|
138
|
+
assert primary_delta is not None
|
|
139
|
+
if primary_delta <= 0:
|
|
140
|
+
# Did not improve.
|
|
141
|
+
if _check_operator(primary_value, target.operator, target.value):
|
|
142
|
+
return DecisionEvaluation(
|
|
143
|
+
outcome=DecisionOutcome.REJECT,
|
|
144
|
+
rationale=(
|
|
145
|
+
f"no improvement: Δ{primary_name}={primary_delta:+.6g} vs baseline "
|
|
146
|
+
f"{baseline.primary_value:.6g} (still meets target)"
|
|
147
|
+
),
|
|
148
|
+
primary_delta=primary_delta,
|
|
149
|
+
)
|
|
150
|
+
# Regressed below target → consult policy.
|
|
151
|
+
regression_policy = experiment.decision.if_regression_fails
|
|
152
|
+
regression_outcome = _OUTCOME_FOR_REGRESSION.get(regression_policy, DecisionOutcome.REJECT)
|
|
153
|
+
return DecisionEvaluation(
|
|
154
|
+
outcome=regression_outcome,
|
|
155
|
+
rationale=(
|
|
156
|
+
f"regression below target: {primary_name}={primary_value:.6g} "
|
|
157
|
+
f"{target.operator} {target.value:.6g}; "
|
|
158
|
+
f"Δ={primary_delta:+.6g}; policy={regression_policy}"
|
|
159
|
+
),
|
|
160
|
+
primary_delta=primary_delta,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# Improvement.
|
|
164
|
+
return DecisionEvaluation(
|
|
165
|
+
outcome=DecisionOutcome.KEEP_CANDIDATE,
|
|
166
|
+
rationale=(
|
|
167
|
+
f"improvement: {primary_name}={primary_value:.6g} "
|
|
168
|
+
f"(Δ{primary_delta:+.6g}); guardrails ok"
|
|
169
|
+
),
|
|
170
|
+
primary_delta=primary_delta,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
class DecisionMatrixEvaluator(DecisionEvaluatorProtocol):
|
|
175
|
+
"""Object form usable as `DecisionEvaluatorProtocol`."""
|
|
176
|
+
|
|
177
|
+
def evaluate(
|
|
178
|
+
self,
|
|
179
|
+
*,
|
|
180
|
+
experiment: Experiment,
|
|
181
|
+
aggregate: IterationAggregate,
|
|
182
|
+
baseline: IterationAggregate | None,
|
|
183
|
+
) -> tuple[DecisionOutcome, str]:
|
|
184
|
+
ev = evaluate_iteration(experiment=experiment, aggregate=aggregate, baseline=baseline)
|
|
185
|
+
return ev.outcome, ev.rationale
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Reference agents used by example experiments and tests.
|
|
2
|
+
|
|
3
|
+
Nothing here is meant for production — these exist so that `selfevals
|
|
4
|
+
run evals/experiments/example_*.yaml` works out of the box and serves
|
|
5
|
+
as both a smoke test and an onboarding artifact.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
{"name": "say pong", "task_type": "echo", "input": {"messages": [{"role": "user", "content": "ping"}]}, "taxonomy": {"level": "final_response", "feature": {"primary": "commerce.product_resolution"}, "source": {"type": "handcrafted"}, "ground_truth": {"methods": ["exact_match"]}, "dataset_type": "capability"}, "expected": {"must_include": ["pong"]}}
|
|
2
|
+
{"name": "say pong again", "task_type": "echo", "input": {"messages": [{"role": "user", "content": "ping"}]}, "taxonomy": {"level": "final_response", "feature": {"primary": "commerce.product_resolution"}, "source": {"type": "handcrafted"}, "ground_truth": {"methods": ["exact_match"]}, "dataset_type": "capability"}, "expected": {"must_include": ["pong"]}}
|