selfevals 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. selfevals/.agents/skills/error-analysis/SKILL.md +149 -0
  2. selfevals/__init__.py +19 -0
  3. selfevals/_errors.py +44 -0
  4. selfevals/_internal/__init__.py +0 -0
  5. selfevals/_internal/hashing.py +23 -0
  6. selfevals/_internal/ids.py +65 -0
  7. selfevals/_internal/time.py +17 -0
  8. selfevals/analysis/__init__.py +23 -0
  9. selfevals/analysis/bundle.py +162 -0
  10. selfevals/analysis/hypothesis.py +26 -0
  11. selfevals/analysis/ingest.py +185 -0
  12. selfevals/analysis/schemas.py +119 -0
  13. selfevals/analysis/staging.py +34 -0
  14. selfevals/api/__init__.py +24 -0
  15. selfevals/api/__main__.py +47 -0
  16. selfevals/api/app.py +351 -0
  17. selfevals/api/broker.py +210 -0
  18. selfevals/api/broker_bridge.py +29 -0
  19. selfevals/api/queries.py +447 -0
  20. selfevals/api/schemas.py +151 -0
  21. selfevals/api/sse.py +114 -0
  22. selfevals/cli/__init__.py +15 -0
  23. selfevals/cli/_friendly.py +180 -0
  24. selfevals/cli/_help.py +55 -0
  25. selfevals/cli/analyze_commands.py +169 -0
  26. selfevals/cli/commands.py +615 -0
  27. selfevals/cli/main.py +409 -0
  28. selfevals/decision/__init__.py +34 -0
  29. selfevals/decision/matrix.py +185 -0
  30. selfevals/examples/__init__.py +8 -0
  31. selfevals/examples/evals/datasets/pingpong.jsonl +2 -0
  32. selfevals/examples/evals/experiments/example_pingpong.yaml +58 -0
  33. selfevals/examples/pingpong.py +21 -0
  34. selfevals/graders/__init__.py +46 -0
  35. selfevals/graders/base.py +54 -0
  36. selfevals/graders/calibration.py +145 -0
  37. selfevals/graders/deterministic.py +143 -0
  38. selfevals/graders/llm_judge.py +187 -0
  39. selfevals/graders/registry.py +66 -0
  40. selfevals/optimization/__init__.py +47 -0
  41. selfevals/optimization/aggregator.py +246 -0
  42. selfevals/optimization/loop.py +432 -0
  43. selfevals/optimization/proposers.py +202 -0
  44. selfevals/py.typed +0 -0
  45. selfevals/repo/__init__.py +28 -0
  46. selfevals/repo/loader.py +276 -0
  47. selfevals/reporter/__init__.py +21 -0
  48. selfevals/reporter/_metrics.py +114 -0
  49. selfevals/reporter/compare.py +221 -0
  50. selfevals/reporter/json_report.py +105 -0
  51. selfevals/reporter/markdown.py +232 -0
  52. selfevals/runner/__init__.py +42 -0
  53. selfevals/runner/adapters.py +268 -0
  54. selfevals/runner/executor.py +234 -0
  55. selfevals/runner/otlp_receiver.py +343 -0
  56. selfevals/runner/otlp_to_recorder.py +180 -0
  57. selfevals/runner/sandbox.py +46 -0
  58. selfevals/schemas/__init__.py +213 -0
  59. selfevals/schemas/_base.py +82 -0
  60. selfevals/schemas/annotation.py +55 -0
  61. selfevals/schemas/dataset.py +111 -0
  62. selfevals/schemas/enums.py +324 -0
  63. selfevals/schemas/eval_case.py +189 -0
  64. selfevals/schemas/experiment.py +367 -0
  65. selfevals/schemas/failure_mode.py +76 -0
  66. selfevals/schemas/fleet.py +111 -0
  67. selfevals/schemas/grader_card.py +112 -0
  68. selfevals/schemas/iteration.py +219 -0
  69. selfevals/schemas/registry.py +125 -0
  70. selfevals/schemas/tool.py +43 -0
  71. selfevals/schemas/trace.py +384 -0
  72. selfevals/schemas/workspace.py +69 -0
  73. selfevals/sdk/__init__.py +24 -0
  74. selfevals/sdk/auto_instrument.py +165 -0
  75. selfevals/sdk/context.py +45 -0
  76. selfevals/sdk/exporter.py +50 -0
  77. selfevals/sdk/facade.py +203 -0
  78. selfevals/skills/__init__.py +61 -0
  79. selfevals/storage/__init__.py +53 -0
  80. selfevals/storage/errors.py +66 -0
  81. selfevals/storage/filesystem.py +137 -0
  82. selfevals/storage/interface.py +135 -0
  83. selfevals/storage/migrations/__init__.py +80 -0
  84. selfevals/storage/migrations/m0001_initial.py +57 -0
  85. selfevals/storage/seed.py +199 -0
  86. selfevals/storage/sqlite.py +232 -0
  87. selfevals/trace/__init__.py +31 -0
  88. selfevals/trace/otel_importer.py +455 -0
  89. selfevals/trace/payload_router.py +106 -0
  90. selfevals/trace/recorder.py +540 -0
  91. selfevals/version.py +1 -0
  92. selfevals-0.2.2.dist-info/METADATA +283 -0
  93. selfevals-0.2.2.dist-info/RECORD +96 -0
  94. selfevals-0.2.2.dist-info/WHEEL +4 -0
  95. selfevals-0.2.2.dist-info/entry_points.txt +2 -0
  96. selfevals-0.2.2.dist-info/licenses/LICENSE +17 -0
@@ -0,0 +1,15 @@
1
+ """SelfEvals CLI.
2
+
3
+ A thin argparse-based command surface over the rest of the library.
4
+ Zero new runtime dependencies — Typer/Click would be friendlier but
5
+ each pulls a dep tree we don't need yet.
6
+
7
+ Entry point declared in `pyproject.toml`:
8
+ selfevals = "selfevals.cli.main:app"
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from selfevals.cli.main import app
14
+
15
+ __all__ = ["app"]
@@ -0,0 +1,180 @@
1
+ """Translate low-level exceptions into actionable CLI errors.
2
+
3
+ This module is the single chokepoint between "what the runtime raised"
4
+ and "what the user sees on stderr". The rule is:
5
+
6
+ * If the failure is something a user can fix by changing inputs,
7
+ surfaces, or configuration, we wrap the underlying exception in a
8
+ :class:`SelfEvalsUserError` with a tight, file-relative message and
9
+ (when possible) a concrete hint.
10
+ * If it's an internal invariant violation, we re-raise so the traceback
11
+ reaches the user.
12
+
13
+ Adding a new friendly-error path: pick a function below or add one. Do
14
+ **not** sprinkle ``except FooError`` blocks across the CLI — keeping
15
+ the translation table here is the whole point.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import difflib
21
+ import sqlite3
22
+ from pathlib import Path
23
+ from urllib.error import HTTPError, URLError
24
+
25
+ import yaml
26
+
27
+ from selfevals._errors import SelfEvalsUserError
28
+ from selfevals.repo.loader import LoaderError, load_experiment_spec
29
+ from selfevals.runner.adapters import AdapterError
30
+
31
+ if False: # for type checkers only, no runtime cycle.
32
+ from selfevals.repo.loader import ExperimentSpec
33
+
34
+
35
+ def load_spec(path: str | Path, *, workspace_id: str | None = None) -> ExperimentSpec:
36
+ """Load a YAML experiment spec with friendly error messages.
37
+
38
+ Wraps :func:`selfevals.repo.loader.load_experiment_spec`. Catches
39
+ raw YAML parser errors and `LoaderError` and re-raises them as
40
+ :class:`SelfEvalsUserError` so the CLI prints a clean single line.
41
+
42
+ The loader already constructs nice messages for *most* failure
43
+ modes; this wrapper exists so callers don't have to know about
44
+ `LoaderError` and so the few classes of error the loader does not
45
+ label (a `yaml.YAMLError` leaking through, a vanished file race)
46
+ get the same one-line treatment.
47
+ """
48
+ spec_path = Path(path)
49
+ try:
50
+ return load_experiment_spec(spec_path, workspace_id=workspace_id)
51
+ except LoaderError as exc:
52
+ # `LoaderError` is the loader's friendly umbrella, but the dataset
53
+ # branch deserves the special "did you mean ..." treatment so we
54
+ # intercept it before falling through to the generic hint table.
55
+ dataset = _missing_dataset_path(exc)
56
+ if dataset is not None:
57
+ err = dataset_not_found(dataset)
58
+ raise err from exc
59
+ raise SelfEvalsUserError(str(exc), hint=_yaml_hint_if_relevant(spec_path, exc)) from exc
60
+ except yaml.YAMLError as exc: # pragma: no cover - loader already wraps this
61
+ raise SelfEvalsUserError(
62
+ f"could not parse YAML {spec_path}: {exc}",
63
+ hint="check indentation and quoting; run `yamllint` for a line-by-line view",
64
+ ) from exc
65
+ except FileNotFoundError as exc: # pragma: no cover - loader already handles
66
+ raise SelfEvalsUserError(f"experiment spec not found: {spec_path}") from exc
67
+
68
+
69
+ def dataset_not_found(path: Path) -> SelfEvalsUserError:
70
+ """Build a `Dataset not found` error with a fuzzy-match suggestion.
71
+
72
+ Returns the exception; the caller raises (lets the caller pick
73
+ `raise ... from exc` to preserve a stacktrace if it has one).
74
+ """
75
+ parent = path.parent if path.parent.exists() else Path()
76
+ candidates: list[str] = []
77
+ if parent.exists():
78
+ for entry in parent.iterdir():
79
+ if entry.is_file() and entry.suffix in {".jsonl", ".json", ".yaml", ".yml"}:
80
+ candidates.append(entry.name)
81
+ closest = difflib.get_close_matches(path.name, candidates, n=1, cutoff=0.6)
82
+ hint: str | None = None
83
+ if closest:
84
+ hint = f"did you mean {parent / closest[0]}?"
85
+ return SelfEvalsUserError(f"dataset path {str(path)!r} not found", hint=hint)
86
+
87
+
88
+ def unknown_grader(name: str, available: list[str]) -> SelfEvalsUserError:
89
+ """`Grader 'foo' not registered. Available: ...`."""
90
+ available_str = ", ".join(sorted(available)) if available else "(none)"
91
+ closest = difflib.get_close_matches(name, available, n=1, cutoff=0.6)
92
+ hint: str | None = None
93
+ if closest:
94
+ hint = f"did you mean {closest[0]!r}?"
95
+ return SelfEvalsUserError(
96
+ f"grader {name!r} not registered; available: {available_str}",
97
+ hint=hint,
98
+ )
99
+
100
+
101
+ def wrap_adapter_error(exc: Exception, *, url: str | None = None) -> SelfEvalsUserError:
102
+ """Convert an `AdapterError` / `URLError` / `HTTPError` into a user error.
103
+
104
+ `url` is the endpoint the adapter was POSTing to, when known. The
105
+ message format is stable so docs/troubleshooting.md can cite it.
106
+ """
107
+ target = f" to {url}" if url else ""
108
+ if isinstance(exc, HTTPError):
109
+ return SelfEvalsUserError(
110
+ f"HTTP adapter got {exc.code} {exc.reason}{target}",
111
+ hint="check the endpoint returns 2xx with a JSON body",
112
+ )
113
+ if isinstance(exc, URLError):
114
+ reason = getattr(exc, "reason", exc)
115
+ return SelfEvalsUserError(
116
+ f"HTTP adapter could not reach{target} ({reason})",
117
+ hint="confirm the endpoint is running and reachable from this host",
118
+ )
119
+ if isinstance(exc, TimeoutError):
120
+ return SelfEvalsUserError(
121
+ f"HTTP adapter timed out{target}",
122
+ hint="increase timeout_seconds or check endpoint responsiveness",
123
+ )
124
+ # `AdapterError` covers contract violations (bad JSON, non-dict, etc.).
125
+ return SelfEvalsUserError(f"adapter error{target}: {exc}")
126
+
127
+
128
+ def wrap_sqlite_error(exc: sqlite3.Error, *, db_path: Path | str) -> SelfEvalsUserError:
129
+ """Turn a raw `sqlite3.OperationalError` into something a human can act on."""
130
+ msg = str(exc).lower()
131
+ if "locked" in msg or "busy" in msg:
132
+ return SelfEvalsUserError(
133
+ f"sqlite database {db_path} is locked",
134
+ hint="another selfevals process is using it; try `--db <new-path>` or wait",
135
+ )
136
+ if "malformed" in msg or "corrupt" in msg or "not a database" in msg:
137
+ return SelfEvalsUserError(
138
+ f"sqlite database {db_path} is corrupted or not a valid selfevals db",
139
+ hint="back up the file and re-run with `--db <new-path>` to start clean",
140
+ )
141
+ return SelfEvalsUserError(f"sqlite error at {db_path}: {exc}")
142
+
143
+
144
+ def _missing_dataset_path(exc: LoaderError) -> Path | None:
145
+ """If the LoaderError comes from `_read_jsonl`'s 'dataset file not found',
146
+ return the missing path so the caller can add a fuzzy hint."""
147
+ msg = str(exc)
148
+ marker = "dataset file not found: "
149
+ if marker not in msg:
150
+ return None
151
+ # Format: "dataset file not found: <path>"
152
+ return Path(msg.split(marker, 1)[1].strip())
153
+
154
+
155
+ def _yaml_hint_if_relevant(spec_path: Path, exc: LoaderError) -> str | None:
156
+ msg = str(exc).lower()
157
+ if "could not parse yaml" in msg:
158
+ return (
159
+ f"open {spec_path} and check indentation and unclosed brackets; "
160
+ "yaml errors usually point at the line just *after* the mistake"
161
+ )
162
+ if "workspace_id missing" in msg:
163
+ return "add `workspace: ws_<id>` at the top of the file or pass --workspace"
164
+ if "missing or non-mapping `experiment:`" in msg:
165
+ return "the YAML must have an `experiment:` key with the experiment block"
166
+ if "dataset" in msg and "not found" in msg:
167
+ return "check `dataset.cases_path` is relative to the YAML file"
168
+ if "entrypoint" in msg:
169
+ return "format must be 'package.module:callable_name' (note the colon)"
170
+ return None
171
+
172
+
173
+ __all__ = [
174
+ "AdapterError",
175
+ "dataset_not_found",
176
+ "load_spec",
177
+ "unknown_grader",
178
+ "wrap_adapter_error",
179
+ "wrap_sqlite_error",
180
+ ]
selfevals/cli/_help.py ADDED
@@ -0,0 +1,55 @@
1
+ """Help-text helpers for the CLI.
2
+
3
+ Centralises the epilog formatting so every subcommand renders examples
4
+ the same way and so `tests/cli/test_help_texts.py` can assert a single
5
+ convention ("Example:" line) across the board.
6
+
7
+ Keep this module pure text. No business logic.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import textwrap
14
+ from collections.abc import Iterable
15
+
16
+
17
+ def epilog(*examples: str) -> str:
18
+ """Render one or more shell examples as an argparse epilog.
19
+
20
+ Each example is a single command line. The first is labelled
21
+ ``Example:``; any additional ones are stacked underneath without a
22
+ second label so the help text stays compact.
23
+ """
24
+ if not examples:
25
+ raise ValueError("epilog() requires at least one example")
26
+ lines = ["Example:"]
27
+ lines.extend(f" {ex}" for ex in examples)
28
+ return "\n".join(lines)
29
+
30
+
31
+ def make_subparser(
32
+ subparsers: argparse._SubParsersAction[argparse.ArgumentParser],
33
+ name: str,
34
+ *,
35
+ help_text: str,
36
+ description: str | None = None,
37
+ examples: Iterable[str] = (),
38
+ ) -> argparse.ArgumentParser:
39
+ """Add a subparser with a normalised description + epilog.
40
+
41
+ - ``help_text`` is the one-liner shown in the parent ``--help`` listing.
42
+ - ``description`` defaults to ``help_text`` and is shown at the top of
43
+ the subcommand's own ``--help``.
44
+ - ``examples`` becomes the epilog. Use the
45
+ :class:`argparse.RawDescriptionHelpFormatter` so indentation
46
+ survives.
47
+ """
48
+ example_list = list(examples)
49
+ return subparsers.add_parser(
50
+ name,
51
+ help=help_text,
52
+ description=textwrap.dedent(description or help_text).strip(),
53
+ epilog=epilog(*example_list) if example_list else None,
54
+ formatter_class=argparse.RawDescriptionHelpFormatter,
55
+ )
@@ -0,0 +1,169 @@
1
+ """CLI commands for error analysis: `analyze pull/push` and `failuremode *`.
2
+
3
+ These implement the handshake (design §4) and the human promotion gate (§6).
4
+ `analyze pull` emits an AnalysisBundle as JSON on stdout; `analyze push` reads
5
+ an AnalysisResult as JSON on stdin. The `failuremode` family manages the
6
+ taxonomy: list, promote (candidate→official), retire, merge, edit.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import json
13
+ import sys
14
+ from pathlib import Path
15
+
16
+ from selfevals._errors import SelfEvalsUserError
17
+ from selfevals.analysis import build_bundle, ingest_result
18
+ from selfevals.analysis.ingest import AnalysisIngestError
19
+ from selfevals.analysis.schemas import AnalysisResult
20
+ from selfevals.cli.commands import _require_entity, _storage
21
+ from selfevals.schemas.enums import FailureModeStatus
22
+ from selfevals.schemas.failure_mode import FailureMode
23
+ from selfevals.storage.filesystem import FilesystemObjectStore
24
+ from selfevals.storage.interface import ListFilter
25
+
26
+
27
+ def _object_store(args: argparse.Namespace) -> FilesystemObjectStore:
28
+ """Object store rooted next to the db, for payload-routed quotes."""
29
+ return FilesystemObjectStore(Path(args.db).parent / "objects")
30
+
31
+
32
+ def cmd_analyze_pull(args: argparse.Namespace) -> int:
33
+ storage = _storage(args)
34
+ try:
35
+ bundle = build_bundle(
36
+ storage,
37
+ workspace_id=args.workspace_id,
38
+ experiment_id=args.experiment_id,
39
+ iteration=args.iteration,
40
+ only_failed=not args.all,
41
+ )
42
+ finally:
43
+ storage.close()
44
+ print(json.dumps(bundle.model_dump(mode="json"), indent=2))
45
+ return 0
46
+
47
+
48
+ def cmd_analyze_push(args: argparse.Namespace) -> int:
49
+ raw = sys.stdin.read()
50
+ if not raw.strip():
51
+ raise SelfEvalsUserError("analyze push expects an AnalysisResult JSON on stdin")
52
+ try:
53
+ result = AnalysisResult.model_validate_json(raw)
54
+ except ValueError as exc:
55
+ raise SelfEvalsUserError(f"invalid AnalysisResult JSON: {exc}") from exc
56
+
57
+ storage = _storage(args)
58
+ try:
59
+ summary = ingest_result(
60
+ storage,
61
+ workspace_id=args.workspace_id,
62
+ experiment_id=args.experiment_id,
63
+ result=result,
64
+ proposed_by=args.by,
65
+ object_store=_object_store(args),
66
+ )
67
+ except AnalysisIngestError as exc:
68
+ raise SelfEvalsUserError(str(exc)) from exc
69
+ finally:
70
+ storage.close()
71
+
72
+ print(f"assignments applied : {summary.assignments_applied}")
73
+ print(f"candidates created : {len(summary.created_candidates)}")
74
+ print(f"candidates re-seen : {len(summary.updated_candidates)}")
75
+ print(f"hypotheses recorded : {summary.hypotheses_recorded}")
76
+ if summary.created_candidates:
77
+ print("\nnew candidates (promote with `selfevals failuremode promote <id>`):")
78
+ for fm_id in summary.created_candidates:
79
+ print(f" {fm_id}")
80
+ return 0
81
+
82
+
83
+ def cmd_failuremode_list(args: argparse.Namespace) -> int:
84
+ storage = _storage(args)
85
+ try:
86
+ with storage.open(args.workspace_id) as scope:
87
+ modes = [
88
+ m
89
+ for m in scope.list_entities(FailureMode, ListFilter())
90
+ if isinstance(m, FailureMode)
91
+ ]
92
+ finally:
93
+ storage.close()
94
+ if args.status:
95
+ modes = [m for m in modes if str(m.status) == args.status]
96
+ if not modes:
97
+ print("(no failure modes)")
98
+ return 0
99
+ for m in sorted(modes, key=lambda x: (str(x.status), x.slug)):
100
+ marker = "*" if m.status == FailureModeStatus.OFFICIAL else " "
101
+ print(f"{marker} {m.id} [{m.status}] {m.slug} ({len(m.examples)} ex)")
102
+ return 0
103
+
104
+
105
+ def _load_mode(args: argparse.Namespace, fm_id: str) -> FailureMode:
106
+ storage = _storage(args)
107
+ try:
108
+ with storage.open(args.workspace_id) as scope:
109
+ fm = _require_entity(scope, FailureMode, fm_id)
110
+ finally:
111
+ storage.close()
112
+ assert isinstance(fm, FailureMode)
113
+ return fm
114
+
115
+
116
+ def _save_mode(args: argparse.Namespace, fm: FailureMode) -> None:
117
+ storage = _storage(args)
118
+ try:
119
+ with storage.open(args.workspace_id) as scope:
120
+ scope.put_entity(fm)
121
+ finally:
122
+ storage.close()
123
+
124
+
125
+ def cmd_failuremode_promote(args: argparse.Namespace) -> int:
126
+ fm = _load_mode(args, args.failure_mode_id)
127
+ if fm.status == FailureModeStatus.OFFICIAL:
128
+ print(f"{fm.id} is already official")
129
+ return 0
130
+ fm.status = FailureModeStatus.OFFICIAL
131
+ _save_mode(args, fm)
132
+ print(f"promoted {fm.id} ({fm.slug}) → official")
133
+ return 0
134
+
135
+
136
+ def cmd_failuremode_retire(args: argparse.Namespace) -> int:
137
+ fm = _load_mode(args, args.failure_mode_id)
138
+ fm.status = FailureModeStatus.RETIRED
139
+ _save_mode(args, fm)
140
+ print(f"retired {fm.id} ({fm.slug})")
141
+ return 0
142
+
143
+
144
+ def cmd_failuremode_merge(args: argparse.Namespace) -> int:
145
+ src = _load_mode(args, args.failure_mode_id)
146
+ dst = _load_mode(args, args.into)
147
+ if src.id == dst.id:
148
+ raise SelfEvalsUserError("cannot merge a mode into itself")
149
+ # Move examples to the destination, retire the source, set the back-pointer.
150
+ dst.examples = [*dst.examples, *src.examples]
151
+ src.superseded_by = dst.id
152
+ src.status = FailureModeStatus.RETIRED
153
+ _save_mode(args, dst)
154
+ _save_mode(args, src)
155
+ print(f"merged {src.id} ({src.slug}) → {dst.id} ({dst.slug}); source retired")
156
+ return 0
157
+
158
+
159
+ def cmd_failuremode_edit(args: argparse.Namespace) -> int:
160
+ fm = _load_mode(args, args.failure_mode_id)
161
+ if args.title is None and args.definition is None:
162
+ raise SelfEvalsUserError("nothing to edit: pass --title and/or --definition")
163
+ if args.title is not None:
164
+ fm.title = args.title
165
+ if args.definition is not None:
166
+ fm.definition = args.definition
167
+ _save_mode(args, fm)
168
+ print(f"edited {fm.id} ({fm.slug})")
169
+ return 0