selfevals 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- selfevals/.agents/skills/error-analysis/SKILL.md +149 -0
- selfevals/__init__.py +19 -0
- selfevals/_errors.py +44 -0
- selfevals/_internal/__init__.py +0 -0
- selfevals/_internal/hashing.py +23 -0
- selfevals/_internal/ids.py +65 -0
- selfevals/_internal/time.py +17 -0
- selfevals/analysis/__init__.py +23 -0
- selfevals/analysis/bundle.py +162 -0
- selfevals/analysis/hypothesis.py +26 -0
- selfevals/analysis/ingest.py +185 -0
- selfevals/analysis/schemas.py +119 -0
- selfevals/analysis/staging.py +34 -0
- selfevals/api/__init__.py +24 -0
- selfevals/api/__main__.py +47 -0
- selfevals/api/app.py +351 -0
- selfevals/api/broker.py +210 -0
- selfevals/api/broker_bridge.py +29 -0
- selfevals/api/queries.py +447 -0
- selfevals/api/schemas.py +151 -0
- selfevals/api/sse.py +114 -0
- selfevals/cli/__init__.py +15 -0
- selfevals/cli/_friendly.py +180 -0
- selfevals/cli/_help.py +55 -0
- selfevals/cli/analyze_commands.py +169 -0
- selfevals/cli/commands.py +615 -0
- selfevals/cli/main.py +409 -0
- selfevals/decision/__init__.py +34 -0
- selfevals/decision/matrix.py +185 -0
- selfevals/examples/__init__.py +8 -0
- selfevals/examples/evals/datasets/pingpong.jsonl +2 -0
- selfevals/examples/evals/experiments/example_pingpong.yaml +58 -0
- selfevals/examples/pingpong.py +21 -0
- selfevals/graders/__init__.py +46 -0
- selfevals/graders/base.py +54 -0
- selfevals/graders/calibration.py +145 -0
- selfevals/graders/deterministic.py +143 -0
- selfevals/graders/llm_judge.py +187 -0
- selfevals/graders/registry.py +66 -0
- selfevals/optimization/__init__.py +47 -0
- selfevals/optimization/aggregator.py +246 -0
- selfevals/optimization/loop.py +432 -0
- selfevals/optimization/proposers.py +202 -0
- selfevals/py.typed +0 -0
- selfevals/repo/__init__.py +28 -0
- selfevals/repo/loader.py +276 -0
- selfevals/reporter/__init__.py +21 -0
- selfevals/reporter/_metrics.py +114 -0
- selfevals/reporter/compare.py +221 -0
- selfevals/reporter/json_report.py +105 -0
- selfevals/reporter/markdown.py +232 -0
- selfevals/runner/__init__.py +42 -0
- selfevals/runner/adapters.py +268 -0
- selfevals/runner/executor.py +234 -0
- selfevals/runner/otlp_receiver.py +343 -0
- selfevals/runner/otlp_to_recorder.py +180 -0
- selfevals/runner/sandbox.py +46 -0
- selfevals/schemas/__init__.py +213 -0
- selfevals/schemas/_base.py +82 -0
- selfevals/schemas/annotation.py +55 -0
- selfevals/schemas/dataset.py +111 -0
- selfevals/schemas/enums.py +324 -0
- selfevals/schemas/eval_case.py +189 -0
- selfevals/schemas/experiment.py +367 -0
- selfevals/schemas/failure_mode.py +76 -0
- selfevals/schemas/fleet.py +111 -0
- selfevals/schemas/grader_card.py +112 -0
- selfevals/schemas/iteration.py +219 -0
- selfevals/schemas/registry.py +125 -0
- selfevals/schemas/tool.py +43 -0
- selfevals/schemas/trace.py +384 -0
- selfevals/schemas/workspace.py +69 -0
- selfevals/sdk/__init__.py +24 -0
- selfevals/sdk/auto_instrument.py +165 -0
- selfevals/sdk/context.py +45 -0
- selfevals/sdk/exporter.py +50 -0
- selfevals/sdk/facade.py +203 -0
- selfevals/skills/__init__.py +61 -0
- selfevals/storage/__init__.py +53 -0
- selfevals/storage/errors.py +66 -0
- selfevals/storage/filesystem.py +137 -0
- selfevals/storage/interface.py +135 -0
- selfevals/storage/migrations/__init__.py +80 -0
- selfevals/storage/migrations/m0001_initial.py +57 -0
- selfevals/storage/seed.py +199 -0
- selfevals/storage/sqlite.py +232 -0
- selfevals/trace/__init__.py +31 -0
- selfevals/trace/otel_importer.py +455 -0
- selfevals/trace/payload_router.py +106 -0
- selfevals/trace/recorder.py +540 -0
- selfevals/version.py +1 -0
- selfevals-0.2.2.dist-info/METADATA +283 -0
- selfevals-0.2.2.dist-info/RECORD +96 -0
- selfevals-0.2.2.dist-info/WHEEL +4 -0
- selfevals-0.2.2.dist-info/entry_points.txt +2 -0
- selfevals-0.2.2.dist-info/licenses/LICENSE +17 -0
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: error-analysis
|
|
3
|
+
description: Analyze an experiment's failed traces and grow its failure-mode taxonomy. Use when a selfevals experiment has staged error analysis (or a human asks "why is this experiment failing?"). Drives the pull → open coding → axial coding → push → promote cycle. selfevals owns the data and the contract; you provide the intelligence (the coding), it never calls an LLM itself.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Error Analysis (open + axial coding)
|
|
7
|
+
|
|
8
|
+
You are the intelligence half of selfevals's error-analysis loop. selfevals has
|
|
9
|
+
already run the experiment, tagged the deterministic failures, and (if the YAML
|
|
10
|
+
opted in and the trigger fired) staged a bundle. Your job is to read the failed
|
|
11
|
+
traces and **grow the failure-mode taxonomy** so the next experiment can target
|
|
12
|
+
specific, named modes.
|
|
13
|
+
|
|
14
|
+
This is the established **open coding → axial coding** method (Hamel Husain &
|
|
15
|
+
Shreya Shankar's error analysis for LLM apps). Apply it faithfully — not an
|
|
16
|
+
ad-hoc clustering.
|
|
17
|
+
|
|
18
|
+
The hard boundary: **selfevals owns data + contract + persistence; you own the
|
|
19
|
+
coding.** You never edit the database directly — everything flows through
|
|
20
|
+
`analyze pull` / `analyze push`. selfevals never calls an LLM; you do all the
|
|
21
|
+
reading and judgement.
|
|
22
|
+
|
|
23
|
+
## 0. Preflight
|
|
24
|
+
|
|
25
|
+
- Confirm the `selfevals` CLI is available: `selfevals --help`. If the project
|
|
26
|
+
uses `uv`, prefix commands with `uv run`.
|
|
27
|
+
- Identify the target **workspace id** and **experiment id**. If the user gave
|
|
28
|
+
a spec path, the workspace is the spec's `workspace:` key; the experiment id
|
|
29
|
+
is printed when the experiment was created/run.
|
|
30
|
+
- You will need the db path the project uses (the CLI's `--db` flag, default per
|
|
31
|
+
the project). Reuse whatever the human/other commands already use.
|
|
32
|
+
|
|
33
|
+
## 1. Pull the bundle
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
selfevals --db <db> analyze pull <workspace_id> <experiment_id> > bundle.json
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
`bundle.json` contains:
|
|
40
|
+
- `taxonomy`: the **live failure modes** (id, slug, title, definition, status).
|
|
41
|
+
This is what you classify against. Treat OFFICIAL modes as the stable
|
|
42
|
+
vocabulary; CANDIDATE modes are proposals awaiting a human.
|
|
43
|
+
- `traces`: each failed trace with `grade` (label, score, any
|
|
44
|
+
`deterministic_modes` already tagged, optional `judge_reason`), the
|
|
45
|
+
`transcript` (the real conversation), and `first_error_span` (selfevals's
|
|
46
|
+
guess at where it first went wrong).
|
|
47
|
+
|
|
48
|
+
If `traces` is empty, the run was healthy or nothing was staged — report that
|
|
49
|
+
and stop. Don't invent failures.
|
|
50
|
+
|
|
51
|
+
## 2. Open coding — one note per first failure
|
|
52
|
+
|
|
53
|
+
For **each** failed trace, in order:
|
|
54
|
+
1. Read the `transcript` and `first_error_span`. Find the **first** thing that
|
|
55
|
+
went wrong (Hamel's rule: code the first failure, not the downstream cascade).
|
|
56
|
+
2. Write a single, concrete, one-line note describing *what* failed — behavioral,
|
|
57
|
+
not a fix. Good: "cited a price the catalog never contained." Bad: "should
|
|
58
|
+
validate prices" (that's a fix, not an observation).
|
|
59
|
+
3. If a `deterministic_modes` tag already fully explains the failure, you may
|
|
60
|
+
skip writing a new note — **unless** the residue suggests a deeper mode the
|
|
61
|
+
deterministic rule missed. Don't re-discover what's already tagged.
|
|
62
|
+
|
|
63
|
+
Keep notes verbatim-grounded: capture a short `quote` from the transcript that
|
|
64
|
+
evidences the failure. You'll attach it to the assignment.
|
|
65
|
+
|
|
66
|
+
## 3. Axial coding — classify against the LIVE taxonomy
|
|
67
|
+
|
|
68
|
+
This is the discipline that makes the taxonomy stable ("discover once, classify
|
|
69
|
+
thereafter"). For each note:
|
|
70
|
+
|
|
71
|
+
- **Does it match an existing mode's `definition`?** → assign that mode by
|
|
72
|
+
`mode_id`. Prefer an existing mode whenever the definition genuinely fits.
|
|
73
|
+
- **No existing mode fits?** → propose a `new_mode_slug` with a **testable
|
|
74
|
+
definition** (a sentence a different person could apply to a new trace and
|
|
75
|
+
agree on). Lower-case, snake_case slug.
|
|
76
|
+
- **Never rename or redefine an existing mode.** If an official mode's
|
|
77
|
+
definition is wrong, note it for the human — do not silently fork it.
|
|
78
|
+
|
|
79
|
+
Each trace gets **exactly one** of `mode_id` *or* `new_mode_slug` (XOR). selfevals
|
|
80
|
+
rejects an assignment that sets both or neither.
|
|
81
|
+
|
|
82
|
+
## 4. Saturation check
|
|
83
|
+
|
|
84
|
+
Track new modes as you go. When ~20 consecutive traces produce **no** new mode,
|
|
85
|
+
you've reached saturation — the taxonomy now covers this run. Note it; you can
|
|
86
|
+
stop proposing and just classify the remainder.
|
|
87
|
+
|
|
88
|
+
## 5. Optional: hypotheses
|
|
89
|
+
|
|
90
|
+
For the dominant modes, you may add a `hypotheses` entry: a testable statement
|
|
91
|
+
("tightening the price-grounding instruction will reduce `invented_price`") with
|
|
92
|
+
optional `suggested_parameters`. selfevals stores these; the proposer consults
|
|
93
|
+
them next iteration. It does **not** run them automatically.
|
|
94
|
+
|
|
95
|
+
## 6. Push the result
|
|
96
|
+
|
|
97
|
+
Emit an `AnalysisResult` JSON and push it:
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
selfevals --db <db> analyze push <workspace_id> <experiment_id> --by "agent:<your-name>" < result.json
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
`result.json` shape:
|
|
104
|
+
|
|
105
|
+
```json
|
|
106
|
+
{
|
|
107
|
+
"proposed_modes": [
|
|
108
|
+
{"slug": "invented_price", "title": "Invented price",
|
|
109
|
+
"definition": "Agent states a price not present in the catalog context."}
|
|
110
|
+
],
|
|
111
|
+
"assignments": [
|
|
112
|
+
{"trace_id": "trc_…", "mode_id": "fm_…", "quote": "…", "open_note": "…"},
|
|
113
|
+
{"trace_id": "trc_…", "new_mode_slug": "invented_price", "quote": "$499", "open_note": "…"}
|
|
114
|
+
],
|
|
115
|
+
"hypotheses": [
|
|
116
|
+
{"targets_mode_slug": "invented_price",
|
|
117
|
+
"statement": "Add an explicit 'only cite catalog prices' instruction.",
|
|
118
|
+
"suggested_parameters": {"prompt_section": "grounding"}}
|
|
119
|
+
]
|
|
120
|
+
}
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
selfevals validates the whole result **before** writing (transactional intent),
|
|
124
|
+
enforces the XOR and classify-don't-rename invariants, creates candidate modes
|
|
125
|
+
idempotently (re-proposing an existing slug does not duplicate it), and stamps
|
|
126
|
+
`mode_id` onto each trace's grader results. It prints a summary.
|
|
127
|
+
|
|
128
|
+
## 7. Hand back to the human
|
|
129
|
+
|
|
130
|
+
Print which candidates are strongest — frequency (how many traces) plus your
|
|
131
|
+
confidence — so the human can batch-promote:
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
selfevals --db <db> failuremode list <workspace_id> --status candidate
|
|
135
|
+
selfevals --db <db> failuremode promote <workspace_id> <fm_id>
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
Promotion (candidate → official) is a **human gate** by design. Never promote on
|
|
139
|
+
your own. Your output is a recommendation, not a decision.
|
|
140
|
+
|
|
141
|
+
## What you must not do
|
|
142
|
+
|
|
143
|
+
- Do not call any database or storage API directly. Only `analyze pull/push`
|
|
144
|
+
and `failuremode *`.
|
|
145
|
+
- Do not rename, redefine, or merge existing modes (merging is a human
|
|
146
|
+
`failuremode merge`).
|
|
147
|
+
- Do not promote candidates.
|
|
148
|
+
- Do not invent failures for healthy traces or pad the taxonomy to look busy —
|
|
149
|
+
a smaller, sharper taxonomy is the goal.
|
selfevals/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""selfevals — self-improving evals framework for AI agents."""
|
|
2
|
+
|
|
3
|
+
from selfevals.sdk import (
|
|
4
|
+
InitResult,
|
|
5
|
+
SelfEvalsAlreadyInitialized,
|
|
6
|
+
init,
|
|
7
|
+
is_initialized,
|
|
8
|
+
shutdown,
|
|
9
|
+
)
|
|
10
|
+
from selfevals.version import __version__
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"InitResult",
|
|
14
|
+
"SelfEvalsAlreadyInitialized",
|
|
15
|
+
"__version__",
|
|
16
|
+
"init",
|
|
17
|
+
"is_initialized",
|
|
18
|
+
"shutdown",
|
|
19
|
+
]
|
selfevals/_errors.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Shared exception hierarchy.
|
|
2
|
+
|
|
3
|
+
Two-tier model:
|
|
4
|
+
|
|
5
|
+
- `SelfEvalsError` — base for all selfevals-raised exceptions. Catch
|
|
6
|
+
this at the outermost boundary to distinguish selfevals failures from
|
|
7
|
+
truly internal Python errors (e.g. an OS-level disk-full).
|
|
8
|
+
- `SelfEvalsUserError` — base for *user-correctable* failures: bad YAML,
|
|
9
|
+
missing dataset, unknown grader, unreachable HTTP endpoint, locked
|
|
10
|
+
database. These flow up to the CLI which prints a single-line
|
|
11
|
+
`error: ...` and exits 2 (no stacktrace).
|
|
12
|
+
|
|
13
|
+
Everything else (assertion violations, programmer bugs, unexpected
|
|
14
|
+
Pydantic shapes that escape our friendly wrappers) keeps its stack
|
|
15
|
+
trace and yields exit 1 from the CLI dispatcher.
|
|
16
|
+
|
|
17
|
+
The CLI's `CommandError` is kept as a thin alias of `SelfEvalsUserError`
|
|
18
|
+
for source compatibility with the rest of the package; both terms refer
|
|
19
|
+
to the same class so `except CommandError` and `except SelfEvalsUserError`
|
|
20
|
+
behave identically.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class SelfEvalsError(Exception):
|
|
27
|
+
"""Root of selfevals's exception hierarchy."""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class SelfEvalsUserError(SelfEvalsError):
|
|
31
|
+
"""Raised for failures the user can fix without reading a traceback.
|
|
32
|
+
|
|
33
|
+
The message must be self-contained: it is printed verbatim as
|
|
34
|
+
`error: <message>` and the user gets no other context. Include the
|
|
35
|
+
file path, the offending field, and (ideally) one concrete hint
|
|
36
|
+
about how to fix it.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, message: str, *, hint: str | None = None) -> None:
|
|
40
|
+
self.hint = hint
|
|
41
|
+
if hint:
|
|
42
|
+
super().__init__(f"{message}\n hint: {hint}")
|
|
43
|
+
else:
|
|
44
|
+
super().__init__(message)
|
|
File without changes
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Content hashing for snapshot identity and pointer integrity."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def content_hash(payload: Any) -> str:
|
|
11
|
+
"""Return a stable `sha256:...` hash of a JSON-serializable payload.
|
|
12
|
+
|
|
13
|
+
Keys are sorted and separators are canonical to make the hash reproducible
|
|
14
|
+
regardless of dict insertion order.
|
|
15
|
+
"""
|
|
16
|
+
encoded = json.dumps(payload, sort_keys=True, separators=(",", ":"), default=str).encode()
|
|
17
|
+
digest = hashlib.sha256(encoded).hexdigest()
|
|
18
|
+
return f"sha256:{digest}"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def bytes_hash(data: bytes) -> str:
|
|
22
|
+
"""Return `sha256:...` for raw bytes (used for stored blobs)."""
|
|
23
|
+
return f"sha256:{hashlib.sha256(data).hexdigest()}"
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""ULID generation (stdlib only).
|
|
2
|
+
|
|
3
|
+
A ULID is a 128-bit identifier: 48-bit big-endian millisecond timestamp +
|
|
4
|
+
80 bits of cryptographic randomness, encoded as 26 chars of Crockford Base32.
|
|
5
|
+
Lexicographically sortable by creation time.
|
|
6
|
+
|
|
7
|
+
Spec: https://github.com/ulid/spec
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
import secrets
|
|
14
|
+
import time
|
|
15
|
+
|
|
16
|
+
_ALPHABET = "0123456789ABCDEFGHJKMNPQRSTVWXYZ" # Crockford Base32 (no I L O U)
|
|
17
|
+
_ALPHABET_SET = frozenset(_ALPHABET)
|
|
18
|
+
_ULID_RE = re.compile(r"^[0-9A-HJKMNP-TV-Z]{26}$")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def new_ulid() -> str:
|
|
22
|
+
"""Generate a new ULID string (26 chars, uppercase Crockford Base32)."""
|
|
23
|
+
timestamp_ms = int(time.time() * 1000) & ((1 << 48) - 1)
|
|
24
|
+
randomness = secrets.randbits(80)
|
|
25
|
+
value = (timestamp_ms << 80) | randomness
|
|
26
|
+
return _encode(value)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _encode(value: int) -> str:
|
|
30
|
+
chars: list[str] = []
|
|
31
|
+
for _ in range(26):
|
|
32
|
+
chars.append(_ALPHABET[value & 0x1F])
|
|
33
|
+
value >>= 5
|
|
34
|
+
return "".join(reversed(chars))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def is_ulid(value: str) -> bool:
|
|
38
|
+
"""Return True if `value` is a syntactically valid ULID string."""
|
|
39
|
+
return bool(_ULID_RE.match(value))
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def new_prefixed_id(prefix: str) -> str:
|
|
43
|
+
"""Generate a prefixed identifier (e.g. `ws_01H...`).
|
|
44
|
+
|
|
45
|
+
Prefix must be 2-6 lowercase ASCII letters; separator is underscore.
|
|
46
|
+
"""
|
|
47
|
+
if (
|
|
48
|
+
not 2 <= len(prefix) <= 6
|
|
49
|
+
or not prefix.isascii()
|
|
50
|
+
or not prefix.isalpha()
|
|
51
|
+
or not prefix.islower()
|
|
52
|
+
):
|
|
53
|
+
raise ValueError(f"invalid id prefix: {prefix!r}")
|
|
54
|
+
return f"{prefix}_{new_ulid()}"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
_PREFIXED_RE = re.compile(r"^[a-z]{2,6}_[0-9A-HJKMNP-TV-Z]{26}$")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def is_prefixed_id(value: str, prefix: str | None = None) -> bool:
|
|
61
|
+
if not _PREFIXED_RE.match(value):
|
|
62
|
+
return False
|
|
63
|
+
if prefix is None:
|
|
64
|
+
return True
|
|
65
|
+
return value.startswith(f"{prefix}_")
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Timezone-aware UTC helpers. All timestamps in selfevals are tz-aware UTC."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import UTC, datetime
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def utc_now() -> datetime:
|
|
9
|
+
"""Return current time as a tz-aware UTC datetime."""
|
|
10
|
+
return datetime.now(UTC)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def ensure_utc(dt: datetime) -> datetime:
|
|
14
|
+
"""Coerce a datetime to UTC; reject naive datetimes."""
|
|
15
|
+
if dt.tzinfo is None:
|
|
16
|
+
raise ValueError("naive datetime is not allowed; provide tz-aware UTC")
|
|
17
|
+
return dt.astimezone(UTC)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Error-analysis handshake: build bundles, ingest results.
|
|
2
|
+
|
|
3
|
+
selfevals owns the data, the contract, the persistence, and the verification.
|
|
4
|
+
The intelligence (open/axial coding) lives in an external coding agent that
|
|
5
|
+
honours the `AnalysisBundle` / `AnalysisResult` contract defined in `schemas`.
|
|
6
|
+
selfevals never calls an LLM here. See docs/spec/error_analysis_design.md.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from selfevals.analysis.bundle import build_bundle
|
|
12
|
+
from selfevals.analysis.ingest import IngestSummary, ingest_result
|
|
13
|
+
from selfevals.analysis.schemas import AnalysisBundle, AnalysisResult
|
|
14
|
+
from selfevals.analysis.staging import AnalysisStagingRecord
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"AnalysisBundle",
|
|
18
|
+
"AnalysisResult",
|
|
19
|
+
"AnalysisStagingRecord",
|
|
20
|
+
"IngestSummary",
|
|
21
|
+
"build_bundle",
|
|
22
|
+
"ingest_result",
|
|
23
|
+
]
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""Build an AnalysisBundle from a workspace's persisted traces.
|
|
2
|
+
|
|
3
|
+
`build_bundle` gathers the failed traces of an experiment (optionally one
|
|
4
|
+
iteration), projects each into the wire shape an external agent needs to do
|
|
5
|
+
open/axial coding, and attaches the live taxonomy the agent must classify
|
|
6
|
+
against. Pure read — it never mutates storage. See design §4.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
from typing import TYPE_CHECKING
|
|
13
|
+
|
|
14
|
+
from selfevals.analysis.schemas import (
|
|
15
|
+
AnalysisBundle,
|
|
16
|
+
BundleErrorSpan,
|
|
17
|
+
BundleGrade,
|
|
18
|
+
BundleMessage,
|
|
19
|
+
BundleTrace,
|
|
20
|
+
TaxonomyEntry,
|
|
21
|
+
)
|
|
22
|
+
from selfevals.schemas.failure_mode import FailureMode
|
|
23
|
+
from selfevals.schemas.trace import ErrorSpan, LLMCallSpan, ToolCallSpan, Trace
|
|
24
|
+
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from selfevals.storage.sqlite import SQLiteStorage
|
|
27
|
+
|
|
28
|
+
# Labels that count as "needs coding". PASS/SKIPPED are dropped.
|
|
29
|
+
_FAILED_LABELS = {"fail", "error", "partial"}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _transcript(trace: Trace) -> list[BundleMessage]:
|
|
33
|
+
"""Recover messages from the inline OTel-imported form.
|
|
34
|
+
|
|
35
|
+
The importer stores reconstructed messages under provider_metadata
|
|
36
|
+
(`selfevals.messages_in` / `selfevals.messages_out`) on each LLM span.
|
|
37
|
+
We concatenate them across LLM spans in span order so the agent reads the
|
|
38
|
+
real conversation, not pointers.
|
|
39
|
+
"""
|
|
40
|
+
out: list[BundleMessage] = []
|
|
41
|
+
for span in trace.spans:
|
|
42
|
+
if not isinstance(span, LLMCallSpan):
|
|
43
|
+
continue
|
|
44
|
+
for key in ("selfevals.messages_in", "selfevals.messages_out"):
|
|
45
|
+
raw = span.provider_metadata.get(key)
|
|
46
|
+
if not isinstance(raw, list):
|
|
47
|
+
continue
|
|
48
|
+
for msg in raw:
|
|
49
|
+
if isinstance(msg, dict) and "role" in msg and "content" in msg:
|
|
50
|
+
out.append(BundleMessage(role=str(msg["role"]), content=str(msg["content"])))
|
|
51
|
+
return out
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _first_error_span(trace: Trace) -> BundleErrorSpan | None:
|
|
55
|
+
"""The first failure in the trace — Hamel's "code the first failure" rule.
|
|
56
|
+
|
|
57
|
+
Prefers an explicit ErrorSpan; falls back to the first errored tool call.
|
|
58
|
+
"""
|
|
59
|
+
for span in trace.spans:
|
|
60
|
+
if isinstance(span, ErrorSpan):
|
|
61
|
+
return BundleErrorSpan(kind=str(span.kind), name=span.name, error=span.message)
|
|
62
|
+
if isinstance(span, ToolCallSpan) and span.error:
|
|
63
|
+
return BundleErrorSpan(kind=str(span.kind), name=span.name, error=span.error)
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _grade(trace: Trace) -> BundleGrade:
|
|
68
|
+
"""Collapse the trace's grader results into one bundle grade.
|
|
69
|
+
|
|
70
|
+
Worst label wins; deterministic failure-mode tags and any judge reason are
|
|
71
|
+
surfaced so the agent focuses on the untagged residue.
|
|
72
|
+
"""
|
|
73
|
+
severity = {"error": 4, "fail": 3, "partial": 2, "skipped": 1, "pass": 0}
|
|
74
|
+
label = "pass"
|
|
75
|
+
score: float | None = None
|
|
76
|
+
modes: list[str] = []
|
|
77
|
+
reason: str | None = None
|
|
78
|
+
for gr in trace.grader_results:
|
|
79
|
+
if severity.get(gr.label, 0) >= severity.get(label, 0):
|
|
80
|
+
label = gr.label
|
|
81
|
+
score = gr.score
|
|
82
|
+
modes.extend(gr.failure_modes)
|
|
83
|
+
# The judge reason is payload-routed; we pass the pointer through as a
|
|
84
|
+
# hint when present (resolving it is the object store's job, optional).
|
|
85
|
+
if reason is None and gr.reason_pointer:
|
|
86
|
+
reason = gr.reason_pointer
|
|
87
|
+
# De-dup modes preserving order.
|
|
88
|
+
seen: set[str] = set()
|
|
89
|
+
deduped: list[str] = []
|
|
90
|
+
for m in modes:
|
|
91
|
+
if m not in seen:
|
|
92
|
+
seen.add(m)
|
|
93
|
+
deduped.append(m)
|
|
94
|
+
return BundleGrade(label=label, score=score, deterministic_modes=deduped, judge_reason=reason)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _is_failed(trace: Trace) -> bool:
|
|
98
|
+
if trace.final_state.status != "completed":
|
|
99
|
+
return True
|
|
100
|
+
return any(gr.label in _FAILED_LABELS for gr in trace.grader_results)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def build_bundle(
|
|
104
|
+
storage: SQLiteStorage,
|
|
105
|
+
*,
|
|
106
|
+
workspace_id: str,
|
|
107
|
+
experiment_id: str,
|
|
108
|
+
iteration: int | None = None,
|
|
109
|
+
only_failed: bool = True,
|
|
110
|
+
) -> AnalysisBundle:
|
|
111
|
+
"""Assemble the bundle. Traces are matched by experiment (and iteration)
|
|
112
|
+
via the run metadata stored in each trace's payload."""
|
|
113
|
+
clauses = [
|
|
114
|
+
"workspace_id = ?",
|
|
115
|
+
"entity_type = 'Trace'",
|
|
116
|
+
"json_extract(payload, '$.run.experiment_id') = ?",
|
|
117
|
+
]
|
|
118
|
+
params: list[object] = [workspace_id, experiment_id]
|
|
119
|
+
if iteration is not None:
|
|
120
|
+
clauses.append("json_extract(payload, '$.run.iteration') = ?")
|
|
121
|
+
params.append(iteration)
|
|
122
|
+
sql = "SELECT payload FROM entities WHERE " + " AND ".join(clauses)
|
|
123
|
+
rows = storage.connection.execute(sql, tuple(params)).fetchall()
|
|
124
|
+
|
|
125
|
+
traces = [Trace.model_validate(json.loads(p)) for (p,) in rows]
|
|
126
|
+
|
|
127
|
+
bundle_traces: list[BundleTrace] = []
|
|
128
|
+
for trace in traces:
|
|
129
|
+
if only_failed and not _is_failed(trace):
|
|
130
|
+
continue
|
|
131
|
+
bundle_traces.append(
|
|
132
|
+
BundleTrace(
|
|
133
|
+
trace_id=trace.id,
|
|
134
|
+
run_id=trace.run.run_id,
|
|
135
|
+
thread_id=trace.run.thread_id,
|
|
136
|
+
eval_case_id=trace.run.eval_case_id,
|
|
137
|
+
grade=_grade(trace),
|
|
138
|
+
transcript=_transcript(trace),
|
|
139
|
+
first_error_span=_first_error_span(trace),
|
|
140
|
+
)
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
with storage.open(workspace_id) as scope:
|
|
144
|
+
taxonomy = [
|
|
145
|
+
TaxonomyEntry(
|
|
146
|
+
id=fm.id,
|
|
147
|
+
slug=fm.slug,
|
|
148
|
+
title=fm.title,
|
|
149
|
+
definition=fm.definition,
|
|
150
|
+
status=str(fm.status),
|
|
151
|
+
)
|
|
152
|
+
for fm in scope.list_entities(FailureMode)
|
|
153
|
+
if isinstance(fm, FailureMode) and str(fm.status) != "retired"
|
|
154
|
+
]
|
|
155
|
+
|
|
156
|
+
return AnalysisBundle(
|
|
157
|
+
workspace_id=workspace_id,
|
|
158
|
+
experiment_id=experiment_id,
|
|
159
|
+
iteration=iteration,
|
|
160
|
+
taxonomy=taxonomy,
|
|
161
|
+
traces=bundle_traces,
|
|
162
|
+
)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""HypothesisRecord — a testable change targeting a failure mode.
|
|
2
|
+
|
|
3
|
+
Produced by error analysis (the `hypotheses` block of an AnalysisResult) and
|
|
4
|
+
stored as a workspace entity linked to the experiment. The proposer consults
|
|
5
|
+
these to target a specific mode in the next iteration; selfevals does not run
|
|
6
|
+
them automatically. See docs/spec/error_analysis_design.md §7.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Any, ClassVar
|
|
12
|
+
|
|
13
|
+
from pydantic import Field
|
|
14
|
+
|
|
15
|
+
from selfevals.schemas._base import BaseEntity, NonEmptyStr
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class HypothesisRecord(BaseEntity):
|
|
19
|
+
_id_prefix: ClassVar[str] = "hyp"
|
|
20
|
+
|
|
21
|
+
experiment_id: NonEmptyStr
|
|
22
|
+
targets_mode_slug: NonEmptyStr
|
|
23
|
+
statement: NonEmptyStr
|
|
24
|
+
suggested_parameters: dict[str, Any] = Field(default_factory=dict)
|
|
25
|
+
consumed_by_iteration: int | None = Field(default=None, ge=0)
|
|
26
|
+
"""Set once a proposer has used this hypothesis, so it isn't re-applied."""
|