selfevals 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. selfevals/.agents/skills/error-analysis/SKILL.md +149 -0
  2. selfevals/__init__.py +19 -0
  3. selfevals/_errors.py +44 -0
  4. selfevals/_internal/__init__.py +0 -0
  5. selfevals/_internal/hashing.py +23 -0
  6. selfevals/_internal/ids.py +65 -0
  7. selfevals/_internal/time.py +17 -0
  8. selfevals/analysis/__init__.py +23 -0
  9. selfevals/analysis/bundle.py +162 -0
  10. selfevals/analysis/hypothesis.py +26 -0
  11. selfevals/analysis/ingest.py +185 -0
  12. selfevals/analysis/schemas.py +119 -0
  13. selfevals/analysis/staging.py +34 -0
  14. selfevals/api/__init__.py +24 -0
  15. selfevals/api/__main__.py +47 -0
  16. selfevals/api/app.py +351 -0
  17. selfevals/api/broker.py +210 -0
  18. selfevals/api/broker_bridge.py +29 -0
  19. selfevals/api/queries.py +447 -0
  20. selfevals/api/schemas.py +151 -0
  21. selfevals/api/sse.py +114 -0
  22. selfevals/cli/__init__.py +15 -0
  23. selfevals/cli/_friendly.py +180 -0
  24. selfevals/cli/_help.py +55 -0
  25. selfevals/cli/analyze_commands.py +169 -0
  26. selfevals/cli/commands.py +615 -0
  27. selfevals/cli/main.py +409 -0
  28. selfevals/decision/__init__.py +34 -0
  29. selfevals/decision/matrix.py +185 -0
  30. selfevals/examples/__init__.py +8 -0
  31. selfevals/examples/evals/datasets/pingpong.jsonl +2 -0
  32. selfevals/examples/evals/experiments/example_pingpong.yaml +58 -0
  33. selfevals/examples/pingpong.py +21 -0
  34. selfevals/graders/__init__.py +46 -0
  35. selfevals/graders/base.py +54 -0
  36. selfevals/graders/calibration.py +145 -0
  37. selfevals/graders/deterministic.py +143 -0
  38. selfevals/graders/llm_judge.py +187 -0
  39. selfevals/graders/registry.py +66 -0
  40. selfevals/optimization/__init__.py +47 -0
  41. selfevals/optimization/aggregator.py +246 -0
  42. selfevals/optimization/loop.py +432 -0
  43. selfevals/optimization/proposers.py +202 -0
  44. selfevals/py.typed +0 -0
  45. selfevals/repo/__init__.py +28 -0
  46. selfevals/repo/loader.py +276 -0
  47. selfevals/reporter/__init__.py +21 -0
  48. selfevals/reporter/_metrics.py +114 -0
  49. selfevals/reporter/compare.py +221 -0
  50. selfevals/reporter/json_report.py +105 -0
  51. selfevals/reporter/markdown.py +232 -0
  52. selfevals/runner/__init__.py +42 -0
  53. selfevals/runner/adapters.py +268 -0
  54. selfevals/runner/executor.py +234 -0
  55. selfevals/runner/otlp_receiver.py +343 -0
  56. selfevals/runner/otlp_to_recorder.py +180 -0
  57. selfevals/runner/sandbox.py +46 -0
  58. selfevals/schemas/__init__.py +213 -0
  59. selfevals/schemas/_base.py +82 -0
  60. selfevals/schemas/annotation.py +55 -0
  61. selfevals/schemas/dataset.py +111 -0
  62. selfevals/schemas/enums.py +324 -0
  63. selfevals/schemas/eval_case.py +189 -0
  64. selfevals/schemas/experiment.py +367 -0
  65. selfevals/schemas/failure_mode.py +76 -0
  66. selfevals/schemas/fleet.py +111 -0
  67. selfevals/schemas/grader_card.py +112 -0
  68. selfevals/schemas/iteration.py +219 -0
  69. selfevals/schemas/registry.py +125 -0
  70. selfevals/schemas/tool.py +43 -0
  71. selfevals/schemas/trace.py +384 -0
  72. selfevals/schemas/workspace.py +69 -0
  73. selfevals/sdk/__init__.py +24 -0
  74. selfevals/sdk/auto_instrument.py +165 -0
  75. selfevals/sdk/context.py +45 -0
  76. selfevals/sdk/exporter.py +50 -0
  77. selfevals/sdk/facade.py +203 -0
  78. selfevals/skills/__init__.py +61 -0
  79. selfevals/storage/__init__.py +53 -0
  80. selfevals/storage/errors.py +66 -0
  81. selfevals/storage/filesystem.py +137 -0
  82. selfevals/storage/interface.py +135 -0
  83. selfevals/storage/migrations/__init__.py +80 -0
  84. selfevals/storage/migrations/m0001_initial.py +57 -0
  85. selfevals/storage/seed.py +199 -0
  86. selfevals/storage/sqlite.py +232 -0
  87. selfevals/trace/__init__.py +31 -0
  88. selfevals/trace/otel_importer.py +455 -0
  89. selfevals/trace/payload_router.py +106 -0
  90. selfevals/trace/recorder.py +540 -0
  91. selfevals/version.py +1 -0
  92. selfevals-0.2.2.dist-info/METADATA +283 -0
  93. selfevals-0.2.2.dist-info/RECORD +96 -0
  94. selfevals-0.2.2.dist-info/WHEEL +4 -0
  95. selfevals-0.2.2.dist-info/entry_points.txt +2 -0
  96. selfevals-0.2.2.dist-info/licenses/LICENSE +17 -0
@@ -0,0 +1,149 @@
1
+ ---
2
+ name: error-analysis
3
+ description: Analyze an experiment's failed traces and grow its failure-mode taxonomy. Use when a selfevals experiment has staged error analysis (or a human asks "why is this experiment failing?"). Drives the pull → open coding → axial coding → push → promote cycle. selfevals owns the data and the contract; you provide the intelligence (the coding), it never calls an LLM itself.
4
+ ---
5
+
6
+ # Error Analysis (open + axial coding)
7
+
8
+ You are the intelligence half of selfevals's error-analysis loop. selfevals has
9
+ already run the experiment, tagged the deterministic failures, and (if the YAML
10
+ opted in and the trigger fired) staged a bundle. Your job is to read the failed
11
+ traces and **grow the failure-mode taxonomy** so the next experiment can target
12
+ specific, named modes.
13
+
14
+ This is the established **open coding → axial coding** method (Hamel Husain &
15
+ Shreya Shankar's error analysis for LLM apps). Apply it faithfully — not an
16
+ ad-hoc clustering.
17
+
18
+ The hard boundary: **selfevals owns data + contract + persistence; you own the
19
+ coding.** You never edit the database directly — everything flows through
20
+ `analyze pull` / `analyze push`. selfevals never calls an LLM; you do all the
21
+ reading and judgement.
22
+
23
+ ## 0. Preflight
24
+
25
+ - Confirm the `selfevals` CLI is available: `selfevals --help`. If the project
26
+ uses `uv`, prefix commands with `uv run`.
27
+ - Identify the target **workspace id** and **experiment id**. If the user gave
28
+ a spec path, the workspace is the spec's `workspace:` key; the experiment id
29
+ is printed when the experiment was created/run.
30
+ - You will need the db path the project uses (the CLI's `--db` flag, default per
31
+ the project). Reuse whatever the human/other commands already use.
32
+
33
+ ## 1. Pull the bundle
34
+
35
+ ```bash
36
+ selfevals --db <db> analyze pull <workspace_id> <experiment_id> > bundle.json
37
+ ```
38
+
39
+ `bundle.json` contains:
40
+ - `taxonomy`: the **live failure modes** (id, slug, title, definition, status).
41
+ This is what you classify against. Treat OFFICIAL modes as the stable
42
+ vocabulary; CANDIDATE modes are proposals awaiting a human.
43
+ - `traces`: each failed trace with `grade` (label, score, any
44
+ `deterministic_modes` already tagged, optional `judge_reason`), the
45
+ `transcript` (the real conversation), and `first_error_span` (selfevals's
46
+ guess at where it first went wrong).
47
+
48
+ If `traces` is empty, the run was healthy or nothing was staged — report that
49
+ and stop. Don't invent failures.
50
+
51
+ ## 2. Open coding — one note per first failure
52
+
53
+ For **each** failed trace, in order:
54
+ 1. Read the `transcript` and `first_error_span`. Find the **first** thing that
55
+ went wrong (Hamel's rule: code the first failure, not the downstream cascade).
56
+ 2. Write a single, concrete, one-line note describing *what* failed — behavioral,
57
+ not a fix. Good: "cited a price the catalog never contained." Bad: "should
58
+ validate prices" (that's a fix, not an observation).
59
+ 3. If a `deterministic_modes` tag already fully explains the failure, you may
60
+ skip writing a new note — **unless** the residue suggests a deeper mode the
61
+ deterministic rule missed. Don't re-discover what's already tagged.
62
+
63
+ Keep notes verbatim-grounded: capture a short `quote` from the transcript that
64
+ evidences the failure. You'll attach it to the assignment.
65
+
66
+ ## 3. Axial coding — classify against the LIVE taxonomy
67
+
68
+ This is the discipline that makes the taxonomy stable ("discover once, classify
69
+ thereafter"). For each note:
70
+
71
+ - **Does it match an existing mode's `definition`?** → assign that mode by
72
+ `mode_id`. Prefer an existing mode whenever the definition genuinely fits.
73
+ - **No existing mode fits?** → propose a `new_mode_slug` with a **testable
74
+ definition** (a sentence a different person could apply to a new trace and
75
+ agree on). Lower-case, snake_case slug.
76
+ - **Never rename or redefine an existing mode.** If an official mode's
77
+ definition is wrong, note it for the human — do not silently fork it.
78
+
79
+ Each trace gets **exactly one** of `mode_id` *or* `new_mode_slug` (XOR). selfevals
80
+ rejects an assignment that sets both or neither.
81
+
82
+ ## 4. Saturation check
83
+
84
+ Track new modes as you go. When ~20 consecutive traces produce **no** new mode,
85
+ you've reached saturation — the taxonomy now covers this run. Note it; you can
86
+ stop proposing and just classify the remainder.
87
+
88
+ ## 5. Optional: hypotheses
89
+
90
+ For the dominant modes, you may add a `hypotheses` entry: a testable statement
91
+ ("tightening the price-grounding instruction will reduce `invented_price`") with
92
+ optional `suggested_parameters`. selfevals stores these; the proposer consults
93
+ them next iteration. It does **not** run them automatically.
94
+
95
+ ## 6. Push the result
96
+
97
+ Emit an `AnalysisResult` JSON and push it:
98
+
99
+ ```bash
100
+ selfevals --db <db> analyze push <workspace_id> <experiment_id> --by "agent:<your-name>" < result.json
101
+ ```
102
+
103
+ `result.json` shape:
104
+
105
+ ```json
106
+ {
107
+ "proposed_modes": [
108
+ {"slug": "invented_price", "title": "Invented price",
109
+ "definition": "Agent states a price not present in the catalog context."}
110
+ ],
111
+ "assignments": [
112
+ {"trace_id": "trc_…", "mode_id": "fm_…", "quote": "…", "open_note": "…"},
113
+ {"trace_id": "trc_…", "new_mode_slug": "invented_price", "quote": "$499", "open_note": "…"}
114
+ ],
115
+ "hypotheses": [
116
+ {"targets_mode_slug": "invented_price",
117
+ "statement": "Add an explicit 'only cite catalog prices' instruction.",
118
+ "suggested_parameters": {"prompt_section": "grounding"}}
119
+ ]
120
+ }
121
+ ```
122
+
123
+ selfevals validates the whole result **before** writing (transactional intent),
124
+ enforces the XOR and classify-don't-rename invariants, creates candidate modes
125
+ idempotently (re-proposing an existing slug does not duplicate it), and stamps
126
+ `mode_id` onto each trace's grader results. It prints a summary.
127
+
128
+ ## 7. Hand back to the human
129
+
130
+ Print which candidates are strongest — frequency (how many traces) plus your
131
+ confidence — so the human can batch-promote:
132
+
133
+ ```bash
134
+ selfevals --db <db> failuremode list <workspace_id> --status candidate
135
+ selfevals --db <db> failuremode promote <workspace_id> <fm_id>
136
+ ```
137
+
138
+ Promotion (candidate → official) is a **human gate** by design. Never promote on
139
+ your own. Your output is a recommendation, not a decision.
140
+
141
+ ## What you must not do
142
+
143
+ - Do not call any database or storage API directly. Only `analyze pull/push`
144
+ and `failuremode *`.
145
+ - Do not rename, redefine, or merge existing modes (merging is a human
146
+ `failuremode merge`).
147
+ - Do not promote candidates.
148
+ - Do not invent failures for healthy traces or pad the taxonomy to look busy —
149
+ a smaller, sharper taxonomy is the goal.
selfevals/__init__.py ADDED
@@ -0,0 +1,19 @@
1
+ """selfevals — self-improving evals framework for AI agents."""
2
+
3
+ from selfevals.sdk import (
4
+ InitResult,
5
+ SelfEvalsAlreadyInitialized,
6
+ init,
7
+ is_initialized,
8
+ shutdown,
9
+ )
10
+ from selfevals.version import __version__
11
+
12
+ __all__ = [
13
+ "InitResult",
14
+ "SelfEvalsAlreadyInitialized",
15
+ "__version__",
16
+ "init",
17
+ "is_initialized",
18
+ "shutdown",
19
+ ]
selfevals/_errors.py ADDED
@@ -0,0 +1,44 @@
1
+ """Shared exception hierarchy.
2
+
3
+ Two-tier model:
4
+
5
+ - `SelfEvalsError` — base for all selfevals-raised exceptions. Catch
6
+ this at the outermost boundary to distinguish selfevals failures from
7
+ truly internal Python errors (e.g. an OS-level disk-full).
8
+ - `SelfEvalsUserError` — base for *user-correctable* failures: bad YAML,
9
+ missing dataset, unknown grader, unreachable HTTP endpoint, locked
10
+ database. These flow up to the CLI which prints a single-line
11
+ `error: ...` and exits 2 (no stacktrace).
12
+
13
+ Everything else (assertion violations, programmer bugs, unexpected
14
+ Pydantic shapes that escape our friendly wrappers) keeps its stack
15
+ trace and yields exit 1 from the CLI dispatcher.
16
+
17
+ The CLI's `CommandError` is kept as a thin alias of `SelfEvalsUserError`
18
+ for source compatibility with the rest of the package; both terms refer
19
+ to the same class so `except CommandError` and `except SelfEvalsUserError`
20
+ behave identically.
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+
26
+ class SelfEvalsError(Exception):
27
+ """Root of selfevals's exception hierarchy."""
28
+
29
+
30
+ class SelfEvalsUserError(SelfEvalsError):
31
+ """Raised for failures the user can fix without reading a traceback.
32
+
33
+ The message must be self-contained: it is printed verbatim as
34
+ `error: <message>` and the user gets no other context. Include the
35
+ file path, the offending field, and (ideally) one concrete hint
36
+ about how to fix it.
37
+ """
38
+
39
+ def __init__(self, message: str, *, hint: str | None = None) -> None:
40
+ self.hint = hint
41
+ if hint:
42
+ super().__init__(f"{message}\n hint: {hint}")
43
+ else:
44
+ super().__init__(message)
File without changes
@@ -0,0 +1,23 @@
1
+ """Content hashing for snapshot identity and pointer integrity."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import json
7
+ from typing import Any
8
+
9
+
10
+ def content_hash(payload: Any) -> str:
11
+ """Return a stable `sha256:...` hash of a JSON-serializable payload.
12
+
13
+ Keys are sorted and separators are canonical to make the hash reproducible
14
+ regardless of dict insertion order.
15
+ """
16
+ encoded = json.dumps(payload, sort_keys=True, separators=(",", ":"), default=str).encode()
17
+ digest = hashlib.sha256(encoded).hexdigest()
18
+ return f"sha256:{digest}"
19
+
20
+
21
+ def bytes_hash(data: bytes) -> str:
22
+ """Return `sha256:...` for raw bytes (used for stored blobs)."""
23
+ return f"sha256:{hashlib.sha256(data).hexdigest()}"
@@ -0,0 +1,65 @@
1
+ """ULID generation (stdlib only).
2
+
3
+ A ULID is a 128-bit identifier: 48-bit big-endian millisecond timestamp +
4
+ 80 bits of cryptographic randomness, encoded as 26 chars of Crockford Base32.
5
+ Lexicographically sortable by creation time.
6
+
7
+ Spec: https://github.com/ulid/spec
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import re
13
+ import secrets
14
+ import time
15
+
16
+ _ALPHABET = "0123456789ABCDEFGHJKMNPQRSTVWXYZ" # Crockford Base32 (no I L O U)
17
+ _ALPHABET_SET = frozenset(_ALPHABET)
18
+ _ULID_RE = re.compile(r"^[0-9A-HJKMNP-TV-Z]{26}$")
19
+
20
+
21
+ def new_ulid() -> str:
22
+ """Generate a new ULID string (26 chars, uppercase Crockford Base32)."""
23
+ timestamp_ms = int(time.time() * 1000) & ((1 << 48) - 1)
24
+ randomness = secrets.randbits(80)
25
+ value = (timestamp_ms << 80) | randomness
26
+ return _encode(value)
27
+
28
+
29
+ def _encode(value: int) -> str:
30
+ chars: list[str] = []
31
+ for _ in range(26):
32
+ chars.append(_ALPHABET[value & 0x1F])
33
+ value >>= 5
34
+ return "".join(reversed(chars))
35
+
36
+
37
+ def is_ulid(value: str) -> bool:
38
+ """Return True if `value` is a syntactically valid ULID string."""
39
+ return bool(_ULID_RE.match(value))
40
+
41
+
42
+ def new_prefixed_id(prefix: str) -> str:
43
+ """Generate a prefixed identifier (e.g. `ws_01H...`).
44
+
45
+ Prefix must be 2-6 lowercase ASCII letters; separator is underscore.
46
+ """
47
+ if (
48
+ not 2 <= len(prefix) <= 6
49
+ or not prefix.isascii()
50
+ or not prefix.isalpha()
51
+ or not prefix.islower()
52
+ ):
53
+ raise ValueError(f"invalid id prefix: {prefix!r}")
54
+ return f"{prefix}_{new_ulid()}"
55
+
56
+
57
+ _PREFIXED_RE = re.compile(r"^[a-z]{2,6}_[0-9A-HJKMNP-TV-Z]{26}$")
58
+
59
+
60
+ def is_prefixed_id(value: str, prefix: str | None = None) -> bool:
61
+ if not _PREFIXED_RE.match(value):
62
+ return False
63
+ if prefix is None:
64
+ return True
65
+ return value.startswith(f"{prefix}_")
@@ -0,0 +1,17 @@
1
+ """Timezone-aware UTC helpers. All timestamps in selfevals are tz-aware UTC."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import UTC, datetime
6
+
7
+
8
+ def utc_now() -> datetime:
9
+ """Return current time as a tz-aware UTC datetime."""
10
+ return datetime.now(UTC)
11
+
12
+
13
+ def ensure_utc(dt: datetime) -> datetime:
14
+ """Coerce a datetime to UTC; reject naive datetimes."""
15
+ if dt.tzinfo is None:
16
+ raise ValueError("naive datetime is not allowed; provide tz-aware UTC")
17
+ return dt.astimezone(UTC)
@@ -0,0 +1,23 @@
1
+ """Error-analysis handshake: build bundles, ingest results.
2
+
3
+ selfevals owns the data, the contract, the persistence, and the verification.
4
+ The intelligence (open/axial coding) lives in an external coding agent that
5
+ honours the `AnalysisBundle` / `AnalysisResult` contract defined in `schemas`.
6
+ selfevals never calls an LLM here. See docs/spec/error_analysis_design.md.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from selfevals.analysis.bundle import build_bundle
12
+ from selfevals.analysis.ingest import IngestSummary, ingest_result
13
+ from selfevals.analysis.schemas import AnalysisBundle, AnalysisResult
14
+ from selfevals.analysis.staging import AnalysisStagingRecord
15
+
16
+ __all__ = [
17
+ "AnalysisBundle",
18
+ "AnalysisResult",
19
+ "AnalysisStagingRecord",
20
+ "IngestSummary",
21
+ "build_bundle",
22
+ "ingest_result",
23
+ ]
@@ -0,0 +1,162 @@
1
+ """Build an AnalysisBundle from a workspace's persisted traces.
2
+
3
+ `build_bundle` gathers the failed traces of an experiment (optionally one
4
+ iteration), projects each into the wire shape an external agent needs to do
5
+ open/axial coding, and attaches the live taxonomy the agent must classify
6
+ against. Pure read — it never mutates storage. See design §4.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ from typing import TYPE_CHECKING
13
+
14
+ from selfevals.analysis.schemas import (
15
+ AnalysisBundle,
16
+ BundleErrorSpan,
17
+ BundleGrade,
18
+ BundleMessage,
19
+ BundleTrace,
20
+ TaxonomyEntry,
21
+ )
22
+ from selfevals.schemas.failure_mode import FailureMode
23
+ from selfevals.schemas.trace import ErrorSpan, LLMCallSpan, ToolCallSpan, Trace
24
+
25
+ if TYPE_CHECKING:
26
+ from selfevals.storage.sqlite import SQLiteStorage
27
+
28
+ # Labels that count as "needs coding". PASS/SKIPPED are dropped.
29
+ _FAILED_LABELS = {"fail", "error", "partial"}
30
+
31
+
32
+ def _transcript(trace: Trace) -> list[BundleMessage]:
33
+ """Recover messages from the inline OTel-imported form.
34
+
35
+ The importer stores reconstructed messages under provider_metadata
36
+ (`selfevals.messages_in` / `selfevals.messages_out`) on each LLM span.
37
+ We concatenate them across LLM spans in span order so the agent reads the
38
+ real conversation, not pointers.
39
+ """
40
+ out: list[BundleMessage] = []
41
+ for span in trace.spans:
42
+ if not isinstance(span, LLMCallSpan):
43
+ continue
44
+ for key in ("selfevals.messages_in", "selfevals.messages_out"):
45
+ raw = span.provider_metadata.get(key)
46
+ if not isinstance(raw, list):
47
+ continue
48
+ for msg in raw:
49
+ if isinstance(msg, dict) and "role" in msg and "content" in msg:
50
+ out.append(BundleMessage(role=str(msg["role"]), content=str(msg["content"])))
51
+ return out
52
+
53
+
54
+ def _first_error_span(trace: Trace) -> BundleErrorSpan | None:
55
+ """The first failure in the trace — Hamel's "code the first failure" rule.
56
+
57
+ Prefers an explicit ErrorSpan; falls back to the first errored tool call.
58
+ """
59
+ for span in trace.spans:
60
+ if isinstance(span, ErrorSpan):
61
+ return BundleErrorSpan(kind=str(span.kind), name=span.name, error=span.message)
62
+ if isinstance(span, ToolCallSpan) and span.error:
63
+ return BundleErrorSpan(kind=str(span.kind), name=span.name, error=span.error)
64
+ return None
65
+
66
+
67
+ def _grade(trace: Trace) -> BundleGrade:
68
+ """Collapse the trace's grader results into one bundle grade.
69
+
70
+ Worst label wins; deterministic failure-mode tags and any judge reason are
71
+ surfaced so the agent focuses on the untagged residue.
72
+ """
73
+ severity = {"error": 4, "fail": 3, "partial": 2, "skipped": 1, "pass": 0}
74
+ label = "pass"
75
+ score: float | None = None
76
+ modes: list[str] = []
77
+ reason: str | None = None
78
+ for gr in trace.grader_results:
79
+ if severity.get(gr.label, 0) >= severity.get(label, 0):
80
+ label = gr.label
81
+ score = gr.score
82
+ modes.extend(gr.failure_modes)
83
+ # The judge reason is payload-routed; we pass the pointer through as a
84
+ # hint when present (resolving it is the object store's job, optional).
85
+ if reason is None and gr.reason_pointer:
86
+ reason = gr.reason_pointer
87
+ # De-dup modes preserving order.
88
+ seen: set[str] = set()
89
+ deduped: list[str] = []
90
+ for m in modes:
91
+ if m not in seen:
92
+ seen.add(m)
93
+ deduped.append(m)
94
+ return BundleGrade(label=label, score=score, deterministic_modes=deduped, judge_reason=reason)
95
+
96
+
97
+ def _is_failed(trace: Trace) -> bool:
98
+ if trace.final_state.status != "completed":
99
+ return True
100
+ return any(gr.label in _FAILED_LABELS for gr in trace.grader_results)
101
+
102
+
103
+ def build_bundle(
104
+ storage: SQLiteStorage,
105
+ *,
106
+ workspace_id: str,
107
+ experiment_id: str,
108
+ iteration: int | None = None,
109
+ only_failed: bool = True,
110
+ ) -> AnalysisBundle:
111
+ """Assemble the bundle. Traces are matched by experiment (and iteration)
112
+ via the run metadata stored in each trace's payload."""
113
+ clauses = [
114
+ "workspace_id = ?",
115
+ "entity_type = 'Trace'",
116
+ "json_extract(payload, '$.run.experiment_id') = ?",
117
+ ]
118
+ params: list[object] = [workspace_id, experiment_id]
119
+ if iteration is not None:
120
+ clauses.append("json_extract(payload, '$.run.iteration') = ?")
121
+ params.append(iteration)
122
+ sql = "SELECT payload FROM entities WHERE " + " AND ".join(clauses)
123
+ rows = storage.connection.execute(sql, tuple(params)).fetchall()
124
+
125
+ traces = [Trace.model_validate(json.loads(p)) for (p,) in rows]
126
+
127
+ bundle_traces: list[BundleTrace] = []
128
+ for trace in traces:
129
+ if only_failed and not _is_failed(trace):
130
+ continue
131
+ bundle_traces.append(
132
+ BundleTrace(
133
+ trace_id=trace.id,
134
+ run_id=trace.run.run_id,
135
+ thread_id=trace.run.thread_id,
136
+ eval_case_id=trace.run.eval_case_id,
137
+ grade=_grade(trace),
138
+ transcript=_transcript(trace),
139
+ first_error_span=_first_error_span(trace),
140
+ )
141
+ )
142
+
143
+ with storage.open(workspace_id) as scope:
144
+ taxonomy = [
145
+ TaxonomyEntry(
146
+ id=fm.id,
147
+ slug=fm.slug,
148
+ title=fm.title,
149
+ definition=fm.definition,
150
+ status=str(fm.status),
151
+ )
152
+ for fm in scope.list_entities(FailureMode)
153
+ if isinstance(fm, FailureMode) and str(fm.status) != "retired"
154
+ ]
155
+
156
+ return AnalysisBundle(
157
+ workspace_id=workspace_id,
158
+ experiment_id=experiment_id,
159
+ iteration=iteration,
160
+ taxonomy=taxonomy,
161
+ traces=bundle_traces,
162
+ )
@@ -0,0 +1,26 @@
1
+ """HypothesisRecord — a testable change targeting a failure mode.
2
+
3
+ Produced by error analysis (the `hypotheses` block of an AnalysisResult) and
4
+ stored as a workspace entity linked to the experiment. The proposer consults
5
+ these to target a specific mode in the next iteration; selfevals does not run
6
+ them automatically. See docs/spec/error_analysis_design.md §7.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Any, ClassVar
12
+
13
+ from pydantic import Field
14
+
15
+ from selfevals.schemas._base import BaseEntity, NonEmptyStr
16
+
17
+
18
+ class HypothesisRecord(BaseEntity):
19
+ _id_prefix: ClassVar[str] = "hyp"
20
+
21
+ experiment_id: NonEmptyStr
22
+ targets_mode_slug: NonEmptyStr
23
+ statement: NonEmptyStr
24
+ suggested_parameters: dict[str, Any] = Field(default_factory=dict)
25
+ consumed_by_iteration: int | None = Field(default=None, ge=0)
26
+ """Set once a proposer has used this hypothesis, so it isn't re-applied."""