omegaprompt 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,51 @@
1
+ """omegaprompt - Calibration discipline for Claude API prompts.
2
+
3
+ Sensitivity-driven coordinate descent and walk-forward validation, ported
4
+ from the omega-lock parameter-calibration framework to the prompt-engineering
5
+ setting. Treats prompt parameters (system variant, few-shot count, effort
6
+ level, max_tokens bucket, thinking on/off) as a calibratable parameter
7
+ space; measures stress on each axis; runs grid search in the top-K unlock
8
+ subspace; validates on a held-out walk-forward slice. LLM-as-judge scoring
9
+ with user-supplied rubrics; hard gates on refusal / format / safety
10
+ collapse the fitness to zero, so overfitting a single metric does not pass.
11
+
12
+ Public API:
13
+ from omegaprompt import (
14
+ PromptTarget, PromptSpace, ParamVariants,
15
+ Dataset, DatasetItem,
16
+ JudgeRubric, Dimension, HardGate, JudgeResult,
17
+ CompositeFitness,
18
+ )
19
+
20
+ Depends on:
21
+ omega-lock (>=0.1.4) - provides CalibrableTarget, run_p1, stress,
22
+ walk-forward, kill criteria, and benchmark.
23
+ anthropic (>=0.40.0) - Claude API SDK.
24
+ """
25
+
26
+ from omegaprompt.dataset import Dataset, DatasetItem
27
+ from omegaprompt.fitness import CompositeFitness
28
+ from omegaprompt.judge import JudgeRubric, Dimension, HardGate, JudgeResult
29
+ from omegaprompt.schema import (
30
+ ParamVariants,
31
+ PromptSpace,
32
+ CalibrationOutcome,
33
+ )
34
+ from omegaprompt.target import PromptTarget
35
+
36
+ __version__ = "0.1.0"
37
+
38
+ __all__ = [
39
+ "PromptTarget",
40
+ "PromptSpace",
41
+ "ParamVariants",
42
+ "Dataset",
43
+ "DatasetItem",
44
+ "JudgeRubric",
45
+ "Dimension",
46
+ "HardGate",
47
+ "JudgeResult",
48
+ "CompositeFitness",
49
+ "CalibrationOutcome",
50
+ "__version__",
51
+ ]
@@ -0,0 +1,7 @@
1
+ """Module entry point: ``python -m omegaprompt``."""
2
+
3
+ from omegaprompt.cli import app
4
+
5
+
6
+ if __name__ == "__main__":
7
+ app()
omegaprompt/api.py ADDED
@@ -0,0 +1,192 @@
1
+ """Anthropic API wrappers for both target and judge calls.
2
+
3
+ Two separate client boundaries:
4
+
5
+ - ``call_target(...)`` - issues the user's prompt to the target model and
6
+ returns the raw response text. No schema enforcement; the target is
7
+ allowed to respond however it wants (that is what the judge grades).
8
+ - ``call_judge(...)`` - issues the rubric + response to the judge model
9
+ via ``messages.parse(output_format=JudgeResult)``. Schema enforcement
10
+ at the SDK boundary guarantees the judge cannot return malformed data.
11
+
12
+ Both wrappers accept the client via a duck-typed Protocol so tests mock
13
+ without importing ``anthropic``. Prompt caching is applied to the judge's
14
+ system prompt - the same prompt is used thousands of times per run so
15
+ cache hits dominate the judge cost.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from typing import Any, Protocol
21
+
22
+ from omegaprompt.judge import JudgeResult, JudgeRubric
23
+ from omegaprompt.prompts import JUDGE_SYSTEM_PROMPT
24
+
25
+
26
+ class _MessagesNamespace(Protocol):
27
+ def create(self, **kwargs: Any) -> Any: ... # noqa: D401
28
+
29
+ def parse(self, **kwargs: Any) -> Any: ... # noqa: D401
30
+
31
+
32
+ class _AnthropicLike(Protocol):
33
+ messages: _MessagesNamespace
34
+
35
+
36
+ # Effort levels the target is willing to accept. The calibration searcher
37
+ # emits an integer in {0, 1, 2}; these labels are the corresponding string
38
+ # values the Anthropic SDK expects.
39
+ _EFFORT_LABELS = ("low", "medium", "high")
40
+
41
+ # Max-tokens buckets the searcher picks between. Picked to exercise short
42
+ # vs. long output behavior of typical frontier models; the calibration
43
+ # will surface which bucket is appropriate for the task.
44
+ _MAX_TOKENS_BUCKETS = (1024, 4096, 16000)
45
+
46
+
47
+ def effort_from_int(idx: int) -> str:
48
+ """Map a calibration integer index to the vendor's effort label."""
49
+ idx = max(0, min(len(_EFFORT_LABELS) - 1, idx))
50
+ return _EFFORT_LABELS[idx]
51
+
52
+
53
+ def max_tokens_from_int(idx: int) -> int:
54
+ """Map a calibration integer index to a concrete max_tokens value."""
55
+ idx = max(0, min(len(_MAX_TOKENS_BUCKETS) - 1, idx))
56
+ return _MAX_TOKENS_BUCKETS[idx]
57
+
58
+
59
+ def call_target(
60
+ client: _AnthropicLike,
61
+ *,
62
+ model: str,
63
+ system_prompt: str,
64
+ user_message: str,
65
+ few_shots: list[dict[str, str]],
66
+ effort: str,
67
+ max_tokens: int,
68
+ thinking_enabled: bool,
69
+ ) -> tuple[str, dict[str, int]]:
70
+ """Call the target model.
71
+
72
+ Returns ``(response_text, usage_dict)``. ``response_text`` is the
73
+ concatenated text of every text content block in the response.
74
+ """
75
+ messages: list[dict[str, Any]] = []
76
+ for shot in few_shots:
77
+ messages.append({"role": "user", "content": shot["input"]})
78
+ messages.append({"role": "assistant", "content": shot["output"]})
79
+ messages.append({"role": "user", "content": user_message})
80
+
81
+ kwargs: dict[str, Any] = {
82
+ "model": model,
83
+ "max_tokens": max_tokens,
84
+ "system": [
85
+ {
86
+ "type": "text",
87
+ "text": system_prompt,
88
+ "cache_control": {"type": "ephemeral"},
89
+ }
90
+ ],
91
+ "messages": messages,
92
+ }
93
+ if thinking_enabled:
94
+ kwargs["thinking"] = {"type": "adaptive"}
95
+ kwargs["output_config"] = {"effort": effort}
96
+
97
+ response = client.messages.create(**kwargs)
98
+
99
+ text_parts: list[str] = []
100
+ for block in getattr(response, "content", []) or []:
101
+ if getattr(block, "type", None) == "text":
102
+ chunk = getattr(block, "text", "")
103
+ if chunk:
104
+ text_parts.append(chunk)
105
+ return "\n".join(text_parts).strip(), _usage_to_dict(getattr(response, "usage", None))
106
+
107
+
108
+ def call_judge(
109
+ client: _AnthropicLike,
110
+ *,
111
+ judge_model: str,
112
+ rubric: JudgeRubric,
113
+ task_input: str,
114
+ task_reference: str | None,
115
+ target_response: str,
116
+ ) -> tuple[JudgeResult, dict[str, int]]:
117
+ """Call the judge model with schema enforcement.
118
+
119
+ Returns ``(JudgeResult, usage_dict)``. Raises ``RuntimeError`` if the
120
+ judge refuses or returns no parsed output.
121
+ """
122
+ user_payload = _build_judge_payload(rubric, task_input, task_reference, target_response)
123
+
124
+ response = client.messages.parse(
125
+ model=judge_model,
126
+ max_tokens=2048,
127
+ system=[
128
+ {
129
+ "type": "text",
130
+ "text": JUDGE_SYSTEM_PROMPT,
131
+ "cache_control": {"type": "ephemeral"},
132
+ }
133
+ ],
134
+ messages=[{"role": "user", "content": user_payload}],
135
+ output_format=JudgeResult,
136
+ )
137
+
138
+ if getattr(response, "stop_reason", None) == "refusal":
139
+ raise RuntimeError(
140
+ "Judge refused to score the response. This usually means the target "
141
+ "response contained content the judge's safety layer flagged. "
142
+ "Inspect the task input and response for anything that could trigger "
143
+ "the judge's refusal path."
144
+ )
145
+
146
+ parsed: JudgeResult | None = getattr(response, "parsed_output", None)
147
+ if parsed is None:
148
+ raise RuntimeError(
149
+ "Judge returned no parsed_output. The judge response did not conform "
150
+ "to the JudgeResult schema. This indicates a prompt-caching drift, a "
151
+ "judge model downgrade, or a transient SDK issue. "
152
+ f"stop_reason={getattr(response, 'stop_reason', None)!r}."
153
+ )
154
+ if not isinstance(parsed, JudgeResult):
155
+ parsed = JudgeResult.model_validate(parsed)
156
+
157
+ return parsed, _usage_to_dict(getattr(response, "usage", None))
158
+
159
+
160
+ def _build_judge_payload(
161
+ rubric: JudgeRubric,
162
+ task_input: str,
163
+ task_reference: str | None,
164
+ target_response: str,
165
+ ) -> str:
166
+ """Render the judge-user-turn payload the system prompt expects."""
167
+ import json as _json
168
+
169
+ rubric_json = _json.dumps(
170
+ {
171
+ "dimensions": [d.model_dump() for d in rubric.dimensions],
172
+ "hard_gates": [g.model_dump() for g in rubric.hard_gates],
173
+ },
174
+ ensure_ascii=False,
175
+ indent=2,
176
+ )
177
+ ref_block = f"<reference>\n{task_reference}\n</reference>\n\n" if task_reference else ""
178
+ return (
179
+ f"<rubric>\n{rubric_json}\n</rubric>\n\n"
180
+ f"<input>\n{task_input}\n</input>\n\n"
181
+ f"{ref_block}"
182
+ f"<response>\n{target_response}\n</response>"
183
+ )
184
+
185
+
186
+ def _usage_to_dict(usage: Any) -> dict[str, int]:
187
+ return {
188
+ "input_tokens": getattr(usage, "input_tokens", 0) or 0,
189
+ "output_tokens": getattr(usage, "output_tokens", 0) or 0,
190
+ "cache_creation_input_tokens": getattr(usage, "cache_creation_input_tokens", 0) or 0,
191
+ "cache_read_input_tokens": getattr(usage, "cache_read_input_tokens", 0) or 0,
192
+ }
omegaprompt/cli.py ADDED
@@ -0,0 +1,43 @@
1
+ """Top-level Typer application for omegaprompt."""
2
+
3
+ import typer
4
+
5
+ from omegaprompt import __version__
6
+ from omegaprompt.commands import calibrate as calibrate_cmd
7
+
8
+ app = typer.Typer(
9
+ name="omegaprompt",
10
+ help="Calibration discipline for Claude API prompts.",
11
+ no_args_is_help=True,
12
+ add_completion=False,
13
+ )
14
+
15
+
16
+ def _version_callback(value: bool) -> None:
17
+ if value:
18
+ typer.echo(f"omegaprompt {__version__}")
19
+ raise typer.Exit()
20
+
21
+
22
+ @app.callback()
23
+ def _root(
24
+ version: bool = typer.Option( # noqa: B008
25
+ False,
26
+ "--version",
27
+ "-V",
28
+ callback=_version_callback,
29
+ is_eager=True,
30
+ help="Show version and exit.",
31
+ ),
32
+ ) -> None:
33
+ """omegaprompt - apply omega-lock's calibration discipline to prompts."""
34
+
35
+
36
+ app.command(
37
+ name="calibrate",
38
+ help="Calibrate a prompt configuration against a dataset (stress + grid + walk-forward).",
39
+ )(calibrate_cmd.calibrate)
40
+
41
+
42
+ if __name__ == "__main__":
43
+ app()
@@ -0,0 +1 @@
1
+ """Subcommand modules for the omegaprompt CLI."""
@@ -0,0 +1,232 @@
1
+ """`omegaprompt calibrate` - end-to-end calibration from a single command.
2
+
3
+ Takes a training dataset, an optional test dataset, a judge rubric, and
4
+ a ParamVariants config; runs omega-lock's P1 pipeline (stress ->
5
+ top-K unlock -> grid -> walk-forward); writes a ``CalibrationOutcome``
6
+ JSON artifact with the winning parameters and the generalization gap.
7
+
8
+ This command requires omega-lock to be installed (``pip install omega-lock``)
9
+ and ``ANTHROPIC_API_KEY`` to be set.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import os
16
+ from pathlib import Path
17
+
18
+ import typer
19
+
20
+ from omegaprompt.dataset import Dataset
21
+ from omegaprompt.judge import JudgeRubric
22
+ from omegaprompt.schema import CalibrationOutcome, ParamVariants, PromptSpace
23
+
24
+
25
+ def calibrate(
26
+ dataset_path: Path = typer.Argument( # noqa: B008
27
+ ...,
28
+ help="Path to the training dataset (.jsonl).",
29
+ exists=True,
30
+ file_okay=True,
31
+ dir_okay=False,
32
+ readable=True,
33
+ ),
34
+ rubric_path: Path = typer.Option( # noqa: B008
35
+ ...,
36
+ "--rubric",
37
+ "-r",
38
+ help="Path to the JudgeRubric JSON.",
39
+ exists=True,
40
+ file_okay=True,
41
+ dir_okay=False,
42
+ readable=True,
43
+ ),
44
+ variants_path: Path = typer.Option( # noqa: B008
45
+ ...,
46
+ "--variants",
47
+ "-v",
48
+ help="Path to the ParamVariants JSON (system_prompts + few_shot_examples).",
49
+ exists=True,
50
+ file_okay=True,
51
+ dir_okay=False,
52
+ readable=True,
53
+ ),
54
+ test_path: Path | None = typer.Option( # noqa: B008
55
+ None,
56
+ "--test",
57
+ "-t",
58
+ help="Path to the walk-forward test dataset (.jsonl). Recommended.",
59
+ exists=True,
60
+ file_okay=True,
61
+ dir_okay=False,
62
+ readable=True,
63
+ ),
64
+ output_path: Path = typer.Option( # noqa: B008
65
+ Path("calibration_outcome.json"),
66
+ "--output",
67
+ "-o",
68
+ help="Where to write the CalibrationOutcome JSON artifact.",
69
+ file_okay=True,
70
+ dir_okay=False,
71
+ ),
72
+ target_model: str = typer.Option( # noqa: B008
73
+ ...,
74
+ "--target-model",
75
+ help="Model string to calibrate (e.g. 'claude-haiku-4-5').",
76
+ ),
77
+ judge_model: str = typer.Option( # noqa: B008
78
+ ...,
79
+ "--judge-model",
80
+ help="Model string to use as judge. Can equal --target-model.",
81
+ ),
82
+ method: str = typer.Option( # noqa: B008
83
+ "p1",
84
+ "--method",
85
+ "-m",
86
+ help="Calibration method: 'p1' (grid + KC-4) or 'grid' (grid only).",
87
+ case_sensitive=False,
88
+ ),
89
+ unlock_k: int = typer.Option( # noqa: B008
90
+ 3,
91
+ "--unlock-k",
92
+ min=1,
93
+ help="How many top-stress parameters to unlock for grid search.",
94
+ ),
95
+ space_path: Path | None = typer.Option( # noqa: B008
96
+ None,
97
+ "--space",
98
+ help="Optional PromptSpace JSON to override the default parameter bounds.",
99
+ exists=True,
100
+ file_okay=True,
101
+ dir_okay=False,
102
+ readable=True,
103
+ ),
104
+ ) -> None:
105
+ """Calibrate a Claude API prompt configuration against a dataset."""
106
+
107
+ if not os.getenv("ANTHROPIC_API_KEY"):
108
+ typer.secho(
109
+ "ANTHROPIC_API_KEY is not set. Export it (see https://console.anthropic.com) "
110
+ "before running `omegaprompt calibrate`.",
111
+ fg=typer.colors.RED,
112
+ err=True,
113
+ )
114
+ raise typer.Exit(code=2)
115
+
116
+ # Lazy imports so the CLI can at least print --help without omega-lock
117
+ # / anthropic installed.
118
+ try:
119
+ from anthropic import Anthropic # noqa: F401
120
+ except ImportError as exc:
121
+ typer.secho(
122
+ f"The 'anthropic' package is required for `calibrate`. Install with "
123
+ f"`pip install omegaprompt[anthropic]` or `pip install anthropic`. ({exc})",
124
+ fg=typer.colors.RED,
125
+ err=True,
126
+ )
127
+ raise typer.Exit(code=2) from exc
128
+ try:
129
+ from omega_lock import run_p1, P1Config # noqa: F401
130
+ except ImportError as exc:
131
+ typer.secho(
132
+ f"The 'omega-lock' package is required for `calibrate`. Install with "
133
+ f"`pip install omega-lock`. ({exc})",
134
+ fg=typer.colors.RED,
135
+ err=True,
136
+ )
137
+ raise typer.Exit(code=2) from exc
138
+
139
+ from anthropic import Anthropic
140
+ from omega_lock import run_p1, P1Config
141
+
142
+ from omegaprompt.target import PromptTarget
143
+
144
+ typer.secho(f"Loading dataset from {dataset_path} ...", fg=typer.colors.BRIGHT_BLACK)
145
+ train_ds = Dataset.from_jsonl(dataset_path)
146
+ typer.secho(f" {len(train_ds)} items", fg=typer.colors.BRIGHT_BLACK)
147
+
148
+ test_ds: Dataset | None = None
149
+ if test_path is not None:
150
+ typer.secho(f"Loading test set from {test_path} ...", fg=typer.colors.BRIGHT_BLACK)
151
+ test_ds = Dataset.from_jsonl(test_path)
152
+ typer.secho(f" {len(test_ds)} items", fg=typer.colors.BRIGHT_BLACK)
153
+
154
+ rubric = JudgeRubric.from_json(rubric_path)
155
+ variants_payload = json.loads(rubric_path.read_text(encoding="utf-8")) if False else None
156
+ variants = ParamVariants.model_validate_json(variants_path.read_text(encoding="utf-8"))
157
+ space: PromptSpace | None = None
158
+ if space_path is not None:
159
+ space = PromptSpace.model_validate_json(space_path.read_text(encoding="utf-8"))
160
+
161
+ client = Anthropic()
162
+ train_target = PromptTarget(
163
+ target_client=client,
164
+ judge_client=client,
165
+ dataset=train_ds,
166
+ rubric=rubric,
167
+ variants=variants,
168
+ space=space,
169
+ target_model=target_model,
170
+ judge_model=judge_model,
171
+ )
172
+ test_target: PromptTarget | None = None
173
+ if test_ds is not None:
174
+ test_target = PromptTarget(
175
+ target_client=client,
176
+ judge_client=client,
177
+ dataset=test_ds,
178
+ rubric=rubric,
179
+ variants=variants,
180
+ space=space,
181
+ target_model=target_model,
182
+ judge_model=judge_model,
183
+ )
184
+
185
+ typer.secho(
186
+ f"Starting omega-lock run_p1 calibration (unlock_k={unlock_k}, method={method}) ...",
187
+ fg=typer.colors.BRIGHT_BLACK,
188
+ )
189
+ typer.secho(
190
+ "This issues Claude API calls. Budget accordingly.",
191
+ fg=typer.colors.YELLOW,
192
+ )
193
+
194
+ config = P1Config(unlock_k=unlock_k)
195
+ result = run_p1(
196
+ train_target=train_target,
197
+ test_target=test_target,
198
+ config=config,
199
+ )
200
+
201
+ best_params = getattr(result, "grid_best", None) or {}
202
+ best_params_dict = best_params.get("unlocked", {}) if isinstance(best_params, dict) else {}
203
+ best_fitness = float(best_params.get("fitness", 0.0)) if isinstance(best_params, dict) else 0.0
204
+ test_fitness = None
205
+ if test_target is not None and isinstance(best_params, dict):
206
+ test_fitness = best_params.get("test_fitness")
207
+ gen_gap = None
208
+ if test_fitness is not None and best_fitness != 0:
209
+ gen_gap = abs(best_fitness - float(test_fitness)) / abs(best_fitness)
210
+
211
+ outcome = CalibrationOutcome(
212
+ best_params=best_params_dict,
213
+ best_fitness=best_fitness,
214
+ test_fitness=float(test_fitness) if test_fitness is not None else None,
215
+ generalization_gap=gen_gap,
216
+ hard_gate_pass_rate=train_target._fitness.pass_rate(),
217
+ n_candidates_evaluated=train_target.total_api_calls
218
+ // max(len(train_ds) * 2, 1),
219
+ total_api_calls=train_target.total_api_calls
220
+ + (test_target.total_api_calls if test_target else 0),
221
+ method=method,
222
+ usage_summary=dict(train_target.last_usage),
223
+ )
224
+
225
+ output_path.write_text(outcome.model_dump_json(indent=2) + "\n", encoding="utf-8")
226
+ typer.secho(
227
+ f"Calibration complete. best_fitness={best_fitness:.4f}"
228
+ + (f", test_fitness={test_fitness:.4f}" if test_fitness is not None else "")
229
+ + (f", gen_gap={gen_gap:.2%}" if gen_gap is not None else ""),
230
+ fg=typer.colors.GREEN,
231
+ )
232
+ typer.secho(f"Artifact: {output_path}", fg=typer.colors.GREEN)
omegaprompt/dataset.py ADDED
@@ -0,0 +1,87 @@
1
+ """Dataset loading for omegaprompt calibration.
2
+
3
+ The expected format is JSON Lines (``.jsonl``) with one item per line:
4
+
5
+ .. code-block:: json
6
+
7
+ {"id": "t1", "input": "...", "reference": "...", "metadata": {...}}
8
+
9
+ ``reference`` is optional — judges can score without a reference when the
10
+ rubric is purely qualitative. ``metadata`` is a free-form dict that the
11
+ judge may consult (e.g. difficulty tier, target domain).
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ from collections.abc import Iterable
18
+ from pathlib import Path
19
+
20
+ from pydantic import BaseModel, ConfigDict, Field
21
+
22
+
23
+ class DatasetItem(BaseModel):
24
+ """A single calibration example."""
25
+
26
+ model_config = ConfigDict(extra="allow")
27
+
28
+ id: str = Field(..., min_length=1)
29
+ input: str = Field(..., min_length=1, description="Input given to the target model.")
30
+ reference: str | None = Field(
31
+ default=None,
32
+ description="Optional reference / expected output for the judge.",
33
+ )
34
+ metadata: dict = Field(default_factory=dict)
35
+
36
+
37
+ class Dataset(BaseModel):
38
+ """A set of ``DatasetItem`` s loaded from a JSONL file."""
39
+
40
+ items: list[DatasetItem]
41
+
42
+ @classmethod
43
+ def from_jsonl(cls, path: str | Path) -> Dataset:
44
+ """Read one item per line from ``path``."""
45
+ p = Path(path)
46
+ if not p.exists():
47
+ raise FileNotFoundError(f"Dataset not found: {p}")
48
+ items: list[DatasetItem] = []
49
+ with p.open("r", encoding="utf-8") as fh:
50
+ for lineno, raw in enumerate(fh, start=1):
51
+ line = raw.strip()
52
+ if not line:
53
+ continue
54
+ try:
55
+ payload = json.loads(line)
56
+ except json.JSONDecodeError as exc:
57
+ raise ValueError(
58
+ f"{p}:{lineno} - not valid JSON: {exc.msg}"
59
+ ) from exc
60
+ try:
61
+ items.append(DatasetItem.model_validate(payload))
62
+ except Exception as exc:
63
+ raise ValueError(f"{p}:{lineno} - schema invalid: {exc}") from exc
64
+ if not items:
65
+ raise ValueError(f"{p} contained zero items (empty or blank-only lines).")
66
+ ids = [it.id for it in items]
67
+ if len(set(ids)) != len(ids):
68
+ dupes = sorted({i for i in ids if ids.count(i) > 1})
69
+ raise ValueError(f"{p} has duplicate ids: {dupes}")
70
+ return cls(items=items)
71
+
72
+ @classmethod
73
+ def from_items(cls, items: Iterable[DatasetItem | dict]) -> Dataset:
74
+ """Construct a Dataset from an in-memory iterable. Useful in tests."""
75
+ parsed: list[DatasetItem] = []
76
+ for it in items:
77
+ if isinstance(it, DatasetItem):
78
+ parsed.append(it)
79
+ else:
80
+ parsed.append(DatasetItem.model_validate(it))
81
+ return cls(items=parsed)
82
+
83
+ def __len__(self) -> int:
84
+ return len(self.items)
85
+
86
+ def __iter__(self):
87
+ return iter(self.items)
omegaprompt/fitness.py ADDED
@@ -0,0 +1,75 @@
1
+ """Composite fitness: ``hard_gate × soft_score``.
2
+
3
+ The omegaprompt fitness rule is deliberately non-negotiable: if any hard
4
+ gate fails on a dataset item, that item's contribution is zero, period.
5
+ This is the structural defense against prompts that score beautifully on
6
+ the soft rubric but refuse on a subset of inputs (a common failure mode
7
+ with aggressive system prompts) or emit malformed output.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from collections.abc import Iterable
13
+ from dataclasses import dataclass
14
+
15
+ from omegaprompt.judge import JudgeResult, JudgeRubric
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class PerItemScore:
20
+ """Fitness breakdown for a single dataset item."""
21
+
22
+ item_id: str
23
+ soft_score: float
24
+ gates_passed: bool
25
+ final_score: float # soft_score if gates_passed else 0.0
26
+ notes: str = ""
27
+
28
+
29
+ class CompositeFitness:
30
+ """Aggregate per-item judge results into a scalar fitness.
31
+
32
+ The aggregation policy (mean by default) is intentionally simple and
33
+ boring. Fancy aggregations (robust mean, quantile-weighted, etc.) live
34
+ downstream of the raw per-item record preserved in ``last_per_item``.
35
+ """
36
+
37
+ def __init__(self, rubric: JudgeRubric) -> None:
38
+ self.rubric = rubric
39
+ self.last_per_item: list[PerItemScore] = []
40
+
41
+ def evaluate(
42
+ self,
43
+ judge_results: Iterable[tuple[str, JudgeResult]],
44
+ ) -> float:
45
+ """Score a batch of (item_id, judge_result) pairs.
46
+
47
+ Returns the mean final score across all items in the batch. The
48
+ per-item breakdown is retained on ``self.last_per_item`` for the
49
+ caller to inspect (useful for populating ``metadata`` on the
50
+ omega-lock EvalResult).
51
+ """
52
+ per_item: list[PerItemScore] = []
53
+ for item_id, jr in judge_results:
54
+ soft = jr.weighted_score(self.rubric)
55
+ passed = not jr.any_gate_failed()
56
+ per_item.append(
57
+ PerItemScore(
58
+ item_id=item_id,
59
+ soft_score=soft,
60
+ gates_passed=passed,
61
+ final_score=soft if passed else 0.0,
62
+ notes=jr.notes,
63
+ )
64
+ )
65
+
66
+ self.last_per_item = per_item
67
+ if not per_item:
68
+ return 0.0
69
+ return sum(p.final_score for p in per_item) / len(per_item)
70
+
71
+ def pass_rate(self) -> float:
72
+ """Fraction of items in the last batch that cleared every gate."""
73
+ if not self.last_per_item:
74
+ return 0.0
75
+ return sum(1 for p in self.last_per_item if p.gates_passed) / len(self.last_per_item)