omegaprompt 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omegaprompt/__init__.py +51 -0
- omegaprompt/__main__.py +7 -0
- omegaprompt/api.py +192 -0
- omegaprompt/cli.py +43 -0
- omegaprompt/commands/__init__.py +1 -0
- omegaprompt/commands/calibrate.py +232 -0
- omegaprompt/dataset.py +87 -0
- omegaprompt/fitness.py +75 -0
- omegaprompt/judge.py +157 -0
- omegaprompt/prompts.py +65 -0
- omegaprompt/schema.py +139 -0
- omegaprompt/target.py +319 -0
- omegaprompt-0.1.0.dist-info/METADATA +363 -0
- omegaprompt-0.1.0.dist-info/RECORD +17 -0
- omegaprompt-0.1.0.dist-info/WHEEL +4 -0
- omegaprompt-0.1.0.dist-info/entry_points.txt +2 -0
- omegaprompt-0.1.0.dist-info/licenses/LICENSE +21 -0
omegaprompt/__init__.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""omegaprompt - Calibration discipline for Claude API prompts.
|
|
2
|
+
|
|
3
|
+
Sensitivity-driven coordinate descent and walk-forward validation, ported
|
|
4
|
+
from the omega-lock parameter-calibration framework to the prompt-engineering
|
|
5
|
+
setting. Treats prompt parameters (system variant, few-shot count, effort
|
|
6
|
+
level, max_tokens bucket, thinking on/off) as a calibratable parameter
|
|
7
|
+
space; measures stress on each axis; runs grid search in the top-K unlock
|
|
8
|
+
subspace; validates on a held-out walk-forward slice. LLM-as-judge scoring
|
|
9
|
+
with user-supplied rubrics; hard gates on refusal / format / safety
|
|
10
|
+
collapse the fitness to zero, so overfitting a single metric does not pass.
|
|
11
|
+
|
|
12
|
+
Public API:
|
|
13
|
+
from omegaprompt import (
|
|
14
|
+
PromptTarget, PromptSpace, ParamVariants,
|
|
15
|
+
Dataset, DatasetItem,
|
|
16
|
+
JudgeRubric, Dimension, HardGate, JudgeResult,
|
|
17
|
+
CompositeFitness,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
Depends on:
|
|
21
|
+
omega-lock (>=0.1.4) - provides CalibrableTarget, run_p1, stress,
|
|
22
|
+
walk-forward, kill criteria, and benchmark.
|
|
23
|
+
anthropic (>=0.40.0) - Claude API SDK.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from omegaprompt.dataset import Dataset, DatasetItem
|
|
27
|
+
from omegaprompt.fitness import CompositeFitness
|
|
28
|
+
from omegaprompt.judge import JudgeRubric, Dimension, HardGate, JudgeResult
|
|
29
|
+
from omegaprompt.schema import (
|
|
30
|
+
ParamVariants,
|
|
31
|
+
PromptSpace,
|
|
32
|
+
CalibrationOutcome,
|
|
33
|
+
)
|
|
34
|
+
from omegaprompt.target import PromptTarget
|
|
35
|
+
|
|
36
|
+
__version__ = "0.1.0"
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
"PromptTarget",
|
|
40
|
+
"PromptSpace",
|
|
41
|
+
"ParamVariants",
|
|
42
|
+
"Dataset",
|
|
43
|
+
"DatasetItem",
|
|
44
|
+
"JudgeRubric",
|
|
45
|
+
"Dimension",
|
|
46
|
+
"HardGate",
|
|
47
|
+
"JudgeResult",
|
|
48
|
+
"CompositeFitness",
|
|
49
|
+
"CalibrationOutcome",
|
|
50
|
+
"__version__",
|
|
51
|
+
]
|
omegaprompt/__main__.py
ADDED
omegaprompt/api.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""Anthropic API wrappers for both target and judge calls.
|
|
2
|
+
|
|
3
|
+
Two separate client boundaries:
|
|
4
|
+
|
|
5
|
+
- ``call_target(...)`` - issues the user's prompt to the target model and
|
|
6
|
+
returns the raw response text. No schema enforcement; the target is
|
|
7
|
+
allowed to respond however it wants (that is what the judge grades).
|
|
8
|
+
- ``call_judge(...)`` - issues the rubric + response to the judge model
|
|
9
|
+
via ``messages.parse(output_format=JudgeResult)``. Schema enforcement
|
|
10
|
+
at the SDK boundary guarantees the judge cannot return malformed data.
|
|
11
|
+
|
|
12
|
+
Both wrappers accept the client via a duck-typed Protocol so tests mock
|
|
13
|
+
without importing ``anthropic``. Prompt caching is applied to the judge's
|
|
14
|
+
system prompt - the same prompt is used thousands of times per run so
|
|
15
|
+
cache hits dominate the judge cost.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from typing import Any, Protocol
|
|
21
|
+
|
|
22
|
+
from omegaprompt.judge import JudgeResult, JudgeRubric
|
|
23
|
+
from omegaprompt.prompts import JUDGE_SYSTEM_PROMPT
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class _MessagesNamespace(Protocol):
|
|
27
|
+
def create(self, **kwargs: Any) -> Any: ... # noqa: D401
|
|
28
|
+
|
|
29
|
+
def parse(self, **kwargs: Any) -> Any: ... # noqa: D401
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class _AnthropicLike(Protocol):
|
|
33
|
+
messages: _MessagesNamespace
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# Effort levels the target is willing to accept. The calibration searcher
|
|
37
|
+
# emits an integer in {0, 1, 2}; these labels are the corresponding string
|
|
38
|
+
# values the Anthropic SDK expects.
|
|
39
|
+
_EFFORT_LABELS = ("low", "medium", "high")
|
|
40
|
+
|
|
41
|
+
# Max-tokens buckets the searcher picks between. Picked to exercise short
|
|
42
|
+
# vs. long output behavior of typical frontier models; the calibration
|
|
43
|
+
# will surface which bucket is appropriate for the task.
|
|
44
|
+
_MAX_TOKENS_BUCKETS = (1024, 4096, 16000)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def effort_from_int(idx: int) -> str:
|
|
48
|
+
"""Map a calibration integer index to the vendor's effort label."""
|
|
49
|
+
idx = max(0, min(len(_EFFORT_LABELS) - 1, idx))
|
|
50
|
+
return _EFFORT_LABELS[idx]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def max_tokens_from_int(idx: int) -> int:
|
|
54
|
+
"""Map a calibration integer index to a concrete max_tokens value."""
|
|
55
|
+
idx = max(0, min(len(_MAX_TOKENS_BUCKETS) - 1, idx))
|
|
56
|
+
return _MAX_TOKENS_BUCKETS[idx]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def call_target(
|
|
60
|
+
client: _AnthropicLike,
|
|
61
|
+
*,
|
|
62
|
+
model: str,
|
|
63
|
+
system_prompt: str,
|
|
64
|
+
user_message: str,
|
|
65
|
+
few_shots: list[dict[str, str]],
|
|
66
|
+
effort: str,
|
|
67
|
+
max_tokens: int,
|
|
68
|
+
thinking_enabled: bool,
|
|
69
|
+
) -> tuple[str, dict[str, int]]:
|
|
70
|
+
"""Call the target model.
|
|
71
|
+
|
|
72
|
+
Returns ``(response_text, usage_dict)``. ``response_text`` is the
|
|
73
|
+
concatenated text of every text content block in the response.
|
|
74
|
+
"""
|
|
75
|
+
messages: list[dict[str, Any]] = []
|
|
76
|
+
for shot in few_shots:
|
|
77
|
+
messages.append({"role": "user", "content": shot["input"]})
|
|
78
|
+
messages.append({"role": "assistant", "content": shot["output"]})
|
|
79
|
+
messages.append({"role": "user", "content": user_message})
|
|
80
|
+
|
|
81
|
+
kwargs: dict[str, Any] = {
|
|
82
|
+
"model": model,
|
|
83
|
+
"max_tokens": max_tokens,
|
|
84
|
+
"system": [
|
|
85
|
+
{
|
|
86
|
+
"type": "text",
|
|
87
|
+
"text": system_prompt,
|
|
88
|
+
"cache_control": {"type": "ephemeral"},
|
|
89
|
+
}
|
|
90
|
+
],
|
|
91
|
+
"messages": messages,
|
|
92
|
+
}
|
|
93
|
+
if thinking_enabled:
|
|
94
|
+
kwargs["thinking"] = {"type": "adaptive"}
|
|
95
|
+
kwargs["output_config"] = {"effort": effort}
|
|
96
|
+
|
|
97
|
+
response = client.messages.create(**kwargs)
|
|
98
|
+
|
|
99
|
+
text_parts: list[str] = []
|
|
100
|
+
for block in getattr(response, "content", []) or []:
|
|
101
|
+
if getattr(block, "type", None) == "text":
|
|
102
|
+
chunk = getattr(block, "text", "")
|
|
103
|
+
if chunk:
|
|
104
|
+
text_parts.append(chunk)
|
|
105
|
+
return "\n".join(text_parts).strip(), _usage_to_dict(getattr(response, "usage", None))
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def call_judge(
|
|
109
|
+
client: _AnthropicLike,
|
|
110
|
+
*,
|
|
111
|
+
judge_model: str,
|
|
112
|
+
rubric: JudgeRubric,
|
|
113
|
+
task_input: str,
|
|
114
|
+
task_reference: str | None,
|
|
115
|
+
target_response: str,
|
|
116
|
+
) -> tuple[JudgeResult, dict[str, int]]:
|
|
117
|
+
"""Call the judge model with schema enforcement.
|
|
118
|
+
|
|
119
|
+
Returns ``(JudgeResult, usage_dict)``. Raises ``RuntimeError`` if the
|
|
120
|
+
judge refuses or returns no parsed output.
|
|
121
|
+
"""
|
|
122
|
+
user_payload = _build_judge_payload(rubric, task_input, task_reference, target_response)
|
|
123
|
+
|
|
124
|
+
response = client.messages.parse(
|
|
125
|
+
model=judge_model,
|
|
126
|
+
max_tokens=2048,
|
|
127
|
+
system=[
|
|
128
|
+
{
|
|
129
|
+
"type": "text",
|
|
130
|
+
"text": JUDGE_SYSTEM_PROMPT,
|
|
131
|
+
"cache_control": {"type": "ephemeral"},
|
|
132
|
+
}
|
|
133
|
+
],
|
|
134
|
+
messages=[{"role": "user", "content": user_payload}],
|
|
135
|
+
output_format=JudgeResult,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
if getattr(response, "stop_reason", None) == "refusal":
|
|
139
|
+
raise RuntimeError(
|
|
140
|
+
"Judge refused to score the response. This usually means the target "
|
|
141
|
+
"response contained content the judge's safety layer flagged. "
|
|
142
|
+
"Inspect the task input and response for anything that could trigger "
|
|
143
|
+
"the judge's refusal path."
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
parsed: JudgeResult | None = getattr(response, "parsed_output", None)
|
|
147
|
+
if parsed is None:
|
|
148
|
+
raise RuntimeError(
|
|
149
|
+
"Judge returned no parsed_output. The judge response did not conform "
|
|
150
|
+
"to the JudgeResult schema. This indicates a prompt-caching drift, a "
|
|
151
|
+
"judge model downgrade, or a transient SDK issue. "
|
|
152
|
+
f"stop_reason={getattr(response, 'stop_reason', None)!r}."
|
|
153
|
+
)
|
|
154
|
+
if not isinstance(parsed, JudgeResult):
|
|
155
|
+
parsed = JudgeResult.model_validate(parsed)
|
|
156
|
+
|
|
157
|
+
return parsed, _usage_to_dict(getattr(response, "usage", None))
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _build_judge_payload(
|
|
161
|
+
rubric: JudgeRubric,
|
|
162
|
+
task_input: str,
|
|
163
|
+
task_reference: str | None,
|
|
164
|
+
target_response: str,
|
|
165
|
+
) -> str:
|
|
166
|
+
"""Render the judge-user-turn payload the system prompt expects."""
|
|
167
|
+
import json as _json
|
|
168
|
+
|
|
169
|
+
rubric_json = _json.dumps(
|
|
170
|
+
{
|
|
171
|
+
"dimensions": [d.model_dump() for d in rubric.dimensions],
|
|
172
|
+
"hard_gates": [g.model_dump() for g in rubric.hard_gates],
|
|
173
|
+
},
|
|
174
|
+
ensure_ascii=False,
|
|
175
|
+
indent=2,
|
|
176
|
+
)
|
|
177
|
+
ref_block = f"<reference>\n{task_reference}\n</reference>\n\n" if task_reference else ""
|
|
178
|
+
return (
|
|
179
|
+
f"<rubric>\n{rubric_json}\n</rubric>\n\n"
|
|
180
|
+
f"<input>\n{task_input}\n</input>\n\n"
|
|
181
|
+
f"{ref_block}"
|
|
182
|
+
f"<response>\n{target_response}\n</response>"
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _usage_to_dict(usage: Any) -> dict[str, int]:
|
|
187
|
+
return {
|
|
188
|
+
"input_tokens": getattr(usage, "input_tokens", 0) or 0,
|
|
189
|
+
"output_tokens": getattr(usage, "output_tokens", 0) or 0,
|
|
190
|
+
"cache_creation_input_tokens": getattr(usage, "cache_creation_input_tokens", 0) or 0,
|
|
191
|
+
"cache_read_input_tokens": getattr(usage, "cache_read_input_tokens", 0) or 0,
|
|
192
|
+
}
|
omegaprompt/cli.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Top-level Typer application for omegaprompt."""
|
|
2
|
+
|
|
3
|
+
import typer
|
|
4
|
+
|
|
5
|
+
from omegaprompt import __version__
|
|
6
|
+
from omegaprompt.commands import calibrate as calibrate_cmd
|
|
7
|
+
|
|
8
|
+
app = typer.Typer(
|
|
9
|
+
name="omegaprompt",
|
|
10
|
+
help="Calibration discipline for Claude API prompts.",
|
|
11
|
+
no_args_is_help=True,
|
|
12
|
+
add_completion=False,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _version_callback(value: bool) -> None:
|
|
17
|
+
if value:
|
|
18
|
+
typer.echo(f"omegaprompt {__version__}")
|
|
19
|
+
raise typer.Exit()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@app.callback()
|
|
23
|
+
def _root(
|
|
24
|
+
version: bool = typer.Option( # noqa: B008
|
|
25
|
+
False,
|
|
26
|
+
"--version",
|
|
27
|
+
"-V",
|
|
28
|
+
callback=_version_callback,
|
|
29
|
+
is_eager=True,
|
|
30
|
+
help="Show version and exit.",
|
|
31
|
+
),
|
|
32
|
+
) -> None:
|
|
33
|
+
"""omegaprompt - apply omega-lock's calibration discipline to prompts."""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
app.command(
|
|
37
|
+
name="calibrate",
|
|
38
|
+
help="Calibrate a prompt configuration against a dataset (stress + grid + walk-forward).",
|
|
39
|
+
)(calibrate_cmd.calibrate)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
if __name__ == "__main__":
|
|
43
|
+
app()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Subcommand modules for the omegaprompt CLI."""
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""`omegaprompt calibrate` - end-to-end calibration from a single command.
|
|
2
|
+
|
|
3
|
+
Takes a training dataset, an optional test dataset, a judge rubric, and
|
|
4
|
+
a ParamVariants config; runs omega-lock's P1 pipeline (stress ->
|
|
5
|
+
top-K unlock -> grid -> walk-forward); writes a ``CalibrationOutcome``
|
|
6
|
+
JSON artifact with the winning parameters and the generalization gap.
|
|
7
|
+
|
|
8
|
+
This command requires omega-lock to be installed (``pip install omega-lock``)
|
|
9
|
+
and ``ANTHROPIC_API_KEY`` to be set.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
import typer
|
|
19
|
+
|
|
20
|
+
from omegaprompt.dataset import Dataset
|
|
21
|
+
from omegaprompt.judge import JudgeRubric
|
|
22
|
+
from omegaprompt.schema import CalibrationOutcome, ParamVariants, PromptSpace
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def calibrate(
|
|
26
|
+
dataset_path: Path = typer.Argument( # noqa: B008
|
|
27
|
+
...,
|
|
28
|
+
help="Path to the training dataset (.jsonl).",
|
|
29
|
+
exists=True,
|
|
30
|
+
file_okay=True,
|
|
31
|
+
dir_okay=False,
|
|
32
|
+
readable=True,
|
|
33
|
+
),
|
|
34
|
+
rubric_path: Path = typer.Option( # noqa: B008
|
|
35
|
+
...,
|
|
36
|
+
"--rubric",
|
|
37
|
+
"-r",
|
|
38
|
+
help="Path to the JudgeRubric JSON.",
|
|
39
|
+
exists=True,
|
|
40
|
+
file_okay=True,
|
|
41
|
+
dir_okay=False,
|
|
42
|
+
readable=True,
|
|
43
|
+
),
|
|
44
|
+
variants_path: Path = typer.Option( # noqa: B008
|
|
45
|
+
...,
|
|
46
|
+
"--variants",
|
|
47
|
+
"-v",
|
|
48
|
+
help="Path to the ParamVariants JSON (system_prompts + few_shot_examples).",
|
|
49
|
+
exists=True,
|
|
50
|
+
file_okay=True,
|
|
51
|
+
dir_okay=False,
|
|
52
|
+
readable=True,
|
|
53
|
+
),
|
|
54
|
+
test_path: Path | None = typer.Option( # noqa: B008
|
|
55
|
+
None,
|
|
56
|
+
"--test",
|
|
57
|
+
"-t",
|
|
58
|
+
help="Path to the walk-forward test dataset (.jsonl). Recommended.",
|
|
59
|
+
exists=True,
|
|
60
|
+
file_okay=True,
|
|
61
|
+
dir_okay=False,
|
|
62
|
+
readable=True,
|
|
63
|
+
),
|
|
64
|
+
output_path: Path = typer.Option( # noqa: B008
|
|
65
|
+
Path("calibration_outcome.json"),
|
|
66
|
+
"--output",
|
|
67
|
+
"-o",
|
|
68
|
+
help="Where to write the CalibrationOutcome JSON artifact.",
|
|
69
|
+
file_okay=True,
|
|
70
|
+
dir_okay=False,
|
|
71
|
+
),
|
|
72
|
+
target_model: str = typer.Option( # noqa: B008
|
|
73
|
+
...,
|
|
74
|
+
"--target-model",
|
|
75
|
+
help="Model string to calibrate (e.g. 'claude-haiku-4-5').",
|
|
76
|
+
),
|
|
77
|
+
judge_model: str = typer.Option( # noqa: B008
|
|
78
|
+
...,
|
|
79
|
+
"--judge-model",
|
|
80
|
+
help="Model string to use as judge. Can equal --target-model.",
|
|
81
|
+
),
|
|
82
|
+
method: str = typer.Option( # noqa: B008
|
|
83
|
+
"p1",
|
|
84
|
+
"--method",
|
|
85
|
+
"-m",
|
|
86
|
+
help="Calibration method: 'p1' (grid + KC-4) or 'grid' (grid only).",
|
|
87
|
+
case_sensitive=False,
|
|
88
|
+
),
|
|
89
|
+
unlock_k: int = typer.Option( # noqa: B008
|
|
90
|
+
3,
|
|
91
|
+
"--unlock-k",
|
|
92
|
+
min=1,
|
|
93
|
+
help="How many top-stress parameters to unlock for grid search.",
|
|
94
|
+
),
|
|
95
|
+
space_path: Path | None = typer.Option( # noqa: B008
|
|
96
|
+
None,
|
|
97
|
+
"--space",
|
|
98
|
+
help="Optional PromptSpace JSON to override the default parameter bounds.",
|
|
99
|
+
exists=True,
|
|
100
|
+
file_okay=True,
|
|
101
|
+
dir_okay=False,
|
|
102
|
+
readable=True,
|
|
103
|
+
),
|
|
104
|
+
) -> None:
|
|
105
|
+
"""Calibrate a Claude API prompt configuration against a dataset."""
|
|
106
|
+
|
|
107
|
+
if not os.getenv("ANTHROPIC_API_KEY"):
|
|
108
|
+
typer.secho(
|
|
109
|
+
"ANTHROPIC_API_KEY is not set. Export it (see https://console.anthropic.com) "
|
|
110
|
+
"before running `omegaprompt calibrate`.",
|
|
111
|
+
fg=typer.colors.RED,
|
|
112
|
+
err=True,
|
|
113
|
+
)
|
|
114
|
+
raise typer.Exit(code=2)
|
|
115
|
+
|
|
116
|
+
# Lazy imports so the CLI can at least print --help without omega-lock
|
|
117
|
+
# / anthropic installed.
|
|
118
|
+
try:
|
|
119
|
+
from anthropic import Anthropic # noqa: F401
|
|
120
|
+
except ImportError as exc:
|
|
121
|
+
typer.secho(
|
|
122
|
+
f"The 'anthropic' package is required for `calibrate`. Install with "
|
|
123
|
+
f"`pip install omegaprompt[anthropic]` or `pip install anthropic`. ({exc})",
|
|
124
|
+
fg=typer.colors.RED,
|
|
125
|
+
err=True,
|
|
126
|
+
)
|
|
127
|
+
raise typer.Exit(code=2) from exc
|
|
128
|
+
try:
|
|
129
|
+
from omega_lock import run_p1, P1Config # noqa: F401
|
|
130
|
+
except ImportError as exc:
|
|
131
|
+
typer.secho(
|
|
132
|
+
f"The 'omega-lock' package is required for `calibrate`. Install with "
|
|
133
|
+
f"`pip install omega-lock`. ({exc})",
|
|
134
|
+
fg=typer.colors.RED,
|
|
135
|
+
err=True,
|
|
136
|
+
)
|
|
137
|
+
raise typer.Exit(code=2) from exc
|
|
138
|
+
|
|
139
|
+
from anthropic import Anthropic
|
|
140
|
+
from omega_lock import run_p1, P1Config
|
|
141
|
+
|
|
142
|
+
from omegaprompt.target import PromptTarget
|
|
143
|
+
|
|
144
|
+
typer.secho(f"Loading dataset from {dataset_path} ...", fg=typer.colors.BRIGHT_BLACK)
|
|
145
|
+
train_ds = Dataset.from_jsonl(dataset_path)
|
|
146
|
+
typer.secho(f" {len(train_ds)} items", fg=typer.colors.BRIGHT_BLACK)
|
|
147
|
+
|
|
148
|
+
test_ds: Dataset | None = None
|
|
149
|
+
if test_path is not None:
|
|
150
|
+
typer.secho(f"Loading test set from {test_path} ...", fg=typer.colors.BRIGHT_BLACK)
|
|
151
|
+
test_ds = Dataset.from_jsonl(test_path)
|
|
152
|
+
typer.secho(f" {len(test_ds)} items", fg=typer.colors.BRIGHT_BLACK)
|
|
153
|
+
|
|
154
|
+
rubric = JudgeRubric.from_json(rubric_path)
|
|
155
|
+
variants_payload = json.loads(rubric_path.read_text(encoding="utf-8")) if False else None
|
|
156
|
+
variants = ParamVariants.model_validate_json(variants_path.read_text(encoding="utf-8"))
|
|
157
|
+
space: PromptSpace | None = None
|
|
158
|
+
if space_path is not None:
|
|
159
|
+
space = PromptSpace.model_validate_json(space_path.read_text(encoding="utf-8"))
|
|
160
|
+
|
|
161
|
+
client = Anthropic()
|
|
162
|
+
train_target = PromptTarget(
|
|
163
|
+
target_client=client,
|
|
164
|
+
judge_client=client,
|
|
165
|
+
dataset=train_ds,
|
|
166
|
+
rubric=rubric,
|
|
167
|
+
variants=variants,
|
|
168
|
+
space=space,
|
|
169
|
+
target_model=target_model,
|
|
170
|
+
judge_model=judge_model,
|
|
171
|
+
)
|
|
172
|
+
test_target: PromptTarget | None = None
|
|
173
|
+
if test_ds is not None:
|
|
174
|
+
test_target = PromptTarget(
|
|
175
|
+
target_client=client,
|
|
176
|
+
judge_client=client,
|
|
177
|
+
dataset=test_ds,
|
|
178
|
+
rubric=rubric,
|
|
179
|
+
variants=variants,
|
|
180
|
+
space=space,
|
|
181
|
+
target_model=target_model,
|
|
182
|
+
judge_model=judge_model,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
typer.secho(
|
|
186
|
+
f"Starting omega-lock run_p1 calibration (unlock_k={unlock_k}, method={method}) ...",
|
|
187
|
+
fg=typer.colors.BRIGHT_BLACK,
|
|
188
|
+
)
|
|
189
|
+
typer.secho(
|
|
190
|
+
"This issues Claude API calls. Budget accordingly.",
|
|
191
|
+
fg=typer.colors.YELLOW,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
config = P1Config(unlock_k=unlock_k)
|
|
195
|
+
result = run_p1(
|
|
196
|
+
train_target=train_target,
|
|
197
|
+
test_target=test_target,
|
|
198
|
+
config=config,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
best_params = getattr(result, "grid_best", None) or {}
|
|
202
|
+
best_params_dict = best_params.get("unlocked", {}) if isinstance(best_params, dict) else {}
|
|
203
|
+
best_fitness = float(best_params.get("fitness", 0.0)) if isinstance(best_params, dict) else 0.0
|
|
204
|
+
test_fitness = None
|
|
205
|
+
if test_target is not None and isinstance(best_params, dict):
|
|
206
|
+
test_fitness = best_params.get("test_fitness")
|
|
207
|
+
gen_gap = None
|
|
208
|
+
if test_fitness is not None and best_fitness != 0:
|
|
209
|
+
gen_gap = abs(best_fitness - float(test_fitness)) / abs(best_fitness)
|
|
210
|
+
|
|
211
|
+
outcome = CalibrationOutcome(
|
|
212
|
+
best_params=best_params_dict,
|
|
213
|
+
best_fitness=best_fitness,
|
|
214
|
+
test_fitness=float(test_fitness) if test_fitness is not None else None,
|
|
215
|
+
generalization_gap=gen_gap,
|
|
216
|
+
hard_gate_pass_rate=train_target._fitness.pass_rate(),
|
|
217
|
+
n_candidates_evaluated=train_target.total_api_calls
|
|
218
|
+
// max(len(train_ds) * 2, 1),
|
|
219
|
+
total_api_calls=train_target.total_api_calls
|
|
220
|
+
+ (test_target.total_api_calls if test_target else 0),
|
|
221
|
+
method=method,
|
|
222
|
+
usage_summary=dict(train_target.last_usage),
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
output_path.write_text(outcome.model_dump_json(indent=2) + "\n", encoding="utf-8")
|
|
226
|
+
typer.secho(
|
|
227
|
+
f"Calibration complete. best_fitness={best_fitness:.4f}"
|
|
228
|
+
+ (f", test_fitness={test_fitness:.4f}" if test_fitness is not None else "")
|
|
229
|
+
+ (f", gen_gap={gen_gap:.2%}" if gen_gap is not None else ""),
|
|
230
|
+
fg=typer.colors.GREEN,
|
|
231
|
+
)
|
|
232
|
+
typer.secho(f"Artifact: {output_path}", fg=typer.colors.GREEN)
|
omegaprompt/dataset.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Dataset loading for omegaprompt calibration.
|
|
2
|
+
|
|
3
|
+
The expected format is JSON Lines (``.jsonl``) with one item per line:
|
|
4
|
+
|
|
5
|
+
.. code-block:: json
|
|
6
|
+
|
|
7
|
+
{"id": "t1", "input": "...", "reference": "...", "metadata": {...}}
|
|
8
|
+
|
|
9
|
+
``reference`` is optional — judges can score without a reference when the
|
|
10
|
+
rubric is purely qualitative. ``metadata`` is a free-form dict that the
|
|
11
|
+
judge may consult (e.g. difficulty tier, target domain).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
from collections.abc import Iterable
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DatasetItem(BaseModel):
|
|
24
|
+
"""A single calibration example."""
|
|
25
|
+
|
|
26
|
+
model_config = ConfigDict(extra="allow")
|
|
27
|
+
|
|
28
|
+
id: str = Field(..., min_length=1)
|
|
29
|
+
input: str = Field(..., min_length=1, description="Input given to the target model.")
|
|
30
|
+
reference: str | None = Field(
|
|
31
|
+
default=None,
|
|
32
|
+
description="Optional reference / expected output for the judge.",
|
|
33
|
+
)
|
|
34
|
+
metadata: dict = Field(default_factory=dict)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class Dataset(BaseModel):
|
|
38
|
+
"""A set of ``DatasetItem`` s loaded from a JSONL file."""
|
|
39
|
+
|
|
40
|
+
items: list[DatasetItem]
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def from_jsonl(cls, path: str | Path) -> Dataset:
|
|
44
|
+
"""Read one item per line from ``path``."""
|
|
45
|
+
p = Path(path)
|
|
46
|
+
if not p.exists():
|
|
47
|
+
raise FileNotFoundError(f"Dataset not found: {p}")
|
|
48
|
+
items: list[DatasetItem] = []
|
|
49
|
+
with p.open("r", encoding="utf-8") as fh:
|
|
50
|
+
for lineno, raw in enumerate(fh, start=1):
|
|
51
|
+
line = raw.strip()
|
|
52
|
+
if not line:
|
|
53
|
+
continue
|
|
54
|
+
try:
|
|
55
|
+
payload = json.loads(line)
|
|
56
|
+
except json.JSONDecodeError as exc:
|
|
57
|
+
raise ValueError(
|
|
58
|
+
f"{p}:{lineno} - not valid JSON: {exc.msg}"
|
|
59
|
+
) from exc
|
|
60
|
+
try:
|
|
61
|
+
items.append(DatasetItem.model_validate(payload))
|
|
62
|
+
except Exception as exc:
|
|
63
|
+
raise ValueError(f"{p}:{lineno} - schema invalid: {exc}") from exc
|
|
64
|
+
if not items:
|
|
65
|
+
raise ValueError(f"{p} contained zero items (empty or blank-only lines).")
|
|
66
|
+
ids = [it.id for it in items]
|
|
67
|
+
if len(set(ids)) != len(ids):
|
|
68
|
+
dupes = sorted({i for i in ids if ids.count(i) > 1})
|
|
69
|
+
raise ValueError(f"{p} has duplicate ids: {dupes}")
|
|
70
|
+
return cls(items=items)
|
|
71
|
+
|
|
72
|
+
@classmethod
|
|
73
|
+
def from_items(cls, items: Iterable[DatasetItem | dict]) -> Dataset:
|
|
74
|
+
"""Construct a Dataset from an in-memory iterable. Useful in tests."""
|
|
75
|
+
parsed: list[DatasetItem] = []
|
|
76
|
+
for it in items:
|
|
77
|
+
if isinstance(it, DatasetItem):
|
|
78
|
+
parsed.append(it)
|
|
79
|
+
else:
|
|
80
|
+
parsed.append(DatasetItem.model_validate(it))
|
|
81
|
+
return cls(items=parsed)
|
|
82
|
+
|
|
83
|
+
def __len__(self) -> int:
|
|
84
|
+
return len(self.items)
|
|
85
|
+
|
|
86
|
+
def __iter__(self):
|
|
87
|
+
return iter(self.items)
|
omegaprompt/fitness.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Composite fitness: ``hard_gate × soft_score``.
|
|
2
|
+
|
|
3
|
+
The omegaprompt fitness rule is deliberately non-negotiable: if any hard
|
|
4
|
+
gate fails on a dataset item, that item's contribution is zero, period.
|
|
5
|
+
This is the structural defense against prompts that score beautifully on
|
|
6
|
+
the soft rubric but refuse on a subset of inputs (a common failure mode
|
|
7
|
+
with aggressive system prompts) or emit malformed output.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from collections.abc import Iterable
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
|
|
15
|
+
from omegaprompt.judge import JudgeResult, JudgeRubric
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass(frozen=True)
|
|
19
|
+
class PerItemScore:
|
|
20
|
+
"""Fitness breakdown for a single dataset item."""
|
|
21
|
+
|
|
22
|
+
item_id: str
|
|
23
|
+
soft_score: float
|
|
24
|
+
gates_passed: bool
|
|
25
|
+
final_score: float # soft_score if gates_passed else 0.0
|
|
26
|
+
notes: str = ""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class CompositeFitness:
|
|
30
|
+
"""Aggregate per-item judge results into a scalar fitness.
|
|
31
|
+
|
|
32
|
+
The aggregation policy (mean by default) is intentionally simple and
|
|
33
|
+
boring. Fancy aggregations (robust mean, quantile-weighted, etc.) live
|
|
34
|
+
downstream of the raw per-item record preserved in ``last_per_item``.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, rubric: JudgeRubric) -> None:
|
|
38
|
+
self.rubric = rubric
|
|
39
|
+
self.last_per_item: list[PerItemScore] = []
|
|
40
|
+
|
|
41
|
+
def evaluate(
|
|
42
|
+
self,
|
|
43
|
+
judge_results: Iterable[tuple[str, JudgeResult]],
|
|
44
|
+
) -> float:
|
|
45
|
+
"""Score a batch of (item_id, judge_result) pairs.
|
|
46
|
+
|
|
47
|
+
Returns the mean final score across all items in the batch. The
|
|
48
|
+
per-item breakdown is retained on ``self.last_per_item`` for the
|
|
49
|
+
caller to inspect (useful for populating ``metadata`` on the
|
|
50
|
+
omega-lock EvalResult).
|
|
51
|
+
"""
|
|
52
|
+
per_item: list[PerItemScore] = []
|
|
53
|
+
for item_id, jr in judge_results:
|
|
54
|
+
soft = jr.weighted_score(self.rubric)
|
|
55
|
+
passed = not jr.any_gate_failed()
|
|
56
|
+
per_item.append(
|
|
57
|
+
PerItemScore(
|
|
58
|
+
item_id=item_id,
|
|
59
|
+
soft_score=soft,
|
|
60
|
+
gates_passed=passed,
|
|
61
|
+
final_score=soft if passed else 0.0,
|
|
62
|
+
notes=jr.notes,
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
self.last_per_item = per_item
|
|
67
|
+
if not per_item:
|
|
68
|
+
return 0.0
|
|
69
|
+
return sum(p.final_score for p in per_item) / len(per_item)
|
|
70
|
+
|
|
71
|
+
def pass_rate(self) -> float:
|
|
72
|
+
"""Fraction of items in the last batch that cleared every gate."""
|
|
73
|
+
if not self.last_per_item:
|
|
74
|
+
return 0.0
|
|
75
|
+
return sum(1 for p in self.last_per_item if p.gates_passed) / len(self.last_per_item)
|