osmosis-ai 0.2.3__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of osmosis-ai might be problematic. Click here for more details.
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/PKG-INFO +46 -35
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/README.md +45 -34
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/cli_services/__init__.py +1 -8
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/cli_services/config.py +21 -18
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/cli_services/dataset.py +28 -82
- osmosis_ai-0.2.4/osmosis_ai/cli_services/engine.py +421 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/consts.py +1 -1
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/rubric_eval.py +34 -176
- osmosis_ai-0.2.4/osmosis_ai/utils.py +315 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/pyproject.toml +1 -1
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/tests/test_cli.py +43 -25
- osmosis_ai-0.2.4/tests/test_cli_services.py +400 -0
- osmosis_ai-0.2.4/tests/test_rubric_eval.py +40 -0
- osmosis_ai-0.2.3/osmosis_ai/cli_services/engine.py +0 -251
- osmosis_ai-0.2.3/osmosis_ai/utils.py +0 -450
- osmosis_ai-0.2.3/tests/test_cli_services.py +0 -193
- osmosis_ai-0.2.3/tests/test_rubric_eval.py +0 -127
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/LICENSE +0 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/MANIFEST.in +0 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/__init__.py +0 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/cli.py +0 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/cli_commands.py +0 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/cli_services/errors.py +0 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/cli_services/reporting.py +0 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/cli_services/session.py +0 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/cli_services/shared.py +0 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/providers/__init__.py +0 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/providers/anthropic_provider.py +0 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/providers/base.py +0 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/providers/gemini_provider.py +0 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/providers/openai_family.py +0 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/providers/shared.py +0 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/rubric_types.py +0 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai.egg-info/SOURCES.txt +0 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/pytest.ini +0 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/requirements.txt +0 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/setup.cfg +0 -0
- {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/setup_env.bat +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: osmosis-ai
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: A Python library for reward function validation with strict type enforcement.
|
|
5
5
|
Author-email: Osmosis AI <jake@osmosis.ai>
|
|
6
6
|
License: MIT License
|
|
@@ -81,23 +81,12 @@ score = simple_reward("hello world", "hello world") # Returns 1.0
|
|
|
81
81
|
```python
|
|
82
82
|
from osmosis_ai import evaluate_rubric
|
|
83
83
|
|
|
84
|
-
|
|
85
|
-
{
|
|
86
|
-
"type": "message",
|
|
87
|
-
"role": "user",
|
|
88
|
-
"content": [{"type": "input_text", "text": "What is the capital of France?"}],
|
|
89
|
-
},
|
|
90
|
-
{
|
|
91
|
-
"type": "message",
|
|
92
|
-
"role": "assistant",
|
|
93
|
-
"content": [{"type": "output_text", "text": "The capital of France is Paris."}],
|
|
94
|
-
},
|
|
95
|
-
]
|
|
84
|
+
solution = "The capital of France is Paris."
|
|
96
85
|
|
|
97
86
|
# Export OPENAI_API_KEY in your shell before running this snippet.
|
|
98
87
|
rubric_score = evaluate_rubric(
|
|
99
88
|
rubric="Assistant must mention the verified capital city.",
|
|
100
|
-
|
|
89
|
+
solution_str=solution,
|
|
101
90
|
model_info={
|
|
102
91
|
"provider": "openai",
|
|
103
92
|
"model": "gpt-5",
|
|
@@ -128,13 +117,15 @@ Credentials are resolved from environment variables by default:
|
|
|
128
117
|
|
|
129
118
|
Override the environment variable name with `model_info={"api_key_env": "CUSTOM_ENV_NAME"}` when needed, or supply an inline secret with `model_info={"api_key": "sk-..."}` for ephemeral credentials. Missing API keys raise a `MissingAPIKeyError` that explains how to export the secret before trying again.
|
|
130
119
|
|
|
120
|
+
`api_key` and `api_key_env` are mutually exclusive ways to provide the same credential. When `api_key` is present and non-empty it is used directly, skipping any environment lookup. Otherwise the resolver falls back to `api_key_env` (or the provider default) and pulls the value from your local environment with `os.getenv`.
|
|
121
|
+
|
|
131
122
|
`model_info` accepts additional rubric-specific knobs:
|
|
132
123
|
|
|
133
124
|
- `score_min` / `score_max` – change the default `[0.0, 1.0]` scoring bounds.
|
|
134
|
-
- `system_prompt` / `original_input` –
|
|
125
|
+
- `system_prompt` / `original_input` – provide optional context strings that will be quoted in the judging prompt.
|
|
135
126
|
- `timeout` – customise the provider timeout in seconds.
|
|
136
127
|
|
|
137
|
-
Pass `
|
|
128
|
+
Pass `metadata={...}` to `evaluate_rubric` when you need structured context quoted in the judge prompt, and set `return_details=True` to receive the full `RewardRubricRunResult` payload (including the provider’s raw response).
|
|
138
129
|
|
|
139
130
|
Remote failures surface as `ProviderRequestError` instances, with `ModelNotFoundError` reserved for missing model identifiers so you can retry with a new snapshot.
|
|
140
131
|
|
|
@@ -172,24 +163,35 @@ The decorator will raise a `TypeError` if the function doesn't match this exact
|
|
|
172
163
|
|
|
173
164
|
## Rubric Function Signature
|
|
174
165
|
|
|
175
|
-
Rubric functions decorated with `@osmosis_rubric` must
|
|
166
|
+
Rubric functions decorated with `@osmosis_rubric` must match this signature:
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
@osmosis_rubric
|
|
170
|
+
def your_rubric(solution_str: str, ground_truth: str | None, extra_info: dict) -> float:
|
|
171
|
+
# Your rubric logic here
|
|
172
|
+
return float_score
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
> The runtime forwards `None` for `ground_truth` when no reference answer exists. Annotate the parameter as `Optional[str]` (or handle `None` explicitly) if your rubric logic expects to run in that scenario.
|
|
176
|
+
|
|
177
|
+
### Required `extra_info` fields
|
|
176
178
|
|
|
177
|
-
-
|
|
178
|
-
-
|
|
179
|
-
-
|
|
180
|
-
- `
|
|
181
|
-
- `system_message: Optional[str] = None`
|
|
182
|
-
- `extra_info: dict = None`
|
|
183
|
-
- `score_min: float = 0.0` *(optional lower bound; must default to 0.0 and stay below `score_max`)*
|
|
184
|
-
- `score_max: float = 1.0` *(optional upper bound; must default to 1.0 and stay above `score_min`)*
|
|
179
|
+
- **`provider`** – Non-empty string identifying the judge provider.
|
|
180
|
+
- **`model`** – Non-empty string naming the provider model to call.
|
|
181
|
+
- **`rubric`** – Natural-language rubric instructions for the judge model.
|
|
182
|
+
- **`api_key` / `api_key_env`** – Supply either the raw key or the environment variable name that exposes it.
|
|
185
183
|
|
|
186
|
-
|
|
184
|
+
### Optional `extra_info` fields
|
|
187
185
|
|
|
188
|
-
|
|
186
|
+
- **`system_prompt`** – Optional string prepended to the provider’s base system prompt when invoking the judge; include it inside `extra_info` rather than as a separate argument.
|
|
187
|
+
- **`score_min` / `score_max`** – Optional numeric overrides for the expected score range.
|
|
188
|
+
- **`model_info_overrides`** – Optional dict merged into the provider configuration passed to the judge.
|
|
189
189
|
|
|
190
|
-
|
|
190
|
+
Additional keys are passthrough and can be used for custom configuration. If you need to extend the provider payload (for example adding `api_key_env`), add a dict under `model_info_overrides` and it will be merged with the required `provider`/`model` pair before invoking `evaluate_rubric`. The decorator enforces the parameter names/annotations, validates the embedded configuration at call time, and ensures the wrapped function returns a `float`.
|
|
191
191
|
|
|
192
|
-
>
|
|
192
|
+
> Annotation quirk: `extra_info` must be annotated as `dict` **without** a default value, unlike `@osmosis_reward`.
|
|
193
|
+
|
|
194
|
+
> Tip: When delegating to `evaluate_rubric`, pass the raw `solution_str` directly and include any extra context inside the `metadata` payload.
|
|
193
195
|
|
|
194
196
|
## Examples
|
|
195
197
|
|
|
@@ -224,8 +226,8 @@ def numeric_tolerance(solution_str: str, ground_truth: str, extra_info: dict = N
|
|
|
224
226
|
|
|
225
227
|
- `examples/rubric_functions.py` demonstrates `evaluate_rubric` with OpenAI, Anthropic, Gemini, and xAI using the schema-enforced SDK integrations.
|
|
226
228
|
- `examples/reward_functions.py` keeps local reward helpers that showcase the decorator contract without external calls.
|
|
227
|
-
- `examples/rubric_configs.yaml` bundles two rubric definitions
|
|
228
|
-
- `examples/sample_data.jsonl` contains two
|
|
229
|
+
- `examples/rubric_configs.yaml` bundles two rubric definitions with provider configuration and scoring bounds.
|
|
230
|
+
- `examples/sample_data.jsonl` contains two rubric-aligned solution strings so you can trial dataset validation.
|
|
229
231
|
|
|
230
232
|
```yaml
|
|
231
233
|
# examples/rubric_configs.yaml (excerpt)
|
|
@@ -239,8 +241,8 @@ rubrics:
|
|
|
239
241
|
```
|
|
240
242
|
|
|
241
243
|
```jsonl
|
|
242
|
-
{"conversation_id": "ticket-001", "rubric_id": "support_followup", "...": "..."}
|
|
243
|
-
{"conversation_id": "ticket-047", "rubric_id": "policy_grounding", "...": "..."}
|
|
244
|
+
{"conversation_id": "ticket-001", "rubric_id": "support_followup", "original_input": "...", "solution_str": "..."}
|
|
245
|
+
{"conversation_id": "ticket-047", "rubric_id": "policy_grounding", "original_input": "...", "solution_str": "..."}
|
|
244
246
|
```
|
|
245
247
|
|
|
246
248
|
## CLI Tools
|
|
@@ -253,7 +255,7 @@ Preview a rubric file and print every configuration discovered, including nested
|
|
|
253
255
|
osmosis preview --path path/to/rubric.yaml
|
|
254
256
|
```
|
|
255
257
|
|
|
256
|
-
Preview a dataset of
|
|
258
|
+
Preview a dataset of rubric-scored solutions stored as JSONL:
|
|
257
259
|
|
|
258
260
|
```bash
|
|
259
261
|
osmosis preview --path path/to/data.jsonl
|
|
@@ -271,6 +273,9 @@ osmosis eval --rubric support_followup --data examples/sample_data.jsonl
|
|
|
271
273
|
- Provide `--output path/to/dir` to create the directory (if needed) and emit `rubric_eval_result_<unix_timestamp>.json`, or supply a full file path (any extension) to control the filename; each file captures every run, provider payloads, timestamps, and aggregate statistics for downstream analysis.
|
|
272
274
|
- Skip `--output` to collect results under `~/.cache/osmosis/eval_result/<rubric_id>/rubric_eval_result_<identifier>.json`; the CLI writes this JSON whether the evaluation finishes cleanly or hits provider/runtime errors so you can inspect failures later (only a manual Ctrl+C interrupt leaves no file behind).
|
|
273
275
|
- Dataset rows whose `rubric_id` does not match the requested rubric are skipped automatically.
|
|
276
|
+
- Each dataset record must provide a non-empty `solution_str`; optional fields such as `original_input`, `ground_truth`, and `extra_info` travel with the record and are forwarded to the evaluator when present.
|
|
277
|
+
- When delegating to a custom `@osmosis_rubric` function, the CLI enriches `extra_info` with the active `provider`, `model`, `rubric`, score bounds, any configured `system_prompt`, the resolved `original_input`, and the record’s metadata/extra fields so the decorator’s required entries are always present.
|
|
278
|
+
- Rubric configuration files intentionally reject `extra_info`; provide per-example context through the dataset instead.
|
|
274
279
|
|
|
275
280
|
Both commands validate the file, echo a short summary (`Loaded <n> ...`), and pretty-print the parsed records so you can confirm that new rubrics or test fixtures look correct before committing them. Invalid files raise a descriptive error and exit with a non-zero status code.
|
|
276
281
|
|
|
@@ -283,7 +288,13 @@ PYTHONPATH=. python examples/rubric_functions.py # Uncomment the provider you n
|
|
|
283
288
|
|
|
284
289
|
## Testing
|
|
285
290
|
|
|
286
|
-
Run `python -m pytest
|
|
291
|
+
Run `python -m pytest` (or any subset under `tests/`) to exercise the updated helpers:
|
|
292
|
+
|
|
293
|
+
- `tests/test_rubric_eval.py` covers prompt construction for `solution_str` evaluations.
|
|
294
|
+
- `tests/test_cli_services.py` validates dataset parsing, extra-info enrichment, and engine interactions.
|
|
295
|
+
- `tests/test_cli.py` ensures the CLI pathways surface the new fields end to end.
|
|
296
|
+
|
|
297
|
+
Add additional tests under `tests/` as you extend the library.
|
|
287
298
|
|
|
288
299
|
## License
|
|
289
300
|
|
|
@@ -36,23 +36,12 @@ score = simple_reward("hello world", "hello world") # Returns 1.0
|
|
|
36
36
|
```python
|
|
37
37
|
from osmosis_ai import evaluate_rubric
|
|
38
38
|
|
|
39
|
-
|
|
40
|
-
{
|
|
41
|
-
"type": "message",
|
|
42
|
-
"role": "user",
|
|
43
|
-
"content": [{"type": "input_text", "text": "What is the capital of France?"}],
|
|
44
|
-
},
|
|
45
|
-
{
|
|
46
|
-
"type": "message",
|
|
47
|
-
"role": "assistant",
|
|
48
|
-
"content": [{"type": "output_text", "text": "The capital of France is Paris."}],
|
|
49
|
-
},
|
|
50
|
-
]
|
|
39
|
+
solution = "The capital of France is Paris."
|
|
51
40
|
|
|
52
41
|
# Export OPENAI_API_KEY in your shell before running this snippet.
|
|
53
42
|
rubric_score = evaluate_rubric(
|
|
54
43
|
rubric="Assistant must mention the verified capital city.",
|
|
55
|
-
|
|
44
|
+
solution_str=solution,
|
|
56
45
|
model_info={
|
|
57
46
|
"provider": "openai",
|
|
58
47
|
"model": "gpt-5",
|
|
@@ -83,13 +72,15 @@ Credentials are resolved from environment variables by default:
|
|
|
83
72
|
|
|
84
73
|
Override the environment variable name with `model_info={"api_key_env": "CUSTOM_ENV_NAME"}` when needed, or supply an inline secret with `model_info={"api_key": "sk-..."}` for ephemeral credentials. Missing API keys raise a `MissingAPIKeyError` that explains how to export the secret before trying again.
|
|
85
74
|
|
|
75
|
+
`api_key` and `api_key_env` are mutually exclusive ways to provide the same credential. When `api_key` is present and non-empty it is used directly, skipping any environment lookup. Otherwise the resolver falls back to `api_key_env` (or the provider default) and pulls the value from your local environment with `os.getenv`.
|
|
76
|
+
|
|
86
77
|
`model_info` accepts additional rubric-specific knobs:
|
|
87
78
|
|
|
88
79
|
- `score_min` / `score_max` – change the default `[0.0, 1.0]` scoring bounds.
|
|
89
|
-
- `system_prompt` / `original_input` –
|
|
80
|
+
- `system_prompt` / `original_input` – provide optional context strings that will be quoted in the judging prompt.
|
|
90
81
|
- `timeout` – customise the provider timeout in seconds.
|
|
91
82
|
|
|
92
|
-
Pass `
|
|
83
|
+
Pass `metadata={...}` to `evaluate_rubric` when you need structured context quoted in the judge prompt, and set `return_details=True` to receive the full `RewardRubricRunResult` payload (including the provider’s raw response).
|
|
93
84
|
|
|
94
85
|
Remote failures surface as `ProviderRequestError` instances, with `ModelNotFoundError` reserved for missing model identifiers so you can retry with a new snapshot.
|
|
95
86
|
|
|
@@ -127,24 +118,35 @@ The decorator will raise a `TypeError` if the function doesn't match this exact
|
|
|
127
118
|
|
|
128
119
|
## Rubric Function Signature
|
|
129
120
|
|
|
130
|
-
Rubric functions decorated with `@osmosis_rubric` must
|
|
121
|
+
Rubric functions decorated with `@osmosis_rubric` must match this signature:
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
@osmosis_rubric
|
|
125
|
+
def your_rubric(solution_str: str, ground_truth: str | None, extra_info: dict) -> float:
|
|
126
|
+
# Your rubric logic here
|
|
127
|
+
return float_score
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
> The runtime forwards `None` for `ground_truth` when no reference answer exists. Annotate the parameter as `Optional[str]` (or handle `None` explicitly) if your rubric logic expects to run in that scenario.
|
|
131
|
+
|
|
132
|
+
### Required `extra_info` fields
|
|
131
133
|
|
|
132
|
-
-
|
|
133
|
-
-
|
|
134
|
-
-
|
|
135
|
-
- `
|
|
136
|
-
- `system_message: Optional[str] = None`
|
|
137
|
-
- `extra_info: dict = None`
|
|
138
|
-
- `score_min: float = 0.0` *(optional lower bound; must default to 0.0 and stay below `score_max`)*
|
|
139
|
-
- `score_max: float = 1.0` *(optional upper bound; must default to 1.0 and stay above `score_min`)*
|
|
134
|
+
- **`provider`** – Non-empty string identifying the judge provider.
|
|
135
|
+
- **`model`** – Non-empty string naming the provider model to call.
|
|
136
|
+
- **`rubric`** – Natural-language rubric instructions for the judge model.
|
|
137
|
+
- **`api_key` / `api_key_env`** – Supply either the raw key or the environment variable name that exposes it.
|
|
140
138
|
|
|
141
|
-
|
|
139
|
+
### Optional `extra_info` fields
|
|
142
140
|
|
|
143
|
-
|
|
141
|
+
- **`system_prompt`** – Optional string prepended to the provider’s base system prompt when invoking the judge; include it inside `extra_info` rather than as a separate argument.
|
|
142
|
+
- **`score_min` / `score_max`** – Optional numeric overrides for the expected score range.
|
|
143
|
+
- **`model_info_overrides`** – Optional dict merged into the provider configuration passed to the judge.
|
|
144
144
|
|
|
145
|
-
|
|
145
|
+
Additional keys are passthrough and can be used for custom configuration. If you need to extend the provider payload (for example adding `api_key_env`), add a dict under `model_info_overrides` and it will be merged with the required `provider`/`model` pair before invoking `evaluate_rubric`. The decorator enforces the parameter names/annotations, validates the embedded configuration at call time, and ensures the wrapped function returns a `float`.
|
|
146
146
|
|
|
147
|
-
>
|
|
147
|
+
> Annotation quirk: `extra_info` must be annotated as `dict` **without** a default value, unlike `@osmosis_reward`.
|
|
148
|
+
|
|
149
|
+
> Tip: When delegating to `evaluate_rubric`, pass the raw `solution_str` directly and include any extra context inside the `metadata` payload.
|
|
148
150
|
|
|
149
151
|
## Examples
|
|
150
152
|
|
|
@@ -179,8 +181,8 @@ def numeric_tolerance(solution_str: str, ground_truth: str, extra_info: dict = N
|
|
|
179
181
|
|
|
180
182
|
- `examples/rubric_functions.py` demonstrates `evaluate_rubric` with OpenAI, Anthropic, Gemini, and xAI using the schema-enforced SDK integrations.
|
|
181
183
|
- `examples/reward_functions.py` keeps local reward helpers that showcase the decorator contract without external calls.
|
|
182
|
-
- `examples/rubric_configs.yaml` bundles two rubric definitions
|
|
183
|
-
- `examples/sample_data.jsonl` contains two
|
|
184
|
+
- `examples/rubric_configs.yaml` bundles two rubric definitions with provider configuration and scoring bounds.
|
|
185
|
+
- `examples/sample_data.jsonl` contains two rubric-aligned solution strings so you can trial dataset validation.
|
|
184
186
|
|
|
185
187
|
```yaml
|
|
186
188
|
# examples/rubric_configs.yaml (excerpt)
|
|
@@ -194,8 +196,8 @@ rubrics:
|
|
|
194
196
|
```
|
|
195
197
|
|
|
196
198
|
```jsonl
|
|
197
|
-
{"conversation_id": "ticket-001", "rubric_id": "support_followup", "...": "..."}
|
|
198
|
-
{"conversation_id": "ticket-047", "rubric_id": "policy_grounding", "...": "..."}
|
|
199
|
+
{"conversation_id": "ticket-001", "rubric_id": "support_followup", "original_input": "...", "solution_str": "..."}
|
|
200
|
+
{"conversation_id": "ticket-047", "rubric_id": "policy_grounding", "original_input": "...", "solution_str": "..."}
|
|
199
201
|
```
|
|
200
202
|
|
|
201
203
|
## CLI Tools
|
|
@@ -208,7 +210,7 @@ Preview a rubric file and print every configuration discovered, including nested
|
|
|
208
210
|
osmosis preview --path path/to/rubric.yaml
|
|
209
211
|
```
|
|
210
212
|
|
|
211
|
-
Preview a dataset of
|
|
213
|
+
Preview a dataset of rubric-scored solutions stored as JSONL:
|
|
212
214
|
|
|
213
215
|
```bash
|
|
214
216
|
osmosis preview --path path/to/data.jsonl
|
|
@@ -226,6 +228,9 @@ osmosis eval --rubric support_followup --data examples/sample_data.jsonl
|
|
|
226
228
|
- Provide `--output path/to/dir` to create the directory (if needed) and emit `rubric_eval_result_<unix_timestamp>.json`, or supply a full file path (any extension) to control the filename; each file captures every run, provider payloads, timestamps, and aggregate statistics for downstream analysis.
|
|
227
229
|
- Skip `--output` to collect results under `~/.cache/osmosis/eval_result/<rubric_id>/rubric_eval_result_<identifier>.json`; the CLI writes this JSON whether the evaluation finishes cleanly or hits provider/runtime errors so you can inspect failures later (only a manual Ctrl+C interrupt leaves no file behind).
|
|
228
230
|
- Dataset rows whose `rubric_id` does not match the requested rubric are skipped automatically.
|
|
231
|
+
- Each dataset record must provide a non-empty `solution_str`; optional fields such as `original_input`, `ground_truth`, and `extra_info` travel with the record and are forwarded to the evaluator when present.
|
|
232
|
+
- When delegating to a custom `@osmosis_rubric` function, the CLI enriches `extra_info` with the active `provider`, `model`, `rubric`, score bounds, any configured `system_prompt`, the resolved `original_input`, and the record’s metadata/extra fields so the decorator’s required entries are always present.
|
|
233
|
+
- Rubric configuration files intentionally reject `extra_info`; provide per-example context through the dataset instead.
|
|
229
234
|
|
|
230
235
|
Both commands validate the file, echo a short summary (`Loaded <n> ...`), and pretty-print the parsed records so you can confirm that new rubrics or test fixtures look correct before committing them. Invalid files raise a descriptive error and exit with a non-zero status code.
|
|
231
236
|
|
|
@@ -238,7 +243,13 @@ PYTHONPATH=. python examples/rubric_functions.py # Uncomment the provider you n
|
|
|
238
243
|
|
|
239
244
|
## Testing
|
|
240
245
|
|
|
241
|
-
Run `python -m pytest
|
|
246
|
+
Run `python -m pytest` (or any subset under `tests/`) to exercise the updated helpers:
|
|
247
|
+
|
|
248
|
+
- `tests/test_rubric_eval.py` covers prompt construction for `solution_str` evaluations.
|
|
249
|
+
- `tests/test_cli_services.py` validates dataset parsing, extra-info enrichment, and engine interactions.
|
|
250
|
+
- `tests/test_cli.py` ensures the CLI pathways surface the new fields end to end.
|
|
251
|
+
|
|
252
|
+
Add additional tests under `tests/` as you extend the library.
|
|
242
253
|
|
|
243
254
|
## License
|
|
244
255
|
|
|
@@ -10,13 +10,7 @@ from .config import (
|
|
|
10
10
|
load_rubric_suite,
|
|
11
11
|
render_yaml_items,
|
|
12
12
|
)
|
|
13
|
-
from .dataset import
|
|
14
|
-
ConversationMessage,
|
|
15
|
-
DatasetLoader,
|
|
16
|
-
DatasetRecord,
|
|
17
|
-
load_jsonl_records,
|
|
18
|
-
render_json_records,
|
|
19
|
-
)
|
|
13
|
+
from .dataset import DatasetLoader, DatasetRecord, load_jsonl_records, render_json_records
|
|
20
14
|
from .engine import (
|
|
21
15
|
EvaluationRecordResult,
|
|
22
16
|
EvaluationReport,
|
|
@@ -40,7 +34,6 @@ __all__ = [
|
|
|
40
34
|
"BaselineStatistics",
|
|
41
35
|
"CLIError",
|
|
42
36
|
"ConsoleReportRenderer",
|
|
43
|
-
"ConversationMessage",
|
|
44
37
|
"DatasetLoader",
|
|
45
38
|
"DatasetRecord",
|
|
46
39
|
"EvaluationSession",
|
|
@@ -25,8 +25,7 @@ class RubricConfig:
|
|
|
25
25
|
model_info: dict[str, Any]
|
|
26
26
|
score_min: Optional[float]
|
|
27
27
|
score_max: Optional[float]
|
|
28
|
-
|
|
29
|
-
extra_info: Optional[dict[str, Any]]
|
|
28
|
+
system_prompt: Optional[str]
|
|
30
29
|
original_input: Optional[str]
|
|
31
30
|
ground_truth: Optional[str]
|
|
32
31
|
source_label: str
|
|
@@ -195,6 +194,13 @@ def _build_document_configs(
|
|
|
195
194
|
parsed_items.append(ParsedItem(label=item.label, payload=payload))
|
|
196
195
|
if not isinstance(payload, dict):
|
|
197
196
|
continue
|
|
197
|
+
if "extra_info" in payload:
|
|
198
|
+
message = (
|
|
199
|
+
f"Rubric entry in '{path}' (document {doc_index + 1}) must not include 'extra_info'."
|
|
200
|
+
)
|
|
201
|
+
if strict:
|
|
202
|
+
raise CLIError(message)
|
|
203
|
+
continue
|
|
198
204
|
|
|
199
205
|
rubric_key_raw = payload.get("id")
|
|
200
206
|
if not isinstance(rubric_key_raw, str) or not rubric_key_raw.strip():
|
|
@@ -223,14 +229,6 @@ def _build_document_configs(
|
|
|
223
229
|
)
|
|
224
230
|
continue
|
|
225
231
|
|
|
226
|
-
extra_info_value = payload.get("extra_info", defaults.get("extra_info"))
|
|
227
|
-
if extra_info_value is not None and not isinstance(extra_info_value, dict):
|
|
228
|
-
if strict:
|
|
229
|
-
raise CLIError(
|
|
230
|
-
f"'extra_info' for rubric '{rubric_key}' in '{path}' must be a mapping."
|
|
231
|
-
)
|
|
232
|
-
continue
|
|
233
|
-
|
|
234
232
|
try:
|
|
235
233
|
score_min = coerce_optional_float(
|
|
236
234
|
payload.get("score_min", defaults.get("score_min")),
|
|
@@ -247,8 +245,12 @@ def _build_document_configs(
|
|
|
247
245
|
raise
|
|
248
246
|
continue
|
|
249
247
|
|
|
250
|
-
|
|
248
|
+
system_prompt = payload.get("system_prompt", defaults.get("system_prompt"))
|
|
249
|
+
|
|
251
250
|
original_input = payload.get("original_input", defaults.get("original_input"))
|
|
251
|
+
if not isinstance(original_input, str):
|
|
252
|
+
original_input = None
|
|
253
|
+
|
|
252
254
|
ground_truth = payload.get("ground_truth", defaults.get("ground_truth"))
|
|
253
255
|
|
|
254
256
|
label = item.label or f"document[{doc_index}]"
|
|
@@ -260,9 +262,8 @@ def _build_document_configs(
|
|
|
260
262
|
model_info=copy.deepcopy(model_info),
|
|
261
263
|
score_min=score_min,
|
|
262
264
|
score_max=score_max,
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
original_input=original_input if isinstance(original_input, str) else None,
|
|
265
|
+
system_prompt=system_prompt if isinstance(system_prompt, str) else None,
|
|
266
|
+
original_input=original_input,
|
|
266
267
|
ground_truth=ground_truth if isinstance(ground_truth, str) else None,
|
|
267
268
|
source_label=source_label,
|
|
268
269
|
)
|
|
@@ -347,10 +348,9 @@ def _extract_config_defaults(document: Any, path: Path, doc_index: int) -> dict[
|
|
|
347
348
|
if not isinstance(document, dict):
|
|
348
349
|
return {
|
|
349
350
|
"model_info": None,
|
|
350
|
-
"extra_info": None,
|
|
351
351
|
"score_min": None,
|
|
352
352
|
"score_max": None,
|
|
353
|
-
"
|
|
353
|
+
"system_prompt": None,
|
|
354
354
|
"original_input": None,
|
|
355
355
|
"ground_truth": None,
|
|
356
356
|
}
|
|
@@ -358,15 +358,18 @@ def _extract_config_defaults(document: Any, path: Path, doc_index: int) -> dict[
|
|
|
358
358
|
source = f"document[{doc_index}] in {path}"
|
|
359
359
|
|
|
360
360
|
defaults: dict[str, Any] = {}
|
|
361
|
+
if "default_extra_info" in document:
|
|
362
|
+
raise CLIError(
|
|
363
|
+
f"Rubric config document {doc_index + 1} in {path} must not include 'default_extra_info'; extra_info is no longer supported."
|
|
364
|
+
)
|
|
361
365
|
defaults["model_info"] = document.get("default_model_info")
|
|
362
|
-
defaults["extra_info"] = document.get("default_extra_info")
|
|
363
366
|
defaults["score_min"] = coerce_optional_float(
|
|
364
367
|
document.get("default_score_min"), "default_score_min", source
|
|
365
368
|
)
|
|
366
369
|
defaults["score_max"] = coerce_optional_float(
|
|
367
370
|
document.get("default_score_max"), "default_score_max", source
|
|
368
371
|
)
|
|
369
|
-
defaults["
|
|
372
|
+
defaults["system_prompt"] = document.get("default_system_prompt")
|
|
370
373
|
defaults["original_input"] = document.get("default_original_input")
|
|
371
374
|
defaults["ground_truth"] = document.get("default_ground_truth")
|
|
372
375
|
return defaults
|
|
@@ -7,48 +7,7 @@ from pathlib import Path
|
|
|
7
7
|
from typing import Any, Optional, Sequence
|
|
8
8
|
|
|
9
9
|
from .errors import CLIError
|
|
10
|
-
from .shared import coerce_optional_float
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
@dataclass(frozen=True)
|
|
14
|
-
class ConversationMessage:
|
|
15
|
-
"""Normalized conversation message with preserved raw payload fields."""
|
|
16
|
-
|
|
17
|
-
role: str
|
|
18
|
-
content: Any
|
|
19
|
-
metadata: dict[str, Any]
|
|
20
|
-
|
|
21
|
-
def to_payload(self) -> dict[str, Any]:
|
|
22
|
-
payload: dict[str, Any] = copy.deepcopy(self.metadata)
|
|
23
|
-
payload["role"] = self.role
|
|
24
|
-
if self.content is None:
|
|
25
|
-
payload.pop("content", None)
|
|
26
|
-
else:
|
|
27
|
-
payload["content"] = copy.deepcopy(self.content)
|
|
28
|
-
return payload
|
|
29
|
-
|
|
30
|
-
def text_fragments(self) -> list[str]:
|
|
31
|
-
fragments: list[str] = []
|
|
32
|
-
seen: set[int] = set()
|
|
33
|
-
gather_text_fragments(self.content, fragments, allow_free_strings=True, seen=seen)
|
|
34
|
-
for value in self.metadata.values():
|
|
35
|
-
gather_text_fragments(value, fragments, seen=seen)
|
|
36
|
-
return fragments
|
|
37
|
-
|
|
38
|
-
@classmethod
|
|
39
|
-
def from_raw(cls, raw: dict[str, Any], *, source_label: str, index: int) -> "ConversationMessage":
|
|
40
|
-
role_value = raw.get("role")
|
|
41
|
-
if not isinstance(role_value, str) or not role_value.strip():
|
|
42
|
-
raise CLIError(
|
|
43
|
-
f"Message {index} in {source_label} must include a non-empty string 'role'."
|
|
44
|
-
)
|
|
45
|
-
content_value = copy.deepcopy(raw.get("content"))
|
|
46
|
-
metadata: dict[str, Any] = {}
|
|
47
|
-
for key, value in raw.items():
|
|
48
|
-
if key in {"role", "content"}:
|
|
49
|
-
continue
|
|
50
|
-
metadata[str(key)] = copy.deepcopy(value)
|
|
51
|
-
return cls(role=role_value.strip().lower(), content=content_value, metadata=metadata)
|
|
10
|
+
from .shared import coerce_optional_float
|
|
52
11
|
|
|
53
12
|
|
|
54
13
|
@dataclass(frozen=True)
|
|
@@ -57,23 +16,16 @@ class DatasetRecord:
|
|
|
57
16
|
rubric_id: str
|
|
58
17
|
conversation_id: Optional[str]
|
|
59
18
|
record_id: Optional[str]
|
|
60
|
-
|
|
19
|
+
solution_str: str
|
|
61
20
|
ground_truth: Optional[str]
|
|
62
|
-
system_message: Optional[str]
|
|
63
21
|
original_input: Optional[str]
|
|
64
22
|
metadata: Optional[dict[str, Any]]
|
|
65
23
|
extra_info: Optional[dict[str, Any]]
|
|
66
24
|
score_min: Optional[float]
|
|
67
25
|
score_max: Optional[float]
|
|
68
26
|
|
|
69
|
-
def
|
|
70
|
-
"""Return messages as provider-ready payloads."""
|
|
71
|
-
return [message.to_payload() for message in self.messages]
|
|
72
|
-
|
|
73
|
-
def merged_extra_info(self, config_extra: Optional[dict[str, Any]]) -> Optional[dict[str, Any]]:
|
|
27
|
+
def merged_extra_info(self) -> Optional[dict[str, Any]]:
|
|
74
28
|
merged: dict[str, Any] = {}
|
|
75
|
-
if isinstance(config_extra, dict):
|
|
76
|
-
merged.update(copy.deepcopy(config_extra))
|
|
77
29
|
if isinstance(self.extra_info, dict):
|
|
78
30
|
merged.update(copy.deepcopy(self.extra_info))
|
|
79
31
|
if isinstance(self.metadata, dict) and self.metadata:
|
|
@@ -81,19 +33,15 @@ class DatasetRecord:
|
|
|
81
33
|
return merged or None
|
|
82
34
|
|
|
83
35
|
def assistant_preview(self, *, max_length: int = 140) -> Optional[str]:
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
if len(preview) > max_length:
|
|
94
|
-
preview = preview[: max_length - 3].rstrip() + "..."
|
|
95
|
-
return preview
|
|
96
|
-
return None
|
|
36
|
+
text = self.solution_str.strip()
|
|
37
|
+
if not text:
|
|
38
|
+
return None
|
|
39
|
+
preview = " ".join(text.split())
|
|
40
|
+
if not preview:
|
|
41
|
+
return None
|
|
42
|
+
if len(preview) > max_length:
|
|
43
|
+
preview = preview[: max_length - 3].rstrip() + "..."
|
|
44
|
+
return preview
|
|
97
45
|
|
|
98
46
|
def conversation_label(self, fallback_index: int) -> str:
|
|
99
47
|
if isinstance(self.conversation_id, str) and self.conversation_id.strip():
|
|
@@ -162,17 +110,29 @@ class DatasetLoader:
|
|
|
162
110
|
metadata = payload.get("metadata") if isinstance(payload.get("metadata"), dict) else None
|
|
163
111
|
extra_info = payload.get("extra_info") if isinstance(payload.get("extra_info"), dict) else None
|
|
164
112
|
record_label = conversation_id or record_id or rubric_id_str or "<record>"
|
|
165
|
-
|
|
113
|
+
solution_raw = payload.get("solution_str")
|
|
114
|
+
if not isinstance(solution_raw, str) or not solution_raw.strip():
|
|
115
|
+
raise CLIError(f"Record '{record_label}' must include a non-empty 'solution_str' string.")
|
|
116
|
+
|
|
117
|
+
original_input_raw = payload.get("original_input")
|
|
118
|
+
if isinstance(original_input_raw, str):
|
|
119
|
+
original_input = original_input_raw
|
|
120
|
+
else:
|
|
121
|
+
original_input = None
|
|
122
|
+
|
|
123
|
+
if original_input is None and isinstance(extra_info, dict):
|
|
124
|
+
extra_original_input = extra_info.get("original_input")
|
|
125
|
+
if isinstance(extra_original_input, str):
|
|
126
|
+
original_input = extra_original_input
|
|
166
127
|
|
|
167
128
|
return DatasetRecord(
|
|
168
129
|
payload=payload,
|
|
169
130
|
rubric_id=rubric_id_str,
|
|
170
131
|
conversation_id=conversation_id,
|
|
171
132
|
record_id=record_id,
|
|
172
|
-
|
|
133
|
+
solution_str=solution_raw,
|
|
173
134
|
ground_truth=payload.get("ground_truth") if isinstance(payload.get("ground_truth"), str) else None,
|
|
174
|
-
|
|
175
|
-
original_input=payload.get("original_input") if isinstance(payload.get("original_input"), str) else None,
|
|
135
|
+
original_input=original_input,
|
|
176
136
|
metadata=metadata,
|
|
177
137
|
extra_info=extra_info,
|
|
178
138
|
score_min=score_min,
|
|
@@ -213,17 +173,3 @@ def render_json_records(records: Sequence[dict[str, Any]]) -> str:
|
|
|
213
173
|
segments.append("\n".join(snippet))
|
|
214
174
|
|
|
215
175
|
return "\n".join(segments)
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
def _parse_messages(messages: Any, *, source_label: str) -> tuple[ConversationMessage, ...]:
|
|
219
|
-
if not isinstance(messages, list) or not messages:
|
|
220
|
-
raise CLIError(f"Record '{source_label}' must include a non-empty 'messages' list.")
|
|
221
|
-
|
|
222
|
-
normalized: list[ConversationMessage] = []
|
|
223
|
-
for index, entry in enumerate(messages):
|
|
224
|
-
if not isinstance(entry, dict):
|
|
225
|
-
raise CLIError(
|
|
226
|
-
f"Message {index} in {source_label} must be an object, got {type(entry).__name__}."
|
|
227
|
-
)
|
|
228
|
-
normalized.append(ConversationMessage.from_raw(entry, source_label=source_label, index=index))
|
|
229
|
-
return tuple(normalized)
|