osmosis-ai 0.2.3__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of osmosis-ai might be problematic. Click here for more details.

Files changed (38) hide show
  1. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/PKG-INFO +46 -35
  2. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/README.md +45 -34
  3. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/cli_services/__init__.py +1 -8
  4. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/cli_services/config.py +21 -18
  5. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/cli_services/dataset.py +28 -82
  6. osmosis_ai-0.2.4/osmosis_ai/cli_services/engine.py +421 -0
  7. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/consts.py +1 -1
  8. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/rubric_eval.py +34 -176
  9. osmosis_ai-0.2.4/osmosis_ai/utils.py +315 -0
  10. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/pyproject.toml +1 -1
  11. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/tests/test_cli.py +43 -25
  12. osmosis_ai-0.2.4/tests/test_cli_services.py +400 -0
  13. osmosis_ai-0.2.4/tests/test_rubric_eval.py +40 -0
  14. osmosis_ai-0.2.3/osmosis_ai/cli_services/engine.py +0 -251
  15. osmosis_ai-0.2.3/osmosis_ai/utils.py +0 -450
  16. osmosis_ai-0.2.3/tests/test_cli_services.py +0 -193
  17. osmosis_ai-0.2.3/tests/test_rubric_eval.py +0 -127
  18. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/LICENSE +0 -0
  19. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/MANIFEST.in +0 -0
  20. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/__init__.py +0 -0
  21. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/cli.py +0 -0
  22. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/cli_commands.py +0 -0
  23. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/cli_services/errors.py +0 -0
  24. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/cli_services/reporting.py +0 -0
  25. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/cli_services/session.py +0 -0
  26. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/cli_services/shared.py +0 -0
  27. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/providers/__init__.py +0 -0
  28. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/providers/anthropic_provider.py +0 -0
  29. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/providers/base.py +0 -0
  30. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/providers/gemini_provider.py +0 -0
  31. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/providers/openai_family.py +0 -0
  32. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/providers/shared.py +0 -0
  33. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai/rubric_types.py +0 -0
  34. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/osmosis_ai.egg-info/SOURCES.txt +0 -0
  35. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/pytest.ini +0 -0
  36. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/requirements.txt +0 -0
  37. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/setup.cfg +0 -0
  38. {osmosis_ai-0.2.3 → osmosis_ai-0.2.4}/setup_env.bat +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: osmosis-ai
3
- Version: 0.2.3
3
+ Version: 0.2.4
4
4
  Summary: A Python library for reward function validation with strict type enforcement.
5
5
  Author-email: Osmosis AI <jake@osmosis.ai>
6
6
  License: MIT License
@@ -81,23 +81,12 @@ score = simple_reward("hello world", "hello world") # Returns 1.0
81
81
  ```python
82
82
  from osmosis_ai import evaluate_rubric
83
83
 
84
- messages = [
85
- {
86
- "type": "message",
87
- "role": "user",
88
- "content": [{"type": "input_text", "text": "What is the capital of France?"}],
89
- },
90
- {
91
- "type": "message",
92
- "role": "assistant",
93
- "content": [{"type": "output_text", "text": "The capital of France is Paris."}],
94
- },
95
- ]
84
+ solution = "The capital of France is Paris."
96
85
 
97
86
  # Export OPENAI_API_KEY in your shell before running this snippet.
98
87
  rubric_score = evaluate_rubric(
99
88
  rubric="Assistant must mention the verified capital city.",
100
- messages=messages,
89
+ solution_str=solution,
101
90
  model_info={
102
91
  "provider": "openai",
103
92
  "model": "gpt-5",
@@ -128,13 +117,15 @@ Credentials are resolved from environment variables by default:
128
117
 
129
118
  Override the environment variable name with `model_info={"api_key_env": "CUSTOM_ENV_NAME"}` when needed, or supply an inline secret with `model_info={"api_key": "sk-..."}` for ephemeral credentials. Missing API keys raise a `MissingAPIKeyError` that explains how to export the secret before trying again.
130
119
 
120
+ `api_key` and `api_key_env` are mutually exclusive ways to provide the same credential. When `api_key` is present and non-empty it is used directly, skipping any environment lookup. Otherwise the resolver falls back to `api_key_env` (or the provider default) and pulls the value from your local environment with `os.getenv`.
121
+
131
122
  `model_info` accepts additional rubric-specific knobs:
132
123
 
133
124
  - `score_min` / `score_max` – change the default `[0.0, 1.0]` scoring bounds.
134
- - `system_prompt` / `original_input` – override the helper’s transcript inference when those entries are absent.
125
+ - `system_prompt` / `original_input` – provide optional context strings that will be quoted in the judging prompt.
135
126
  - `timeout` – customise the provider timeout in seconds.
136
127
 
137
- Pass `extra_info={...}` to `evaluate_rubric` when you need structured context quoted in the judge prompt, and set `return_details=True` to receive the full `RewardRubricRunResult` payload (including the provider’s raw response).
128
+ Pass `metadata={...}` to `evaluate_rubric` when you need structured context quoted in the judge prompt, and set `return_details=True` to receive the full `RewardRubricRunResult` payload (including the provider’s raw response).
138
129
 
139
130
  Remote failures surface as `ProviderRequestError` instances, with `ModelNotFoundError` reserved for missing model identifiers so you can retry with a new snapshot.
140
131
 
@@ -172,24 +163,35 @@ The decorator will raise a `TypeError` if the function doesn't match this exact
172
163
 
173
164
  ## Rubric Function Signature
174
165
 
175
- Rubric functions decorated with `@osmosis_rubric` must accept the parameters:
166
+ Rubric functions decorated with `@osmosis_rubric` must match this signature:
167
+
168
+ ```python
169
+ @osmosis_rubric
170
+ def your_rubric(solution_str: str, ground_truth: str | None, extra_info: dict) -> float:
171
+ # Your rubric logic here
172
+ return float_score
173
+ ```
174
+
175
+ > The runtime forwards `None` for `ground_truth` when no reference answer exists. Annotate the parameter as `Optional[str]` (or handle `None` explicitly) if your rubric logic expects to run in that scenario.
176
+
177
+ ### Required `extra_info` fields
176
178
 
177
- - `model_info: dict`
178
- - `rubric: str`
179
- - `messages: list`
180
- - `ground_truth: Optional[str] = None`
181
- - `system_message: Optional[str] = None`
182
- - `extra_info: dict = None`
183
- - `score_min: float = 0.0` *(optional lower bound; must default to 0.0 and stay below `score_max`)*
184
- - `score_max: float = 1.0` *(optional upper bound; must default to 1.0 and stay above `score_min`)*
179
+ - **`provider`** – Non-empty string identifying the judge provider.
180
+ - **`model`** – Non-empty string naming the provider model to call.
181
+ - **`rubric`** – Natural-language rubric instructions for the judge model.
182
+ - **`api_key` / `api_key_env`** – Supply either the raw key or the environment variable name that exposes it.
185
183
 
186
- and must return a `float`. The decorator validates the signature and runtime payload (including message role validation and return type) before delegating to your custom logic.
184
+ ### Optional `extra_info` fields
187
185
 
188
- > Required fields: `model_info` must contain non-empty `provider` and `model` string entries.
186
+ - **`system_prompt`** Optional string prepended to the provider’s base system prompt when invoking the judge; include it inside `extra_info` rather than as a separate argument.
187
+ - **`score_min` / `score_max`** – Optional numeric overrides for the expected score range.
188
+ - **`model_info_overrides`** – Optional dict merged into the provider configuration passed to the judge.
189
189
 
190
- > Annotation quirk: `extra_info` must be annotated as a plain `dict` with a default of `None` to satisfy the validator.
190
+ Additional keys are passthrough and can be used for custom configuration. If you need to extend the provider payload (for example adding `api_key_env`), add a dict under `model_info_overrides` and it will be merged with the required `provider`/`model` pair before invoking `evaluate_rubric`. The decorator enforces the parameter names/annotations, validates the embedded configuration at call time, and ensures the wrapped function returns a `float`.
191
191
 
192
- > Tip: You can call `evaluate_rubric` from inside a rubric function (or any other orchestrator) to outsource judging to a hosted model while still benefiting from the decorator’s validation.
192
+ > Annotation quirk: `extra_info` must be annotated as `dict` **without** a default value, unlike `@osmosis_reward`.
193
+
194
+ > Tip: When delegating to `evaluate_rubric`, pass the raw `solution_str` directly and include any extra context inside the `metadata` payload.
193
195
 
194
196
  ## Examples
195
197
 
@@ -224,8 +226,8 @@ def numeric_tolerance(solution_str: str, ground_truth: str, extra_info: dict = N
224
226
 
225
227
  - `examples/rubric_functions.py` demonstrates `evaluate_rubric` with OpenAI, Anthropic, Gemini, and xAI using the schema-enforced SDK integrations.
226
228
  - `examples/reward_functions.py` keeps local reward helpers that showcase the decorator contract without external calls.
227
- - `examples/rubric_configs.yaml` bundles two rubric definitions, each with its own provider configuration and extra prompt context.
228
- - `examples/sample_data.jsonl` contains two conversation payloads mapped to those rubrics so you can trial dataset validation.
229
+ - `examples/rubric_configs.yaml` bundles two rubric definitions with provider configuration and scoring bounds.
230
+ - `examples/sample_data.jsonl` contains two rubric-aligned solution strings so you can trial dataset validation.
229
231
 
230
232
  ```yaml
231
233
  # examples/rubric_configs.yaml (excerpt)
@@ -239,8 +241,8 @@ rubrics:
239
241
  ```
240
242
 
241
243
  ```jsonl
242
- {"conversation_id": "ticket-001", "rubric_id": "support_followup", "...": "..."}
243
- {"conversation_id": "ticket-047", "rubric_id": "policy_grounding", "...": "..."}
244
+ {"conversation_id": "ticket-001", "rubric_id": "support_followup", "original_input": "...", "solution_str": "..."}
245
+ {"conversation_id": "ticket-047", "rubric_id": "policy_grounding", "original_input": "...", "solution_str": "..."}
244
246
  ```
245
247
 
246
248
  ## CLI Tools
@@ -253,7 +255,7 @@ Preview a rubric file and print every configuration discovered, including nested
253
255
  osmosis preview --path path/to/rubric.yaml
254
256
  ```
255
257
 
256
- Preview a dataset of chat transcripts stored as JSONL:
258
+ Preview a dataset of rubric-scored solutions stored as JSONL:
257
259
 
258
260
  ```bash
259
261
  osmosis preview --path path/to/data.jsonl
@@ -271,6 +273,9 @@ osmosis eval --rubric support_followup --data examples/sample_data.jsonl
271
273
  - Provide `--output path/to/dir` to create the directory (if needed) and emit `rubric_eval_result_<unix_timestamp>.json`, or supply a full file path (any extension) to control the filename; each file captures every run, provider payloads, timestamps, and aggregate statistics for downstream analysis.
272
274
  - Skip `--output` to collect results under `~/.cache/osmosis/eval_result/<rubric_id>/rubric_eval_result_<identifier>.json`; the CLI writes this JSON whether the evaluation finishes cleanly or hits provider/runtime errors so you can inspect failures later (only a manual Ctrl+C interrupt leaves no file behind).
273
275
  - Dataset rows whose `rubric_id` does not match the requested rubric are skipped automatically.
276
+ - Each dataset record must provide a non-empty `solution_str`; optional fields such as `original_input`, `ground_truth`, and `extra_info` travel with the record and are forwarded to the evaluator when present.
277
+ - When delegating to a custom `@osmosis_rubric` function, the CLI enriches `extra_info` with the active `provider`, `model`, `rubric`, score bounds, any configured `system_prompt`, the resolved `original_input`, and the record’s metadata/extra fields so the decorator’s required entries are always present.
278
+ - Rubric configuration files intentionally reject `extra_info`; provide per-example context through the dataset instead.
274
279
 
275
280
  Both commands validate the file, echo a short summary (`Loaded <n> ...`), and pretty-print the parsed records so you can confirm that new rubrics or test fixtures look correct before committing them. Invalid files raise a descriptive error and exit with a non-zero status code.
276
281
 
@@ -283,7 +288,13 @@ PYTHONPATH=. python examples/rubric_functions.py # Uncomment the provider you n
283
288
 
284
289
  ## Testing
285
290
 
286
- Run `python -m pytest tests/test_rubric_eval.py` to exercise the guards that ensure rubric prompts ignore message metadata (for example `tests/test_rubric_eval.py::test_collect_text_skips_metadata_fields`) while still preserving nested tool output. Add additional tests under `tests/` as you extend the library.
291
+ Run `python -m pytest` (or any subset under `tests/`) to exercise the updated helpers:
292
+
293
+ - `tests/test_rubric_eval.py` covers prompt construction for `solution_str` evaluations.
294
+ - `tests/test_cli_services.py` validates dataset parsing, extra-info enrichment, and engine interactions.
295
+ - `tests/test_cli.py` ensures the CLI pathways surface the new fields end to end.
296
+
297
+ Add additional tests under `tests/` as you extend the library.
287
298
 
288
299
  ## License
289
300
 
@@ -36,23 +36,12 @@ score = simple_reward("hello world", "hello world") # Returns 1.0
36
36
  ```python
37
37
  from osmosis_ai import evaluate_rubric
38
38
 
39
- messages = [
40
- {
41
- "type": "message",
42
- "role": "user",
43
- "content": [{"type": "input_text", "text": "What is the capital of France?"}],
44
- },
45
- {
46
- "type": "message",
47
- "role": "assistant",
48
- "content": [{"type": "output_text", "text": "The capital of France is Paris."}],
49
- },
50
- ]
39
+ solution = "The capital of France is Paris."
51
40
 
52
41
  # Export OPENAI_API_KEY in your shell before running this snippet.
53
42
  rubric_score = evaluate_rubric(
54
43
  rubric="Assistant must mention the verified capital city.",
55
- messages=messages,
44
+ solution_str=solution,
56
45
  model_info={
57
46
  "provider": "openai",
58
47
  "model": "gpt-5",
@@ -83,13 +72,15 @@ Credentials are resolved from environment variables by default:
83
72
 
84
73
  Override the environment variable name with `model_info={"api_key_env": "CUSTOM_ENV_NAME"}` when needed, or supply an inline secret with `model_info={"api_key": "sk-..."}` for ephemeral credentials. Missing API keys raise a `MissingAPIKeyError` that explains how to export the secret before trying again.
85
74
 
75
+ `api_key` and `api_key_env` are mutually exclusive ways to provide the same credential. When `api_key` is present and non-empty it is used directly, skipping any environment lookup. Otherwise the resolver falls back to `api_key_env` (or the provider default) and pulls the value from your local environment with `os.getenv`.
76
+
86
77
  `model_info` accepts additional rubric-specific knobs:
87
78
 
88
79
  - `score_min` / `score_max` – change the default `[0.0, 1.0]` scoring bounds.
89
- - `system_prompt` / `original_input` – override the helper’s transcript inference when those entries are absent.
80
+ - `system_prompt` / `original_input` – provide optional context strings that will be quoted in the judging prompt.
90
81
  - `timeout` – customise the provider timeout in seconds.
91
82
 
92
- Pass `extra_info={...}` to `evaluate_rubric` when you need structured context quoted in the judge prompt, and set `return_details=True` to receive the full `RewardRubricRunResult` payload (including the provider’s raw response).
83
+ Pass `metadata={...}` to `evaluate_rubric` when you need structured context quoted in the judge prompt, and set `return_details=True` to receive the full `RewardRubricRunResult` payload (including the provider’s raw response).
93
84
 
94
85
  Remote failures surface as `ProviderRequestError` instances, with `ModelNotFoundError` reserved for missing model identifiers so you can retry with a new snapshot.
95
86
 
@@ -127,24 +118,35 @@ The decorator will raise a `TypeError` if the function doesn't match this exact
127
118
 
128
119
  ## Rubric Function Signature
129
120
 
130
- Rubric functions decorated with `@osmosis_rubric` must accept the parameters:
121
+ Rubric functions decorated with `@osmosis_rubric` must match this signature:
122
+
123
+ ```python
124
+ @osmosis_rubric
125
+ def your_rubric(solution_str: str, ground_truth: str | None, extra_info: dict) -> float:
126
+ # Your rubric logic here
127
+ return float_score
128
+ ```
129
+
130
+ > The runtime forwards `None` for `ground_truth` when no reference answer exists. Annotate the parameter as `Optional[str]` (or handle `None` explicitly) if your rubric logic expects to run in that scenario.
131
+
132
+ ### Required `extra_info` fields
131
133
 
132
- - `model_info: dict`
133
- - `rubric: str`
134
- - `messages: list`
135
- - `ground_truth: Optional[str] = None`
136
- - `system_message: Optional[str] = None`
137
- - `extra_info: dict = None`
138
- - `score_min: float = 0.0` *(optional lower bound; must default to 0.0 and stay below `score_max`)*
139
- - `score_max: float = 1.0` *(optional upper bound; must default to 1.0 and stay above `score_min`)*
134
+ - **`provider`** – Non-empty string identifying the judge provider.
135
+ - **`model`** – Non-empty string naming the provider model to call.
136
+ - **`rubric`** – Natural-language rubric instructions for the judge model.
137
+ - **`api_key` / `api_key_env`** – Supply either the raw key or the environment variable name that exposes it.
140
138
 
141
- and must return a `float`. The decorator validates the signature and runtime payload (including message role validation and return type) before delegating to your custom logic.
139
+ ### Optional `extra_info` fields
142
140
 
143
- > Required fields: `model_info` must contain non-empty `provider` and `model` string entries.
141
+ - **`system_prompt`** Optional string prepended to the provider’s base system prompt when invoking the judge; include it inside `extra_info` rather than as a separate argument.
142
+ - **`score_min` / `score_max`** – Optional numeric overrides for the expected score range.
143
+ - **`model_info_overrides`** – Optional dict merged into the provider configuration passed to the judge.
144
144
 
145
- > Annotation quirk: `extra_info` must be annotated as a plain `dict` with a default of `None` to satisfy the validator.
145
+ Additional keys are passthrough and can be used for custom configuration. If you need to extend the provider payload (for example adding `api_key_env`), add a dict under `model_info_overrides` and it will be merged with the required `provider`/`model` pair before invoking `evaluate_rubric`. The decorator enforces the parameter names/annotations, validates the embedded configuration at call time, and ensures the wrapped function returns a `float`.
146
146
 
147
- > Tip: You can call `evaluate_rubric` from inside a rubric function (or any other orchestrator) to outsource judging to a hosted model while still benefiting from the decorator’s validation.
147
+ > Annotation quirk: `extra_info` must be annotated as `dict` **without** a default value, unlike `@osmosis_reward`.
148
+
149
+ > Tip: When delegating to `evaluate_rubric`, pass the raw `solution_str` directly and include any extra context inside the `metadata` payload.
148
150
 
149
151
  ## Examples
150
152
 
@@ -179,8 +181,8 @@ def numeric_tolerance(solution_str: str, ground_truth: str, extra_info: dict = N
179
181
 
180
182
  - `examples/rubric_functions.py` demonstrates `evaluate_rubric` with OpenAI, Anthropic, Gemini, and xAI using the schema-enforced SDK integrations.
181
183
  - `examples/reward_functions.py` keeps local reward helpers that showcase the decorator contract without external calls.
182
- - `examples/rubric_configs.yaml` bundles two rubric definitions, each with its own provider configuration and extra prompt context.
183
- - `examples/sample_data.jsonl` contains two conversation payloads mapped to those rubrics so you can trial dataset validation.
184
+ - `examples/rubric_configs.yaml` bundles two rubric definitions with provider configuration and scoring bounds.
185
+ - `examples/sample_data.jsonl` contains two rubric-aligned solution strings so you can trial dataset validation.
184
186
 
185
187
  ```yaml
186
188
  # examples/rubric_configs.yaml (excerpt)
@@ -194,8 +196,8 @@ rubrics:
194
196
  ```
195
197
 
196
198
  ```jsonl
197
- {"conversation_id": "ticket-001", "rubric_id": "support_followup", "...": "..."}
198
- {"conversation_id": "ticket-047", "rubric_id": "policy_grounding", "...": "..."}
199
+ {"conversation_id": "ticket-001", "rubric_id": "support_followup", "original_input": "...", "solution_str": "..."}
200
+ {"conversation_id": "ticket-047", "rubric_id": "policy_grounding", "original_input": "...", "solution_str": "..."}
199
201
  ```
200
202
 
201
203
  ## CLI Tools
@@ -208,7 +210,7 @@ Preview a rubric file and print every configuration discovered, including nested
208
210
  osmosis preview --path path/to/rubric.yaml
209
211
  ```
210
212
 
211
- Preview a dataset of chat transcripts stored as JSONL:
213
+ Preview a dataset of rubric-scored solutions stored as JSONL:
212
214
 
213
215
  ```bash
214
216
  osmosis preview --path path/to/data.jsonl
@@ -226,6 +228,9 @@ osmosis eval --rubric support_followup --data examples/sample_data.jsonl
226
228
  - Provide `--output path/to/dir` to create the directory (if needed) and emit `rubric_eval_result_<unix_timestamp>.json`, or supply a full file path (any extension) to control the filename; each file captures every run, provider payloads, timestamps, and aggregate statistics for downstream analysis.
227
229
  - Skip `--output` to collect results under `~/.cache/osmosis/eval_result/<rubric_id>/rubric_eval_result_<identifier>.json`; the CLI writes this JSON whether the evaluation finishes cleanly or hits provider/runtime errors so you can inspect failures later (only a manual Ctrl+C interrupt leaves no file behind).
228
230
  - Dataset rows whose `rubric_id` does not match the requested rubric are skipped automatically.
231
+ - Each dataset record must provide a non-empty `solution_str`; optional fields such as `original_input`, `ground_truth`, and `extra_info` travel with the record and are forwarded to the evaluator when present.
232
+ - When delegating to a custom `@osmosis_rubric` function, the CLI enriches `extra_info` with the active `provider`, `model`, `rubric`, score bounds, any configured `system_prompt`, the resolved `original_input`, and the record’s metadata/extra fields so the decorator’s required entries are always present.
233
+ - Rubric configuration files intentionally reject `extra_info`; provide per-example context through the dataset instead.
229
234
 
230
235
  Both commands validate the file, echo a short summary (`Loaded <n> ...`), and pretty-print the parsed records so you can confirm that new rubrics or test fixtures look correct before committing them. Invalid files raise a descriptive error and exit with a non-zero status code.
231
236
 
@@ -238,7 +243,13 @@ PYTHONPATH=. python examples/rubric_functions.py # Uncomment the provider you n
238
243
 
239
244
  ## Testing
240
245
 
241
- Run `python -m pytest tests/test_rubric_eval.py` to exercise the guards that ensure rubric prompts ignore message metadata (for example `tests/test_rubric_eval.py::test_collect_text_skips_metadata_fields`) while still preserving nested tool output. Add additional tests under `tests/` as you extend the library.
246
+ Run `python -m pytest` (or any subset under `tests/`) to exercise the updated helpers:
247
+
248
+ - `tests/test_rubric_eval.py` covers prompt construction for `solution_str` evaluations.
249
+ - `tests/test_cli_services.py` validates dataset parsing, extra-info enrichment, and engine interactions.
250
+ - `tests/test_cli.py` ensures the CLI pathways surface the new fields end to end.
251
+
252
+ Add additional tests under `tests/` as you extend the library.
242
253
 
243
254
  ## License
244
255
 
@@ -10,13 +10,7 @@ from .config import (
10
10
  load_rubric_suite,
11
11
  render_yaml_items,
12
12
  )
13
- from .dataset import (
14
- ConversationMessage,
15
- DatasetLoader,
16
- DatasetRecord,
17
- load_jsonl_records,
18
- render_json_records,
19
- )
13
+ from .dataset import DatasetLoader, DatasetRecord, load_jsonl_records, render_json_records
20
14
  from .engine import (
21
15
  EvaluationRecordResult,
22
16
  EvaluationReport,
@@ -40,7 +34,6 @@ __all__ = [
40
34
  "BaselineStatistics",
41
35
  "CLIError",
42
36
  "ConsoleReportRenderer",
43
- "ConversationMessage",
44
37
  "DatasetLoader",
45
38
  "DatasetRecord",
46
39
  "EvaluationSession",
@@ -25,8 +25,7 @@ class RubricConfig:
25
25
  model_info: dict[str, Any]
26
26
  score_min: Optional[float]
27
27
  score_max: Optional[float]
28
- system_message: Optional[str]
29
- extra_info: Optional[dict[str, Any]]
28
+ system_prompt: Optional[str]
30
29
  original_input: Optional[str]
31
30
  ground_truth: Optional[str]
32
31
  source_label: str
@@ -195,6 +194,13 @@ def _build_document_configs(
195
194
  parsed_items.append(ParsedItem(label=item.label, payload=payload))
196
195
  if not isinstance(payload, dict):
197
196
  continue
197
+ if "extra_info" in payload:
198
+ message = (
199
+ f"Rubric entry in '{path}' (document {doc_index + 1}) must not include 'extra_info'."
200
+ )
201
+ if strict:
202
+ raise CLIError(message)
203
+ continue
198
204
 
199
205
  rubric_key_raw = payload.get("id")
200
206
  if not isinstance(rubric_key_raw, str) or not rubric_key_raw.strip():
@@ -223,14 +229,6 @@ def _build_document_configs(
223
229
  )
224
230
  continue
225
231
 
226
- extra_info_value = payload.get("extra_info", defaults.get("extra_info"))
227
- if extra_info_value is not None and not isinstance(extra_info_value, dict):
228
- if strict:
229
- raise CLIError(
230
- f"'extra_info' for rubric '{rubric_key}' in '{path}' must be a mapping."
231
- )
232
- continue
233
-
234
232
  try:
235
233
  score_min = coerce_optional_float(
236
234
  payload.get("score_min", defaults.get("score_min")),
@@ -247,8 +245,12 @@ def _build_document_configs(
247
245
  raise
248
246
  continue
249
247
 
250
- system_message = payload.get("system_message", defaults.get("system_message"))
248
+ system_prompt = payload.get("system_prompt", defaults.get("system_prompt"))
249
+
251
250
  original_input = payload.get("original_input", defaults.get("original_input"))
251
+ if not isinstance(original_input, str):
252
+ original_input = None
253
+
252
254
  ground_truth = payload.get("ground_truth", defaults.get("ground_truth"))
253
255
 
254
256
  label = item.label or f"document[{doc_index}]"
@@ -260,9 +262,8 @@ def _build_document_configs(
260
262
  model_info=copy.deepcopy(model_info),
261
263
  score_min=score_min,
262
264
  score_max=score_max,
263
- system_message=system_message if isinstance(system_message, str) else None,
264
- extra_info=copy.deepcopy(extra_info_value) if isinstance(extra_info_value, dict) else None,
265
- original_input=original_input if isinstance(original_input, str) else None,
265
+ system_prompt=system_prompt if isinstance(system_prompt, str) else None,
266
+ original_input=original_input,
266
267
  ground_truth=ground_truth if isinstance(ground_truth, str) else None,
267
268
  source_label=source_label,
268
269
  )
@@ -347,10 +348,9 @@ def _extract_config_defaults(document: Any, path: Path, doc_index: int) -> dict[
347
348
  if not isinstance(document, dict):
348
349
  return {
349
350
  "model_info": None,
350
- "extra_info": None,
351
351
  "score_min": None,
352
352
  "score_max": None,
353
- "system_message": None,
353
+ "system_prompt": None,
354
354
  "original_input": None,
355
355
  "ground_truth": None,
356
356
  }
@@ -358,15 +358,18 @@ def _extract_config_defaults(document: Any, path: Path, doc_index: int) -> dict[
358
358
  source = f"document[{doc_index}] in {path}"
359
359
 
360
360
  defaults: dict[str, Any] = {}
361
+ if "default_extra_info" in document:
362
+ raise CLIError(
363
+ f"Rubric config document {doc_index + 1} in {path} must not include 'default_extra_info'; extra_info is no longer supported."
364
+ )
361
365
  defaults["model_info"] = document.get("default_model_info")
362
- defaults["extra_info"] = document.get("default_extra_info")
363
366
  defaults["score_min"] = coerce_optional_float(
364
367
  document.get("default_score_min"), "default_score_min", source
365
368
  )
366
369
  defaults["score_max"] = coerce_optional_float(
367
370
  document.get("default_score_max"), "default_score_max", source
368
371
  )
369
- defaults["system_message"] = document.get("default_system_message")
372
+ defaults["system_prompt"] = document.get("default_system_prompt")
370
373
  defaults["original_input"] = document.get("default_original_input")
371
374
  defaults["ground_truth"] = document.get("default_ground_truth")
372
375
  return defaults
@@ -7,48 +7,7 @@ from pathlib import Path
7
7
  from typing import Any, Optional, Sequence
8
8
 
9
9
  from .errors import CLIError
10
- from .shared import coerce_optional_float, gather_text_fragments
11
-
12
-
13
- @dataclass(frozen=True)
14
- class ConversationMessage:
15
- """Normalized conversation message with preserved raw payload fields."""
16
-
17
- role: str
18
- content: Any
19
- metadata: dict[str, Any]
20
-
21
- def to_payload(self) -> dict[str, Any]:
22
- payload: dict[str, Any] = copy.deepcopy(self.metadata)
23
- payload["role"] = self.role
24
- if self.content is None:
25
- payload.pop("content", None)
26
- else:
27
- payload["content"] = copy.deepcopy(self.content)
28
- return payload
29
-
30
- def text_fragments(self) -> list[str]:
31
- fragments: list[str] = []
32
- seen: set[int] = set()
33
- gather_text_fragments(self.content, fragments, allow_free_strings=True, seen=seen)
34
- for value in self.metadata.values():
35
- gather_text_fragments(value, fragments, seen=seen)
36
- return fragments
37
-
38
- @classmethod
39
- def from_raw(cls, raw: dict[str, Any], *, source_label: str, index: int) -> "ConversationMessage":
40
- role_value = raw.get("role")
41
- if not isinstance(role_value, str) or not role_value.strip():
42
- raise CLIError(
43
- f"Message {index} in {source_label} must include a non-empty string 'role'."
44
- )
45
- content_value = copy.deepcopy(raw.get("content"))
46
- metadata: dict[str, Any] = {}
47
- for key, value in raw.items():
48
- if key in {"role", "content"}:
49
- continue
50
- metadata[str(key)] = copy.deepcopy(value)
51
- return cls(role=role_value.strip().lower(), content=content_value, metadata=metadata)
10
+ from .shared import coerce_optional_float
52
11
 
53
12
 
54
13
  @dataclass(frozen=True)
@@ -57,23 +16,16 @@ class DatasetRecord:
57
16
  rubric_id: str
58
17
  conversation_id: Optional[str]
59
18
  record_id: Optional[str]
60
- messages: tuple[ConversationMessage, ...]
19
+ solution_str: str
61
20
  ground_truth: Optional[str]
62
- system_message: Optional[str]
63
21
  original_input: Optional[str]
64
22
  metadata: Optional[dict[str, Any]]
65
23
  extra_info: Optional[dict[str, Any]]
66
24
  score_min: Optional[float]
67
25
  score_max: Optional[float]
68
26
 
69
- def message_payloads(self) -> list[dict[str, Any]]:
70
- """Return messages as provider-ready payloads."""
71
- return [message.to_payload() for message in self.messages]
72
-
73
- def merged_extra_info(self, config_extra: Optional[dict[str, Any]]) -> Optional[dict[str, Any]]:
27
+ def merged_extra_info(self) -> Optional[dict[str, Any]]:
74
28
  merged: dict[str, Any] = {}
75
- if isinstance(config_extra, dict):
76
- merged.update(copy.deepcopy(config_extra))
77
29
  if isinstance(self.extra_info, dict):
78
30
  merged.update(copy.deepcopy(self.extra_info))
79
31
  if isinstance(self.metadata, dict) and self.metadata:
@@ -81,19 +33,15 @@ class DatasetRecord:
81
33
  return merged or None
82
34
 
83
35
  def assistant_preview(self, *, max_length: int = 140) -> Optional[str]:
84
- for message in reversed(self.messages):
85
- if message.role != "assistant":
86
- continue
87
- fragments = message.text_fragments()
88
- if not fragments:
89
- continue
90
- preview = " ".join(" ".join(fragments).split())
91
- if not preview:
92
- continue
93
- if len(preview) > max_length:
94
- preview = preview[: max_length - 3].rstrip() + "..."
95
- return preview
96
- return None
36
+ text = self.solution_str.strip()
37
+ if not text:
38
+ return None
39
+ preview = " ".join(text.split())
40
+ if not preview:
41
+ return None
42
+ if len(preview) > max_length:
43
+ preview = preview[: max_length - 3].rstrip() + "..."
44
+ return preview
97
45
 
98
46
  def conversation_label(self, fallback_index: int) -> str:
99
47
  if isinstance(self.conversation_id, str) and self.conversation_id.strip():
@@ -162,17 +110,29 @@ class DatasetLoader:
162
110
  metadata = payload.get("metadata") if isinstance(payload.get("metadata"), dict) else None
163
111
  extra_info = payload.get("extra_info") if isinstance(payload.get("extra_info"), dict) else None
164
112
  record_label = conversation_id or record_id or rubric_id_str or "<record>"
165
- messages = _parse_messages(payload.get("messages"), source_label=record_label)
113
+ solution_raw = payload.get("solution_str")
114
+ if not isinstance(solution_raw, str) or not solution_raw.strip():
115
+ raise CLIError(f"Record '{record_label}' must include a non-empty 'solution_str' string.")
116
+
117
+ original_input_raw = payload.get("original_input")
118
+ if isinstance(original_input_raw, str):
119
+ original_input = original_input_raw
120
+ else:
121
+ original_input = None
122
+
123
+ if original_input is None and isinstance(extra_info, dict):
124
+ extra_original_input = extra_info.get("original_input")
125
+ if isinstance(extra_original_input, str):
126
+ original_input = extra_original_input
166
127
 
167
128
  return DatasetRecord(
168
129
  payload=payload,
169
130
  rubric_id=rubric_id_str,
170
131
  conversation_id=conversation_id,
171
132
  record_id=record_id,
172
- messages=messages,
133
+ solution_str=solution_raw,
173
134
  ground_truth=payload.get("ground_truth") if isinstance(payload.get("ground_truth"), str) else None,
174
- system_message=payload.get("system_message") if isinstance(payload.get("system_message"), str) else None,
175
- original_input=payload.get("original_input") if isinstance(payload.get("original_input"), str) else None,
135
+ original_input=original_input,
176
136
  metadata=metadata,
177
137
  extra_info=extra_info,
178
138
  score_min=score_min,
@@ -213,17 +173,3 @@ def render_json_records(records: Sequence[dict[str, Any]]) -> str:
213
173
  segments.append("\n".join(snippet))
214
174
 
215
175
  return "\n".join(segments)
216
-
217
-
218
- def _parse_messages(messages: Any, *, source_label: str) -> tuple[ConversationMessage, ...]:
219
- if not isinstance(messages, list) or not messages:
220
- raise CLIError(f"Record '{source_label}' must include a non-empty 'messages' list.")
221
-
222
- normalized: list[ConversationMessage] = []
223
- for index, entry in enumerate(messages):
224
- if not isinstance(entry, dict):
225
- raise CLIError(
226
- f"Message {index} in {source_label} must be an object, got {type(entry).__name__}."
227
- )
228
- normalized.append(ConversationMessage.from_raw(entry, source_label=source_label, index=index))
229
- return tuple(normalized)