osmosis-ai 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of osmosis-ai might be problematic. Click here for more details.
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/PKG-INFO +64 -2
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/README.md +53 -0
- osmosis_ai-0.2.3/osmosis_ai/cli.py +50 -0
- osmosis_ai-0.2.3/osmosis_ai/cli_commands.py +181 -0
- osmosis_ai-0.2.3/osmosis_ai/cli_services/__init__.py +67 -0
- osmosis_ai-0.2.3/osmosis_ai/cli_services/config.py +407 -0
- osmosis_ai-0.2.3/osmosis_ai/cli_services/dataset.py +229 -0
- osmosis_ai-0.2.3/osmosis_ai/cli_services/engine.py +251 -0
- osmosis_ai-0.2.3/osmosis_ai/cli_services/errors.py +7 -0
- osmosis_ai-0.2.3/osmosis_ai/cli_services/reporting.py +307 -0
- osmosis_ai-0.2.3/osmosis_ai/cli_services/session.py +174 -0
- osmosis_ai-0.2.3/osmosis_ai/cli_services/shared.py +209 -0
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/osmosis_ai/providers/gemini_provider.py +73 -28
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/osmosis_ai/rubric_eval.py +27 -66
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/osmosis_ai/utils.py +0 -4
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/osmosis_ai.egg-info/SOURCES.txt +12 -0
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/pyproject.toml +18 -3
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/requirements.txt +3 -1
- osmosis_ai-0.2.3/tests/test_cli.py +510 -0
- osmosis_ai-0.2.3/tests/test_cli_services.py +193 -0
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/LICENSE +0 -0
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/MANIFEST.in +0 -0
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/osmosis_ai/__init__.py +0 -0
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/osmosis_ai/consts.py +0 -0
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/osmosis_ai/providers/__init__.py +0 -0
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/osmosis_ai/providers/anthropic_provider.py +0 -0
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/osmosis_ai/providers/base.py +0 -0
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/osmosis_ai/providers/openai_family.py +0 -0
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/osmosis_ai/providers/shared.py +0 -0
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/osmosis_ai/rubric_types.py +0 -0
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/pytest.ini +0 -0
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/setup.cfg +0 -0
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/setup_env.bat +0 -0
- {osmosis_ai-0.2.2 → osmosis_ai-0.2.3}/tests/test_rubric_eval.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: osmosis-ai
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: A Python library for reward function validation with strict type enforcement.
|
|
5
5
|
Author-email: Osmosis AI <jake@osmosis.ai>
|
|
6
6
|
License: MIT License
|
|
@@ -29,9 +29,18 @@ Project-URL: Issues, https://github.com/Osmosis-AI/osmosis-sdk-python/issues
|
|
|
29
29
|
Classifier: Programming Language :: Python :: 3
|
|
30
30
|
Classifier: License :: OSI Approved :: MIT License
|
|
31
31
|
Classifier: Operating System :: OS Independent
|
|
32
|
-
Requires-Python: >=3.
|
|
32
|
+
Requires-Python: >=3.9
|
|
33
33
|
Description-Content-Type: text/markdown
|
|
34
34
|
License-File: LICENSE
|
|
35
|
+
Requires-Dist: PyYAML<7.0,>=6.0
|
|
36
|
+
Requires-Dist: python-dotenv<2.0.0,>=0.1.0
|
|
37
|
+
Requires-Dist: requests<3.0.0,>=2.0.0
|
|
38
|
+
Requires-Dist: xxhash<4.0.0,>=3.0.0
|
|
39
|
+
Requires-Dist: anthropic<0.50.0,>=0.36.0
|
|
40
|
+
Requires-Dist: openai>=2.0.0
|
|
41
|
+
Requires-Dist: google-genai>=1.0.0
|
|
42
|
+
Requires-Dist: xai-sdk>=1.2.0
|
|
43
|
+
Requires-Dist: tqdm<5.0.0,>=4.0.0
|
|
35
44
|
Dynamic: license-file
|
|
36
45
|
|
|
37
46
|
# osmosis-ai
|
|
@@ -44,6 +53,10 @@ A Python library that provides reward and rubric validation helpers for LLM appl
|
|
|
44
53
|
pip install osmosis-ai
|
|
45
54
|
```
|
|
46
55
|
|
|
56
|
+
Requires Python 3.9 or newer.
|
|
57
|
+
|
|
58
|
+
This installs the Osmosis CLI and pulls in the required provider SDKs (`openai`, `anthropic`, `google-genai`, `xai-sdk`) along with supporting utilities such as `PyYAML`, `python-dotenv`, `requests`, and `xxhash`.
|
|
59
|
+
|
|
47
60
|
For development:
|
|
48
61
|
```bash
|
|
49
62
|
git clone https://github.com/Osmosis-AI/osmosis-sdk-python
|
|
@@ -211,6 +224,55 @@ def numeric_tolerance(solution_str: str, ground_truth: str, extra_info: dict = N
|
|
|
211
224
|
|
|
212
225
|
- `examples/rubric_functions.py` demonstrates `evaluate_rubric` with OpenAI, Anthropic, Gemini, and xAI using the schema-enforced SDK integrations.
|
|
213
226
|
- `examples/reward_functions.py` keeps local reward helpers that showcase the decorator contract without external calls.
|
|
227
|
+
- `examples/rubric_configs.yaml` bundles two rubric definitions, each with its own provider configuration and extra prompt context.
|
|
228
|
+
- `examples/sample_data.jsonl` contains two conversation payloads mapped to those rubrics so you can trial dataset validation.
|
|
229
|
+
|
|
230
|
+
```yaml
|
|
231
|
+
# examples/rubric_configs.yaml (excerpt)
|
|
232
|
+
version: 1
|
|
233
|
+
rubrics:
|
|
234
|
+
- id: support_followup
|
|
235
|
+
model_info:
|
|
236
|
+
provider: openai
|
|
237
|
+
model: gpt-5-mini
|
|
238
|
+
api_key_env: OPENAI_API_KEY
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
```jsonl
|
|
242
|
+
{"conversation_id": "ticket-001", "rubric_id": "support_followup", "...": "..."}
|
|
243
|
+
{"conversation_id": "ticket-047", "rubric_id": "policy_grounding", "...": "..."}
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
## CLI Tools
|
|
247
|
+
|
|
248
|
+
Installing the SDK also provides a lightweight CLI available as `osmosis` (aliases: `osmosis_ai`, `osmosis-ai`) for inspecting rubric YAML files and JSONL test payloads.
|
|
249
|
+
|
|
250
|
+
Preview a rubric file and print every configuration discovered, including nested entries:
|
|
251
|
+
|
|
252
|
+
```bash
|
|
253
|
+
osmosis preview --path path/to/rubric.yaml
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
Preview a dataset of chat transcripts stored as JSONL:
|
|
257
|
+
|
|
258
|
+
```bash
|
|
259
|
+
osmosis preview --path path/to/data.jsonl
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
Evaluate a dataset against a hosted rubric configuration and print the returned scores:
|
|
263
|
+
|
|
264
|
+
```bash
|
|
265
|
+
osmosis eval --rubric support_followup --data examples/sample_data.jsonl
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
- Supply the dataset with `-d`/`--data path/to/data.jsonl`; the path is resolved relative to the current working directory.
|
|
269
|
+
- Use `--config path/to/rubric_configs.yaml` when the rubric definitions are not located alongside the dataset.
|
|
270
|
+
- Pass `-n`/`--number` to sample the provider multiple times per record; the CLI prints every run along with aggregate statistics (average, variance, standard deviation, and min/max).
|
|
271
|
+
- Provide `--output path/to/dir` to create the directory (if needed) and emit `rubric_eval_result_<unix_timestamp>.json`, or supply a full file path (any extension) to control the filename; each file captures every run, provider payloads, timestamps, and aggregate statistics for downstream analysis.
|
|
272
|
+
- Skip `--output` to collect results under `~/.cache/osmosis/eval_result/<rubric_id>/rubric_eval_result_<identifier>.json`; the CLI writes this JSON whether the evaluation finishes cleanly or hits provider/runtime errors so you can inspect failures later (only a manual Ctrl+C interrupt leaves no file behind).
|
|
273
|
+
- Dataset rows whose `rubric_id` does not match the requested rubric are skipped automatically.
|
|
274
|
+
|
|
275
|
+
Both commands validate the file, echo a short summary (`Loaded <n> ...`), and pretty-print the parsed records so you can confirm that new rubrics or test fixtures look correct before committing them. Invalid files raise a descriptive error and exit with a non-zero status code.
|
|
214
276
|
|
|
215
277
|
## Running Examples
|
|
216
278
|
|
|
@@ -8,6 +8,10 @@ A Python library that provides reward and rubric validation helpers for LLM appl
|
|
|
8
8
|
pip install osmosis-ai
|
|
9
9
|
```
|
|
10
10
|
|
|
11
|
+
Requires Python 3.9 or newer.
|
|
12
|
+
|
|
13
|
+
This installs the Osmosis CLI and pulls in the required provider SDKs (`openai`, `anthropic`, `google-genai`, `xai-sdk`) along with supporting utilities such as `PyYAML`, `python-dotenv`, `requests`, and `xxhash`.
|
|
14
|
+
|
|
11
15
|
For development:
|
|
12
16
|
```bash
|
|
13
17
|
git clone https://github.com/Osmosis-AI/osmosis-sdk-python
|
|
@@ -175,6 +179,55 @@ def numeric_tolerance(solution_str: str, ground_truth: str, extra_info: dict = N
|
|
|
175
179
|
|
|
176
180
|
- `examples/rubric_functions.py` demonstrates `evaluate_rubric` with OpenAI, Anthropic, Gemini, and xAI using the schema-enforced SDK integrations.
|
|
177
181
|
- `examples/reward_functions.py` keeps local reward helpers that showcase the decorator contract without external calls.
|
|
182
|
+
- `examples/rubric_configs.yaml` bundles two rubric definitions, each with its own provider configuration and extra prompt context.
|
|
183
|
+
- `examples/sample_data.jsonl` contains two conversation payloads mapped to those rubrics so you can trial dataset validation.
|
|
184
|
+
|
|
185
|
+
```yaml
|
|
186
|
+
# examples/rubric_configs.yaml (excerpt)
|
|
187
|
+
version: 1
|
|
188
|
+
rubrics:
|
|
189
|
+
- id: support_followup
|
|
190
|
+
model_info:
|
|
191
|
+
provider: openai
|
|
192
|
+
model: gpt-5-mini
|
|
193
|
+
api_key_env: OPENAI_API_KEY
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
```jsonl
|
|
197
|
+
{"conversation_id": "ticket-001", "rubric_id": "support_followup", "...": "..."}
|
|
198
|
+
{"conversation_id": "ticket-047", "rubric_id": "policy_grounding", "...": "..."}
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## CLI Tools
|
|
202
|
+
|
|
203
|
+
Installing the SDK also provides a lightweight CLI available as `osmosis` (aliases: `osmosis_ai`, `osmosis-ai`) for inspecting rubric YAML files and JSONL test payloads.
|
|
204
|
+
|
|
205
|
+
Preview a rubric file and print every configuration discovered, including nested entries:
|
|
206
|
+
|
|
207
|
+
```bash
|
|
208
|
+
osmosis preview --path path/to/rubric.yaml
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
Preview a dataset of chat transcripts stored as JSONL:
|
|
212
|
+
|
|
213
|
+
```bash
|
|
214
|
+
osmosis preview --path path/to/data.jsonl
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
Evaluate a dataset against a hosted rubric configuration and print the returned scores:
|
|
218
|
+
|
|
219
|
+
```bash
|
|
220
|
+
osmosis eval --rubric support_followup --data examples/sample_data.jsonl
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
- Supply the dataset with `-d`/`--data path/to/data.jsonl`; the path is resolved relative to the current working directory.
|
|
224
|
+
- Use `--config path/to/rubric_configs.yaml` when the rubric definitions are not located alongside the dataset.
|
|
225
|
+
- Pass `-n`/`--number` to sample the provider multiple times per record; the CLI prints every run along with aggregate statistics (average, variance, standard deviation, and min/max).
|
|
226
|
+
- Provide `--output path/to/dir` to create the directory (if needed) and emit `rubric_eval_result_<unix_timestamp>.json`, or supply a full file path (any extension) to control the filename; each file captures every run, provider payloads, timestamps, and aggregate statistics for downstream analysis.
|
|
227
|
+
- Skip `--output` to collect results under `~/.cache/osmosis/eval_result/<rubric_id>/rubric_eval_result_<identifier>.json`; the CLI writes this JSON whether the evaluation finishes cleanly or hits provider/runtime errors so you can inspect failures later (only a manual Ctrl+C interrupt leaves no file behind).
|
|
228
|
+
- Dataset rows whose `rubric_id` does not match the requested rubric are skipped automatically.
|
|
229
|
+
|
|
230
|
+
Both commands validate the file, echo a short summary (`Loaded <n> ...`), and pretty-print the parsed records so you can confirm that new rubrics or test fixtures look correct before committing them. Invalid files raise a descriptive error and exit with a non-zero status code.
|
|
178
231
|
|
|
179
232
|
## Running Examples
|
|
180
233
|
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from .cli_commands import EvalCommand, PreviewCommand
|
|
8
|
+
from .cli_services import CLIError
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def main(argv: Optional[list[str]] = None) -> int:
|
|
12
|
+
"""Entry point for the osmosis CLI."""
|
|
13
|
+
parser = _build_parser()
|
|
14
|
+
args = parser.parse_args(argv)
|
|
15
|
+
|
|
16
|
+
handler = getattr(args, "handler", None)
|
|
17
|
+
if handler is None:
|
|
18
|
+
parser.print_help()
|
|
19
|
+
return 1
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
return handler(args)
|
|
23
|
+
except CLIError as exc:
|
|
24
|
+
print(f"Error: {exc}", file=sys.stderr)
|
|
25
|
+
return 1
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
29
|
+
parser = argparse.ArgumentParser(
|
|
30
|
+
prog="osmosis", description="Utilities for inspecting Osmosis rubric and test data files."
|
|
31
|
+
)
|
|
32
|
+
subparsers = parser.add_subparsers(dest="command")
|
|
33
|
+
|
|
34
|
+
preview_parser = subparsers.add_parser(
|
|
35
|
+
"preview",
|
|
36
|
+
help="Preview a rubric YAML file or test JSONL file and print its parsed contents.",
|
|
37
|
+
)
|
|
38
|
+
PreviewCommand().configure_parser(preview_parser)
|
|
39
|
+
|
|
40
|
+
eval_parser = subparsers.add_parser(
|
|
41
|
+
"eval",
|
|
42
|
+
help="Evaluate JSONL conversations against a rubric using remote providers.",
|
|
43
|
+
)
|
|
44
|
+
EvalCommand().configure_parser(eval_parser)
|
|
45
|
+
|
|
46
|
+
return parser
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
if __name__ == "__main__":
|
|
50
|
+
sys.exit(main())
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import time
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Callable, Optional
|
|
7
|
+
|
|
8
|
+
from .cli_services import (
|
|
9
|
+
CLIError,
|
|
10
|
+
ParsedItem,
|
|
11
|
+
BaselineComparator,
|
|
12
|
+
ConsoleReportRenderer,
|
|
13
|
+
DatasetLoader,
|
|
14
|
+
EvaluationSession,
|
|
15
|
+
EvaluationSessionRequest,
|
|
16
|
+
JsonReportWriter,
|
|
17
|
+
RubricEvaluationEngine,
|
|
18
|
+
RubricSuite,
|
|
19
|
+
discover_rubric_config_path,
|
|
20
|
+
load_jsonl_records,
|
|
21
|
+
load_rubric_configs,
|
|
22
|
+
load_rubric_suite,
|
|
23
|
+
render_json_records,
|
|
24
|
+
render_yaml_items,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class PreviewCommand:
|
|
29
|
+
"""Handler for `osmosis preview`."""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
*,
|
|
34
|
+
yaml_loader: Callable[[Path], list[ParsedItem]] = load_rubric_configs,
|
|
35
|
+
json_loader: Callable[[Path], list[dict[str, Any]]] = load_jsonl_records,
|
|
36
|
+
):
|
|
37
|
+
self._yaml_loader = yaml_loader
|
|
38
|
+
self._json_loader = json_loader
|
|
39
|
+
|
|
40
|
+
def configure_parser(self, parser: argparse.ArgumentParser) -> None:
|
|
41
|
+
parser.set_defaults(handler=self.run)
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"-p",
|
|
44
|
+
"--path",
|
|
45
|
+
dest="path",
|
|
46
|
+
required=True,
|
|
47
|
+
help="Path to the YAML or JSONL file to inspect.",
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def run(self, args: argparse.Namespace) -> int:
|
|
51
|
+
path = Path(args.path).expanduser()
|
|
52
|
+
if not path.exists():
|
|
53
|
+
raise CLIError(f"Path '{path}' does not exist.")
|
|
54
|
+
if path.is_dir():
|
|
55
|
+
raise CLIError(f"Expected a file path but got directory '{path}'.")
|
|
56
|
+
|
|
57
|
+
suffix = path.suffix.lower()
|
|
58
|
+
if suffix in {".yaml", ".yml"}:
|
|
59
|
+
items = self._yaml_loader(path)
|
|
60
|
+
print(f"Loaded {len(items)} rubric config(s) from {path}")
|
|
61
|
+
print(render_yaml_items(items, label="Rubric config"))
|
|
62
|
+
elif suffix == ".jsonl":
|
|
63
|
+
records = self._json_loader(path)
|
|
64
|
+
print(f"Loaded {len(records)} JSONL record(s) from {path}")
|
|
65
|
+
print(render_json_records(records))
|
|
66
|
+
else:
|
|
67
|
+
raise CLIError(f"Unsupported file extension '{suffix}'. Expected .yaml, .yml, or .jsonl.")
|
|
68
|
+
|
|
69
|
+
return 0
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class EvalCommand:
|
|
73
|
+
"""Handler for `osmosis eval`."""
|
|
74
|
+
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
*,
|
|
78
|
+
session: Optional[EvaluationSession] = None,
|
|
79
|
+
config_locator: Callable[[Optional[str], Path], Path] = discover_rubric_config_path,
|
|
80
|
+
suite_loader: Callable[[Path], RubricSuite] = load_rubric_suite,
|
|
81
|
+
dataset_loader: Optional[DatasetLoader] = None,
|
|
82
|
+
engine: Optional[RubricEvaluationEngine] = None,
|
|
83
|
+
renderer: Optional[ConsoleReportRenderer] = None,
|
|
84
|
+
report_writer: Optional[JsonReportWriter] = None,
|
|
85
|
+
baseline_comparator: Optional[BaselineComparator] = None,
|
|
86
|
+
):
|
|
87
|
+
self._renderer = renderer or ConsoleReportRenderer()
|
|
88
|
+
if session is not None:
|
|
89
|
+
self._session = session
|
|
90
|
+
else:
|
|
91
|
+
self._session = EvaluationSession(
|
|
92
|
+
config_locator=config_locator,
|
|
93
|
+
suite_loader=suite_loader,
|
|
94
|
+
dataset_loader=dataset_loader,
|
|
95
|
+
engine=engine,
|
|
96
|
+
baseline_comparator=baseline_comparator,
|
|
97
|
+
report_writer=report_writer,
|
|
98
|
+
identifier_factory=self._generate_output_identifier,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
def configure_parser(self, parser: argparse.ArgumentParser) -> None:
|
|
102
|
+
parser.set_defaults(handler=self.run)
|
|
103
|
+
parser.add_argument(
|
|
104
|
+
"-r",
|
|
105
|
+
"--rubric",
|
|
106
|
+
dest="rubric_id",
|
|
107
|
+
required=True,
|
|
108
|
+
help="Rubric identifier declared in the rubric config file.",
|
|
109
|
+
)
|
|
110
|
+
parser.add_argument(
|
|
111
|
+
"-d",
|
|
112
|
+
"--data",
|
|
113
|
+
dest="data_path",
|
|
114
|
+
required=True,
|
|
115
|
+
help="Path to the JSONL file containing evaluation records.",
|
|
116
|
+
)
|
|
117
|
+
parser.add_argument(
|
|
118
|
+
"-n",
|
|
119
|
+
"--number",
|
|
120
|
+
dest="number",
|
|
121
|
+
type=int,
|
|
122
|
+
default=1,
|
|
123
|
+
help="Run the evaluation multiple times to sample provider variance (default: 1).",
|
|
124
|
+
)
|
|
125
|
+
parser.add_argument(
|
|
126
|
+
"-c",
|
|
127
|
+
"--config",
|
|
128
|
+
dest="config_path",
|
|
129
|
+
help="Path to the rubric config YAML (defaults to searching near the data file).",
|
|
130
|
+
)
|
|
131
|
+
parser.add_argument(
|
|
132
|
+
"-o",
|
|
133
|
+
"--output",
|
|
134
|
+
dest="output_path",
|
|
135
|
+
help="Optional path to write evaluation results as JSON.",
|
|
136
|
+
)
|
|
137
|
+
parser.add_argument(
|
|
138
|
+
"-b",
|
|
139
|
+
"--baseline",
|
|
140
|
+
dest="baseline_path",
|
|
141
|
+
help="Optional path to a prior evaluation JSON to compare against.",
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
def run(self, args: argparse.Namespace) -> int:
|
|
145
|
+
rubric_id_raw = getattr(args, "rubric_id", "")
|
|
146
|
+
rubric_id = str(rubric_id_raw).strip()
|
|
147
|
+
if not rubric_id:
|
|
148
|
+
raise CLIError("Rubric identifier cannot be empty.")
|
|
149
|
+
|
|
150
|
+
data_path = Path(args.data_path).expanduser()
|
|
151
|
+
config_path_value = getattr(args, "config_path", None)
|
|
152
|
+
output_path_value = getattr(args, "output_path", None)
|
|
153
|
+
baseline_path_value = getattr(args, "baseline_path", None)
|
|
154
|
+
|
|
155
|
+
number_value = getattr(args, "number", None)
|
|
156
|
+
number = int(number_value) if number_value is not None else 1
|
|
157
|
+
|
|
158
|
+
request = EvaluationSessionRequest(
|
|
159
|
+
rubric_id=rubric_id,
|
|
160
|
+
data_path=data_path,
|
|
161
|
+
number=number,
|
|
162
|
+
config_path=Path(config_path_value).expanduser() if config_path_value else None,
|
|
163
|
+
output_path=Path(output_path_value).expanduser() if output_path_value else None,
|
|
164
|
+
baseline_path=Path(baseline_path_value).expanduser() if baseline_path_value else None,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
try:
|
|
168
|
+
result = self._session.execute(request)
|
|
169
|
+
except KeyboardInterrupt:
|
|
170
|
+
print("Evaluation cancelled by user.")
|
|
171
|
+
return 1
|
|
172
|
+
self._renderer.render(result.report, result.baseline)
|
|
173
|
+
|
|
174
|
+
if result.written_path is not None:
|
|
175
|
+
print(f"Wrote evaluation results to {result.written_path}")
|
|
176
|
+
|
|
177
|
+
return 0
|
|
178
|
+
|
|
179
|
+
@staticmethod
|
|
180
|
+
def _generate_output_identifier() -> str:
|
|
181
|
+
return str(int(time.time()))
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .config import (
|
|
4
|
+
ParsedItem,
|
|
5
|
+
RubricConfig,
|
|
6
|
+
RubricConfigParser,
|
|
7
|
+
RubricSuite,
|
|
8
|
+
discover_rubric_config_path,
|
|
9
|
+
load_rubric_configs,
|
|
10
|
+
load_rubric_suite,
|
|
11
|
+
render_yaml_items,
|
|
12
|
+
)
|
|
13
|
+
from .dataset import (
|
|
14
|
+
ConversationMessage,
|
|
15
|
+
DatasetLoader,
|
|
16
|
+
DatasetRecord,
|
|
17
|
+
load_jsonl_records,
|
|
18
|
+
render_json_records,
|
|
19
|
+
)
|
|
20
|
+
from .engine import (
|
|
21
|
+
EvaluationRecordResult,
|
|
22
|
+
EvaluationReport,
|
|
23
|
+
EvaluationRun,
|
|
24
|
+
RubricEvaluationEngine,
|
|
25
|
+
RubricEvaluator,
|
|
26
|
+
)
|
|
27
|
+
from .errors import CLIError
|
|
28
|
+
from .reporting import (
|
|
29
|
+
BaselineComparator,
|
|
30
|
+
BaselineStatistics,
|
|
31
|
+
ConsoleReportRenderer,
|
|
32
|
+
JsonReportFormatter,
|
|
33
|
+
JsonReportWriter,
|
|
34
|
+
TextReportFormatter,
|
|
35
|
+
)
|
|
36
|
+
from .session import EvaluationSession, EvaluationSessionRequest, EvaluationSessionResult
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
"BaselineComparator",
|
|
40
|
+
"BaselineStatistics",
|
|
41
|
+
"CLIError",
|
|
42
|
+
"ConsoleReportRenderer",
|
|
43
|
+
"ConversationMessage",
|
|
44
|
+
"DatasetLoader",
|
|
45
|
+
"DatasetRecord",
|
|
46
|
+
"EvaluationSession",
|
|
47
|
+
"EvaluationSessionRequest",
|
|
48
|
+
"EvaluationSessionResult",
|
|
49
|
+
"EvaluationRecordResult",
|
|
50
|
+
"EvaluationReport",
|
|
51
|
+
"EvaluationRun",
|
|
52
|
+
"JsonReportFormatter",
|
|
53
|
+
"JsonReportWriter",
|
|
54
|
+
"ParsedItem",
|
|
55
|
+
"RubricConfig",
|
|
56
|
+
"RubricConfigParser",
|
|
57
|
+
"RubricEvaluationEngine",
|
|
58
|
+
"RubricEvaluator",
|
|
59
|
+
"RubricSuite",
|
|
60
|
+
"TextReportFormatter",
|
|
61
|
+
"discover_rubric_config_path",
|
|
62
|
+
"load_jsonl_records",
|
|
63
|
+
"load_rubric_configs",
|
|
64
|
+
"load_rubric_suite",
|
|
65
|
+
"render_json_records",
|
|
66
|
+
"render_yaml_items",
|
|
67
|
+
]
|