osmosis-ai 0.2.2__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of osmosis-ai might be problematic. Click here for more details.
- osmosis_ai/cli.py +50 -0
- osmosis_ai/cli_commands.py +181 -0
- osmosis_ai/cli_services/__init__.py +67 -0
- osmosis_ai/cli_services/config.py +407 -0
- osmosis_ai/cli_services/dataset.py +229 -0
- osmosis_ai/cli_services/engine.py +251 -0
- osmosis_ai/cli_services/errors.py +7 -0
- osmosis_ai/cli_services/reporting.py +307 -0
- osmosis_ai/cli_services/session.py +174 -0
- osmosis_ai/cli_services/shared.py +209 -0
- osmosis_ai/providers/gemini_provider.py +73 -28
- osmosis_ai/rubric_eval.py +27 -66
- osmosis_ai/utils.py +0 -4
- {osmosis_ai-0.2.2.dist-info → osmosis_ai-0.2.3.dist-info}/METADATA +64 -2
- osmosis_ai-0.2.3.dist-info/RECORD +27 -0
- osmosis_ai-0.2.3.dist-info/entry_points.txt +4 -0
- osmosis_ai-0.2.2.dist-info/RECORD +0 -16
- {osmosis_ai-0.2.2.dist-info → osmosis_ai-0.2.3.dist-info}/WHEEL +0 -0
- {osmosis_ai-0.2.2.dist-info → osmosis_ai-0.2.3.dist-info}/licenses/LICENSE +0 -0
- {osmosis_ai-0.2.2.dist-info → osmosis_ai-0.2.3.dist-info}/top_level.txt +0 -0
osmosis_ai/cli.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from .cli_commands import EvalCommand, PreviewCommand
|
|
8
|
+
from .cli_services import CLIError
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def main(argv: Optional[list[str]] = None) -> int:
|
|
12
|
+
"""Entry point for the osmosis CLI."""
|
|
13
|
+
parser = _build_parser()
|
|
14
|
+
args = parser.parse_args(argv)
|
|
15
|
+
|
|
16
|
+
handler = getattr(args, "handler", None)
|
|
17
|
+
if handler is None:
|
|
18
|
+
parser.print_help()
|
|
19
|
+
return 1
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
return handler(args)
|
|
23
|
+
except CLIError as exc:
|
|
24
|
+
print(f"Error: {exc}", file=sys.stderr)
|
|
25
|
+
return 1
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
29
|
+
parser = argparse.ArgumentParser(
|
|
30
|
+
prog="osmosis", description="Utilities for inspecting Osmosis rubric and test data files."
|
|
31
|
+
)
|
|
32
|
+
subparsers = parser.add_subparsers(dest="command")
|
|
33
|
+
|
|
34
|
+
preview_parser = subparsers.add_parser(
|
|
35
|
+
"preview",
|
|
36
|
+
help="Preview a rubric YAML file or test JSONL file and print its parsed contents.",
|
|
37
|
+
)
|
|
38
|
+
PreviewCommand().configure_parser(preview_parser)
|
|
39
|
+
|
|
40
|
+
eval_parser = subparsers.add_parser(
|
|
41
|
+
"eval",
|
|
42
|
+
help="Evaluate JSONL conversations against a rubric using remote providers.",
|
|
43
|
+
)
|
|
44
|
+
EvalCommand().configure_parser(eval_parser)
|
|
45
|
+
|
|
46
|
+
return parser
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
if __name__ == "__main__":
|
|
50
|
+
sys.exit(main())
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import time
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Callable, Optional
|
|
7
|
+
|
|
8
|
+
from .cli_services import (
|
|
9
|
+
CLIError,
|
|
10
|
+
ParsedItem,
|
|
11
|
+
BaselineComparator,
|
|
12
|
+
ConsoleReportRenderer,
|
|
13
|
+
DatasetLoader,
|
|
14
|
+
EvaluationSession,
|
|
15
|
+
EvaluationSessionRequest,
|
|
16
|
+
JsonReportWriter,
|
|
17
|
+
RubricEvaluationEngine,
|
|
18
|
+
RubricSuite,
|
|
19
|
+
discover_rubric_config_path,
|
|
20
|
+
load_jsonl_records,
|
|
21
|
+
load_rubric_configs,
|
|
22
|
+
load_rubric_suite,
|
|
23
|
+
render_json_records,
|
|
24
|
+
render_yaml_items,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class PreviewCommand:
|
|
29
|
+
"""Handler for `osmosis preview`."""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
*,
|
|
34
|
+
yaml_loader: Callable[[Path], list[ParsedItem]] = load_rubric_configs,
|
|
35
|
+
json_loader: Callable[[Path], list[dict[str, Any]]] = load_jsonl_records,
|
|
36
|
+
):
|
|
37
|
+
self._yaml_loader = yaml_loader
|
|
38
|
+
self._json_loader = json_loader
|
|
39
|
+
|
|
40
|
+
def configure_parser(self, parser: argparse.ArgumentParser) -> None:
|
|
41
|
+
parser.set_defaults(handler=self.run)
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"-p",
|
|
44
|
+
"--path",
|
|
45
|
+
dest="path",
|
|
46
|
+
required=True,
|
|
47
|
+
help="Path to the YAML or JSONL file to inspect.",
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def run(self, args: argparse.Namespace) -> int:
|
|
51
|
+
path = Path(args.path).expanduser()
|
|
52
|
+
if not path.exists():
|
|
53
|
+
raise CLIError(f"Path '{path}' does not exist.")
|
|
54
|
+
if path.is_dir():
|
|
55
|
+
raise CLIError(f"Expected a file path but got directory '{path}'.")
|
|
56
|
+
|
|
57
|
+
suffix = path.suffix.lower()
|
|
58
|
+
if suffix in {".yaml", ".yml"}:
|
|
59
|
+
items = self._yaml_loader(path)
|
|
60
|
+
print(f"Loaded {len(items)} rubric config(s) from {path}")
|
|
61
|
+
print(render_yaml_items(items, label="Rubric config"))
|
|
62
|
+
elif suffix == ".jsonl":
|
|
63
|
+
records = self._json_loader(path)
|
|
64
|
+
print(f"Loaded {len(records)} JSONL record(s) from {path}")
|
|
65
|
+
print(render_json_records(records))
|
|
66
|
+
else:
|
|
67
|
+
raise CLIError(f"Unsupported file extension '{suffix}'. Expected .yaml, .yml, or .jsonl.")
|
|
68
|
+
|
|
69
|
+
return 0
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class EvalCommand:
|
|
73
|
+
"""Handler for `osmosis eval`."""
|
|
74
|
+
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
*,
|
|
78
|
+
session: Optional[EvaluationSession] = None,
|
|
79
|
+
config_locator: Callable[[Optional[str], Path], Path] = discover_rubric_config_path,
|
|
80
|
+
suite_loader: Callable[[Path], RubricSuite] = load_rubric_suite,
|
|
81
|
+
dataset_loader: Optional[DatasetLoader] = None,
|
|
82
|
+
engine: Optional[RubricEvaluationEngine] = None,
|
|
83
|
+
renderer: Optional[ConsoleReportRenderer] = None,
|
|
84
|
+
report_writer: Optional[JsonReportWriter] = None,
|
|
85
|
+
baseline_comparator: Optional[BaselineComparator] = None,
|
|
86
|
+
):
|
|
87
|
+
self._renderer = renderer or ConsoleReportRenderer()
|
|
88
|
+
if session is not None:
|
|
89
|
+
self._session = session
|
|
90
|
+
else:
|
|
91
|
+
self._session = EvaluationSession(
|
|
92
|
+
config_locator=config_locator,
|
|
93
|
+
suite_loader=suite_loader,
|
|
94
|
+
dataset_loader=dataset_loader,
|
|
95
|
+
engine=engine,
|
|
96
|
+
baseline_comparator=baseline_comparator,
|
|
97
|
+
report_writer=report_writer,
|
|
98
|
+
identifier_factory=self._generate_output_identifier,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
def configure_parser(self, parser: argparse.ArgumentParser) -> None:
|
|
102
|
+
parser.set_defaults(handler=self.run)
|
|
103
|
+
parser.add_argument(
|
|
104
|
+
"-r",
|
|
105
|
+
"--rubric",
|
|
106
|
+
dest="rubric_id",
|
|
107
|
+
required=True,
|
|
108
|
+
help="Rubric identifier declared in the rubric config file.",
|
|
109
|
+
)
|
|
110
|
+
parser.add_argument(
|
|
111
|
+
"-d",
|
|
112
|
+
"--data",
|
|
113
|
+
dest="data_path",
|
|
114
|
+
required=True,
|
|
115
|
+
help="Path to the JSONL file containing evaluation records.",
|
|
116
|
+
)
|
|
117
|
+
parser.add_argument(
|
|
118
|
+
"-n",
|
|
119
|
+
"--number",
|
|
120
|
+
dest="number",
|
|
121
|
+
type=int,
|
|
122
|
+
default=1,
|
|
123
|
+
help="Run the evaluation multiple times to sample provider variance (default: 1).",
|
|
124
|
+
)
|
|
125
|
+
parser.add_argument(
|
|
126
|
+
"-c",
|
|
127
|
+
"--config",
|
|
128
|
+
dest="config_path",
|
|
129
|
+
help="Path to the rubric config YAML (defaults to searching near the data file).",
|
|
130
|
+
)
|
|
131
|
+
parser.add_argument(
|
|
132
|
+
"-o",
|
|
133
|
+
"--output",
|
|
134
|
+
dest="output_path",
|
|
135
|
+
help="Optional path to write evaluation results as JSON.",
|
|
136
|
+
)
|
|
137
|
+
parser.add_argument(
|
|
138
|
+
"-b",
|
|
139
|
+
"--baseline",
|
|
140
|
+
dest="baseline_path",
|
|
141
|
+
help="Optional path to a prior evaluation JSON to compare against.",
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
def run(self, args: argparse.Namespace) -> int:
|
|
145
|
+
rubric_id_raw = getattr(args, "rubric_id", "")
|
|
146
|
+
rubric_id = str(rubric_id_raw).strip()
|
|
147
|
+
if not rubric_id:
|
|
148
|
+
raise CLIError("Rubric identifier cannot be empty.")
|
|
149
|
+
|
|
150
|
+
data_path = Path(args.data_path).expanduser()
|
|
151
|
+
config_path_value = getattr(args, "config_path", None)
|
|
152
|
+
output_path_value = getattr(args, "output_path", None)
|
|
153
|
+
baseline_path_value = getattr(args, "baseline_path", None)
|
|
154
|
+
|
|
155
|
+
number_value = getattr(args, "number", None)
|
|
156
|
+
number = int(number_value) if number_value is not None else 1
|
|
157
|
+
|
|
158
|
+
request = EvaluationSessionRequest(
|
|
159
|
+
rubric_id=rubric_id,
|
|
160
|
+
data_path=data_path,
|
|
161
|
+
number=number,
|
|
162
|
+
config_path=Path(config_path_value).expanduser() if config_path_value else None,
|
|
163
|
+
output_path=Path(output_path_value).expanduser() if output_path_value else None,
|
|
164
|
+
baseline_path=Path(baseline_path_value).expanduser() if baseline_path_value else None,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
try:
|
|
168
|
+
result = self._session.execute(request)
|
|
169
|
+
except KeyboardInterrupt:
|
|
170
|
+
print("Evaluation cancelled by user.")
|
|
171
|
+
return 1
|
|
172
|
+
self._renderer.render(result.report, result.baseline)
|
|
173
|
+
|
|
174
|
+
if result.written_path is not None:
|
|
175
|
+
print(f"Wrote evaluation results to {result.written_path}")
|
|
176
|
+
|
|
177
|
+
return 0
|
|
178
|
+
|
|
179
|
+
@staticmethod
|
|
180
|
+
def _generate_output_identifier() -> str:
|
|
181
|
+
return str(int(time.time()))
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .config import (
|
|
4
|
+
ParsedItem,
|
|
5
|
+
RubricConfig,
|
|
6
|
+
RubricConfigParser,
|
|
7
|
+
RubricSuite,
|
|
8
|
+
discover_rubric_config_path,
|
|
9
|
+
load_rubric_configs,
|
|
10
|
+
load_rubric_suite,
|
|
11
|
+
render_yaml_items,
|
|
12
|
+
)
|
|
13
|
+
from .dataset import (
|
|
14
|
+
ConversationMessage,
|
|
15
|
+
DatasetLoader,
|
|
16
|
+
DatasetRecord,
|
|
17
|
+
load_jsonl_records,
|
|
18
|
+
render_json_records,
|
|
19
|
+
)
|
|
20
|
+
from .engine import (
|
|
21
|
+
EvaluationRecordResult,
|
|
22
|
+
EvaluationReport,
|
|
23
|
+
EvaluationRun,
|
|
24
|
+
RubricEvaluationEngine,
|
|
25
|
+
RubricEvaluator,
|
|
26
|
+
)
|
|
27
|
+
from .errors import CLIError
|
|
28
|
+
from .reporting import (
|
|
29
|
+
BaselineComparator,
|
|
30
|
+
BaselineStatistics,
|
|
31
|
+
ConsoleReportRenderer,
|
|
32
|
+
JsonReportFormatter,
|
|
33
|
+
JsonReportWriter,
|
|
34
|
+
TextReportFormatter,
|
|
35
|
+
)
|
|
36
|
+
from .session import EvaluationSession, EvaluationSessionRequest, EvaluationSessionResult
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
"BaselineComparator",
|
|
40
|
+
"BaselineStatistics",
|
|
41
|
+
"CLIError",
|
|
42
|
+
"ConsoleReportRenderer",
|
|
43
|
+
"ConversationMessage",
|
|
44
|
+
"DatasetLoader",
|
|
45
|
+
"DatasetRecord",
|
|
46
|
+
"EvaluationSession",
|
|
47
|
+
"EvaluationSessionRequest",
|
|
48
|
+
"EvaluationSessionResult",
|
|
49
|
+
"EvaluationRecordResult",
|
|
50
|
+
"EvaluationReport",
|
|
51
|
+
"EvaluationRun",
|
|
52
|
+
"JsonReportFormatter",
|
|
53
|
+
"JsonReportWriter",
|
|
54
|
+
"ParsedItem",
|
|
55
|
+
"RubricConfig",
|
|
56
|
+
"RubricConfigParser",
|
|
57
|
+
"RubricEvaluationEngine",
|
|
58
|
+
"RubricEvaluator",
|
|
59
|
+
"RubricSuite",
|
|
60
|
+
"TextReportFormatter",
|
|
61
|
+
"discover_rubric_config_path",
|
|
62
|
+
"load_jsonl_records",
|
|
63
|
+
"load_rubric_configs",
|
|
64
|
+
"load_rubric_suite",
|
|
65
|
+
"render_json_records",
|
|
66
|
+
"render_yaml_items",
|
|
67
|
+
]
|
|
@@ -0,0 +1,407 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import copy
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Optional, Sequence
|
|
7
|
+
|
|
8
|
+
import yaml
|
|
9
|
+
from yaml.representer import SafeRepresenter
|
|
10
|
+
|
|
11
|
+
from .errors import CLIError
|
|
12
|
+
from .shared import coerce_optional_float
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True)
|
|
16
|
+
class ParsedItem:
|
|
17
|
+
label: Optional[str]
|
|
18
|
+
payload: Any
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True)
|
|
22
|
+
class RubricConfig:
|
|
23
|
+
rubric_id: str
|
|
24
|
+
rubric_text: str
|
|
25
|
+
model_info: dict[str, Any]
|
|
26
|
+
score_min: Optional[float]
|
|
27
|
+
score_max: Optional[float]
|
|
28
|
+
system_message: Optional[str]
|
|
29
|
+
extra_info: Optional[dict[str, Any]]
|
|
30
|
+
original_input: Optional[str]
|
|
31
|
+
ground_truth: Optional[str]
|
|
32
|
+
source_label: str
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass(frozen=True)
|
|
36
|
+
class RubricSuite:
|
|
37
|
+
source_path: Path
|
|
38
|
+
version: Optional[int]
|
|
39
|
+
configs: dict[str, RubricConfig]
|
|
40
|
+
|
|
41
|
+
def get(self, rubric_id: str) -> RubricConfig:
|
|
42
|
+
if rubric_id not in self.configs:
|
|
43
|
+
available = ", ".join(self.available_ids()) or "none"
|
|
44
|
+
raise CLIError(
|
|
45
|
+
f"Rubric '{rubric_id}' not found in '{self.source_path}'. Available IDs: {available}"
|
|
46
|
+
)
|
|
47
|
+
return self.configs[rubric_id]
|
|
48
|
+
|
|
49
|
+
def available_ids(self) -> list[str]:
|
|
50
|
+
return sorted(self.configs)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass(frozen=True)
|
|
54
|
+
class RubricConfigDocumentResult:
|
|
55
|
+
configs: dict[str, RubricConfig]
|
|
56
|
+
items: list[ParsedItem]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class RubricConfigDocumentSchema:
|
|
60
|
+
"""Base interface for schema-specific rubric config parsing."""
|
|
61
|
+
|
|
62
|
+
version: Optional[int] = None
|
|
63
|
+
|
|
64
|
+
def parse_document(
|
|
65
|
+
self,
|
|
66
|
+
document: Any,
|
|
67
|
+
*,
|
|
68
|
+
path: Path,
|
|
69
|
+
doc_index: int,
|
|
70
|
+
strict: bool,
|
|
71
|
+
) -> RubricConfigDocumentResult:
|
|
72
|
+
raise NotImplementedError
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class LegacyRubricConfigSchema(RubricConfigDocumentSchema):
|
|
76
|
+
"""Schema handling documents without an explicit version."""
|
|
77
|
+
|
|
78
|
+
version = None
|
|
79
|
+
|
|
80
|
+
def parse_document(
|
|
81
|
+
self,
|
|
82
|
+
document: Any,
|
|
83
|
+
*,
|
|
84
|
+
path: Path,
|
|
85
|
+
doc_index: int,
|
|
86
|
+
strict: bool,
|
|
87
|
+
) -> RubricConfigDocumentResult:
|
|
88
|
+
defaults = _extract_config_defaults(document, path, doc_index)
|
|
89
|
+
entries = _extract_rubric_items(document, context=None, doc_index=doc_index)
|
|
90
|
+
return _build_document_configs(entries, defaults, path=path, doc_index=doc_index, strict=strict)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class Version1RubricConfigSchema(LegacyRubricConfigSchema):
|
|
94
|
+
"""Schema for version 1 documents (currently aligned with legacy layout)."""
|
|
95
|
+
|
|
96
|
+
version = 1
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class RubricConfigParser:
|
|
100
|
+
"""Parses rubric configuration files and produces typed suites."""
|
|
101
|
+
|
|
102
|
+
def __init__(self, *, schemas: Optional[dict[Optional[int], RubricConfigDocumentSchema]] = None):
|
|
103
|
+
self._schemas = schemas or {
|
|
104
|
+
None: LegacyRubricConfigSchema(),
|
|
105
|
+
1: Version1RubricConfigSchema(),
|
|
106
|
+
}
|
|
107
|
+
if None not in self._schemas:
|
|
108
|
+
raise ValueError("At least one default schema (key=None) must be provided.")
|
|
109
|
+
|
|
110
|
+
def parse(self, path: Path, *, strict: bool = True) -> tuple[RubricSuite, list[ParsedItem]]:
|
|
111
|
+
documents = _load_yaml_documents(path)
|
|
112
|
+
configs: dict[str, RubricConfig] = {}
|
|
113
|
+
parsed_items: list[ParsedItem] = []
|
|
114
|
+
detected_version: Optional[int] = None
|
|
115
|
+
document_indices = []
|
|
116
|
+
|
|
117
|
+
for doc_index, document in enumerate(documents):
|
|
118
|
+
if document:
|
|
119
|
+
document_indices.append(doc_index)
|
|
120
|
+
if not document:
|
|
121
|
+
continue
|
|
122
|
+
|
|
123
|
+
doc_version = self._coerce_optional_version(document, path, doc_index)
|
|
124
|
+
if doc_version is not None:
|
|
125
|
+
if detected_version is None:
|
|
126
|
+
detected_version = doc_version
|
|
127
|
+
elif detected_version != doc_version:
|
|
128
|
+
raise CLIError(
|
|
129
|
+
f"Rubric config '{path}' mixes different version numbers across documents."
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
schema = self._select_schema(detected_version)
|
|
133
|
+
|
|
134
|
+
for doc_index in document_indices:
|
|
135
|
+
document = documents[doc_index]
|
|
136
|
+
if not document:
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
result = schema.parse_document(
|
|
140
|
+
document,
|
|
141
|
+
path=path,
|
|
142
|
+
doc_index=doc_index,
|
|
143
|
+
strict=strict,
|
|
144
|
+
)
|
|
145
|
+
parsed_items.extend(result.items)
|
|
146
|
+
for rubric_id, config in result.configs.items():
|
|
147
|
+
if rubric_id in configs:
|
|
148
|
+
raise CLIError(f"Duplicate rubric id '{rubric_id}' detected in '{path}'.")
|
|
149
|
+
configs[rubric_id] = config
|
|
150
|
+
|
|
151
|
+
if strict and not configs:
|
|
152
|
+
raise CLIError(f"No rubric entries found in '{path}'.")
|
|
153
|
+
|
|
154
|
+
suite = RubricSuite(source_path=path, version=detected_version, configs=configs)
|
|
155
|
+
return suite, parsed_items
|
|
156
|
+
|
|
157
|
+
def _select_schema(self, version: Optional[int]) -> RubricConfigDocumentSchema:
|
|
158
|
+
if version in self._schemas:
|
|
159
|
+
return self._schemas[version]
|
|
160
|
+
if version is None:
|
|
161
|
+
return self._schemas[None]
|
|
162
|
+
raise CLIError(f"Unsupported rubric config version '{version}'.")
|
|
163
|
+
|
|
164
|
+
@staticmethod
|
|
165
|
+
def _coerce_optional_version(document: Any, path: Path, doc_index: int) -> Optional[int]:
|
|
166
|
+
if not isinstance(document, dict):
|
|
167
|
+
return None
|
|
168
|
+
version_value = document.get("version")
|
|
169
|
+
if version_value is None:
|
|
170
|
+
return None
|
|
171
|
+
if isinstance(version_value, int):
|
|
172
|
+
if version_value < 0:
|
|
173
|
+
raise CLIError(
|
|
174
|
+
f"Version number in '{path}' document {doc_index} must be non-negative."
|
|
175
|
+
)
|
|
176
|
+
return version_value
|
|
177
|
+
raise CLIError(
|
|
178
|
+
f"Version field in '{path}' document {doc_index} must be an integer."
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _build_document_configs(
|
|
183
|
+
entries: Sequence[ParsedItem],
|
|
184
|
+
defaults: dict[str, Any],
|
|
185
|
+
*,
|
|
186
|
+
path: Path,
|
|
187
|
+
doc_index: int,
|
|
188
|
+
strict: bool,
|
|
189
|
+
) -> RubricConfigDocumentResult:
|
|
190
|
+
configs: dict[str, RubricConfig] = {}
|
|
191
|
+
parsed_items: list[ParsedItem] = []
|
|
192
|
+
|
|
193
|
+
for item in entries:
|
|
194
|
+
payload = item.payload
|
|
195
|
+
parsed_items.append(ParsedItem(label=item.label, payload=payload))
|
|
196
|
+
if not isinstance(payload, dict):
|
|
197
|
+
continue
|
|
198
|
+
|
|
199
|
+
rubric_key_raw = payload.get("id")
|
|
200
|
+
if not isinstance(rubric_key_raw, str) or not rubric_key_raw.strip():
|
|
201
|
+
if strict:
|
|
202
|
+
raise CLIError(
|
|
203
|
+
f"Rubric entry in '{path}' (document {doc_index}) is missing a non-empty 'id'."
|
|
204
|
+
)
|
|
205
|
+
continue
|
|
206
|
+
rubric_key = rubric_key_raw.strip()
|
|
207
|
+
if rubric_key in configs:
|
|
208
|
+
raise CLIError(f"Duplicate rubric id '{rubric_key}' detected in '{path}'.")
|
|
209
|
+
|
|
210
|
+
rubric_text = payload.get("rubric")
|
|
211
|
+
if not isinstance(rubric_text, str) or not rubric_text.strip():
|
|
212
|
+
if strict:
|
|
213
|
+
raise CLIError(
|
|
214
|
+
f"Rubric '{rubric_key}' in '{path}' must include a non-empty 'rubric' string."
|
|
215
|
+
)
|
|
216
|
+
continue
|
|
217
|
+
|
|
218
|
+
model_info = payload.get("model_info", defaults.get("model_info"))
|
|
219
|
+
if not isinstance(model_info, dict):
|
|
220
|
+
if strict:
|
|
221
|
+
raise CLIError(
|
|
222
|
+
f"Rubric '{rubric_key}' in '{path}' must include a 'model_info' mapping."
|
|
223
|
+
)
|
|
224
|
+
continue
|
|
225
|
+
|
|
226
|
+
extra_info_value = payload.get("extra_info", defaults.get("extra_info"))
|
|
227
|
+
if extra_info_value is not None and not isinstance(extra_info_value, dict):
|
|
228
|
+
if strict:
|
|
229
|
+
raise CLIError(
|
|
230
|
+
f"'extra_info' for rubric '{rubric_key}' in '{path}' must be a mapping."
|
|
231
|
+
)
|
|
232
|
+
continue
|
|
233
|
+
|
|
234
|
+
try:
|
|
235
|
+
score_min = coerce_optional_float(
|
|
236
|
+
payload.get("score_min", defaults.get("score_min")),
|
|
237
|
+
"score_min",
|
|
238
|
+
f"rubric '{rubric_key}' in {path}",
|
|
239
|
+
)
|
|
240
|
+
score_max = coerce_optional_float(
|
|
241
|
+
payload.get("score_max", defaults.get("score_max")),
|
|
242
|
+
"score_max",
|
|
243
|
+
f"rubric '{rubric_key}' in {path}",
|
|
244
|
+
)
|
|
245
|
+
except CLIError:
|
|
246
|
+
if strict:
|
|
247
|
+
raise
|
|
248
|
+
continue
|
|
249
|
+
|
|
250
|
+
system_message = payload.get("system_message", defaults.get("system_message"))
|
|
251
|
+
original_input = payload.get("original_input", defaults.get("original_input"))
|
|
252
|
+
ground_truth = payload.get("ground_truth", defaults.get("ground_truth"))
|
|
253
|
+
|
|
254
|
+
label = item.label or f"document[{doc_index}]"
|
|
255
|
+
source_label = f"{path}:{label}"
|
|
256
|
+
|
|
257
|
+
configs[rubric_key] = RubricConfig(
|
|
258
|
+
rubric_id=rubric_key,
|
|
259
|
+
rubric_text=rubric_text,
|
|
260
|
+
model_info=copy.deepcopy(model_info),
|
|
261
|
+
score_min=score_min,
|
|
262
|
+
score_max=score_max,
|
|
263
|
+
system_message=system_message if isinstance(system_message, str) else None,
|
|
264
|
+
extra_info=copy.deepcopy(extra_info_value) if isinstance(extra_info_value, dict) else None,
|
|
265
|
+
original_input=original_input if isinstance(original_input, str) else None,
|
|
266
|
+
ground_truth=ground_truth if isinstance(ground_truth, str) else None,
|
|
267
|
+
source_label=source_label,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
return RubricConfigDocumentResult(configs=configs, items=parsed_items)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def discover_rubric_config_path(config_arg: Optional[str], data_path: Path) -> Path:
|
|
274
|
+
if config_arg:
|
|
275
|
+
candidate = Path(config_arg).expanduser()
|
|
276
|
+
if not candidate.exists():
|
|
277
|
+
raise CLIError(f"Rubric config path '{candidate}' does not exist.")
|
|
278
|
+
if candidate.is_dir():
|
|
279
|
+
raise CLIError(f"Rubric config path '{candidate}' is a directory.")
|
|
280
|
+
return candidate
|
|
281
|
+
|
|
282
|
+
candidates: list[Path] = []
|
|
283
|
+
candidates.append(data_path.parent / "rubric_configs.yaml")
|
|
284
|
+
candidates.append(Path.cwd() / "rubric_configs.yaml")
|
|
285
|
+
candidates.append(Path.cwd() / "examples" / "rubric_configs.yaml")
|
|
286
|
+
|
|
287
|
+
checked: list[Path] = []
|
|
288
|
+
for candidate in dict.fromkeys(candidates):
|
|
289
|
+
checked.append(candidate)
|
|
290
|
+
if candidate.exists() and candidate.is_file():
|
|
291
|
+
return candidate
|
|
292
|
+
|
|
293
|
+
searched = ", ".join(str(path) for path in checked)
|
|
294
|
+
raise CLIError(
|
|
295
|
+
"Unable to locate a rubric config file. Provide --config explicitly. "
|
|
296
|
+
f"Paths checked: {searched}"
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def load_rubric_configs(path: Path) -> list[ParsedItem]:
|
|
301
|
+
parser = RubricConfigParser()
|
|
302
|
+
_, items = parser.parse(path, strict=False)
|
|
303
|
+
return items
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def load_rubric_suite(path: Path) -> RubricSuite:
|
|
307
|
+
parser = RubricConfigParser()
|
|
308
|
+
suite, _ = parser.parse(path)
|
|
309
|
+
return suite
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def render_yaml_items(items: Sequence[ParsedItem], label: str) -> str:
|
|
313
|
+
blocks: list[str] = []
|
|
314
|
+
total = len(items)
|
|
315
|
+
|
|
316
|
+
for index, item in enumerate(items, start=1):
|
|
317
|
+
header = f"{label} #{index}"
|
|
318
|
+
if item.label:
|
|
319
|
+
header += f" ({item.label})"
|
|
320
|
+
dumped = yaml.dump(
|
|
321
|
+
item.payload,
|
|
322
|
+
Dumper=_LiteralSafeDumper,
|
|
323
|
+
sort_keys=False,
|
|
324
|
+
indent=2,
|
|
325
|
+
allow_unicode=True,
|
|
326
|
+
).rstrip()
|
|
327
|
+
|
|
328
|
+
snippet = [header, dumped]
|
|
329
|
+
if index != total:
|
|
330
|
+
snippet.append("")
|
|
331
|
+
blocks.append("\n".join(snippet))
|
|
332
|
+
|
|
333
|
+
return "\n".join(blocks)
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def _load_yaml_documents(path: Path) -> list[Any]:
|
|
337
|
+
try:
|
|
338
|
+
with path.open("r", encoding="utf-8") as fh:
|
|
339
|
+
return list(yaml.safe_load_all(fh))
|
|
340
|
+
except yaml.YAMLError as exc:
|
|
341
|
+
raise CLIError(f"Failed to parse YAML in '{path}': {exc}") from exc
|
|
342
|
+
except OSError as exc:
|
|
343
|
+
raise CLIError(f"Unable to read rubric config '{path}': {exc}") from exc
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def _extract_config_defaults(document: Any, path: Path, doc_index: int) -> dict[str, Any]:
|
|
347
|
+
if not isinstance(document, dict):
|
|
348
|
+
return {
|
|
349
|
+
"model_info": None,
|
|
350
|
+
"extra_info": None,
|
|
351
|
+
"score_min": None,
|
|
352
|
+
"score_max": None,
|
|
353
|
+
"system_message": None,
|
|
354
|
+
"original_input": None,
|
|
355
|
+
"ground_truth": None,
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
source = f"document[{doc_index}] in {path}"
|
|
359
|
+
|
|
360
|
+
defaults: dict[str, Any] = {}
|
|
361
|
+
defaults["model_info"] = document.get("default_model_info")
|
|
362
|
+
defaults["extra_info"] = document.get("default_extra_info")
|
|
363
|
+
defaults["score_min"] = coerce_optional_float(
|
|
364
|
+
document.get("default_score_min"), "default_score_min", source
|
|
365
|
+
)
|
|
366
|
+
defaults["score_max"] = coerce_optional_float(
|
|
367
|
+
document.get("default_score_max"), "default_score_max", source
|
|
368
|
+
)
|
|
369
|
+
defaults["system_message"] = document.get("default_system_message")
|
|
370
|
+
defaults["original_input"] = document.get("default_original_input")
|
|
371
|
+
defaults["ground_truth"] = document.get("default_ground_truth")
|
|
372
|
+
return defaults
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def _extract_rubric_items(node: Any, context: Optional[str], doc_index: int) -> list[ParsedItem]:
|
|
376
|
+
items: list[ParsedItem] = []
|
|
377
|
+
|
|
378
|
+
if node is None:
|
|
379
|
+
return items
|
|
380
|
+
|
|
381
|
+
if isinstance(node, dict):
|
|
382
|
+
if "rubric" in node and isinstance(node["rubric"], str):
|
|
383
|
+
label = context or f"document[{doc_index}]"
|
|
384
|
+
items.append(ParsedItem(label=label, payload=node))
|
|
385
|
+
else:
|
|
386
|
+
for key, value in node.items():
|
|
387
|
+
next_context = str(key) if isinstance(key, str) else context
|
|
388
|
+
items.extend(_extract_rubric_items(value, context=next_context, doc_index=doc_index))
|
|
389
|
+
elif isinstance(node, list):
|
|
390
|
+
for index, value in enumerate(node):
|
|
391
|
+
idx_context = f"{context}[{index}]" if context else None
|
|
392
|
+
items.extend(_extract_rubric_items(value, context=idx_context, doc_index=doc_index))
|
|
393
|
+
|
|
394
|
+
return items
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
class _LiteralSafeDumper(yaml.SafeDumper):
|
|
398
|
+
"""YAML dumper that preserves multiline strings with literal blocks."""
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def _represent_str(dumper: yaml.Dumper, data: str):
|
|
402
|
+
if "\n" in data:
|
|
403
|
+
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
|
|
404
|
+
return SafeRepresenter.represent_str(dumper, data)
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
_LiteralSafeDumper.add_representer(str, _represent_str)
|