loopgym 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
loopgym/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ """LoopGym — OpenAI Gym equivalent for LSS-defined agent loops."""
2
+
3
+ from loopgym.envs.base import LoopEnv, Observation
4
+ from loopgym.registry import list_envs, make
5
+
6
+ __version__ = "0.1.0"
7
+ __all__ = ["LoopEnv", "Observation", "list_envs", "make"]
loopgym/cli.py ADDED
@@ -0,0 +1,47 @@
1
+ """LoopGym CLI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+
8
+ import loopgym as lg
9
+
10
+
11
+ def main() -> None:
12
+ parser = argparse.ArgumentParser(description="LoopGym — run loop environments")
13
+ sub = parser.add_subparsers(dest="command", required=True)
14
+
15
+ list_cmd = sub.add_parser("list", help="List registered environments")
16
+ list_cmd.set_defaults(func=_cmd_list)
17
+
18
+ run_cmd = sub.add_parser("run", help="Run an environment episode")
19
+ run_cmd.add_argument("env_id", help="Environment ID")
20
+ run_cmd.add_argument("--task-id", default="default")
21
+ run_cmd.add_argument("--seed", type=int, default=0)
22
+ run_cmd.add_argument("--spec-path", default=None)
23
+ run_cmd.set_defaults(func=_cmd_run)
24
+
25
+ args = parser.parse_args()
26
+ args.func(args)
27
+
28
+
29
+ def _cmd_list(_args: argparse.Namespace) -> None:
30
+ for env_id in lg.list_envs():
31
+ print(env_id)
32
+
33
+
34
+ def _cmd_run(args: argparse.Namespace) -> None:
35
+ env = lg.make(args.env_id, spec_path=args.spec_path, seed=args.seed)
36
+ if hasattr(env, "run_episode"):
37
+ result = env.run_episode(task_id=args.task_id, seed=args.seed)
38
+ print(json.dumps(result, indent=2))
39
+ else:
40
+ obs = env.reset(task_id=args.task_id, seed=args.seed)
41
+ while not env.done:
42
+ obs, reward, done, info = env.step()
43
+ print(f"step reward={reward:.3f} quality={obs.quality_score:.3f} done={done}")
44
+
45
+
46
+ if __name__ == "__main__":
47
+ main()
@@ -0,0 +1,8 @@
1
+ """Loop environment implementations."""
2
+
3
+ from loopgym.envs.base import LoopEnv, Observation
4
+ from loopgym.envs.live import LiveEnv
5
+ from loopgym.envs.replay import ReplayEnv
6
+ from loopgym.envs.sim import SimEnv
7
+
8
+ __all__ = ["LoopEnv", "LiveEnv", "Observation", "ReplayEnv", "SimEnv"]
loopgym/envs/base.py ADDED
@@ -0,0 +1,63 @@
1
+ """Loop environment base classes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from dataclasses import dataclass, field
7
+ from typing import Any
8
+
9
+
10
+ @dataclass
11
+ class Observation:
12
+ """Gym-style observation returned by reset/step."""
13
+
14
+ task_id: str
15
+ iteration: int
16
+ output: str
17
+ quality_score: float
18
+ objective: str
19
+ done: bool
20
+ info: dict[str, Any] = field(default_factory=dict)
21
+
22
+ def to_dict(self) -> dict[str, Any]:
23
+ return {
24
+ "task_id": self.task_id,
25
+ "iteration": self.iteration,
26
+ "output": self.output,
27
+ "quality_score": self.quality_score,
28
+ "objective": self.objective,
29
+ "done": self.done,
30
+ "info": self.info,
31
+ }
32
+
33
+
34
+ class LoopEnv(ABC):
35
+ """Abstract loop environment (Gym-style API)."""
36
+
37
+ def __init__(self, env_id: str) -> None:
38
+ self.env_id = env_id
39
+ self._done = False
40
+ self._task_id = ""
41
+ self._obs: Observation | None = None
42
+
43
+ @property
44
+ def done(self) -> bool:
45
+ return self._done
46
+
47
+ @property
48
+ def observation(self) -> Observation | None:
49
+ return self._obs
50
+
51
+ @abstractmethod
52
+ def reset(self, task_id: str = "", seed: int | None = None, **kwargs: Any) -> Observation:
53
+ """Reset environment for a new episode."""
54
+
55
+ @abstractmethod
56
+ def step(self, action: Any = None) -> tuple[Observation, float, bool, dict[str, Any]]:
57
+ """Advance one loop iteration. Returns (obs, reward, done, info)."""
58
+
59
+ def close(self) -> None:
60
+ """Release resources."""
61
+
62
+ def _reward(self, quality_score: float, success: bool) -> float:
63
+ return quality_score if success else quality_score - 0.5
loopgym/envs/live.py ADDED
@@ -0,0 +1,80 @@
1
+ """LiveEnv — real LLM APIs (optional, user-provided keys)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from typing import Any
7
+
8
+ from loopgym.envs.base import LoopEnv, Observation
9
+ from loopgym.envs.sim import SimEnv
10
+ from loopgym.runtime.loop_runtime import LLMClient
11
+
12
+
13
+ class _OpenAILiveLLM:
14
+ """Thin wrapper for OpenAI-compatible APIs (optional dependency)."""
15
+
16
+ def __init__(self, model: str = "gpt-4.1-mini", api_key: str | None = None) -> None:
17
+ self.model = model
18
+ self.api_key = api_key or os.environ.get("OPENAI_API_KEY", "")
19
+ self.tokens_used = 0
20
+ if not self.api_key:
21
+ raise ValueError(
22
+ "LiveEnv requires OPENAI_API_KEY or api_key= parameter. "
23
+ "Use SimEnv for keyless testing."
24
+ )
25
+ try:
26
+ import openai # noqa: PLC0415
27
+ except ImportError as exc:
28
+ raise ImportError("Install openai: pip install openai") from exc
29
+ self._client = openai.OpenAI(api_key=self.api_key)
30
+
31
+ def complete(self, prompt: str, role: str = "default") -> str:
32
+ response = self._client.chat.completions.create(
33
+ model=self.model,
34
+ messages=[
35
+ {"role": "system", "content": f"You are a {role} agent in a loop."},
36
+ {"role": "user", "content": prompt},
37
+ ],
38
+ temperature=0.2,
39
+ )
40
+ text = response.choices[0].message.content or ""
41
+ if response.usage:
42
+ self.tokens_used += response.usage.total_tokens
43
+ return text
44
+
45
+
46
+ class LiveEnv(LoopEnv):
47
+ """Live environment using real LLM APIs. Falls back to documenting key requirement."""
48
+
49
+ def __init__(
50
+ self,
51
+ env_id: str,
52
+ spec: dict[str, Any],
53
+ llm: LLMClient | None = None,
54
+ **sim_kwargs: Any,
55
+ ) -> None:
56
+ super().__init__(env_id)
57
+ self._sim = SimEnv(env_id, spec, **sim_kwargs)
58
+ self._llm = llm
59
+ if llm is not None:
60
+ self._sim._runtime = None # will be set on reset with custom llm
61
+
62
+ def reset(self, task_id: str = "", seed: int | None = None, **kwargs: Any) -> Observation:
63
+ obs = self._sim.reset(task_id=task_id, seed=seed, **kwargs)
64
+ if self._llm and self._sim._runtime:
65
+ self._sim._runtime.llm = self._llm
66
+ return obs
67
+
68
+ def step(self, action: Any = None) -> tuple[Observation, float, bool, dict[str, Any]]:
69
+ return self._sim.step(action)
70
+
71
+ @classmethod
72
+ def with_openai(
73
+ cls,
74
+ env_id: str,
75
+ spec: dict[str, Any],
76
+ model: str = "gpt-4.1-mini",
77
+ api_key: str | None = None,
78
+ **sim_kwargs: Any,
79
+ ) -> LiveEnv:
80
+ return cls(env_id, spec, llm=_OpenAILiveLLM(model=model, api_key=api_key), **sim_kwargs)
loopgym/envs/replay.py ADDED
@@ -0,0 +1,197 @@
1
+ """ReplayEnv — replay LoopNet trajectories from ln/record-v1 JSONL."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ import random
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from loopgym.envs.base import LoopEnv, Observation
12
+
13
+
14
+ def _default_loopnet_seed_path() -> Path | None:
15
+ """Resolve LoopNet seed corpus from env, sibling clone, or CI deps layout."""
16
+ loopgym_root = Path(__file__).resolve().parents[2]
17
+ candidates: list[Path] = []
18
+ env_path = os.environ.get("LOOPNET_SEED_PATH")
19
+ if env_path:
20
+ candidates.append(Path(env_path))
21
+ candidates.extend(
22
+ [
23
+ loopgym_root.parent / "04-loopnet" / "data" / "seed" / "records.jsonl",
24
+ loopgym_root.parent / "loopnet" / "data" / "seed" / "records.jsonl",
25
+ loopgym_root / "deps" / "loopnet" / "data" / "seed" / "records.jsonl",
26
+ ]
27
+ )
28
+ for candidate in candidates:
29
+ if candidate.exists():
30
+ return candidate
31
+ return None
32
+
33
+
34
+ def load_loopnet_records(path: Path) -> list[dict[str, Any]]:
35
+ """Load LoopNet records from JSONL."""
36
+ records: list[dict[str, Any]] = []
37
+ with path.open(encoding="utf-8") as handle:
38
+ for line_no, line in enumerate(handle, start=1):
39
+ line = line.strip()
40
+ if not line:
41
+ continue
42
+ try:
43
+ records.append(json.loads(line))
44
+ except json.JSONDecodeError as exc:
45
+ raise ValueError(f"{path}:{line_no}: invalid JSON — {exc}") from exc
46
+ return records
47
+
48
+
49
+ def record_to_trajectory(record: dict[str, Any]) -> list[dict[str, Any]]:
50
+ """Map ln/record-v1 trajectory steps to ReplayEnv step dicts."""
51
+ steps: list[dict[str, Any]] = []
52
+ objective = str(record.get("objective", ""))
53
+ for step in record.get("trajectory") or []:
54
+ goal = float(step.get("goal_score", step.get("primary_quality", 0.0)))
55
+ steps.append(
56
+ {
57
+ "iteration": int(step.get("iteration", len(steps) + 1)),
58
+ "output": (
59
+ f"[loopnet replay] {objective[:80]}… "
60
+ f"iter={step.get('iteration')} goal={goal:.3f}"
61
+ ),
62
+ "quality_score": goal,
63
+ "cost_usd": step.get("cost_usd"),
64
+ "latency_seconds": step.get("latency_seconds"),
65
+ "failure_codes": list(step.get("failure_codes") or []),
66
+ }
67
+ )
68
+ return steps
69
+
70
+
71
+ class ReplayEnv(LoopEnv):
72
+ """Replay recorded trajectories from LoopNet ln/record-v1 records."""
73
+
74
+ def __init__(
75
+ self,
76
+ env_id: str,
77
+ trajectory_path: str | Path | None = None,
78
+ records_path: str | Path | None = None,
79
+ ) -> None:
80
+ super().__init__(env_id)
81
+ self.trajectory_path = Path(trajectory_path) if trajectory_path else None
82
+ resolved_records = Path(records_path) if records_path else None
83
+ if resolved_records and resolved_records.exists():
84
+ self.records_path = resolved_records
85
+ else:
86
+ self.records_path = _default_loopnet_seed_path()
87
+ self._records: list[dict[str, Any]] = []
88
+ self._record: dict[str, Any] | None = None
89
+ self._trajectory: list[dict[str, Any]] = []
90
+ self._index = 0
91
+
92
+ def _load_records_corpus(self) -> None:
93
+ if self._records or not self.records_path or not self.records_path.exists():
94
+ return
95
+ self._records = load_loopnet_records(self.records_path)
96
+
97
+ def _select_record(self, task_id: str, seed: int | None) -> dict[str, Any] | None:
98
+ self._load_records_corpus()
99
+ if not self._records:
100
+ return None
101
+
102
+ if task_id.startswith("ln-"):
103
+ for record in self._records:
104
+ if record.get("record_id") == task_id:
105
+ return record
106
+
107
+ if task_id and task_id != "default":
108
+ for record in self._records:
109
+ if record.get("loop_name") == task_id:
110
+ return record
111
+
112
+ rng = random.Random(seed if seed is not None else 0)
113
+ return rng.choice(self._records)
114
+
115
+ def reset(self, task_id: str = "", seed: int | None = None, **kwargs: Any) -> Observation:
116
+ record_id = kwargs.get("record_id")
117
+ self._task_id = task_id or "default"
118
+ self._index = 0
119
+ self._trajectory = []
120
+ self._done = False
121
+ self._record = None
122
+
123
+ if self.trajectory_path and self.trajectory_path.exists():
124
+ with self.trajectory_path.open(encoding="utf-8") as handle:
125
+ data = json.load(handle)
126
+ self._trajectory = list(data.get("steps") or data.get("history") or [])
127
+ else:
128
+ if record_id:
129
+ self._load_records_corpus()
130
+ for record in self._records:
131
+ if record.get("record_id") == record_id:
132
+ self._record = record
133
+ break
134
+ else:
135
+ self._record = self._select_record(self._task_id, seed)
136
+
137
+ if self._record:
138
+ self._trajectory = record_to_trajectory(self._record)
139
+ self._task_id = str(self._record.get("record_id", self._task_id))
140
+
141
+ if not self._trajectory:
142
+ self._trajectory = [
143
+ {"iteration": 1, "output": "replay fallback step 1", "quality_score": 0.5},
144
+ {"iteration": 2, "output": "replay fallback step 2", "quality_score": 0.85},
145
+ ]
146
+
147
+ step = self._trajectory[0]
148
+ objective = str((self._record or {}).get("objective", "Replay LoopNet trajectory"))
149
+ self._obs = Observation(
150
+ task_id=self._task_id,
151
+ iteration=int(step.get("iteration", 1)),
152
+ output=str(step.get("output", "")),
153
+ quality_score=float(step.get("quality_score", 0.0)),
154
+ objective=objective,
155
+ done=False,
156
+ info={
157
+ "mode": "replay",
158
+ "total_steps": len(self._trajectory),
159
+ "record_id": (self._record or {}).get("record_id"),
160
+ "outcome": (self._record or {}).get("outcome"),
161
+ },
162
+ )
163
+ return self._obs
164
+
165
+ def step(self, action: Any = None) -> tuple[Observation, float, bool, dict[str, Any]]:
166
+ self._index += 1
167
+ if self._index >= len(self._trajectory):
168
+ self._done = True
169
+ obs = self._obs or Observation(
170
+ task_id=self._task_id,
171
+ iteration=0,
172
+ output="",
173
+ quality_score=0.0,
174
+ objective="Replay LoopNet trajectory",
175
+ done=True,
176
+ )
177
+ return obs, 0.0, True, {"reason": "trajectory_exhausted"}
178
+
179
+ step = self._trajectory[self._index]
180
+ quality = float(step.get("quality_score", 0.0))
181
+ self._done = self._index >= len(self._trajectory) - 1
182
+ objective = str((self._record or {}).get("objective", "Replay LoopNet trajectory"))
183
+ self._obs = Observation(
184
+ task_id=self._task_id,
185
+ iteration=int(step.get("iteration", self._index + 1)),
186
+ output=str(step.get("output", "")),
187
+ quality_score=quality,
188
+ objective=objective,
189
+ done=self._done,
190
+ info={
191
+ "mode": "replay",
192
+ "step_index": self._index,
193
+ "record_id": (self._record or {}).get("record_id"),
194
+ "failure_codes": step.get("failure_codes", []),
195
+ },
196
+ )
197
+ return self._obs, quality, self._done, {"step_index": self._index}
loopgym/envs/sim.py ADDED
@@ -0,0 +1,183 @@
1
+ """SimEnv — mock LLM + mock oracles (no API keys)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from loopgym.envs.base import LoopEnv, Observation
10
+ from loopgym.runtime.compiler import compile_lss
11
+ from loopgym.runtime.loop_runtime import LoopRuntime, LoopState, MockLLM, load_lss_spec
12
+
13
+
14
+ def _task_input(spec: dict[str, Any], task_id: str, tasks_path: Path | None) -> str:
15
+ """Resolve task input from tasks.json or spec examples."""
16
+ if tasks_path and tasks_path.exists():
17
+ with tasks_path.open(encoding="utf-8") as fh:
18
+ tasks = json.load(fh)
19
+ for task in tasks.get("tasks", []):
20
+ if task.get("id") == task_id:
21
+ payload = task.get("input") or task
22
+ return json.dumps(payload) if isinstance(payload, dict) else str(payload)
23
+ inputs = spec.get("inputs") or {}
24
+ examples = inputs.get("examples") or []
25
+ if examples:
26
+ first = examples[0]
27
+ if isinstance(first, dict):
28
+ return json.dumps(first)
29
+ return str(first)
30
+ return f"task:{task_id or 'default'}"
31
+
32
+
33
+ def _mock_llm_seed(env_seed: int, task_id: str, loop_name: str) -> str:
34
+ """Deterministic seed for reproducible trajectories."""
35
+ return f"{env_seed}:{task_id}:{loop_name}"
36
+
37
+
38
+ class SimEnv(LoopEnv):
39
+ """Simulation environment with MockLLM — no API keys required."""
40
+
41
+ def __init__(
42
+ self,
43
+ env_id: str,
44
+ spec: dict[str, Any],
45
+ spec_path: Path | None = None,
46
+ tasks_path: Path | None = None,
47
+ seed: int = 0,
48
+ ) -> None:
49
+ super().__init__(env_id)
50
+ self.spec = spec
51
+ self.spec_path = spec_path
52
+ self.tasks_path = tasks_path
53
+ self.seed = seed
54
+ self._graph = compile_lss(spec)
55
+ self._runtime: LoopRuntime | None = None
56
+ self._state: LoopState | None = None
57
+ self._user_input = ""
58
+
59
+ @classmethod
60
+ def from_spec_file(
61
+ cls,
62
+ env_id: str,
63
+ spec_path: str | Path,
64
+ tasks_path: str | Path | None = None,
65
+ seed: int = 0,
66
+ ) -> SimEnv:
67
+ path = Path(spec_path)
68
+ tasks = Path(tasks_path) if tasks_path else path.parent / "tasks.json"
69
+ return cls(
70
+ env_id=env_id,
71
+ spec=load_lss_spec(path),
72
+ spec_path=path,
73
+ tasks_path=tasks if tasks.exists() else None,
74
+ seed=seed,
75
+ )
76
+
77
+ def reset(self, task_id: str = "", seed: int | None = None, **kwargs: Any) -> Observation:
78
+ if seed is not None:
79
+ self.seed = seed
80
+ self._task_id = task_id or "default"
81
+ self._user_input = _task_input(self.spec, self._task_id, self.tasks_path)
82
+ llm_seed = _mock_llm_seed(self.seed, self._task_id, self._graph.loop_name)
83
+ llm = MockLLM(seed=llm_seed)
84
+ self._runtime = LoopRuntime(self.spec, llm=llm)
85
+ self._state = LoopState()
86
+ self._done = False
87
+
88
+ self._obs = Observation(
89
+ task_id=self._task_id,
90
+ iteration=0,
91
+ output="",
92
+ quality_score=0.0,
93
+ objective=self._graph.objective,
94
+ done=False,
95
+ info={
96
+ "env_id": self.env_id,
97
+ "seed": self.seed,
98
+ "loop_name": self._graph.loop_name,
99
+ "user_input": self._user_input,
100
+ },
101
+ )
102
+ return self._obs
103
+
104
+ def step(self, action: Any = None) -> tuple[Observation, float, bool, dict[str, Any]]:
105
+ if self._runtime is None or self._state is None:
106
+ raise RuntimeError("Call reset() before step()")
107
+ if self._done:
108
+ obs = self._obs or Observation(
109
+ task_id=self._task_id,
110
+ iteration=0,
111
+ output="",
112
+ quality_score=0.0,
113
+ objective=self._graph.objective,
114
+ done=True,
115
+ )
116
+ return obs, 0.0, True, {"reason": "already_done"}
117
+
118
+ if isinstance(action, dict) and action.get("output"):
119
+ self._state.output = str(action["output"])
120
+ self._state.quality_score, feedback = self._runtime._evaluate(self._state, self._user_input)
121
+ self._state.iteration += 1
122
+ self._state.history.append(
123
+ {
124
+ "iteration": self._state.iteration,
125
+ "output": self._state.output,
126
+ "quality_score": self._state.quality_score,
127
+ "feedback": feedback,
128
+ "agent_override": True,
129
+ }
130
+ )
131
+ self._runtime._check_termination(self._state)
132
+ else:
133
+ self._state = self._runtime.step_once(self._state, self._user_input)
134
+
135
+ self._done = self._state.terminated
136
+ success = self._state.quality_score >= self._runtime.quality_threshold
137
+ reward = self._reward(self._state.quality_score, success)
138
+
139
+ info = {
140
+ "iteration": self._state.iteration,
141
+ "termination_reason": self._state.termination_reason,
142
+ "success": success,
143
+ "history_len": len(self._state.history),
144
+ }
145
+ if self._state.history:
146
+ info["last_feedback"] = self._state.history[-1].get("feedback", "")
147
+
148
+ self._obs = Observation(
149
+ task_id=self._task_id,
150
+ iteration=self._state.iteration,
151
+ output=self._state.output,
152
+ quality_score=self._state.quality_score,
153
+ objective=self._graph.objective,
154
+ done=self._done,
155
+ info=info,
156
+ )
157
+ return self._obs, reward, self._done, info
158
+
159
+ def run_episode(self, task_id: str = "", seed: int | None = None) -> dict[str, Any]:
160
+ """Run full episode until done (convenience for benchmarks)."""
161
+ self.reset(task_id=task_id, seed=seed)
162
+ total_reward = 0.0
163
+ steps = 0
164
+ while not self.done:
165
+ _, reward, _, info = self.step()
166
+ total_reward += reward
167
+ steps += 1
168
+ return {
169
+ "task_id": self._task_id,
170
+ "seed": self.seed,
171
+ "steps": steps,
172
+ "total_reward": total_reward,
173
+ "success": info.get("success", False),
174
+ "quality_score": self._obs.quality_score if self._obs else 0.0,
175
+ "trajectory": [
176
+ {
177
+ "iteration": h["iteration"],
178
+ "output": h["output"],
179
+ "quality_score": h["quality_score"],
180
+ }
181
+ for h in (self._state.history if self._state else [])
182
+ ],
183
+ }
@@ -0,0 +1,6 @@
1
+ """Evaluator plugins for loop execution."""
2
+
3
+ from loopgym.evaluators.deterministic import run_deterministic
4
+ from loopgym.evaluators.rubric import run_rubric
5
+
6
+ __all__ = ["run_deterministic", "run_rubric"]
@@ -0,0 +1,41 @@
1
+ """Deterministic evaluator implementations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Any
7
+
8
+
9
+ def run_deterministic(implementation: str, output: str, context: dict[str, Any] | None = None) -> dict[str, Any]:
10
+ """Run a deterministic evaluator by implementation reference."""
11
+ ctx = context or {}
12
+ if implementation == "evaluators.word_count_max":
13
+ max_words = int(ctx.get("max_words", 100))
14
+ count = len(output.split())
15
+ passed = count <= max_words
16
+ return {
17
+ "passed": passed,
18
+ "score": min(1.0, max_words / max(count, 1)),
19
+ "word_count": count,
20
+ "failure_codes": [] if passed else ["fail.false_fail"],
21
+ }
22
+ if implementation == "evaluators.test_pass_rate":
23
+ rate = float(ctx.get("mock_pass_rate", 0.0))
24
+ passed = rate >= float(ctx.get("threshold", 1.0))
25
+ return {
26
+ "passed": passed,
27
+ "score": rate,
28
+ "test_pass_rate": rate,
29
+ "failure_codes": [] if passed else ["fail.false_fail"],
30
+ }
31
+ if implementation == "evaluators.citation_count_min":
32
+ citations = len(re.findall(r"\[[\d]+\]|\(\d{4}\)", output))
33
+ min_citations = int(ctx.get("min_citations", 3))
34
+ passed = citations >= min_citations
35
+ return {
36
+ "passed": passed,
37
+ "score": min(1.0, citations / min_citations),
38
+ "citation_count": citations,
39
+ "failure_codes": [] if passed else ["fail.false_fail"],
40
+ }
41
+ return {"passed": True, "score": 1.0, "failure_codes": []}
@@ -0,0 +1,45 @@
1
+ """LLM rubric evaluator helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Protocol
6
+
7
+
8
+ class RubricLLM(Protocol):
9
+ def complete(self, prompt: str, role: str = "default") -> str: ...
10
+
11
+
12
+ def run_rubric(
13
+ llm: RubricLLM,
14
+ output: str,
15
+ objective: str,
16
+ rubric: dict[str, Any],
17
+ role: str = "evaluator",
18
+ ) -> dict[str, Any]:
19
+ """Score output against an LSS rubric using an LLM backend."""
20
+ threshold = float(rubric.get("pass_threshold", 0.8))
21
+ dimensions = rubric.get("dimensions") or []
22
+ dim_names = ", ".join(d.get("name", "quality") for d in dimensions) or "quality"
23
+
24
+ feedback = llm.complete(
25
+ f"Evaluate ({dim_names}) against objective '{objective}':\n{output}",
26
+ role=role,
27
+ )
28
+
29
+ score = 0.0
30
+ for token in feedback.split():
31
+ try:
32
+ val = float(token.rstrip("."))
33
+ if 0.0 <= val <= 1.0:
34
+ score = val
35
+ except ValueError:
36
+ pass
37
+
38
+ passed = score >= threshold
39
+ return {
40
+ "passed": passed,
41
+ "score": score,
42
+ "feedback": feedback,
43
+ "failure_codes": [] if passed else ["fail.self_grade"],
44
+ "dimension_scores": {d.get("name", "quality"): score for d in dimensions},
45
+ }