loopgym 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loopgym/__init__.py +7 -0
- loopgym/cli.py +47 -0
- loopgym/envs/__init__.py +8 -0
- loopgym/envs/base.py +63 -0
- loopgym/envs/live.py +80 -0
- loopgym/envs/replay.py +197 -0
- loopgym/envs/sim.py +183 -0
- loopgym/evaluators/__init__.py +6 -0
- loopgym/evaluators/deterministic.py +41 -0
- loopgym/evaluators/rubric.py +45 -0
- loopgym/registry.py +122 -0
- loopgym/runtime/__init__.py +19 -0
- loopgym/runtime/compiler.py +100 -0
- loopgym/runtime/loop_runtime.py +292 -0
- loopgym-0.1.0.dist-info/METADATA +188 -0
- loopgym-0.1.0.dist-info/RECORD +19 -0
- loopgym-0.1.0.dist-info/WHEEL +4 -0
- loopgym-0.1.0.dist-info/entry_points.txt +2 -0
- loopgym-0.1.0.dist-info/licenses/LICENSE +21 -0
loopgym/__init__.py
ADDED
loopgym/cli.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""LoopGym CLI."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import json
|
|
7
|
+
|
|
8
|
+
import loopgym as lg
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def main() -> None:
|
|
12
|
+
parser = argparse.ArgumentParser(description="LoopGym — run loop environments")
|
|
13
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
14
|
+
|
|
15
|
+
list_cmd = sub.add_parser("list", help="List registered environments")
|
|
16
|
+
list_cmd.set_defaults(func=_cmd_list)
|
|
17
|
+
|
|
18
|
+
run_cmd = sub.add_parser("run", help="Run an environment episode")
|
|
19
|
+
run_cmd.add_argument("env_id", help="Environment ID")
|
|
20
|
+
run_cmd.add_argument("--task-id", default="default")
|
|
21
|
+
run_cmd.add_argument("--seed", type=int, default=0)
|
|
22
|
+
run_cmd.add_argument("--spec-path", default=None)
|
|
23
|
+
run_cmd.set_defaults(func=_cmd_run)
|
|
24
|
+
|
|
25
|
+
args = parser.parse_args()
|
|
26
|
+
args.func(args)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _cmd_list(_args: argparse.Namespace) -> None:
|
|
30
|
+
for env_id in lg.list_envs():
|
|
31
|
+
print(env_id)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _cmd_run(args: argparse.Namespace) -> None:
|
|
35
|
+
env = lg.make(args.env_id, spec_path=args.spec_path, seed=args.seed)
|
|
36
|
+
if hasattr(env, "run_episode"):
|
|
37
|
+
result = env.run_episode(task_id=args.task_id, seed=args.seed)
|
|
38
|
+
print(json.dumps(result, indent=2))
|
|
39
|
+
else:
|
|
40
|
+
obs = env.reset(task_id=args.task_id, seed=args.seed)
|
|
41
|
+
while not env.done:
|
|
42
|
+
obs, reward, done, info = env.step()
|
|
43
|
+
print(f"step reward={reward:.3f} quality={obs.quality_score:.3f} done={done}")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
if __name__ == "__main__":
|
|
47
|
+
main()
|
loopgym/envs/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Loop environment implementations."""
|
|
2
|
+
|
|
3
|
+
from loopgym.envs.base import LoopEnv, Observation
|
|
4
|
+
from loopgym.envs.live import LiveEnv
|
|
5
|
+
from loopgym.envs.replay import ReplayEnv
|
|
6
|
+
from loopgym.envs.sim import SimEnv
|
|
7
|
+
|
|
8
|
+
__all__ = ["LoopEnv", "LiveEnv", "Observation", "ReplayEnv", "SimEnv"]
|
loopgym/envs/base.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Loop environment base classes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class Observation:
|
|
12
|
+
"""Gym-style observation returned by reset/step."""
|
|
13
|
+
|
|
14
|
+
task_id: str
|
|
15
|
+
iteration: int
|
|
16
|
+
output: str
|
|
17
|
+
quality_score: float
|
|
18
|
+
objective: str
|
|
19
|
+
done: bool
|
|
20
|
+
info: dict[str, Any] = field(default_factory=dict)
|
|
21
|
+
|
|
22
|
+
def to_dict(self) -> dict[str, Any]:
|
|
23
|
+
return {
|
|
24
|
+
"task_id": self.task_id,
|
|
25
|
+
"iteration": self.iteration,
|
|
26
|
+
"output": self.output,
|
|
27
|
+
"quality_score": self.quality_score,
|
|
28
|
+
"objective": self.objective,
|
|
29
|
+
"done": self.done,
|
|
30
|
+
"info": self.info,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class LoopEnv(ABC):
|
|
35
|
+
"""Abstract loop environment (Gym-style API)."""
|
|
36
|
+
|
|
37
|
+
def __init__(self, env_id: str) -> None:
|
|
38
|
+
self.env_id = env_id
|
|
39
|
+
self._done = False
|
|
40
|
+
self._task_id = ""
|
|
41
|
+
self._obs: Observation | None = None
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def done(self) -> bool:
|
|
45
|
+
return self._done
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def observation(self) -> Observation | None:
|
|
49
|
+
return self._obs
|
|
50
|
+
|
|
51
|
+
@abstractmethod
|
|
52
|
+
def reset(self, task_id: str = "", seed: int | None = None, **kwargs: Any) -> Observation:
|
|
53
|
+
"""Reset environment for a new episode."""
|
|
54
|
+
|
|
55
|
+
@abstractmethod
|
|
56
|
+
def step(self, action: Any = None) -> tuple[Observation, float, bool, dict[str, Any]]:
|
|
57
|
+
"""Advance one loop iteration. Returns (obs, reward, done, info)."""
|
|
58
|
+
|
|
59
|
+
def close(self) -> None:
|
|
60
|
+
"""Release resources."""
|
|
61
|
+
|
|
62
|
+
def _reward(self, quality_score: float, success: bool) -> float:
|
|
63
|
+
return quality_score if success else quality_score - 0.5
|
loopgym/envs/live.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""LiveEnv — real LLM APIs (optional, user-provided keys)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from loopgym.envs.base import LoopEnv, Observation
|
|
9
|
+
from loopgym.envs.sim import SimEnv
|
|
10
|
+
from loopgym.runtime.loop_runtime import LLMClient
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class _OpenAILiveLLM:
|
|
14
|
+
"""Thin wrapper for OpenAI-compatible APIs (optional dependency)."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, model: str = "gpt-4.1-mini", api_key: str | None = None) -> None:
|
|
17
|
+
self.model = model
|
|
18
|
+
self.api_key = api_key or os.environ.get("OPENAI_API_KEY", "")
|
|
19
|
+
self.tokens_used = 0
|
|
20
|
+
if not self.api_key:
|
|
21
|
+
raise ValueError(
|
|
22
|
+
"LiveEnv requires OPENAI_API_KEY or api_key= parameter. "
|
|
23
|
+
"Use SimEnv for keyless testing."
|
|
24
|
+
)
|
|
25
|
+
try:
|
|
26
|
+
import openai # noqa: PLC0415
|
|
27
|
+
except ImportError as exc:
|
|
28
|
+
raise ImportError("Install openai: pip install openai") from exc
|
|
29
|
+
self._client = openai.OpenAI(api_key=self.api_key)
|
|
30
|
+
|
|
31
|
+
def complete(self, prompt: str, role: str = "default") -> str:
|
|
32
|
+
response = self._client.chat.completions.create(
|
|
33
|
+
model=self.model,
|
|
34
|
+
messages=[
|
|
35
|
+
{"role": "system", "content": f"You are a {role} agent in a loop."},
|
|
36
|
+
{"role": "user", "content": prompt},
|
|
37
|
+
],
|
|
38
|
+
temperature=0.2,
|
|
39
|
+
)
|
|
40
|
+
text = response.choices[0].message.content or ""
|
|
41
|
+
if response.usage:
|
|
42
|
+
self.tokens_used += response.usage.total_tokens
|
|
43
|
+
return text
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class LiveEnv(LoopEnv):
|
|
47
|
+
"""Live environment using real LLM APIs. Falls back to documenting key requirement."""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
env_id: str,
|
|
52
|
+
spec: dict[str, Any],
|
|
53
|
+
llm: LLMClient | None = None,
|
|
54
|
+
**sim_kwargs: Any,
|
|
55
|
+
) -> None:
|
|
56
|
+
super().__init__(env_id)
|
|
57
|
+
self._sim = SimEnv(env_id, spec, **sim_kwargs)
|
|
58
|
+
self._llm = llm
|
|
59
|
+
if llm is not None:
|
|
60
|
+
self._sim._runtime = None # will be set on reset with custom llm
|
|
61
|
+
|
|
62
|
+
def reset(self, task_id: str = "", seed: int | None = None, **kwargs: Any) -> Observation:
|
|
63
|
+
obs = self._sim.reset(task_id=task_id, seed=seed, **kwargs)
|
|
64
|
+
if self._llm and self._sim._runtime:
|
|
65
|
+
self._sim._runtime.llm = self._llm
|
|
66
|
+
return obs
|
|
67
|
+
|
|
68
|
+
def step(self, action: Any = None) -> tuple[Observation, float, bool, dict[str, Any]]:
|
|
69
|
+
return self._sim.step(action)
|
|
70
|
+
|
|
71
|
+
@classmethod
|
|
72
|
+
def with_openai(
|
|
73
|
+
cls,
|
|
74
|
+
env_id: str,
|
|
75
|
+
spec: dict[str, Any],
|
|
76
|
+
model: str = "gpt-4.1-mini",
|
|
77
|
+
api_key: str | None = None,
|
|
78
|
+
**sim_kwargs: Any,
|
|
79
|
+
) -> LiveEnv:
|
|
80
|
+
return cls(env_id, spec, llm=_OpenAILiveLLM(model=model, api_key=api_key), **sim_kwargs)
|
loopgym/envs/replay.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""ReplayEnv — replay LoopNet trajectories from ln/record-v1 JSONL."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import random
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from loopgym.envs.base import LoopEnv, Observation
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _default_loopnet_seed_path() -> Path | None:
|
|
15
|
+
"""Resolve LoopNet seed corpus from env, sibling clone, or CI deps layout."""
|
|
16
|
+
loopgym_root = Path(__file__).resolve().parents[2]
|
|
17
|
+
candidates: list[Path] = []
|
|
18
|
+
env_path = os.environ.get("LOOPNET_SEED_PATH")
|
|
19
|
+
if env_path:
|
|
20
|
+
candidates.append(Path(env_path))
|
|
21
|
+
candidates.extend(
|
|
22
|
+
[
|
|
23
|
+
loopgym_root.parent / "04-loopnet" / "data" / "seed" / "records.jsonl",
|
|
24
|
+
loopgym_root.parent / "loopnet" / "data" / "seed" / "records.jsonl",
|
|
25
|
+
loopgym_root / "deps" / "loopnet" / "data" / "seed" / "records.jsonl",
|
|
26
|
+
]
|
|
27
|
+
)
|
|
28
|
+
for candidate in candidates:
|
|
29
|
+
if candidate.exists():
|
|
30
|
+
return candidate
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def load_loopnet_records(path: Path) -> list[dict[str, Any]]:
|
|
35
|
+
"""Load LoopNet records from JSONL."""
|
|
36
|
+
records: list[dict[str, Any]] = []
|
|
37
|
+
with path.open(encoding="utf-8") as handle:
|
|
38
|
+
for line_no, line in enumerate(handle, start=1):
|
|
39
|
+
line = line.strip()
|
|
40
|
+
if not line:
|
|
41
|
+
continue
|
|
42
|
+
try:
|
|
43
|
+
records.append(json.loads(line))
|
|
44
|
+
except json.JSONDecodeError as exc:
|
|
45
|
+
raise ValueError(f"{path}:{line_no}: invalid JSON — {exc}") from exc
|
|
46
|
+
return records
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def record_to_trajectory(record: dict[str, Any]) -> list[dict[str, Any]]:
|
|
50
|
+
"""Map ln/record-v1 trajectory steps to ReplayEnv step dicts."""
|
|
51
|
+
steps: list[dict[str, Any]] = []
|
|
52
|
+
objective = str(record.get("objective", ""))
|
|
53
|
+
for step in record.get("trajectory") or []:
|
|
54
|
+
goal = float(step.get("goal_score", step.get("primary_quality", 0.0)))
|
|
55
|
+
steps.append(
|
|
56
|
+
{
|
|
57
|
+
"iteration": int(step.get("iteration", len(steps) + 1)),
|
|
58
|
+
"output": (
|
|
59
|
+
f"[loopnet replay] {objective[:80]}… "
|
|
60
|
+
f"iter={step.get('iteration')} goal={goal:.3f}"
|
|
61
|
+
),
|
|
62
|
+
"quality_score": goal,
|
|
63
|
+
"cost_usd": step.get("cost_usd"),
|
|
64
|
+
"latency_seconds": step.get("latency_seconds"),
|
|
65
|
+
"failure_codes": list(step.get("failure_codes") or []),
|
|
66
|
+
}
|
|
67
|
+
)
|
|
68
|
+
return steps
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class ReplayEnv(LoopEnv):
|
|
72
|
+
"""Replay recorded trajectories from LoopNet ln/record-v1 records."""
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
env_id: str,
|
|
77
|
+
trajectory_path: str | Path | None = None,
|
|
78
|
+
records_path: str | Path | None = None,
|
|
79
|
+
) -> None:
|
|
80
|
+
super().__init__(env_id)
|
|
81
|
+
self.trajectory_path = Path(trajectory_path) if trajectory_path else None
|
|
82
|
+
resolved_records = Path(records_path) if records_path else None
|
|
83
|
+
if resolved_records and resolved_records.exists():
|
|
84
|
+
self.records_path = resolved_records
|
|
85
|
+
else:
|
|
86
|
+
self.records_path = _default_loopnet_seed_path()
|
|
87
|
+
self._records: list[dict[str, Any]] = []
|
|
88
|
+
self._record: dict[str, Any] | None = None
|
|
89
|
+
self._trajectory: list[dict[str, Any]] = []
|
|
90
|
+
self._index = 0
|
|
91
|
+
|
|
92
|
+
def _load_records_corpus(self) -> None:
|
|
93
|
+
if self._records or not self.records_path or not self.records_path.exists():
|
|
94
|
+
return
|
|
95
|
+
self._records = load_loopnet_records(self.records_path)
|
|
96
|
+
|
|
97
|
+
def _select_record(self, task_id: str, seed: int | None) -> dict[str, Any] | None:
|
|
98
|
+
self._load_records_corpus()
|
|
99
|
+
if not self._records:
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
if task_id.startswith("ln-"):
|
|
103
|
+
for record in self._records:
|
|
104
|
+
if record.get("record_id") == task_id:
|
|
105
|
+
return record
|
|
106
|
+
|
|
107
|
+
if task_id and task_id != "default":
|
|
108
|
+
for record in self._records:
|
|
109
|
+
if record.get("loop_name") == task_id:
|
|
110
|
+
return record
|
|
111
|
+
|
|
112
|
+
rng = random.Random(seed if seed is not None else 0)
|
|
113
|
+
return rng.choice(self._records)
|
|
114
|
+
|
|
115
|
+
def reset(self, task_id: str = "", seed: int | None = None, **kwargs: Any) -> Observation:
|
|
116
|
+
record_id = kwargs.get("record_id")
|
|
117
|
+
self._task_id = task_id or "default"
|
|
118
|
+
self._index = 0
|
|
119
|
+
self._trajectory = []
|
|
120
|
+
self._done = False
|
|
121
|
+
self._record = None
|
|
122
|
+
|
|
123
|
+
if self.trajectory_path and self.trajectory_path.exists():
|
|
124
|
+
with self.trajectory_path.open(encoding="utf-8") as handle:
|
|
125
|
+
data = json.load(handle)
|
|
126
|
+
self._trajectory = list(data.get("steps") or data.get("history") or [])
|
|
127
|
+
else:
|
|
128
|
+
if record_id:
|
|
129
|
+
self._load_records_corpus()
|
|
130
|
+
for record in self._records:
|
|
131
|
+
if record.get("record_id") == record_id:
|
|
132
|
+
self._record = record
|
|
133
|
+
break
|
|
134
|
+
else:
|
|
135
|
+
self._record = self._select_record(self._task_id, seed)
|
|
136
|
+
|
|
137
|
+
if self._record:
|
|
138
|
+
self._trajectory = record_to_trajectory(self._record)
|
|
139
|
+
self._task_id = str(self._record.get("record_id", self._task_id))
|
|
140
|
+
|
|
141
|
+
if not self._trajectory:
|
|
142
|
+
self._trajectory = [
|
|
143
|
+
{"iteration": 1, "output": "replay fallback step 1", "quality_score": 0.5},
|
|
144
|
+
{"iteration": 2, "output": "replay fallback step 2", "quality_score": 0.85},
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
step = self._trajectory[0]
|
|
148
|
+
objective = str((self._record or {}).get("objective", "Replay LoopNet trajectory"))
|
|
149
|
+
self._obs = Observation(
|
|
150
|
+
task_id=self._task_id,
|
|
151
|
+
iteration=int(step.get("iteration", 1)),
|
|
152
|
+
output=str(step.get("output", "")),
|
|
153
|
+
quality_score=float(step.get("quality_score", 0.0)),
|
|
154
|
+
objective=objective,
|
|
155
|
+
done=False,
|
|
156
|
+
info={
|
|
157
|
+
"mode": "replay",
|
|
158
|
+
"total_steps": len(self._trajectory),
|
|
159
|
+
"record_id": (self._record or {}).get("record_id"),
|
|
160
|
+
"outcome": (self._record or {}).get("outcome"),
|
|
161
|
+
},
|
|
162
|
+
)
|
|
163
|
+
return self._obs
|
|
164
|
+
|
|
165
|
+
def step(self, action: Any = None) -> tuple[Observation, float, bool, dict[str, Any]]:
|
|
166
|
+
self._index += 1
|
|
167
|
+
if self._index >= len(self._trajectory):
|
|
168
|
+
self._done = True
|
|
169
|
+
obs = self._obs or Observation(
|
|
170
|
+
task_id=self._task_id,
|
|
171
|
+
iteration=0,
|
|
172
|
+
output="",
|
|
173
|
+
quality_score=0.0,
|
|
174
|
+
objective="Replay LoopNet trajectory",
|
|
175
|
+
done=True,
|
|
176
|
+
)
|
|
177
|
+
return obs, 0.0, True, {"reason": "trajectory_exhausted"}
|
|
178
|
+
|
|
179
|
+
step = self._trajectory[self._index]
|
|
180
|
+
quality = float(step.get("quality_score", 0.0))
|
|
181
|
+
self._done = self._index >= len(self._trajectory) - 1
|
|
182
|
+
objective = str((self._record or {}).get("objective", "Replay LoopNet trajectory"))
|
|
183
|
+
self._obs = Observation(
|
|
184
|
+
task_id=self._task_id,
|
|
185
|
+
iteration=int(step.get("iteration", self._index + 1)),
|
|
186
|
+
output=str(step.get("output", "")),
|
|
187
|
+
quality_score=quality,
|
|
188
|
+
objective=objective,
|
|
189
|
+
done=self._done,
|
|
190
|
+
info={
|
|
191
|
+
"mode": "replay",
|
|
192
|
+
"step_index": self._index,
|
|
193
|
+
"record_id": (self._record or {}).get("record_id"),
|
|
194
|
+
"failure_codes": step.get("failure_codes", []),
|
|
195
|
+
},
|
|
196
|
+
)
|
|
197
|
+
return self._obs, quality, self._done, {"step_index": self._index}
|
loopgym/envs/sim.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""SimEnv — mock LLM + mock oracles (no API keys)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from loopgym.envs.base import LoopEnv, Observation
|
|
10
|
+
from loopgym.runtime.compiler import compile_lss
|
|
11
|
+
from loopgym.runtime.loop_runtime import LoopRuntime, LoopState, MockLLM, load_lss_spec
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _task_input(spec: dict[str, Any], task_id: str, tasks_path: Path | None) -> str:
|
|
15
|
+
"""Resolve task input from tasks.json or spec examples."""
|
|
16
|
+
if tasks_path and tasks_path.exists():
|
|
17
|
+
with tasks_path.open(encoding="utf-8") as fh:
|
|
18
|
+
tasks = json.load(fh)
|
|
19
|
+
for task in tasks.get("tasks", []):
|
|
20
|
+
if task.get("id") == task_id:
|
|
21
|
+
payload = task.get("input") or task
|
|
22
|
+
return json.dumps(payload) if isinstance(payload, dict) else str(payload)
|
|
23
|
+
inputs = spec.get("inputs") or {}
|
|
24
|
+
examples = inputs.get("examples") or []
|
|
25
|
+
if examples:
|
|
26
|
+
first = examples[0]
|
|
27
|
+
if isinstance(first, dict):
|
|
28
|
+
return json.dumps(first)
|
|
29
|
+
return str(first)
|
|
30
|
+
return f"task:{task_id or 'default'}"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _mock_llm_seed(env_seed: int, task_id: str, loop_name: str) -> str:
|
|
34
|
+
"""Deterministic seed for reproducible trajectories."""
|
|
35
|
+
return f"{env_seed}:{task_id}:{loop_name}"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class SimEnv(LoopEnv):
|
|
39
|
+
"""Simulation environment with MockLLM — no API keys required."""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
env_id: str,
|
|
44
|
+
spec: dict[str, Any],
|
|
45
|
+
spec_path: Path | None = None,
|
|
46
|
+
tasks_path: Path | None = None,
|
|
47
|
+
seed: int = 0,
|
|
48
|
+
) -> None:
|
|
49
|
+
super().__init__(env_id)
|
|
50
|
+
self.spec = spec
|
|
51
|
+
self.spec_path = spec_path
|
|
52
|
+
self.tasks_path = tasks_path
|
|
53
|
+
self.seed = seed
|
|
54
|
+
self._graph = compile_lss(spec)
|
|
55
|
+
self._runtime: LoopRuntime | None = None
|
|
56
|
+
self._state: LoopState | None = None
|
|
57
|
+
self._user_input = ""
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def from_spec_file(
|
|
61
|
+
cls,
|
|
62
|
+
env_id: str,
|
|
63
|
+
spec_path: str | Path,
|
|
64
|
+
tasks_path: str | Path | None = None,
|
|
65
|
+
seed: int = 0,
|
|
66
|
+
) -> SimEnv:
|
|
67
|
+
path = Path(spec_path)
|
|
68
|
+
tasks = Path(tasks_path) if tasks_path else path.parent / "tasks.json"
|
|
69
|
+
return cls(
|
|
70
|
+
env_id=env_id,
|
|
71
|
+
spec=load_lss_spec(path),
|
|
72
|
+
spec_path=path,
|
|
73
|
+
tasks_path=tasks if tasks.exists() else None,
|
|
74
|
+
seed=seed,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
def reset(self, task_id: str = "", seed: int | None = None, **kwargs: Any) -> Observation:
|
|
78
|
+
if seed is not None:
|
|
79
|
+
self.seed = seed
|
|
80
|
+
self._task_id = task_id or "default"
|
|
81
|
+
self._user_input = _task_input(self.spec, self._task_id, self.tasks_path)
|
|
82
|
+
llm_seed = _mock_llm_seed(self.seed, self._task_id, self._graph.loop_name)
|
|
83
|
+
llm = MockLLM(seed=llm_seed)
|
|
84
|
+
self._runtime = LoopRuntime(self.spec, llm=llm)
|
|
85
|
+
self._state = LoopState()
|
|
86
|
+
self._done = False
|
|
87
|
+
|
|
88
|
+
self._obs = Observation(
|
|
89
|
+
task_id=self._task_id,
|
|
90
|
+
iteration=0,
|
|
91
|
+
output="",
|
|
92
|
+
quality_score=0.0,
|
|
93
|
+
objective=self._graph.objective,
|
|
94
|
+
done=False,
|
|
95
|
+
info={
|
|
96
|
+
"env_id": self.env_id,
|
|
97
|
+
"seed": self.seed,
|
|
98
|
+
"loop_name": self._graph.loop_name,
|
|
99
|
+
"user_input": self._user_input,
|
|
100
|
+
},
|
|
101
|
+
)
|
|
102
|
+
return self._obs
|
|
103
|
+
|
|
104
|
+
def step(self, action: Any = None) -> tuple[Observation, float, bool, dict[str, Any]]:
|
|
105
|
+
if self._runtime is None or self._state is None:
|
|
106
|
+
raise RuntimeError("Call reset() before step()")
|
|
107
|
+
if self._done:
|
|
108
|
+
obs = self._obs or Observation(
|
|
109
|
+
task_id=self._task_id,
|
|
110
|
+
iteration=0,
|
|
111
|
+
output="",
|
|
112
|
+
quality_score=0.0,
|
|
113
|
+
objective=self._graph.objective,
|
|
114
|
+
done=True,
|
|
115
|
+
)
|
|
116
|
+
return obs, 0.0, True, {"reason": "already_done"}
|
|
117
|
+
|
|
118
|
+
if isinstance(action, dict) and action.get("output"):
|
|
119
|
+
self._state.output = str(action["output"])
|
|
120
|
+
self._state.quality_score, feedback = self._runtime._evaluate(self._state, self._user_input)
|
|
121
|
+
self._state.iteration += 1
|
|
122
|
+
self._state.history.append(
|
|
123
|
+
{
|
|
124
|
+
"iteration": self._state.iteration,
|
|
125
|
+
"output": self._state.output,
|
|
126
|
+
"quality_score": self._state.quality_score,
|
|
127
|
+
"feedback": feedback,
|
|
128
|
+
"agent_override": True,
|
|
129
|
+
}
|
|
130
|
+
)
|
|
131
|
+
self._runtime._check_termination(self._state)
|
|
132
|
+
else:
|
|
133
|
+
self._state = self._runtime.step_once(self._state, self._user_input)
|
|
134
|
+
|
|
135
|
+
self._done = self._state.terminated
|
|
136
|
+
success = self._state.quality_score >= self._runtime.quality_threshold
|
|
137
|
+
reward = self._reward(self._state.quality_score, success)
|
|
138
|
+
|
|
139
|
+
info = {
|
|
140
|
+
"iteration": self._state.iteration,
|
|
141
|
+
"termination_reason": self._state.termination_reason,
|
|
142
|
+
"success": success,
|
|
143
|
+
"history_len": len(self._state.history),
|
|
144
|
+
}
|
|
145
|
+
if self._state.history:
|
|
146
|
+
info["last_feedback"] = self._state.history[-1].get("feedback", "")
|
|
147
|
+
|
|
148
|
+
self._obs = Observation(
|
|
149
|
+
task_id=self._task_id,
|
|
150
|
+
iteration=self._state.iteration,
|
|
151
|
+
output=self._state.output,
|
|
152
|
+
quality_score=self._state.quality_score,
|
|
153
|
+
objective=self._graph.objective,
|
|
154
|
+
done=self._done,
|
|
155
|
+
info=info,
|
|
156
|
+
)
|
|
157
|
+
return self._obs, reward, self._done, info
|
|
158
|
+
|
|
159
|
+
def run_episode(self, task_id: str = "", seed: int | None = None) -> dict[str, Any]:
|
|
160
|
+
"""Run full episode until done (convenience for benchmarks)."""
|
|
161
|
+
self.reset(task_id=task_id, seed=seed)
|
|
162
|
+
total_reward = 0.0
|
|
163
|
+
steps = 0
|
|
164
|
+
while not self.done:
|
|
165
|
+
_, reward, _, info = self.step()
|
|
166
|
+
total_reward += reward
|
|
167
|
+
steps += 1
|
|
168
|
+
return {
|
|
169
|
+
"task_id": self._task_id,
|
|
170
|
+
"seed": self.seed,
|
|
171
|
+
"steps": steps,
|
|
172
|
+
"total_reward": total_reward,
|
|
173
|
+
"success": info.get("success", False),
|
|
174
|
+
"quality_score": self._obs.quality_score if self._obs else 0.0,
|
|
175
|
+
"trajectory": [
|
|
176
|
+
{
|
|
177
|
+
"iteration": h["iteration"],
|
|
178
|
+
"output": h["output"],
|
|
179
|
+
"quality_score": h["quality_score"],
|
|
180
|
+
}
|
|
181
|
+
for h in (self._state.history if self._state else [])
|
|
182
|
+
],
|
|
183
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Deterministic evaluator implementations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def run_deterministic(implementation: str, output: str, context: dict[str, Any] | None = None) -> dict[str, Any]:
|
|
10
|
+
"""Run a deterministic evaluator by implementation reference."""
|
|
11
|
+
ctx = context or {}
|
|
12
|
+
if implementation == "evaluators.word_count_max":
|
|
13
|
+
max_words = int(ctx.get("max_words", 100))
|
|
14
|
+
count = len(output.split())
|
|
15
|
+
passed = count <= max_words
|
|
16
|
+
return {
|
|
17
|
+
"passed": passed,
|
|
18
|
+
"score": min(1.0, max_words / max(count, 1)),
|
|
19
|
+
"word_count": count,
|
|
20
|
+
"failure_codes": [] if passed else ["fail.false_fail"],
|
|
21
|
+
}
|
|
22
|
+
if implementation == "evaluators.test_pass_rate":
|
|
23
|
+
rate = float(ctx.get("mock_pass_rate", 0.0))
|
|
24
|
+
passed = rate >= float(ctx.get("threshold", 1.0))
|
|
25
|
+
return {
|
|
26
|
+
"passed": passed,
|
|
27
|
+
"score": rate,
|
|
28
|
+
"test_pass_rate": rate,
|
|
29
|
+
"failure_codes": [] if passed else ["fail.false_fail"],
|
|
30
|
+
}
|
|
31
|
+
if implementation == "evaluators.citation_count_min":
|
|
32
|
+
citations = len(re.findall(r"\[[\d]+\]|\(\d{4}\)", output))
|
|
33
|
+
min_citations = int(ctx.get("min_citations", 3))
|
|
34
|
+
passed = citations >= min_citations
|
|
35
|
+
return {
|
|
36
|
+
"passed": passed,
|
|
37
|
+
"score": min(1.0, citations / min_citations),
|
|
38
|
+
"citation_count": citations,
|
|
39
|
+
"failure_codes": [] if passed else ["fail.false_fail"],
|
|
40
|
+
}
|
|
41
|
+
return {"passed": True, "score": 1.0, "failure_codes": []}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""LLM rubric evaluator helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Protocol
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class RubricLLM(Protocol):
|
|
9
|
+
def complete(self, prompt: str, role: str = "default") -> str: ...
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def run_rubric(
|
|
13
|
+
llm: RubricLLM,
|
|
14
|
+
output: str,
|
|
15
|
+
objective: str,
|
|
16
|
+
rubric: dict[str, Any],
|
|
17
|
+
role: str = "evaluator",
|
|
18
|
+
) -> dict[str, Any]:
|
|
19
|
+
"""Score output against an LSS rubric using an LLM backend."""
|
|
20
|
+
threshold = float(rubric.get("pass_threshold", 0.8))
|
|
21
|
+
dimensions = rubric.get("dimensions") or []
|
|
22
|
+
dim_names = ", ".join(d.get("name", "quality") for d in dimensions) or "quality"
|
|
23
|
+
|
|
24
|
+
feedback = llm.complete(
|
|
25
|
+
f"Evaluate ({dim_names}) against objective '{objective}':\n{output}",
|
|
26
|
+
role=role,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
score = 0.0
|
|
30
|
+
for token in feedback.split():
|
|
31
|
+
try:
|
|
32
|
+
val = float(token.rstrip("."))
|
|
33
|
+
if 0.0 <= val <= 1.0:
|
|
34
|
+
score = val
|
|
35
|
+
except ValueError:
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
passed = score >= threshold
|
|
39
|
+
return {
|
|
40
|
+
"passed": passed,
|
|
41
|
+
"score": score,
|
|
42
|
+
"feedback": feedback,
|
|
43
|
+
"failure_codes": [] if passed else ["fail.self_grade"],
|
|
44
|
+
"dimension_scores": {d.get("name", "quality"): score for d in dimensions},
|
|
45
|
+
}
|