verifiers 0.1.10.dev0__tar.gz → 0.1.10.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/.gitignore +4 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/PKG-INFO +10 -1
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/pyproject.toml +11 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/conftest.py +145 -0
- verifiers-0.1.10.dev2/tests/test_browser_env.py +562 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_cli_agent_env.py +0 -16
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_env_group.py +22 -40
- verifiers-0.1.10.dev2/tests/test_environment.py +831 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_environment_extra.py +52 -110
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_envs.py +9 -0
- verifiers-0.1.10.dev2/tests/test_eval_cli.py +461 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_eval_utils.py +33 -69
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_gym_env.py +21 -13
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_imports.py +7 -4
- verifiers-0.1.10.dev2/tests/test_install_utils.py +161 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_math_rubric.py +9 -16
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_multiturn_env.py +28 -82
- verifiers-0.1.10.dev2/tests/test_rlm_env.py +1442 -0
- verifiers-0.1.10.dev2/tests/test_rlm_env_sandbox.py +258 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_rubric.py +9 -19
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_rubric_group.py +7 -15
- verifiers-0.1.10.dev2/tests/test_save_utils.py +196 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_singleturn_env.py +79 -162
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_stateful_tool_env.py +8 -20
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_tool_env.py +191 -26
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_trajectory_processing.py +6 -8
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/AGENTS.md +1 -1
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/__init__.py +9 -1
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/AGENTS.md +16 -2
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/env_group.py +12 -11
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/environment.py +347 -259
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/experimental/README.md +9 -1
- verifiers-0.1.10.dev2/verifiers/envs/experimental/cli_agent_env.py +820 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/experimental/harbor_env.py +5 -1
- verifiers-0.1.10.dev2/verifiers/envs/experimental/rlm_env.py +4125 -0
- verifiers-0.1.10.dev2/verifiers/envs/integrations/README.md +131 -0
- verifiers-0.1.10.dev2/verifiers/envs/integrations/browser_env/__init__.py +75 -0
- verifiers-0.1.10.dev2/verifiers/envs/integrations/browser_env/browser_env.py +203 -0
- verifiers-0.1.10.dev2/verifiers/envs/integrations/browser_env/modes/__init__.py +26 -0
- verifiers-0.1.10.dev2/verifiers/envs/integrations/browser_env/modes/base.py +42 -0
- verifiers-0.1.10.dev2/verifiers/envs/integrations/browser_env/modes/cua_mode.py +1183 -0
- verifiers-0.1.10.dev2/verifiers/envs/integrations/browser_env/modes/dom_mode.py +271 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/multiturn_env.py +22 -16
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/sandbox_env.py +3 -1
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/tool_env.py +3 -2
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/errors.py +7 -1
- verifiers-0.1.10.dev2/verifiers/gepa/__init__.py +12 -0
- verifiers-0.1.10.dev2/verifiers/gepa/adapter.py +204 -0
- verifiers-0.1.10.dev2/verifiers/gepa/config.py +42 -0
- verifiers-0.1.10.dev2/verifiers/gepa/display.py +493 -0
- verifiers-0.1.10.dev2/verifiers/gepa/gepa_utils.py +112 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rl/trainer/orchestrator.py +27 -14
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rl/trainer/trainer.py +5 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rubrics/rubric.py +61 -76
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rubrics/rubric_group.py +5 -5
- verifiers-0.1.10.dev2/verifiers/scripts/eval.py +437 -0
- verifiers-0.1.10.dev2/verifiers/scripts/gepa.py +386 -0
- verifiers-0.1.10.dev2/verifiers/scripts/install.py +76 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/scripts/tui.py +303 -7
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/types.py +75 -29
- verifiers-0.1.10.dev2/verifiers/utils/async_utils.py +198 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/client_utils.py +2 -6
- verifiers-0.1.10.dev2/verifiers/utils/config_utils.py +31 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/data_utils.py +13 -6
- verifiers-0.1.10.dev2/verifiers/utils/display_utils.py +407 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/error_utils.py +3 -3
- verifiers-0.1.10.dev2/verifiers/utils/eval_display.py +699 -0
- verifiers-0.1.10.dev2/verifiers/utils/eval_utils.py +554 -0
- verifiers-0.1.10.dev2/verifiers/utils/install_utils.py +249 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/logging_utils.py +41 -68
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/message_utils.py +52 -1
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/path_utils.py +25 -1
- verifiers-0.1.10.dev2/verifiers/utils/sandbox_exec_utils.py +103 -0
- verifiers-0.1.10.dev2/verifiers/utils/save_utils.py +385 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/tool_utils.py +17 -0
- verifiers-0.1.10.dev2/verifiers/utils/worker_utils.py +40 -0
- verifiers-0.1.10.dev2/verifiers/workers/__init__.py +27 -0
- verifiers-0.1.10.dev2/verifiers/workers/client/env_client.py +96 -0
- verifiers-0.1.10.dev2/verifiers/workers/client/zmq_env_client.py +190 -0
- verifiers-0.1.10.dev2/verifiers/workers/server/env_server.py +135 -0
- verifiers-0.1.10.dev2/verifiers/workers/server/zmq_env_server.py +150 -0
- verifiers-0.1.10.dev2/verifiers/workers/types.py +74 -0
- verifiers-0.1.10.dev0/tests/test_environment.py +0 -552
- verifiers-0.1.10.dev0/tests/test_eval_cli.py +0 -130
- verifiers-0.1.10.dev0/tests/test_rlm_env.py +0 -1984
- verifiers-0.1.10.dev0/verifiers/envs/experimental/cli_agent_env.py +0 -655
- verifiers-0.1.10.dev0/verifiers/envs/experimental/rlm_env.py +0 -2694
- verifiers-0.1.10.dev0/verifiers/envs/integrations/README.md +0 -17
- verifiers-0.1.10.dev0/verifiers/scripts/eval.py +0 -362
- verifiers-0.1.10.dev0/verifiers/scripts/install.py +0 -70
- verifiers-0.1.10.dev0/verifiers/utils/async_utils.py +0 -87
- verifiers-0.1.10.dev0/verifiers/utils/eval_utils.py +0 -365
- verifiers-0.1.10.dev0/verifiers/utils/rlm_data_serialization_utils.py +0 -630
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/LICENSE +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/README.md +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/AGENTS.md +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/README.md +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/__init__.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/mock_client_guide.md +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/mock_openai_client.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_decorator_ranks.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_environment_audio_modality.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_error_chain.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_logging.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_maybe_think_parser.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_message_utils_audio.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_parser.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_sandbox_env.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_think_parser.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_tool_utils.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_xml_parser.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/decorators.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/__init__.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/experimental/__init__.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/experimental/gym_env.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/experimental/mcp_env.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/integrations/__init__.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/integrations/textarena_env.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/python_env.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/singleturn_env.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/stateful_tool_env.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/parsers/__init__.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/parsers/maybe_think_parser.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/parsers/parser.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/parsers/think_parser.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/parsers/xml_parser.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rl/README.md +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rl/inference/__init__.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rl/inference/client.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rl/inference/server.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rl/trainer/__init__.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rl/trainer/config.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rl/trainer/utils.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rubrics/__init__.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rubrics/judge_rubric.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rubrics/math_rubric.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/scripts/__init__.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/scripts/init.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/scripts/prime_rl.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/scripts/rl.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/scripts/setup.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/scripts/train.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/__init__.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/env_utils.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/response_utils.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/thread_utils.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/token_utils.py +0 -0
- {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/tunnel_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: verifiers
|
|
3
|
-
Version: 0.1.10.
|
|
3
|
+
Version: 0.1.10.dev2
|
|
4
4
|
Summary: Verifiers: Environments for LLM Reinforcement Learning
|
|
5
5
|
Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
|
|
6
6
|
Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
|
|
@@ -23,14 +23,19 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
23
23
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
24
|
Requires-Python: <3.14,>=3.10
|
|
25
25
|
Requires-Dist: datasets>=3.0.0
|
|
26
|
+
Requires-Dist: gepa
|
|
26
27
|
Requires-Dist: jinja2>=3.1.6
|
|
27
28
|
Requires-Dist: math-verify>=0.8.0
|
|
28
29
|
Requires-Dist: mcp>=1.14.1
|
|
30
|
+
Requires-Dist: msgpack>=1.1.2
|
|
29
31
|
Requires-Dist: nest-asyncio>=1.6.0
|
|
32
|
+
Requires-Dist: numpy
|
|
30
33
|
Requires-Dist: openai-agents>=0.0.7
|
|
31
34
|
Requires-Dist: openai>=1.108.1
|
|
32
35
|
Requires-Dist: prime-sandboxes>=0.2.9
|
|
36
|
+
Requires-Dist: prime-tunnel
|
|
33
37
|
Requires-Dist: pydantic>=2.11.9
|
|
38
|
+
Requires-Dist: pyzmq>=27.1.0
|
|
34
39
|
Requires-Dist: requests
|
|
35
40
|
Requires-Dist: rich
|
|
36
41
|
Requires-Dist: tenacity>=8.5.0
|
|
@@ -38,6 +43,10 @@ Requires-Dist: textual
|
|
|
38
43
|
Requires-Dist: tomli; python_version < '3.11'
|
|
39
44
|
Requires-Dist: typing-extensions; python_version < '3.12'
|
|
40
45
|
Requires-Dist: wget>=3.2
|
|
46
|
+
Provides-Extra: browser
|
|
47
|
+
Requires-Dist: aiohttp>=3.9.0; extra == 'browser'
|
|
48
|
+
Requires-Dist: python-dotenv>=1.0.0; extra == 'browser'
|
|
49
|
+
Requires-Dist: stagehand>=3.0.0; extra == 'browser'
|
|
41
50
|
Provides-Extra: rg
|
|
42
51
|
Requires-Dist: reasoning-gym; extra == 'rg'
|
|
43
52
|
Provides-Extra: rl
|
|
@@ -30,11 +30,13 @@ classifiers = [
|
|
|
30
30
|
dependencies = [
|
|
31
31
|
"datasets>=3.0.0",
|
|
32
32
|
"jinja2>=3.1.6",
|
|
33
|
+
"numpy",
|
|
33
34
|
"math-verify>=0.8.0",
|
|
34
35
|
"mcp>=1.14.1",
|
|
35
36
|
"nest-asyncio>=1.6.0", # for jupyter notebooks
|
|
36
37
|
"openai>=1.108.1",
|
|
37
38
|
"openai-agents>=0.0.7",
|
|
39
|
+
"prime-tunnel",
|
|
38
40
|
"prime-sandboxes>=0.2.9",
|
|
39
41
|
"pydantic>=2.11.9",
|
|
40
42
|
"requests",
|
|
@@ -44,6 +46,9 @@ dependencies = [
|
|
|
44
46
|
"tomli; python_version < '3.11'",
|
|
45
47
|
"typing_extensions; python_version < '3.12'",
|
|
46
48
|
"wget>=3.2",
|
|
49
|
+
"gepa",
|
|
50
|
+
"pyzmq>=27.1.0",
|
|
51
|
+
"msgpack>=1.1.2",
|
|
47
52
|
]
|
|
48
53
|
|
|
49
54
|
[dependency-groups]
|
|
@@ -70,6 +75,11 @@ ta = [
|
|
|
70
75
|
"textarena",
|
|
71
76
|
"nltk",
|
|
72
77
|
]
|
|
78
|
+
browser = [
|
|
79
|
+
"stagehand>=3.0.0",
|
|
80
|
+
"aiohttp>=3.9.0",
|
|
81
|
+
"python-dotenv>=1.0.0",
|
|
82
|
+
]
|
|
73
83
|
rl = [
|
|
74
84
|
"torch>=2.8.0,<2.9.0",
|
|
75
85
|
"transformers>=4.56.2",
|
|
@@ -91,6 +101,7 @@ flash-attn = { FLASH_ATTENTION_SKIP_CUDA_BUILD = "TRUE" }
|
|
|
91
101
|
|
|
92
102
|
[project.scripts]
|
|
93
103
|
vf-eval = "verifiers.scripts.eval:main"
|
|
104
|
+
vf-gepa = "verifiers.scripts.gepa:main"
|
|
94
105
|
vf-init = "verifiers.scripts.init:main"
|
|
95
106
|
vf-install = "verifiers.scripts.install:main"
|
|
96
107
|
vf-setup = "verifiers.scripts.setup:main"
|
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
"""Pytest configuration and fixtures for verifiers tests."""
|
|
2
2
|
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Callable
|
|
3
5
|
from unittest.mock import AsyncMock, MagicMock
|
|
4
6
|
|
|
5
7
|
import pytest
|
|
6
8
|
from datasets import Dataset
|
|
9
|
+
from openai.types.chat import ChatCompletionToolParam
|
|
7
10
|
|
|
8
11
|
from verifiers import (
|
|
9
12
|
MaybeThinkParser,
|
|
@@ -20,6 +23,16 @@ from verifiers import (
|
|
|
20
23
|
XMLParser,
|
|
21
24
|
stop,
|
|
22
25
|
)
|
|
26
|
+
from verifiers.types import (
|
|
27
|
+
GenerateMetadata,
|
|
28
|
+
Info,
|
|
29
|
+
RolloutInput,
|
|
30
|
+
RolloutOutput,
|
|
31
|
+
RolloutTiming,
|
|
32
|
+
SamplingArgs,
|
|
33
|
+
TrajectoryStep,
|
|
34
|
+
)
|
|
35
|
+
from verifiers.utils.save_utils import state_to_output
|
|
23
36
|
|
|
24
37
|
|
|
25
38
|
@pytest.fixture
|
|
@@ -408,3 +421,135 @@ def mock_stateful_tool_env(mock_openai_client, sample_chat_dataset):
|
|
|
408
421
|
parser=Parser(),
|
|
409
422
|
rubric=Rubric(),
|
|
410
423
|
)
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
DEFAULT_PROMPT: Messages = [{"role": "user", "content": "What is 2+2?"}]
|
|
427
|
+
DEFAULT_COMPLETION: Messages = [{"role": "assistant", "content": "4"}]
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
@pytest.fixture
|
|
431
|
+
def make_input() -> Callable[..., RolloutInput]:
|
|
432
|
+
"""Fixture to make RolloutInput objects for testing."""
|
|
433
|
+
|
|
434
|
+
def _make_input(
|
|
435
|
+
example_id: int = 0,
|
|
436
|
+
task: str = "default",
|
|
437
|
+
prompt: Messages = DEFAULT_PROMPT,
|
|
438
|
+
info: Info = {},
|
|
439
|
+
answer: str = "4",
|
|
440
|
+
) -> RolloutInput:
|
|
441
|
+
return RolloutInput(
|
|
442
|
+
example_id=example_id, task=task, prompt=prompt, answer=answer, info=info
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
return _make_input
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
@pytest.fixture
|
|
449
|
+
def make_state() -> Callable[..., State]:
|
|
450
|
+
"""Fixture to make State objects for testing."""
|
|
451
|
+
|
|
452
|
+
def _make_state(
|
|
453
|
+
example_id: int = 0,
|
|
454
|
+
task: str = "default",
|
|
455
|
+
prompt: Messages = DEFAULT_PROMPT,
|
|
456
|
+
answer: str = "4",
|
|
457
|
+
info: Info = {},
|
|
458
|
+
completion: Messages = DEFAULT_COMPLETION,
|
|
459
|
+
reward: float = 0.0,
|
|
460
|
+
metrics: dict[str, float] = {"accuracy": 0.0},
|
|
461
|
+
is_completed: bool = True,
|
|
462
|
+
is_truncated: bool = False,
|
|
463
|
+
stop_condition: str | None = "max_turns_reached",
|
|
464
|
+
oai_tools: list[ChatCompletionToolParam] | None = None,
|
|
465
|
+
trajectory: list[TrajectoryStep] = [],
|
|
466
|
+
timing=RolloutTiming(
|
|
467
|
+
generation_ms=0.0,
|
|
468
|
+
scoring_ms=0.0,
|
|
469
|
+
total_ms=0.0,
|
|
470
|
+
),
|
|
471
|
+
foo: str = "bar", # custom field
|
|
472
|
+
**kwargs,
|
|
473
|
+
) -> State:
|
|
474
|
+
return State(
|
|
475
|
+
example_id=example_id,
|
|
476
|
+
task=task,
|
|
477
|
+
prompt=prompt,
|
|
478
|
+
answer=answer,
|
|
479
|
+
info=info,
|
|
480
|
+
completion=completion,
|
|
481
|
+
reward=reward,
|
|
482
|
+
metrics=metrics,
|
|
483
|
+
is_completed=is_completed,
|
|
484
|
+
is_truncated=is_truncated,
|
|
485
|
+
stop_condition=stop_condition,
|
|
486
|
+
oai_tools=oai_tools,
|
|
487
|
+
trajectory=trajectory,
|
|
488
|
+
timing=timing,
|
|
489
|
+
error=None,
|
|
490
|
+
foo=foo,
|
|
491
|
+
**kwargs,
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
return _make_state
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
@pytest.fixture
|
|
498
|
+
def make_output(make_state) -> Callable[..., RolloutOutput]:
|
|
499
|
+
"""Fixture to make RolloutOutput objects for testing.
|
|
500
|
+
|
|
501
|
+
This creates a State first, then converts it to a RolloutOutput using
|
|
502
|
+
state_to_output(). This ensures the output matches the serialized format
|
|
503
|
+
used in GenerateOutputs.
|
|
504
|
+
"""
|
|
505
|
+
|
|
506
|
+
def _make_output(
|
|
507
|
+
state_columns: list[str] = ["foo"],
|
|
508
|
+
**kwargs,
|
|
509
|
+
) -> RolloutOutput:
|
|
510
|
+
state = make_state(**kwargs)
|
|
511
|
+
return state_to_output(state, state_columns)
|
|
512
|
+
|
|
513
|
+
return _make_output
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
@pytest.fixture
|
|
517
|
+
def make_metadata() -> Callable[..., GenerateMetadata]:
|
|
518
|
+
"""Fixture to make GenerateMetadata objects for testing."""
|
|
519
|
+
|
|
520
|
+
def _make_metadata(
|
|
521
|
+
env_id: str = "test-env",
|
|
522
|
+
env_args: dict = {},
|
|
523
|
+
model: str = "test-model",
|
|
524
|
+
base_url: str = "http://localhost:8000/v1",
|
|
525
|
+
num_examples: int = 1,
|
|
526
|
+
rollouts_per_example: int = 1,
|
|
527
|
+
sampling_args: SamplingArgs = {},
|
|
528
|
+
date: str = "1970-01-01",
|
|
529
|
+
time_ms: float = 0.0,
|
|
530
|
+
avg_reward: float = 0.0,
|
|
531
|
+
avg_metrics: dict[str, float] = {},
|
|
532
|
+
usage: dict[str, float] | None = None,
|
|
533
|
+
state_columns: list[str] = ["foo"],
|
|
534
|
+
path_to_save: Path = Path("test.jsonl"),
|
|
535
|
+
tools: list[ChatCompletionToolParam] | None = None,
|
|
536
|
+
) -> GenerateMetadata:
|
|
537
|
+
return GenerateMetadata(
|
|
538
|
+
env_id=env_id,
|
|
539
|
+
env_args=env_args,
|
|
540
|
+
model=model,
|
|
541
|
+
base_url=base_url,
|
|
542
|
+
num_examples=num_examples,
|
|
543
|
+
rollouts_per_example=rollouts_per_example,
|
|
544
|
+
sampling_args=sampling_args,
|
|
545
|
+
date=date,
|
|
546
|
+
time_ms=time_ms,
|
|
547
|
+
avg_reward=avg_reward,
|
|
548
|
+
avg_metrics=avg_metrics,
|
|
549
|
+
usage=usage,
|
|
550
|
+
state_columns=state_columns,
|
|
551
|
+
path_to_save=path_to_save,
|
|
552
|
+
tools=tools,
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
return _make_metadata
|