verifiers 0.1.10.dev0__tar.gz → 0.1.10.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/.gitignore +4 -0
  2. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/PKG-INFO +10 -1
  3. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/pyproject.toml +11 -0
  4. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/conftest.py +145 -0
  5. verifiers-0.1.10.dev2/tests/test_browser_env.py +562 -0
  6. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_cli_agent_env.py +0 -16
  7. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_env_group.py +22 -40
  8. verifiers-0.1.10.dev2/tests/test_environment.py +831 -0
  9. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_environment_extra.py +52 -110
  10. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_envs.py +9 -0
  11. verifiers-0.1.10.dev2/tests/test_eval_cli.py +461 -0
  12. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_eval_utils.py +33 -69
  13. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_gym_env.py +21 -13
  14. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_imports.py +7 -4
  15. verifiers-0.1.10.dev2/tests/test_install_utils.py +161 -0
  16. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_math_rubric.py +9 -16
  17. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_multiturn_env.py +28 -82
  18. verifiers-0.1.10.dev2/tests/test_rlm_env.py +1442 -0
  19. verifiers-0.1.10.dev2/tests/test_rlm_env_sandbox.py +258 -0
  20. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_rubric.py +9 -19
  21. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_rubric_group.py +7 -15
  22. verifiers-0.1.10.dev2/tests/test_save_utils.py +196 -0
  23. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_singleturn_env.py +79 -162
  24. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_stateful_tool_env.py +8 -20
  25. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_tool_env.py +191 -26
  26. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_trajectory_processing.py +6 -8
  27. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/AGENTS.md +1 -1
  28. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/__init__.py +9 -1
  29. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/AGENTS.md +16 -2
  30. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/env_group.py +12 -11
  31. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/environment.py +347 -259
  32. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/experimental/README.md +9 -1
  33. verifiers-0.1.10.dev2/verifiers/envs/experimental/cli_agent_env.py +820 -0
  34. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/experimental/harbor_env.py +5 -1
  35. verifiers-0.1.10.dev2/verifiers/envs/experimental/rlm_env.py +4125 -0
  36. verifiers-0.1.10.dev2/verifiers/envs/integrations/README.md +131 -0
  37. verifiers-0.1.10.dev2/verifiers/envs/integrations/browser_env/__init__.py +75 -0
  38. verifiers-0.1.10.dev2/verifiers/envs/integrations/browser_env/browser_env.py +203 -0
  39. verifiers-0.1.10.dev2/verifiers/envs/integrations/browser_env/modes/__init__.py +26 -0
  40. verifiers-0.1.10.dev2/verifiers/envs/integrations/browser_env/modes/base.py +42 -0
  41. verifiers-0.1.10.dev2/verifiers/envs/integrations/browser_env/modes/cua_mode.py +1183 -0
  42. verifiers-0.1.10.dev2/verifiers/envs/integrations/browser_env/modes/dom_mode.py +271 -0
  43. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/multiturn_env.py +22 -16
  44. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/sandbox_env.py +3 -1
  45. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/tool_env.py +3 -2
  46. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/errors.py +7 -1
  47. verifiers-0.1.10.dev2/verifiers/gepa/__init__.py +12 -0
  48. verifiers-0.1.10.dev2/verifiers/gepa/adapter.py +204 -0
  49. verifiers-0.1.10.dev2/verifiers/gepa/config.py +42 -0
  50. verifiers-0.1.10.dev2/verifiers/gepa/display.py +493 -0
  51. verifiers-0.1.10.dev2/verifiers/gepa/gepa_utils.py +112 -0
  52. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rl/trainer/orchestrator.py +27 -14
  53. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rl/trainer/trainer.py +5 -0
  54. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rubrics/rubric.py +61 -76
  55. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rubrics/rubric_group.py +5 -5
  56. verifiers-0.1.10.dev2/verifiers/scripts/eval.py +437 -0
  57. verifiers-0.1.10.dev2/verifiers/scripts/gepa.py +386 -0
  58. verifiers-0.1.10.dev2/verifiers/scripts/install.py +76 -0
  59. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/scripts/tui.py +303 -7
  60. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/types.py +75 -29
  61. verifiers-0.1.10.dev2/verifiers/utils/async_utils.py +198 -0
  62. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/client_utils.py +2 -6
  63. verifiers-0.1.10.dev2/verifiers/utils/config_utils.py +31 -0
  64. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/data_utils.py +13 -6
  65. verifiers-0.1.10.dev2/verifiers/utils/display_utils.py +407 -0
  66. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/error_utils.py +3 -3
  67. verifiers-0.1.10.dev2/verifiers/utils/eval_display.py +699 -0
  68. verifiers-0.1.10.dev2/verifiers/utils/eval_utils.py +554 -0
  69. verifiers-0.1.10.dev2/verifiers/utils/install_utils.py +249 -0
  70. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/logging_utils.py +41 -68
  71. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/message_utils.py +52 -1
  72. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/path_utils.py +25 -1
  73. verifiers-0.1.10.dev2/verifiers/utils/sandbox_exec_utils.py +103 -0
  74. verifiers-0.1.10.dev2/verifiers/utils/save_utils.py +385 -0
  75. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/tool_utils.py +17 -0
  76. verifiers-0.1.10.dev2/verifiers/utils/worker_utils.py +40 -0
  77. verifiers-0.1.10.dev2/verifiers/workers/__init__.py +27 -0
  78. verifiers-0.1.10.dev2/verifiers/workers/client/env_client.py +96 -0
  79. verifiers-0.1.10.dev2/verifiers/workers/client/zmq_env_client.py +190 -0
  80. verifiers-0.1.10.dev2/verifiers/workers/server/env_server.py +135 -0
  81. verifiers-0.1.10.dev2/verifiers/workers/server/zmq_env_server.py +150 -0
  82. verifiers-0.1.10.dev2/verifiers/workers/types.py +74 -0
  83. verifiers-0.1.10.dev0/tests/test_environment.py +0 -552
  84. verifiers-0.1.10.dev0/tests/test_eval_cli.py +0 -130
  85. verifiers-0.1.10.dev0/tests/test_rlm_env.py +0 -1984
  86. verifiers-0.1.10.dev0/verifiers/envs/experimental/cli_agent_env.py +0 -655
  87. verifiers-0.1.10.dev0/verifiers/envs/experimental/rlm_env.py +0 -2694
  88. verifiers-0.1.10.dev0/verifiers/envs/integrations/README.md +0 -17
  89. verifiers-0.1.10.dev0/verifiers/scripts/eval.py +0 -362
  90. verifiers-0.1.10.dev0/verifiers/scripts/install.py +0 -70
  91. verifiers-0.1.10.dev0/verifiers/utils/async_utils.py +0 -87
  92. verifiers-0.1.10.dev0/verifiers/utils/eval_utils.py +0 -365
  93. verifiers-0.1.10.dev0/verifiers/utils/rlm_data_serialization_utils.py +0 -630
  94. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/LICENSE +0 -0
  95. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/README.md +0 -0
  96. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/AGENTS.md +0 -0
  97. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/README.md +0 -0
  98. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/__init__.py +0 -0
  99. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/mock_client_guide.md +0 -0
  100. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/mock_openai_client.py +0 -0
  101. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_decorator_ranks.py +0 -0
  102. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_environment_audio_modality.py +0 -0
  103. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_error_chain.py +0 -0
  104. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_logging.py +0 -0
  105. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_maybe_think_parser.py +0 -0
  106. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_message_utils_audio.py +0 -0
  107. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_parser.py +0 -0
  108. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_sandbox_env.py +0 -0
  109. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_think_parser.py +0 -0
  110. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_tool_utils.py +0 -0
  111. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/tests/test_xml_parser.py +0 -0
  112. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/decorators.py +0 -0
  113. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/__init__.py +0 -0
  114. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/experimental/__init__.py +0 -0
  115. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/experimental/gym_env.py +0 -0
  116. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/experimental/mcp_env.py +0 -0
  117. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/integrations/__init__.py +0 -0
  118. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/integrations/reasoninggym_env.py +0 -0
  119. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/integrations/textarena_env.py +0 -0
  120. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/python_env.py +0 -0
  121. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/singleturn_env.py +0 -0
  122. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/envs/stateful_tool_env.py +0 -0
  123. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/parsers/__init__.py +0 -0
  124. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/parsers/maybe_think_parser.py +0 -0
  125. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/parsers/parser.py +0 -0
  126. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/parsers/think_parser.py +0 -0
  127. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/parsers/xml_parser.py +0 -0
  128. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rl/README.md +0 -0
  129. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rl/inference/__init__.py +0 -0
  130. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rl/inference/client.py +0 -0
  131. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rl/inference/server.py +0 -0
  132. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rl/trainer/__init__.py +0 -0
  133. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rl/trainer/config.py +0 -0
  134. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rl/trainer/utils.py +0 -0
  135. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rubrics/__init__.py +0 -0
  136. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rubrics/judge_rubric.py +0 -0
  137. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/rubrics/math_rubric.py +0 -0
  138. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/scripts/__init__.py +0 -0
  139. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/scripts/init.py +0 -0
  140. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/scripts/prime_rl.py +0 -0
  141. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/scripts/rl.py +0 -0
  142. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/scripts/setup.py +0 -0
  143. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/scripts/train.py +0 -0
  144. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/__init__.py +0 -0
  145. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/env_utils.py +0 -0
  146. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/response_utils.py +0 -0
  147. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/thread_utils.py +0 -0
  148. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/token_utils.py +0 -0
  149. {verifiers-0.1.10.dev0 → verifiers-0.1.10.dev2}/verifiers/utils/tunnel_utils.py +0 -0
@@ -46,3 +46,7 @@ scratch/
46
46
  .vscode/
47
47
  *.swp
48
48
  .DS_Store
49
+
50
+ # CUA server (local dev artifacts)
51
+ assets/templates/browserbase/cua/node_modules/
52
+ assets/templates/browserbase/cua/pnpm-lock.yaml
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: verifiers
3
- Version: 0.1.10.dev0
3
+ Version: 0.1.10.dev2
4
4
  Summary: Verifiers: Environments for LLM Reinforcement Learning
5
5
  Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
6
6
  Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
@@ -23,14 +23,19 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
23
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
24
  Requires-Python: <3.14,>=3.10
25
25
  Requires-Dist: datasets>=3.0.0
26
+ Requires-Dist: gepa
26
27
  Requires-Dist: jinja2>=3.1.6
27
28
  Requires-Dist: math-verify>=0.8.0
28
29
  Requires-Dist: mcp>=1.14.1
30
+ Requires-Dist: msgpack>=1.1.2
29
31
  Requires-Dist: nest-asyncio>=1.6.0
32
+ Requires-Dist: numpy
30
33
  Requires-Dist: openai-agents>=0.0.7
31
34
  Requires-Dist: openai>=1.108.1
32
35
  Requires-Dist: prime-sandboxes>=0.2.9
36
+ Requires-Dist: prime-tunnel
33
37
  Requires-Dist: pydantic>=2.11.9
38
+ Requires-Dist: pyzmq>=27.1.0
34
39
  Requires-Dist: requests
35
40
  Requires-Dist: rich
36
41
  Requires-Dist: tenacity>=8.5.0
@@ -38,6 +43,10 @@ Requires-Dist: textual
38
43
  Requires-Dist: tomli; python_version < '3.11'
39
44
  Requires-Dist: typing-extensions; python_version < '3.12'
40
45
  Requires-Dist: wget>=3.2
46
+ Provides-Extra: browser
47
+ Requires-Dist: aiohttp>=3.9.0; extra == 'browser'
48
+ Requires-Dist: python-dotenv>=1.0.0; extra == 'browser'
49
+ Requires-Dist: stagehand>=3.0.0; extra == 'browser'
41
50
  Provides-Extra: rg
42
51
  Requires-Dist: reasoning-gym; extra == 'rg'
43
52
  Provides-Extra: rl
@@ -30,11 +30,13 @@ classifiers = [
30
30
  dependencies = [
31
31
  "datasets>=3.0.0",
32
32
  "jinja2>=3.1.6",
33
+ "numpy",
33
34
  "math-verify>=0.8.0",
34
35
  "mcp>=1.14.1",
35
36
  "nest-asyncio>=1.6.0", # for jupyter notebooks
36
37
  "openai>=1.108.1",
37
38
  "openai-agents>=0.0.7",
39
+ "prime-tunnel",
38
40
  "prime-sandboxes>=0.2.9",
39
41
  "pydantic>=2.11.9",
40
42
  "requests",
@@ -44,6 +46,9 @@ dependencies = [
44
46
  "tomli; python_version < '3.11'",
45
47
  "typing_extensions; python_version < '3.12'",
46
48
  "wget>=3.2",
49
+ "gepa",
50
+ "pyzmq>=27.1.0",
51
+ "msgpack>=1.1.2",
47
52
  ]
48
53
 
49
54
  [dependency-groups]
@@ -70,6 +75,11 @@ ta = [
70
75
  "textarena",
71
76
  "nltk",
72
77
  ]
78
+ browser = [
79
+ "stagehand>=3.0.0",
80
+ "aiohttp>=3.9.0",
81
+ "python-dotenv>=1.0.0",
82
+ ]
73
83
  rl = [
74
84
  "torch>=2.8.0,<2.9.0",
75
85
  "transformers>=4.56.2",
@@ -91,6 +101,7 @@ flash-attn = { FLASH_ATTENTION_SKIP_CUDA_BUILD = "TRUE" }
91
101
 
92
102
  [project.scripts]
93
103
  vf-eval = "verifiers.scripts.eval:main"
104
+ vf-gepa = "verifiers.scripts.gepa:main"
94
105
  vf-init = "verifiers.scripts.init:main"
95
106
  vf-install = "verifiers.scripts.install:main"
96
107
  vf-setup = "verifiers.scripts.setup:main"
@@ -1,9 +1,12 @@
1
1
  """Pytest configuration and fixtures for verifiers tests."""
2
2
 
3
+ from pathlib import Path
4
+ from typing import Callable
3
5
  from unittest.mock import AsyncMock, MagicMock
4
6
 
5
7
  import pytest
6
8
  from datasets import Dataset
9
+ from openai.types.chat import ChatCompletionToolParam
7
10
 
8
11
  from verifiers import (
9
12
  MaybeThinkParser,
@@ -20,6 +23,16 @@ from verifiers import (
20
23
  XMLParser,
21
24
  stop,
22
25
  )
26
+ from verifiers.types import (
27
+ GenerateMetadata,
28
+ Info,
29
+ RolloutInput,
30
+ RolloutOutput,
31
+ RolloutTiming,
32
+ SamplingArgs,
33
+ TrajectoryStep,
34
+ )
35
+ from verifiers.utils.save_utils import state_to_output
23
36
 
24
37
 
25
38
  @pytest.fixture
@@ -408,3 +421,135 @@ def mock_stateful_tool_env(mock_openai_client, sample_chat_dataset):
408
421
  parser=Parser(),
409
422
  rubric=Rubric(),
410
423
  )
424
+
425
+
426
+ DEFAULT_PROMPT: Messages = [{"role": "user", "content": "What is 2+2?"}]
427
+ DEFAULT_COMPLETION: Messages = [{"role": "assistant", "content": "4"}]
428
+
429
+
430
+ @pytest.fixture
431
+ def make_input() -> Callable[..., RolloutInput]:
432
+ """Fixture to make RolloutInput objects for testing."""
433
+
434
+ def _make_input(
435
+ example_id: int = 0,
436
+ task: str = "default",
437
+ prompt: Messages = DEFAULT_PROMPT,
438
+ info: Info = {},
439
+ answer: str = "4",
440
+ ) -> RolloutInput:
441
+ return RolloutInput(
442
+ example_id=example_id, task=task, prompt=prompt, answer=answer, info=info
443
+ )
444
+
445
+ return _make_input
446
+
447
+
448
+ @pytest.fixture
449
+ def make_state() -> Callable[..., State]:
450
+ """Fixture to make State objects for testing."""
451
+
452
+ def _make_state(
453
+ example_id: int = 0,
454
+ task: str = "default",
455
+ prompt: Messages = DEFAULT_PROMPT,
456
+ answer: str = "4",
457
+ info: Info = {},
458
+ completion: Messages = DEFAULT_COMPLETION,
459
+ reward: float = 0.0,
460
+ metrics: dict[str, float] = {"accuracy": 0.0},
461
+ is_completed: bool = True,
462
+ is_truncated: bool = False,
463
+ stop_condition: str | None = "max_turns_reached",
464
+ oai_tools: list[ChatCompletionToolParam] | None = None,
465
+ trajectory: list[TrajectoryStep] = [],
466
+ timing=RolloutTiming(
467
+ generation_ms=0.0,
468
+ scoring_ms=0.0,
469
+ total_ms=0.0,
470
+ ),
471
+ foo: str = "bar", # custom field
472
+ **kwargs,
473
+ ) -> State:
474
+ return State(
475
+ example_id=example_id,
476
+ task=task,
477
+ prompt=prompt,
478
+ answer=answer,
479
+ info=info,
480
+ completion=completion,
481
+ reward=reward,
482
+ metrics=metrics,
483
+ is_completed=is_completed,
484
+ is_truncated=is_truncated,
485
+ stop_condition=stop_condition,
486
+ oai_tools=oai_tools,
487
+ trajectory=trajectory,
488
+ timing=timing,
489
+ error=None,
490
+ foo=foo,
491
+ **kwargs,
492
+ )
493
+
494
+ return _make_state
495
+
496
+
497
+ @pytest.fixture
498
+ def make_output(make_state) -> Callable[..., RolloutOutput]:
499
+ """Fixture to make RolloutOutput objects for testing.
500
+
501
+ This creates a State first, then converts it to a RolloutOutput using
502
+ state_to_output(). This ensures the output matches the serialized format
503
+ used in GenerateOutputs.
504
+ """
505
+
506
+ def _make_output(
507
+ state_columns: list[str] = ["foo"],
508
+ **kwargs,
509
+ ) -> RolloutOutput:
510
+ state = make_state(**kwargs)
511
+ return state_to_output(state, state_columns)
512
+
513
+ return _make_output
514
+
515
+
516
+ @pytest.fixture
517
+ def make_metadata() -> Callable[..., GenerateMetadata]:
518
+ """Fixture to make GenerateMetadata objects for testing."""
519
+
520
+ def _make_metadata(
521
+ env_id: str = "test-env",
522
+ env_args: dict = {},
523
+ model: str = "test-model",
524
+ base_url: str = "http://localhost:8000/v1",
525
+ num_examples: int = 1,
526
+ rollouts_per_example: int = 1,
527
+ sampling_args: SamplingArgs = {},
528
+ date: str = "1970-01-01",
529
+ time_ms: float = 0.0,
530
+ avg_reward: float = 0.0,
531
+ avg_metrics: dict[str, float] = {},
532
+ usage: dict[str, float] | None = None,
533
+ state_columns: list[str] = ["foo"],
534
+ path_to_save: Path = Path("test.jsonl"),
535
+ tools: list[ChatCompletionToolParam] | None = None,
536
+ ) -> GenerateMetadata:
537
+ return GenerateMetadata(
538
+ env_id=env_id,
539
+ env_args=env_args,
540
+ model=model,
541
+ base_url=base_url,
542
+ num_examples=num_examples,
543
+ rollouts_per_example=rollouts_per_example,
544
+ sampling_args=sampling_args,
545
+ date=date,
546
+ time_ms=time_ms,
547
+ avg_reward=avg_reward,
548
+ avg_metrics=avg_metrics,
549
+ usage=usage,
550
+ state_columns=state_columns,
551
+ path_to_save=path_to_save,
552
+ tools=tools,
553
+ )
554
+
555
+ return _make_metadata