PyPI - verifiers - Versions diffs - 0.1.15.dev170__tar.gz → 0.1.15.dev172__tar.gz - Mend

verifiers 0.1.15.dev170tar.gz → 0.1.15.dev172tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (353) hide show

{verifiers-0.1.15.dev170 → verifiers-0.1.15.dev172}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: verifiers
-Version: 0.1.15.dev170
+Version: 0.1.15.dev172
 Summary: Verifiers: Environments for LLM Reinforcement Learning
 Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
 Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers

{verifiers-0.1.15.dev170 → verifiers-0.1.15.dev172}/tests/test_environment_extra.py RENAMED Viewed

@@ -557,7 +557,12 @@ async def test_generate_resume_closes_local_endpoint_clients(
     results_path = tmp_path / "resume-complete"
     results_path.mkdir()
     (results_path / "results.jsonl").write_text(
-        json.dumps(make_output(example_id=0)) + "\n",
+        (
+            json.dumps(make_output(example_id=0, reward=0.0))
+            + "\n"
+            + json.dumps(make_output(example_id=0, reward=1.0))
+            + "\n"
+        ),
         encoding="utf-8",
     )
     (results_path / "metadata.json").write_text(
@@ -583,9 +588,12 @@ async def test_generate_resume_closes_local_endpoint_clients(
         ),
         model="test-model",
         results_path=results_path,
+        save_results=True,
     )
     assert len(outputs["outputs"]) == 1
+    saved_metadata = json.loads((results_path / "metadata.json").read_text())
+    assert saved_metadata["avg_reward"] == 0.0
     assert len(created_clients) == 2
     assert all(client.closed for client in created_clients)
@@ -669,6 +677,21 @@ async def test_generate_resume_raises_on_metadata_mismatch(
 ):
     env = make_dummy_env(mock_client)
+    invalid_results_path = tmp_path / "missing-metadata"
+    invalid_results_path.mkdir()
+    (invalid_results_path / "results.jsonl").write_text(
+        json.dumps({"example_id": 99, "label": "existing"}) + "\n",
+        encoding="utf-8",
+    )
+    with pytest.raises(ValueError, match="already exists without valid metadata"):
+        await env.generate(
+            inputs=[make_input(example_id=0)],
+            client=mock_client,
+            model="test-model",
+            results_path=invalid_results_path,
+            save_results=True,
+        )
     results_path = tmp_path / "resume"
     results_path.mkdir()
     (results_path / "results.jsonl").write_text("", encoding="utf-8")

{verifiers-0.1.15.dev170 → verifiers-0.1.15.dev172}/tests/test_path_utils.py RENAMED Viewed

@@ -30,7 +30,8 @@ def test_find_latest_incomplete_eval_results_path_picks_newest_matching(
     (old_run / "results.jsonl").write_text('{"example_id":0}\n', encoding="utf-8")
     (new_run / "results.jsonl").write_text(
-        '{"example_id":0}\n{"example_id":1}\n', encoding="utf-8"
+        '{"example_id":0}\n{"example_id":1}\n{"example_id":2',
+        encoding="utf-8",
     )
     (complete_run / "results.jsonl").write_text(
         '{"example_id":0}\n{"example_id":1}\n{"example_id":2}\n{"example_id":3}\n',

{verifiers-0.1.15.dev170 → verifiers-0.1.15.dev172}/tests/test_save_utils.py RENAMED Viewed

@@ -33,7 +33,6 @@ from verifiers.utils.save_utils import (
     save_new_outputs,
     save_metadata,
     states_to_outputs,
-    truncate_malformed_trailing_line,
     validate_resume_metadata,
 )
 from verifiers.utils.usage_utils import StateUsageTracker, response_usage_tokens
@@ -475,7 +474,7 @@ class TestSavingResults:
 class TestLoadOutputs:
-    def test_ignores_malformed_trailing_line(self, tmp_path: Path):
+    def test_ignores_malformed_trailing_line(self, tmp_path: Path, monkeypatch):
         results_path = tmp_path / "results"
         results_path.mkdir()
         outputs_path = results_path / "results.jsonl"
@@ -489,50 +488,68 @@ class TestLoadOutputs:
         outputs_path.write_text(
             "\n".join(lines + [partial_trailing_line]) + "\n", encoding="utf-8"
         )
+        warnings = []
+        monkeypatch.setattr(
+            "verifiers.utils.save_utils.logger.warning",
+            lambda *args, **kwargs: warnings.append(args),
+        )
         outputs = load_outputs(results_path)
-        assert len(outputs) == 2
-        assert outputs[0]["example_id"] == 0
-        assert outputs[1]["example_id"] == 1
+        assert [output["example_id"] for output in outputs] == [0, 1]
+        assert warnings
-    def test_raises_for_malformed_non_trailing_line(self, tmp_path: Path):
+    def test_ignores_malformed_non_trailing_line(self, tmp_path: Path, monkeypatch):
         results_path = tmp_path / "results"
         results_path.mkdir()
         outputs_path = results_path / "results.jsonl"
         malformed_non_trailing_line = '{"example_id": 0, "label": "broken"'
+        missing_example_id_line = json.dumps({"label": "missing"})
         valid_line = json.dumps({"example_id": 1, "label": "row-1"})
         outputs_path.write_text(
-            "\n".join([malformed_non_trailing_line, valid_line]) + "\n",
+            "\n".join(
+                [malformed_non_trailing_line, missing_example_id_line, valid_line]
+            )
+            + "\n",
             encoding="utf-8",
         )
+        warnings = []
+        monkeypatch.setattr(
+            "verifiers.utils.save_utils.logger.warning",
+            lambda *args, **kwargs: warnings.append(args),
+        )
-        with pytest.raises(json.JSONDecodeError):
-            load_outputs(results_path)
+        outputs = load_outputs(results_path)
+        assert [output["example_id"] for output in outputs] == [1]
+        assert warnings
 class TestSaveNewOutputs:
-    def test_truncates_malformed_trailing_line_before_append(self, tmp_path: Path):
+    def test_appends_after_malformed_rows_without_rewriting(self, tmp_path: Path):
         results_path = tmp_path / "results"
         results_path.mkdir()
         outputs_path = results_path / "results.jsonl"
-        existing_outputs = [
-            {"example_id": 0, "label": "row-0"},
-            {"example_id": 1, "label": "row-1"},
-        ]
+        malformed_middle_line = '{"example_id": 99, "label": "broken"'
         malformed_trailing_line = '{"example_id": 2, "label": "row-2"'
-        lines = [json.dumps(output) for output in existing_outputs]
         outputs_path.write_text(
-            "\n".join(lines + [malformed_trailing_line]), encoding="utf-8"
+            "\n".join(
+                [
+                    json.dumps({"example_id": 0, "label": "row-0"}),
+                    malformed_middle_line,
+                    json.dumps({"example_id": 1, "label": "row-1"}),
+                    malformed_trailing_line,
+                ]
+            ),
+            encoding="utf-8",
         )
-        # Caller drops the partial trailing row before appending so the new
-        # row lands on a valid JSONL boundary.
-        truncate_malformed_trailing_line(outputs_path)
+        circular_output = {"example_id": 4}
+        circular_output["self"] = circular_output
         save_new_outputs(
-            [{"example_id": 3, "label": "row-3"}],
+            [circular_output, {"example_id": 3, "label": "row-3"}],
             results_path,
         )
@@ -541,14 +558,12 @@ class TestSaveNewOutputs:
             for line in outputs_path.read_text(encoding="utf-8").splitlines()
             if line
         ]
-        parsed_outputs = [json.loads(line) for line in persisted_lines]
-        assert [output["example_id"] for output in parsed_outputs] == [0, 1, 3]
-        assert [output["example_id"] for output in load_outputs(results_path)] == [
-            0,
-            1,
-            3,
-        ]
+        assert persisted_lines[1] == malformed_middle_line
+        assert persisted_lines[3] == malformed_trailing_line
+        assert [
+            json.loads(persisted_lines[idx])["example_id"] for idx in [0, 2, 4]
+        ] == [0, 1, 3]
 class TestResumeMetadataValidation:

{verifiers-0.1.15.dev170 → verifiers-0.1.15.dev172}/verifiers/envs/environment.py RENAMED Viewed

@@ -82,7 +82,6 @@ from verifiers.utils.save_utils import (
     save_new_outputs,
     save_outputs,
     state_to_output,
-    truncate_malformed_trailing_line,
     validate_resume_metadata,
 )
 from verifiers.utils.usage_utils import StateUsageTracker
@@ -1003,9 +1002,21 @@ class Environment(ABC):
                 )
                 on_log(f"Resuming evaluation from {results_path}")
                 outputs = load_outputs(results_path)
-                # Drop any partial trailing row left by a crashed prior write
-                # so subsequent appends start from a valid JSONL boundary.
-                truncate_malformed_trailing_line(results_path / "results.jsonl")
+                rollout_counts_by_example_id: dict[object, int] = {}
+                capped_outputs: list[RolloutOutput] = []
+                for output in outputs:
+                    example_id = output["example_id"]
+                    rollout_count = rollout_counts_by_example_id.get(example_id, 0)
+                    if rollout_count >= rollouts_per_example:
+                        continue
+                    rollout_counts_by_example_id[example_id] = rollout_count + 1
+                    capped_outputs.append(output)
+                if len(capped_outputs) != len(outputs):
+                    on_log(
+                        f"Ignoring {len(outputs) - len(capped_outputs)} saved duplicate rollout(s) "
+                        "beyond rollouts_per_example"
+                    )
+                outputs = capped_outputs
                 builder.add_outputs(outputs)
                 filtered_inputs = filter_inputs(
                     raw_inputs, outputs, rollouts_per_example
@@ -1014,7 +1025,12 @@ class Environment(ABC):
                     on_log(
                         "No remaining rollouts to evaluate, returning completed outputs"
                     )
-                    return builder.build(sort_by_example_id=True)
+                    results = builder.build(sort_by_example_id=True)
+                    if save_results:
+                        await asyncio.to_thread(
+                            save_metadata, results["metadata"], builder.results_path
+                        )
+                    return results
                 on_log(
                     f"Found {len(outputs)} completed rollout(s), {len(filtered_inputs)} remaining rollout(s)"
                 )
@@ -1023,6 +1039,21 @@ class Environment(ABC):
             if save_results:
                 on_log(f"Saving results to {builder.results_path}")
+                if results_path is None or not is_valid_eval_results_path(results_path):
+                    outputs_path = builder.results_path / "results.jsonl"
+                    if (
+                        results_path is not None
+                        and outputs_path.is_file()
+                        and outputs_path.stat().st_size > 0
+                    ):
+                        raise ValueError(
+                            f"Cannot save to invalid results path {builder.results_path}: "
+                            "results.jsonl already exists without valid metadata"
+                        )
+                    await asyncio.to_thread(save_outputs, [], builder.results_path, "a")
+                    await asyncio.to_thread(
+                        save_metadata, builder.build_metadata(), builder.results_path
+                    )
             tasks: dict[asyncio.Task, int] = {}
             try:
@@ -1104,9 +1135,6 @@ class Environment(ABC):
             # save if requested
             if save_results:
-                await asyncio.to_thread(
-                    save_outputs, results["outputs"], builder.results_path
-                )
                 await asyncio.to_thread(
                     save_metadata, results["metadata"], builder.results_path
                 )

{verifiers-0.1.15.dev170 → verifiers-0.1.15.dev172}/verifiers/envs/experimental/composable/tasksets/search/README.md RENAMED Viewed

@@ -10,6 +10,7 @@ The search family is intentionally backend-oriented, mirroring the SWE taskset p
 |---|---|---|---|
 | `openseeker` | [PolarSeeker/OpenSeeker](https://github.com/PolarSeeker/OpenSeeker) | [`PolarSeeker/OpenSeeker-v1-Data`](https://huggingface.co/datasets/PolarSeeker/OpenSeeker-v1-Data) | Binary semantic answer judge |
 | `quest` | [OSU-NLP-Group/QUEST](https://github.com/OSU-NLP-Group/QUEST) | [`osunlp/QUEST-RL-Data`](https://huggingface.co/datasets/osunlp/QUEST-RL-Data) | Objective tasks supported |
+| `redsearcher` | [RedSearchAgent/REDSearcher](https://github.com/RedSearchAgent/REDSearcher) | [`Zchu/REDSearcher_RL_1K`](https://huggingface.co/datasets/Zchu/REDSearcher_RL_1K) | Text RL query set supported |
 ## Usage
@@ -18,13 +19,14 @@ from verifiers.envs.experimental.composable.tasksets.search import make_search_t
 taskset = make_search_taskset(backend="openseeker")
 taskset = make_search_taskset(backend="quest", category="objective")
+redsearcher = make_search_taskset(backend="redsearcher", difficulty="easy")
 ```
 `make_search_taskset()` dispatches by backend name. Unknown backends raise `ValueError` with the available backend list.
 ## Output Contract
-Search tasksets should define their own output contract. The `quest` and `openseeker` backends expect the agent to write one final researched response to `/task/answer.txt`, including supporting URLs/citations when available. Scratch reasoning, tool traces, and logs should not be written as the final answer.
+Search tasksets should define their own output contract. The `quest`, `openseeker`, and `redsearcher` backends expect the agent to write one final researched response to `/task/answer.txt`, including supporting URLs/citations when available. Scratch reasoning, tool traces, and logs should not be written as the final answer.
 ## Error Handling

verifiers-0.1.15.dev172/verifiers/envs/experimental/composable/tasksets/search/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""Composable search/research tasksets."""
+from .search_tasksets import (
+    make_openseeker_taskset,
+    make_quest_taskset,
+    make_redsearcher_taskset,
+    make_search_taskset,
+)
+__all__ = [
+    "make_openseeker_taskset",
+    "make_quest_taskset",
+    "make_redsearcher_taskset",
+    "make_search_taskset",
+]

verifiers-0.1.15.dev172/verifiers/envs/experimental/composable/tasksets/search/redsearcher/README.md ADDED Viewed

@@ -0,0 +1,38 @@
+# REDSearcher Search Taskset
+Text RL queries from REDSearcher ported into the composable search taskset framework.
+## Source
+- Dataset: [`Zchu/REDSearcher_RL_1K`](https://huggingface.co/datasets/Zchu/REDSearcher_RL_1K)
+- Collection: [`Zchu/redsearcher`](https://huggingface.co/collections/Zchu/redsearcher)
+- Upstream project: [`RedSearchAgent/REDSearcher`](https://github.com/RedSearchAgent/REDSearcher)
+- Paper: [`arXiv:2602.14234`](https://arxiv.org/abs/2602.14234)
+The released text RL dataset contains 1,000 rows with `problem`, `answer`, and `difficulty` columns. The upstream REDSearcher repo describes converting each row into a Slime-style `prompt` plus `label`; this taskset keeps the same problem/answer boundary while adapting it to Verifiers' taskset format.
+## Task Contract
+Each example is a long-horizon web-search question. The agent should research across sources and produce one final answer in `/task/answer.txt`, with supporting URLs/citations when available.
+The paired `rlm_search` environment prompts RLM to write this file and provides web search/open-page skills. The rubric can fall back to the final assistant text if the answer file is empty, but agents should still write the file directly.
+## Scoring
+`RedSearcherRubric` compares the final response against the released `answer` label. It first applies a strict normalized exact-answer shortcut for unambiguous matches. Otherwise it uses an OpenAI-compatible LLM-as-judge prompt that follows the answer-matching convention in REDSearcher's DeepTraceHub evaluation code: judge whether the predicted final answer is equivalent to the ground truth and return binary accuracy.
+A reward of `1.0` means the final response matched the ground-truth answer; `0.0` means it did not, or no final answer was produced. Judge provider failures are preserved as `vf.Error` values on `state["error"]`.
+## Common Arguments
+| Argument | Default | Description |
+|---|---:|---|
+| `dataset_name` | `Zchu/REDSearcher_RL_1K` | Hugging Face dataset name. |
+| `split` | `train` | Dataset split. |
+| `difficulty` | `None` | Optional difficulty filter: `easy`, `medium`, `hard`, or `all`. |
+| `answer_file` | `/task/answer.txt` | Final answer path in the sandbox. |
+| `judge_model` | `openai/gpt-5.4-mini` | OpenAI-compatible model for answer-match judging. |
+| `judge_base_url` | `https://api.pinference.ai/api/v1` | Judge API base URL. |
+| `judge_api_key_var` | `PRIME_API_KEY` | Env var containing the judge API key. |
+| `judge_max_retries` | `5` | Number of parse retries for the A/B judge response. |
+| `use_exact_match_shortcut` | `True` | Return `1.0` without an LLM call when the normalized final response exactly equals the normalized ground-truth answer. |

verifiers-0.1.15.dev172/verifiers/envs/experimental/composable/tasksets/search/redsearcher/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""REDSearcher search taskset."""
+from .taskset import RedSearcherRubric, RedSearcherTaskSet
+__all__ = ["RedSearcherRubric", "RedSearcherTaskSet"]

verifiers 0.1.15.dev170__tar.gz → 0.1.15.dev172__tar.gz

verifiers 0.1.15.dev170tar.gz → 0.1.15.dev172tar.gz