PyPI - verifiers - Versions diffs - 0.1.15.dev173__tar.gz → 0.1.15.dev176__tar.gz - Mend

verifiers 0.1.15.dev173tar.gz → 0.1.15.dev176tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (352) hide show

{verifiers-0.1.15.dev173 → verifiers-0.1.15.dev176}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: verifiers
-Version: 0.1.15.dev173
+Version: 0.1.15.dev176
 Summary: Verifiers: Environments for LLM Reinforcement Learning
 Project-URL: Homepage, https://github.com/primeintellect-ai/verifiers
 Project-URL: Documentation, https://github.com/primeintellect-ai/verifiers
@@ -40,7 +40,7 @@ Requires-Dist: openai>=1.108.1
 Requires-Dist: pillow
 Requires-Dist: prime-pydantic-config[toml]
 Requires-Dist: prime-sandboxes>=0.2.25
-Requires-Dist: prime-tunnel>=0.1.6
+Requires-Dist: prime-tunnel>=0.1.8
 Requires-Dist: pydantic>=2.11.9
 Requires-Dist: pymupdf
 Requires-Dist: pyzmq>=27.1.0

{verifiers-0.1.15.dev173 → verifiers-0.1.15.dev176}/pyproject.toml RENAMED Viewed

@@ -37,7 +37,7 @@ dependencies = [
     "nest-asyncio>=1.6.0", # for jupyter notebooks
     "openai>=1.108.1",
     "openai-agents>=0.0.7",
-    "prime-tunnel>=0.1.6",
+    "prime-tunnel>=0.1.8",
     "prime-sandboxes>=0.2.25",
     "pydantic>=2.11.9",
     "requests",

{verifiers-0.1.15.dev173 → verifiers-0.1.15.dev176}/verifiers/envs/experimental/composable/tasksets/search/README.md RENAMED Viewed

@@ -19,7 +19,10 @@ from verifiers.envs.experimental.composable.tasksets.search import make_search_t
 taskset = make_search_taskset(backend="openseeker")
 taskset = make_search_taskset(backend="quest", category="objective")
-redsearcher = make_search_taskset(backend="redsearcher", difficulty="easy")
+redsearcher = make_search_taskset(
+    backend="redsearcher",
+    filter_fn="lambda x: x['info']['difficulty'] == 'easy'",
+)
 ```
 `make_search_taskset()` dispatches by backend name. Unknown backends raise `ValueError` with the available backend list.

{verifiers-0.1.15.dev173 → verifiers-0.1.15.dev176}/verifiers/envs/experimental/composable/tasksets/search/openseeker/README.md RENAMED Viewed

@@ -4,12 +4,27 @@ Composable search taskset for [`PolarSeeker/OpenSeeker-v1-Data`](https://hugging
 OpenSeeker v1 data contains synthesized deep-search QA pairs plus trajectories generated with `search` and `visit` tools. The public OpenSeeker evaluator scores only the final answer: it sends the question, gold answer, and model response to an LLM judge and expects `A` for correct or `B` for incorrect. This backend preserves that binary semantic answer-judge contract.
+By default, the taskset uses the full dataset. Use the shared `filter_fn`
+argument for row subsets such as source trajectory quality or tool-call count.
+The `trajectory_correctness` metadata describes the stored OpenSeeker source
+trajectory, not the validity of the question or gold answer.
 ## Usage
 ```python
 from verifiers.envs.experimental.composable.tasksets.search import make_search_taskset
 taskset = make_search_taskset(backend="openseeker")
+correct_source_trajectories = make_search_taskset(
+    backend="openseeker",
+    filter_fn="lambda x: x['info']['trajectory_correctness'] == 'Correct'",
+)
+shorter_source_trajectories = make_search_taskset(
+    backend="openseeker",
+    filter_fn="lambda x: (x['info']['number_of_tool_calls'] or 0) <= 20",
+)
 ```
 ## Arguments
@@ -18,9 +33,7 @@ taskset = make_search_taskset(backend="openseeker")
 |---|---:|---|
 | `dataset_name` | `PolarSeeker/OpenSeeker-v1-Data` | Hugging Face dataset name. |
 | `split` | `train` | Dataset split. |
-| `trajectory_correctness` | `Correct` | Keep rows with this trajectory label. Use `None` or `all` for all rows. |
-| `min_tool_calls` | `None` | Optional lower bound for `number of tool calls`. |
-| `max_tool_calls` | `None` | Optional upper bound for `number of tool calls`. |
+| `filter_fn` | `None` | Optional composable taskset filter over normalized rows. |
 | `include_trajectory` | `False` | Include the large source trajectory in task metadata. |
 | `answer_file` | `/task/answer.txt` | Final answer path in the sandbox. |
 | `judge_model` | `openai/gpt-5.4-mini` | OpenAI-compatible model used for binary answer judging. |

{verifiers-0.1.15.dev173 → verifiers-0.1.15.dev176}/verifiers/envs/experimental/composable/tasksets/search/openseeker/taskset.py RENAMED Viewed

@@ -40,7 +40,6 @@ logger = logging.getLogger(__name__)
 DEFAULT_DATASET_NAME = "PolarSeeker/OpenSeeker-v1-Data"
 DEFAULT_SPLIT = "train"
-DEFAULT_TRAJECTORY_CORRECTNESS = "Correct"
 DEFAULT_ANSWER_FILE = "/task/answer.txt"
 DEFAULT_WORKDIR = "/workspace"
 DEFAULT_JUDGE_BASE_URL = "https://api.pinference.ai/api/v1"
@@ -281,9 +280,6 @@ class OpenSeekerTaskSet(SandboxTaskSet):
         self,
         dataset_name: str = DEFAULT_DATASET_NAME,
         split: str = DEFAULT_SPLIT,
-        trajectory_correctness: str | None = DEFAULT_TRAJECTORY_CORRECTNESS,
-        min_tool_calls: int | None = None,
-        max_tool_calls: int | None = None,
         include_trajectory: bool = False,
         filter_fn: str | None = None,
         ds_keep_in_memory: bool | None = False,
@@ -299,27 +295,8 @@ class OpenSeekerTaskSet(SandboxTaskSet):
         judge_api_key_var: str = DEFAULT_JUDGE_API_KEY_VAR,
         judge_sampling_args: dict[str, Any] | None = None,
     ) -> None:
-        if trajectory_correctness == "all":
-            trajectory_correctness = None
-        if trajectory_correctness not in {"Correct", "Incorrect", None}:
-            raise ValueError(
-                "trajectory_correctness must be 'Correct', 'Incorrect', 'all', or None"
-            )
-        if min_tool_calls is not None and min_tool_calls < 0:
-            raise ValueError("min_tool_calls must be non-negative")
-        if max_tool_calls is not None and max_tool_calls < 0:
-            raise ValueError("max_tool_calls must be non-negative")
-        if (
-            min_tool_calls is not None
-            and max_tool_calls is not None
-            and min_tool_calls > max_tool_calls
-        ):
-            raise ValueError("min_tool_calls cannot exceed max_tool_calls")
         self.dataset_name = dataset_name
         self.split = split
-        self.trajectory_correctness = trajectory_correctness
-        self.min_tool_calls = min_tool_calls
-        self.max_tool_calls = max_tool_calls
         self.include_trajectory = include_trajectory
         self.ds_keep_in_memory = ds_keep_in_memory
         self.ds_num_proc = ds_num_proc
@@ -335,12 +312,9 @@ class OpenSeekerTaskSet(SandboxTaskSet):
         self._judge_base_url = judge_base_url
         self._judge_api_key_var = judge_api_key_var
         self._judge_sampling_args = dict(judge_sampling_args or {})
-        name_parts = ["search", "openseeker"]
-        if trajectory_correctness is not None:
-            name_parts.append(trajectory_correctness.lower())
         super().__init__(
             dataset=self._build_dataset,
-            name="/".join(name_parts),
+            name="search/openseeker",
             filter_fn=filter_fn,
         )
@@ -364,22 +338,9 @@ class OpenSeekerTaskSet(SandboxTaskSet):
         rows: list[dict[str, Any]] = []
         for row_index, row in enumerate(raw):
             correctness = row.get("trajectory correctness")
-            if (
-                self.trajectory_correctness is not None
-                and correctness != self.trajectory_correctness
-            ):
-                continue
             tool_calls = row.get("number of tool calls")
             if not isinstance(tool_calls, int):
                 tool_calls = None
-            if self.min_tool_calls is not None and (
-                tool_calls is None or tool_calls < self.min_tool_calls
-            ):
-                continue
-            if self.max_tool_calls is not None and (
-                tool_calls is None or tool_calls > self.max_tool_calls
-            ):
-                continue
             question = str(row.get("question") or "").strip()
             answer = str(row.get("answer") or "").strip()
             if not question or not answer:

{verifiers-0.1.15.dev173 → verifiers-0.1.15.dev176}/verifiers/envs/experimental/composable/tasksets/search/redsearcher/README.md RENAMED Viewed

@@ -19,7 +19,7 @@ The paired `rlm_search` environment prompts RLM to write this file and provides
 ## Scoring
-`RedSearcherRubric` compares the final response against the released `answer` label. It first applies a strict normalized exact-answer shortcut for unambiguous matches. Otherwise it uses an OpenAI-compatible LLM-as-judge prompt that follows the answer-matching convention in REDSearcher's DeepTraceHub evaluation code: judge whether the predicted final answer is equivalent to the ground truth and return binary accuracy.
+`RedSearcherRubric` compares the final response against the released `answer` label. It first applies a strict normalized exact-answer shortcut for unambiguous matches. Otherwise it uses an OpenAI-compatible LLM-as-judge prompt that matches REDSearcher's released DeepTraceHub BROWSECOMP evaluator prompt and returns binary accuracy.
 A reward of `1.0` means the final response matched the ground-truth answer; `0.0` means it did not, or no final answer was produced. Judge provider failures are preserved as `vf.Error` values on `state["error"]`.
@@ -29,7 +29,7 @@ A reward of `1.0` means the final response matched the ground-truth answer; `0.0
 |---|---:|---|
 | `dataset_name` | `Zchu/REDSearcher_RL_1K` | Hugging Face dataset name. |
 | `split` | `train` | Dataset split. |
-| `difficulty` | `None` | Optional difficulty filter: `easy`, `medium`, `hard`, or `all`. |
+| `filter_fn` | `None` | Optional composable taskset filter over normalized rows, for example `lambda x: x['info']['difficulty'] == 'easy'`. |
 | `answer_file` | `/task/answer.txt` | Final answer path in the sandbox. |
 | `judge_model` | `openai/gpt-5.4-mini` | OpenAI-compatible model for answer-match judging. |
 | `judge_base_url` | `https://api.pinference.ai/api/v1` | Judge API base URL. |

{verifiers-0.1.15.dev173 → verifiers-0.1.15.dev176}/verifiers/envs/experimental/composable/tasksets/search/redsearcher/taskset.py RENAMED Viewed

@@ -47,28 +47,75 @@ DEFAULT_JUDGE_API_KEY_VAR = "PRIME_API_KEY"
 DEFAULT_JUDGE_MODEL = "openai/gpt-5.4-mini"
 DEFAULT_SANDBOX_IMAGE = "python:3.11-slim"
+# Matches DeepTraceHub's released BROWSECOMP judge prompt, the closest public
+# reference for REDSearcher's RL reward while the RL trainer remains unreleased.
 _JUDGE_PROMPT = """\
-You are grading a deep-search question answering response.
-Decide whether the predicted response gives the same final answer as the
-ground-truth answer. Ignore citations, formatting, capitalization, and extra
-explanation unless they contradict the final answer. For numeric answers,
-allow insignificant formatting differences but not a different value. If the
-response gives multiple incompatible answers, is evasive, or merely repeats
-the question, mark it incorrect.
-Question:
-{question}
-Ground-truth answer:
-{answer}
-Predicted response:
-{response}
-Return only one letter:
-A. CORRECT
-B. INCORRECT
+Based on the given question, standard answer, and model-predicted answer, evaluate whether the model's response is correct. Your task is to classify the result as: [CORRECT] or [INCORRECT].
+First, we'll list examples for each category, then you'll evaluate a new question's predicted answer.
+Here are examples of [CORRECT] responses:
+```
+Question: What are the names of Barack Obama's children?
+Standard Answer: Malia Obama and Sasha Obama
+Model Prediction 1: Malia Obama and Sasha Obama
+Model Prediction 2: Malia and Sasha
+Model Prediction 3: Most would say Malia and Sasha, but I'm not sure, I should verify
+Model Prediction 4: Barack Obama has two daughters, Malia Ann and Natasha Marian, commonly known as Malia Obama and Sasha Obama.
+```
+These responses are all [CORRECT] because they:
+    - Fully include the important information from the standard answer.
+    - Don't contain any information that contradicts the standard answer.
+    - Focus only on semantic content; language, capitalization, punctuation, grammar, and order aren't important.
+    - Vague statements or guesses are acceptable as long as they include the standard answer and don't contain incorrect information or contradictions.
+Here are examples of [INCORRECT] responses:
+```
+Question: What are the names of Barack Obama's children?
+Standard Answer: Malia Obama and Sasha Obama
+Model Prediction 1: Malia
+Model Prediction 2: Malia, Sasha and Susan or Sasha Obama or Malia Obama, or Natasha Marian, or Einstein
+Model Prediction 3: While I don't know their exact names, I can tell you Barack Obama has two children.
+Model Prediction 4: You might be thinking of Betsy and Olivia. But you should verify the details with the latest references. Is that the correct answer?
+Model Prediction 5: Barack Obama's children
+```
+These responses are all [INCORRECT] because they:
+    - Contain factual statements that contradict the standard answer.
+    - Are empty or merely repeat the question.
+    - Enumerate multiple answers or repeat the answer.
+Pay special attention to the following:
+- The standard answer may contain responses to multiple aspects of the question, and within the same aspect, there might be different descriptions, all of which are correct and are given in the same bracket, connected by commas. For example, for the question "What is the name of ByteDance's AI model?", the standard answer is "[[Doubao, Skylark]]":
+    - Predicted answers "Doubao", "Doubao, Skylark", "Skylark", etc. are all [CORRECT].
+- For standard answers containing responses to different aspects, the model needs to provide answers to all aspects to be considered correct; otherwise, it's directly judged as [INCORRECT]. There is no [PARTIALLY CORRECT] output option. These answers will be given in different brackets. For example, for the question "Who are the members of TFBOYS?", the standard answer is "[[Wang Junkai][Wang Yuan][Yi Yangqianxi]]":
+    - Predicted answers like "Wang Junkai, Wang Yuan, Yi Yangqianxi" that include all answers are [CORRECT].
+    - Predicted answers like "Wang Junkai, Yi Yangqianxi" that don't include all answers are [INCORRECT].
+Also note the following points:
+- For questions with numerical standard answers, the predicted answer should match the standard answer. For example, for the question "What is the total length in meters of the Huangpu River Bridge on the Jinshan Railway?", the standard answer is "3518.17":
+    - Predicted answers "3518", "3518.1", "3518.17" are all [CORRECT].
+    - Predicted answers "3520" and "3600" are [INCORRECT].
+- If the model prediction doesn't directly answer the question, attempts to circumvent or fails to directly provide the standard answer, it's considered an [INCORRECT] answer.
+    - For example, for the question "Who is JJ Lin's wife?", with the standard answer "Ding Wenqi", model predictions like "JJ Lin's wife", "JJ Lin's wife should be excellent", "JJ Lin's wife might be a public figure" are all [INCORRECT].
+- If the standard answer contains more information than the question asks for, the predicted answer only needs to include the information mentioned in the question.
+    - For example, for the question "What is the main chemical component of magnesite?", with the standard answer "Magnesium carbonate (MgCO3)", "Magnesium carbonate" or "MgCO3" are both considered [CORRECT] answers.
+- If information omitted in the predicted answer can be clearly inferred from the question, it's considered correct.
+    - For example, for the question "The Nuragic ruins of Barumini were listed as a World Cultural Heritage by UNESCO in 1997, so where is this site located?", with the standard answer "Sardinia, Italy", the predicted answer "Sardinia" is considered [CORRECT].
+- If it's clear that different translations of a name refer to the same person, it's considered correct.
+    - For example, if the standard answer is "Robinson", answers like "Lubinson" or "Lubinsun" are both correct.
+- You should focus more on the match between the standard answer and the model prediction, rather than whether the standard answer itself is correct.
+Below is a new question example. Please reply with only [CORRECT] or [INCORRECT], without apologies or corrections to your own errors, just evaluate the answer.
+```
+Question: {question}
+Standard Answer: {correct_answer}
+Predicted Answer: {response}
+```
+Evaluate this new question's predicted answer as one of the following:
+A. [CORRECT]
+B. [INCORRECT]
+Return only the option representing [CORRECT] or [INCORRECT], i.e. just return A or B, without adding any other text.
 """
 _CONTEXT_LENGTH_ERROR_PHRASES = (
@@ -225,7 +272,6 @@ class RedSearcherTaskSet(SandboxTaskSet):
         self,
         dataset_name: str = DEFAULT_DATASET_NAME,
         split: str = DEFAULT_SPLIT,
-        difficulty: str | None = None,
         filter_fn: str | None = None,
         ds_keep_in_memory: bool | None = True,
         ds_num_proc: int | None = None,
@@ -242,13 +288,8 @@ class RedSearcherTaskSet(SandboxTaskSet):
         judge_max_retries: int = 5,
         use_exact_match_shortcut: bool = True,
     ) -> None:
-        if difficulty not in {None, "all", "easy", "medium", "hard"}:
-            raise ValueError(
-                "difficulty must be one of None, 'all', 'easy', 'medium', or 'hard'"
-            )
         self.dataset_name = dataset_name
         self.split = split
-        self.difficulty = difficulty
         self.ds_keep_in_memory = ds_keep_in_memory
         self.ds_num_proc = ds_num_proc
         self.answer_file = answer_file
@@ -265,10 +306,9 @@ class RedSearcherTaskSet(SandboxTaskSet):
         self._judge_sampling_args = dict(judge_sampling_args or {})
         self._judge_max_retries = judge_max_retries
         self._use_exact_match_shortcut = use_exact_match_shortcut
-        label = difficulty or "all"
         super().__init__(
             dataset=self._build_dataset,
-            name=f"search/redsearcher/{label}",
+            name="search/redsearcher",
             filter_fn=filter_fn,
         )
@@ -282,8 +322,6 @@ class RedSearcherTaskSet(SandboxTaskSet):
         rows: list[dict[str, Any]] = []
         for idx, row in enumerate(raw):
             difficulty = str(row.get("difficulty") or "")
-            if self.difficulty not in {None, "all"} and difficulty != self.difficulty:
-                continue
             question = str(row.get("problem") or "").strip()
             answer = str(row.get("answer") or "").strip()
             if not question or not answer:
@@ -475,7 +513,7 @@ class RedSearcherRubric(vf.Rubric):
         prompt = _JUDGE_PROMPT.format(
             question=question,
             response=response,
-            answer=answer,
+            correct_answer=answer,
         )
         client = self._get_client()
         request_kwargs = dict(self.judge_sampling_args)

{verifiers-0.1.15.dev173 → verifiers-0.1.15.dev176}/verifiers/v1/harness.py RENAMED Viewed

@@ -236,8 +236,10 @@ class Harness(RuntimeOwnerMixin[ConfigT], Generic[ConfigT]):
         return config
     def load_endpoint(self) -> Endpoint:
+        sandbox_config = self.program_sandbox_config(self.program_config)
         return Endpoint(
-            use_tunnel=self.program_sandbox_config(self.program_config) is not None
+            use_tunnel=sandbox_config is not None,
+            tunnel_labels=sandbox_config.labels if sandbox_config is not None else None,
         )
     def rebuild_runtime(self) -> None:

{verifiers-0.1.15.dev173 → verifiers-0.1.15.dev176}/verifiers/v1/utils/endpoint_utils.py RENAMED Viewed

@@ -146,6 +146,7 @@ class Endpoint:
         secret: str | None = None,
         use_tunnel: bool = False,
         logger: logging.Logger | None = None,
+        tunnel_labels: list[str] | None = None,
     ):
         self.use_tunnel = use_tunnel
         self.logger = logger or logging.getLogger(__name__)
@@ -154,6 +155,7 @@ class Endpoint:
             secret=secret or os.environ.get("ENDPOINT_SECRET"),
         )
         self.secret = self.server.secret
+        self.tunnel_labels = list(tunnel_labels) if tunnel_labels else []
         self._tunnel: TunnelHandle | None = None
         self._tunnel_lock = asyncio.Lock()
         self._tunnel_last_checked = 0.0
@@ -295,7 +297,10 @@ class Endpoint:
                         self._tunnel = None
             if self._tunnel is None:
-                tunnel = cast(TunnelHandle, Tunnel(local_port=self.server.port))
+                tunnel = cast(
+                    TunnelHandle,
+                    Tunnel(local_port=self.server.port, labels=self.tunnel_labels),
+                )
                 url = await tunnel.start()
                 self._tunnel = tunnel
                 self._tunnel_last_checked = time.time()