PyPI - strands-env - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

strands-env 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

strands_env/core/models.py +4 -0
strands_env/core/types.py +2 -0
strands_env/environments/__init__.py +20 -0
strands_env/environments/calculator/__init__.py +19 -0
strands_env/environments/calculator/env.py +30 -0
strands_env/environments/calculator/system_prompt.md +1 -0
strands_env/environments/code_sandbox/__init__.py +19 -0
strands_env/environments/code_sandbox/env.py +114 -0
strands_env/environments/code_sandbox/system_prompt.md +9 -0
strands_env/eval/__init__.py +25 -0
strands_env/eval/aime.py +64 -0
strands_env/eval/evaluator.py +221 -0
strands_env/eval/metrics.py +70 -0
strands_env/rewards/__init__.py +21 -0
strands_env/rewards/math_reward.py +134 -0
strands_env/tools/__init__.py +21 -0
strands_env/tools/code_interpreter.py +192 -0
strands_env/utils/__init__.py +29 -0
strands_env/utils/aws.py +98 -0
strands_env/utils/sglang.py +47 -0
strands_env-0.1.1.dist-info/METADATA +203 -0
strands_env-0.1.1.dist-info/RECORD +27 -0
strands_env-0.1.0.dist-info/METADATA +0 -98
strands_env-0.1.0.dist-info/RECORD +0 -9
{strands_env-0.1.0.dist-info → strands_env-0.1.1.dist-info}/WHEEL +0 -0
{strands_env-0.1.0.dist-info → strands_env-0.1.1.dist-info}/licenses/LICENSE +0 -0

strands_env/core/models.py CHANGED Viewed

@@ -49,6 +49,7 @@ from strands.models import Model
 from strands.models.bedrock import BedrockModel
 from strands.models.openai import OpenAIModel
 from strands_sglang import SGLangClient, SGLangModel
+from strands_sglang.tool_parser import HermesToolCallParser, ToolCallParser
 from transformers import PreTrainedTokenizerBase
 #: Factory that produces a fresh `Model` per step (for concurrent step isolation).
@@ -66,6 +67,7 @@ def sglang_model_factory(
     model_id: str,
     tokenizer: PreTrainedTokenizerBase,
     client: SGLangClient,
+    tool_call_parser: ToolCallParser = HermesToolCallParser(),
     sampling_params: dict[str, Any] = DEFAULT_SAMPLING_PARAMS,
     enable_thinking: bool | None = None,
 ) -> ModelFactory:
@@ -81,6 +83,7 @@ def sglang_model_factory(
     return lambda: SGLangModel(
         tokenizer=tokenizer,
         client=client,
+        tool_call_parser=tool_call_parser,
         params=sampling_params,
         model_id=model_id,
         return_logprobs=True,
@@ -124,6 +127,7 @@ def bedrock_model_factory(
         model_id=model_id,
         boto_session=boto_session,
         boto_client_config=boto_client_config,
+        streaming=False,
         **sampling_params,
     )

strands_env/core/types.py CHANGED Viewed

@@ -17,6 +17,7 @@
 from __future__ import annotations
 import logging
+import uuid
 from abc import ABC, abstractmethod
 from enum import Enum
 from typing import Any
@@ -41,6 +42,7 @@ class TaskContext(BaseModel):
     model_config = ConfigDict(extra="allow")
+    id: str = Field(default_factory=lambda: str(uuid.uuid4()))
     ground_truth: Any = None
     conversation_history: Messages = Field(default_factory=list)

strands_env/environments/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+# Copyright 2025 Horizon RL Contributors
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Environments for strands-env."""
+from .calculator import CalculatorEnv
+from .code_sandbox import CodeMode, CodeSandboxEnv
+__all__ = ["CalculatorEnv", "CodeMode", "CodeSandboxEnv"]

strands_env/environments/calculator/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+# Copyright 2025 Horizon RL Contributors
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Simple math environment with a calculator tool."""
+from .env import CalculatorEnv
+__all__ = ["CalculatorEnv"]

strands_env/environments/calculator/env.py ADDED Viewed

@@ -0,0 +1,30 @@
+# Copyright 2025 Horizon RL Contributors
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Simple math environment using a calculator tool."""
+from pathlib import Path
+from strands_tools import calculator
+from strands_env.core.environment import Environment
+class CalculatorEnv(Environment):
+    """Simple math environment using a calculator tool."""
+    default_system_prompt_path = Path(__file__).parent / "system_prompt.md"
+    def get_tools(self):
+        return [calculator]

strands_env/environments/calculator/system_prompt.md ADDED Viewed

	@@ -0,0 +1 @@
1	+ You are a math problem solver. Solve the given problem step by step using the calculator tool when needed. Put your final answer in \boxed{}.

strands_env/environments/code_sandbox/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+# Copyright 2025 Horizon RL Contributors
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Code sandbox environment using AWS Bedrock AgentCore Code Interpreter."""
+from .env import CodeMode, CodeSandboxEnv
+__all__ = ["CodeMode", "CodeSandboxEnv"]

strands_env/environments/code_sandbox/env.py ADDED Viewed

@@ -0,0 +1,114 @@
+# Copyright 2025 Horizon RL Contributors
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Code sandbox environment using AWS Bedrock AgentCore Code Interpreter."""
+from __future__ import annotations
+from enum import Enum
+from pathlib import Path
+from typing import TYPE_CHECKING
+from strands_env.core.environment import Environment
+from strands_env.tools import CodeInterpreterToolkit
+from strands_env.utils.aws import get_boto3_session
+if TYPE_CHECKING:
+    import boto3
+    from strands_env.core.types import ModelFactory, RewardFunction
+class CodeMode(str, Enum):
+    """Tool modes for CodeSandboxEnv."""
+    CODE = "code"
+    """Only `execute_code` tool (Python execution)."""
+    TERMINAL = "terminal"
+    """Only `execute_command` tool (shell commands)."""
+    CODE_AND_TERMINAL = "code_and_terminal"
+    """Both `execute_code` and `execute_command` tools."""
+class CodeSandboxEnv(Environment):
+    """Code sandbox environment using AWS Bedrock AgentCore Code Interpreter.
+    Provides `execute_code` (Python) and/or `execute_command` (shell) tools
+    depending on the configured `CodeMode`.
+    Example:
+        from strands_env.environments.code_sandbox import CodeSandboxEnv, CodeMode
+        from strands_env.utils import get_boto3_session
+        session = get_boto3_session(region="us-east-1")
+        env = CodeSandboxEnv(
+            boto3_session=session,
+            model_factory=model_factory,
+            mode=CodeMode.CODE,  # Only Python execution
+        )
+        result = await env.step(action)
+        await env.cleanup()  # Clean up code interpreter session
+    """
+    default_system_prompt_path = Path(__file__).parent / "system_prompt.md"
+    def __init__(
+        self,
+        *,
+        model_factory: ModelFactory,
+        system_prompt: str | None = None,
+        reward_fn: RewardFunction | None = None,
+        max_tool_iterations: int = 10,
+        verbose: bool = False,
+        boto3_session: boto3.Session | None = None,
+        mode: CodeMode = CodeMode.CODE,
+    ):
+        """Initialize the code sandbox environment.
+        Args:
+            boto3_session: boto3 session for AWS credentials.
+            model_factory: Factory function that creates a fresh Model per step.
+            system_prompt: Optional system prompt override.
+            reward_fn: Optional reward function to compute rewards.
+            max_tool_iterations: Maximum tool iterations per step.
+            verbose: Whether to print verbose output.
+            mode: Tool mode - CODE, TERMINAL, or CODE_AND_TERMINAL.
+        """
+        super().__init__(
+            model_factory=model_factory,
+            reward_fn=reward_fn,
+            system_prompt=system_prompt,
+            max_tool_iterations=max_tool_iterations,
+            verbose=verbose,
+        )
+        self.mode = mode
+        self._toolkit = CodeInterpreterToolkit(
+            boto3_session=boto3_session or get_boto3_session(), session_name="strands-env-code-sandbox"
+        )
+    def get_tools(self):
+        """Return tools based on configured mode."""
+        tool_map = {
+            CodeMode.CODE: [self._toolkit.execute_code],
+            CodeMode.TERMINAL: [self._toolkit.execute_command],
+            CodeMode.CODE_AND_TERMINAL: [self._toolkit.execute_code, self._toolkit.execute_command],
+        }
+        return tool_map[self.mode]
+    async def cleanup(self) -> None:
+        """Clean up code interpreter session."""
+        self._toolkit.cleanup()

strands_env/environments/code_sandbox/system_prompt.md ADDED Viewed

@@ -0,0 +1,9 @@
+You are a helpful coding assistant with access to a sandboxed execution environment.
+Use the available tools to write and execute code to solve problems.
+When solving problems:
+1. Break down complex tasks into smaller steps
+2. Write and execute code to verify your solutions
+3. Use print statements to show intermediate results
+4. Handle errors gracefully and retry if needed

strands_env/eval/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+# Copyright 2025 Horizon RL Contributors
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .aime import AIMEEvaluator
+from .evaluator import EvalSample, Evaluator
+from .metrics import MetricFn, pass_at_k_metric
+__all__ = [
+    "AIMEEvaluator",
+    "EvalSample",
+    "Evaluator",
+    "MetricFn",
+    "pass_at_k_metric",
+]

strands_env/eval/aime.py ADDED Viewed

@@ -0,0 +1,64 @@
+# Copyright 2025 Horizon RL Contributors
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""AIME (American Invitational Mathematics Examination) evaluator."""
+from __future__ import annotations
+import logging
+from collections.abc import Iterable
+from typing import Literal
+from datasets import load_dataset
+from strands_env.core import Action, TaskContext
+from .evaluator import Evaluator
+logger = logging.getLogger(__name__)
+_AIME_DATASETS = {
+    "2024": "HuggingFaceH4/aime_2024",
+    "2025": "MathArena/aime_2025",
+}
+class AIMEEvaluator(Evaluator):
+    """Evaluator for AIME math competition problems."""
+    benchmark_name = "AIME"
+    def load_dataset(self, version: Literal["2024", "2025"] = "2024") -> Iterable[Action]:
+        """Load AIME dataset from HuggingFace."""
+        self.benchmark_name = f"{self.benchmark_name}_{version}"
+        dataset = load_dataset(_AIME_DATASETS[version], split="train")
+        actions = []
+        for i, row in enumerate(dataset):
+            problem, answer = row.get("problem"), row.get("answer")
+            if problem is None or answer is None:
+                logger.warning(f"Row {i}: missing problem/answer, skipped")
+                continue
+            actions.append(
+                Action(
+                    message=str(problem),
+                    task_context=TaskContext(
+                        id=f"{self.benchmark_name}_{row.get('id', i)}",
+                        ground_truth=str(answer),
+                    ),
+                )
+            )
+        logger.info(f"[{self.benchmark_name}] Loaded {len(actions)}/{len(dataset)} prompts")
+        return actions

strands_env/eval/evaluator.py ADDED Viewed

@@ -0,0 +1,221 @@
+# Copyright 2025 Horizon RL Contributors
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluator for running agentic benchmarks with `strands-env` environments."""
+from __future__ import annotations
+import asyncio
+import json
+import logging
+from collections import defaultdict
+from collections.abc import Awaitable, Callable, Iterable
+from functools import partial
+from pathlib import Path
+from pydantic import BaseModel
+from tqdm import tqdm
+from tqdm.contrib.logging import logging_redirect_tqdm
+from strands_env.core import Action, Environment, StepResult
+from .metrics import MetricFn, pass_at_k_metric
+logger = logging.getLogger(__name__)
+#: Type alias for environment factory function (async).
+AsyncEnvFactory = Callable[[Action], Awaitable[Environment]]
+class EvalSample(BaseModel):
+    """Evaluation sample result."""
+    action: Action
+    """The action (task) that was evaluated."""
+    step_result: StepResult
+    """The result of the step (observation, reward, termination reason)."""
+class Evaluator:
+    """Evaluator for running concurrent environment evaluations."""
+    benchmark_name: str = ""
+    """Benchmark identifier. Override in subclasses."""
+    def __init__(
+        self,
+        env_factory: AsyncEnvFactory,
+        *,
+        max_concurrency: int = 10,
+        n_samples_per_prompt: int = 1,
+        output_path: Path | str = Path.cwd() / "results.jsonl",
+        save_interval: int = 10,
+        keep_tokens: bool = False,
+        metric_fns: list[MetricFn] = [],
+    ):
+        """Initialize the evaluator.
+        Args:
+            env_factory: Async factory function that creates a fresh Environment per sample.
+            max_concurrency: Maximum concurrent evaluate_sample() calls.
+            n_samples_per_prompt: Number of samples per prompt (for pass@k, set to max(k_values)).
+            output_path: Path to JSONL file for saving results. Enables resume.
+            save_interval: Flush results to disk every N completed samples.
+            keep_tokens: Keep token-level observation in results (only valid for `SGLangModel` backends).
+            metric_fns: Additional metric functions. `pass@k` is always included.
+        """
+        self.env_factory: AsyncEnvFactory = env_factory
+        self.max_concurrency = max_concurrency
+        self.n_samples_per_prompt = n_samples_per_prompt
+        self.output_path = Path(output_path)
+        self.save_interval = save_interval
+        self.keep_tokens = keep_tokens
+        # Always include pass@k, then any additional metrics
+        self.metric_fns: list[MetricFn] = [
+            partial(pass_at_k_metric, k_values=list(range(1, n_samples_per_prompt + 1)), reward_threshold=1.0)
+        ]
+        self.metric_fns += metric_fns
+        # Runtime state
+        self.results: dict[str, list[EvalSample]] = defaultdict(list)
+        self.completed_ids: set[str] = set()
+    def load_dataset(self) -> Iterable[Action]:
+        """Load dataset. Override in subclasses."""
+        raise NotImplementedError("Subclasses must implement load_dataset()")
+    def load_results(self) -> None:
+        """Load completed samples from checkpoint file."""
+        if not self.output_path.exists():
+            return
+        self.results = defaultdict(list)
+        self.completed_ids = set()
+        with open(self.output_path, encoding="utf-8") as f:
+            for line in f:
+                data = json.loads(line)
+                prompt_id = data.pop("prompt_id")
+                sample = EvalSample.model_validate(data)
+                self.results[prompt_id].append(sample)
+                self.completed_ids.add(sample.action.task_context.id)
+        total = sum(len(s) for s in self.results.values())
+        logger.info(f"Resumed {total} samples from {self.output_path}")
+    def save_results(self) -> None:
+        """Save all samples to checkpoint file."""
+        self.output_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(self.output_path, "w", encoding="utf-8") as f:
+            for prompt_id, samples in self.results.items():
+                for sample in samples:
+                    data = sample.model_dump()
+                    data["prompt_id"] = prompt_id
+                    f.write(json.dumps(data, ensure_ascii=False) + "\n")
+    async def evaluate_sample(self, action: Action) -> EvalSample:
+        """Evaluate a single sample."""
+        env = await self.env_factory(action)
+        await env.reset()
+        step_result = await env.step(action)
+        if not self.keep_tokens:
+            step_result.observation.tokens = None
+        await env.cleanup()
+        # Runtime logging for debugging
+        reward_str = f"{step_result.reward.reward:.2f}" if step_result.reward else "N/A"
+        reward_info = step_result.reward.info if step_result.reward else {}
+        logger.info(
+            f"[{action.task_context.id}]: "
+            f"reward={reward_str} | "
+            f"label={action.task_context.ground_truth} | "
+            f"reward_info={reward_info} | "
+            f"metrics={step_result.observation.metrics}"
+        )
+        return EvalSample(action=action, step_result=step_result)
+    async def run(self, actions: Iterable[Action]) -> dict[str, list[EvalSample]]:
+        """Run evaluation on actions with n_samples_per_prompt each.
+        Args:
+            actions: Actions to evaluate.
+        Returns:
+            Dict mapping prompt_id to list of EvalSample results.
+        """
+        self.load_results()
+        # Expand actions to (prompt_id, sample_id, action) tuples
+        to_process: list[tuple[str, str, Action]] = []
+        for action in actions:
+            prompt_id = action.task_context.id
+            for i in range(self.n_samples_per_prompt):
+                sample_id = f"{prompt_id}_{i}"
+                if sample_id not in self.completed_ids:
+                    expanded = action.model_copy(deep=True)
+                    expanded.task_context.id = sample_id
+                    to_process.append((prompt_id, sample_id, expanded))
+        semaphore = asyncio.Semaphore(self.max_concurrency)
+        save_counter = 0
+        total = len(to_process)
+        async def process(prompt_id: str, sample_id: str, action: Action, pbar: tqdm) -> None:
+            nonlocal save_counter
+            async with semaphore:
+                sample = await self.evaluate_sample(action)
+                self.results[prompt_id].append(sample)
+                self.completed_ids.add(sample_id)
+                pbar.update(1)
+                save_counter += 1
+                if save_counter >= self.save_interval:
+                    self.save_results()
+                    save_counter = 0
+        with logging_redirect_tqdm():
+            with tqdm(total=total, desc=f"Evaluating {self.benchmark_name}", unit="sample", dynamic_ncols=True) as pbar:
+                await asyncio.gather(*[process(pid, sid, a, pbar) for pid, sid, a in to_process])
+        self.save_results()
+        return dict(self.results)
+    def compute_metrics(self, results: dict[str, list[EvalSample]], log: bool = True) -> dict[str, float]:
+        """Compute all metrics on results.
+        Args:
+            results: Dict mapping prompt_id to sample results.
+            log: Whether to log the metrics summary.
+        Returns:
+            Dict mapping metric names to values.
+        """
+        metrics = {}
+        for fn in self.metric_fns:
+            metrics.update(fn(results))
+        if log and metrics:
+            n_prompts = len(results)
+            n_samples = sum(len(s) for s in results.values())
+            name = self.benchmark_name or "Evaluation"
+            # Build formatted output
+            lines = [f"{'─' * 40}", f"  {name} Results", f"{'─' * 40}"]
+            lines.append(f"  Prompts: {n_prompts}  Samples (n={self.n_samples_per_prompt}): {n_samples}")
+            lines.append("")
+            for metric, value in sorted(metrics.items()):
+                lines.append(f"  {metric:<12} {value:>6.1%}")
+            lines.append(f"{'─' * 40}")
+            logger.info("\n" + "\n".join(lines))
+        return metrics

strands_env/eval/metrics.py ADDED Viewed

@@ -0,0 +1,70 @@
+# Copyright 2025 Horizon RL Contributors
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation metrics for benchmark results."""
+from __future__ import annotations
+import math
+from collections.abc import Callable
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from .evaluator import EvalSample
+#: Type alias for metric function: takes results {prompt_id: [EvalSample, ...]}, returns {metric_name: value}.
+MetricFn = Callable[[dict[str, list["EvalSample"]]], dict[str, float]]
+def pass_at_k_metric(
+    results: dict[str, list["EvalSample"]],
+    k_values: list[int],
+    reward_threshold: float = 1.0,
+) -> dict[str, float]:
+    """Compute pass@k metrics using unbiased estimator.
+    Args:
+        results: Dict mapping prompt_id to list of samples.
+        k_values: List of k values for pass@k.
+        reward_threshold: Reward threshold for "pass" (default: 1.0).
+    Returns:
+        Dict mapping "pass@k" to average score.
+    """
+    if not results:
+        return {f"pass@{k}": 0.0 for k in k_values}
+    def is_correct(s: EvalSample) -> bool:
+        r = s.step_result.reward
+        return r is not None and r.reward >= reward_threshold
+    def pass_at_k_single(n: int, c: int, k: int) -> float:
+        """Unbiased estimator: 1 - C(n-c, k) / C(n, k)."""
+        if n - c < k:
+            return 1.0
+        if c == 0:
+            return 0.0
+        log_ratio = sum(math.log(n - c - i) - math.log(n - i) for i in range(k))
+        return 1.0 - math.exp(log_ratio)
+    metrics = {}
+    for k in k_values:
+        scores = []
+        for samples in results.values():
+            n, c = len(samples), sum(1 for s in samples if is_correct(s))
+            if k <= n:
+                scores.append(pass_at_k_single(n, c, k))
+        metrics[f"pass@{k}"] = sum(scores) / len(scores) if scores else 0.0
+    return metrics

strands_env/rewards/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+# Copyright 2025 Horizon RL Contributors
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Reward functions for strands-env."""
+from .math_reward import MathRewardFunction
+__all__ = [
+    "MathRewardFunction",
+]

strands-env 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

strands-env 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl