PyPI - stable-baselines3 - Versions diffs - 2.2.1__tar.gz → 2.3.0__tar.gz - Mend

stable-baselines3 2.2.1tar.gz → 2.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

{stable_baselines3-2.2.1 → stable_baselines3-2.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: stable_baselines3
-Version: 2.2.1
+Version: 2.3.0
 Summary: Pytorch version of Stable Baselines, implementations of reinforcement learning algorithms.
 Home-page: https://github.com/DLR-RM/stable-baselines3
 Author: Antonin Raffin
@@ -34,8 +34,8 @@ Requires-Dist: pytest-cov; extra == "tests"
 Requires-Dist: pytest-env; extra == "tests"
 Requires-Dist: pytest-xdist; extra == "tests"
 Requires-Dist: mypy; extra == "tests"
-Requires-Dist: ruff>=0.0.288; extra == "tests"
-Requires-Dist: black<24,>=23.9.1; extra == "tests"
+Requires-Dist: ruff>=0.3.1; extra == "tests"
+Requires-Dist: black<25,>=24.2.0; extra == "tests"
 Provides-Extra: docs
 Requires-Dist: sphinx<8,>=5; extra == "docs"
 Requires-Dist: sphinx-autobuild; extra == "docs"
@@ -99,7 +99,7 @@ import gymnasium
 from stable_baselines3 import PPO
-env = gymnasium.make("CartPole-v1")
+env = gymnasium.make("CartPole-v1", render_mode="human")
 model = PPO("MlpPolicy", env, verbose=1)
 model.learn(total_timesteps=10_000)

{stable_baselines3-2.2.1 → stable_baselines3-2.3.0}/README.md RENAMED Viewed

@@ -127,7 +127,7 @@ import gymnasium as gym
 from stable_baselines3 import PPO
-env = gym.make("CartPole-v1")
+env = gym.make("CartPole-v1", render_mode="human")
 model = PPO("MlpPolicy", env, verbose=1)
 model.learn(total_timesteps=10_000)

{stable_baselines3-2.2.1 → stable_baselines3-2.3.0}/pyproject.toml RENAMED Viewed

@@ -3,13 +3,15 @@
 line-length = 127
 # Assume Python 3.8
 target-version = "py38"
+[tool.ruff.lint]
 # See https://beta.ruff.rs/docs/rules/
 select = ["E", "F", "B", "UP", "C90", "RUF"]
 # B028: Ignore explicit stacklevel`
 # RUF013: Too many false positives (implicit optional)
 ignore = ["B028", "RUF013"]
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
 # Default implementation in abstract methods
 "./stable_baselines3/common/callbacks.py"= ["B027"]
 "./stable_baselines3/common/noise.py"= ["B027"]
@@ -17,7 +19,7 @@ ignore = ["B028", "RUF013"]
 "./tests/*.py"= ["RUF012", "RUF013"]
-[tool.ruff.mccabe]
+[tool.ruff.lint.mccabe]
 # Unlike Flake8, default to a complexity level of 10.
 max-complexity = 15

{stable_baselines3-2.2.1 → stable_baselines3-2.3.0}/setup.py RENAMED Viewed

@@ -43,7 +43,7 @@ import gymnasium
 from stable_baselines3 import PPO
-env = gymnasium.make("CartPole-v1")
+env = gymnasium.make("CartPole-v1", render_mode="human")
 model = PPO("MlpPolicy", env, verbose=1)
 model.learn(total_timesteps=10_000)
@@ -120,9 +120,9 @@ setup(
             # Type check
             "mypy",
             # Lint code and sort imports (flake8 and isort replacement)
-            "ruff>=0.0.288",
+            "ruff>=0.3.1",
             # Reformat
-            "black>=23.9.1,<24",
+            "black>=24.2.0,<25",
         ],
         "docs": [
             "sphinx>=5,<8",

{stable_baselines3-2.2.1 → stable_baselines3-2.3.0}/stable_baselines3/common/base_class.py RENAMED Viewed

@@ -523,7 +523,10 @@ class BaseAlgorithm(ABC):
         :param total_timesteps: The total number of samples (env steps) to train on
         :param callback: callback(s) called at every step with state of the algorithm.
-        :param log_interval: The number of episodes before logging.
+        :param log_interval: for on-policy algos (e.g., PPO, A2C, ...) this is the number of
+            training iterations (i.e., log_interval * n_steps * n_envs timesteps) before logging;
+            for off-policy algos (e.g., TD3, SAC, ...) this is the number of episodes before
+            logging.
         :param tb_log_name: the name of the run for TensorBoard logging
         :param reset_num_timesteps: whether or not to reset the current timestep number (used in logging)
         :param progress_bar: Display a progress bar using tqdm and rich.

{stable_baselines3-2.2.1 → stable_baselines3-2.3.0}/stable_baselines3/common/distributions.py RENAMED Viewed

@@ -113,7 +113,7 @@ def sum_independent_dims(tensor: th.Tensor) -> th.Tensor:
     so we can sum components of the ``log_prob`` or the entropy.
     :param tensor: shape: (n_batch, n_actions) or (n_batch,)
-    :return: shape: (n_batch,)
+    :return: shape: (n_batch,) for (n_batch, n_actions) input, scalar for (n_batch,) input
     """
     if len(tensor.shape) > 1:
         tensor = tensor.sum(dim=1)

{stable_baselines3-2.2.1 → stable_baselines3-2.3.0}/stable_baselines3/common/env_checker.py RENAMED Viewed

@@ -17,13 +17,37 @@ def _is_numpy_array_space(space: spaces.Space) -> bool:
     return not isinstance(space, (spaces.Dict, spaces.Tuple))
+def _starts_at_zero(space: Union[spaces.Discrete, spaces.MultiDiscrete]) -> bool:
+    """
+    Return False if a (Multi)Discrete space has a non-zero start.
+    """
+    return np.allclose(space.start, np.zeros_like(space.start))
+def _check_non_zero_start(space: spaces.Space, space_type: str = "observation", key: str = "") -> None:
+    """
+    :param space: Observation or action space
+    :param space_type: information about whether it is an observation or action space
+        (for the warning message)
+    :param key: When the observation space comes from a Dict space, we pass the
+        corresponding key to have more precise warning messages. Defaults to "".
+    """
+    if isinstance(space, (spaces.Discrete, spaces.MultiDiscrete)) and not _starts_at_zero(space):
+        maybe_key = f"(key='{key}')" if key else ""
+        warnings.warn(
+            f"{type(space).__name__} {space_type} space {maybe_key} with a non-zero start (start={space.start}) "
+            "is not supported by Stable-Baselines3. "
+            f"You can use a wrapper or update your {space_type} space."
+        )
 def _check_image_input(observation_space: spaces.Box, key: str = "") -> None:
     """
     Check that the input will be compatible with Stable-Baselines
     when the observation is apparently an image.
     :param observation_space: Observation space
-    :key: When the observation space comes from a Dict space, we pass the
+    :param key: When the observation space comes from a Dict space, we pass the
         corresponding key to have more precise warning messages. Defaults to "".
     """
     if observation_space.dtype != np.uint8:
@@ -63,11 +87,7 @@ def _check_unsupported_spaces(env: gym.Env, observation_space: spaces.Space, act
         for key, space in observation_space.spaces.items():
             if isinstance(space, spaces.Dict):
                 nested_dict = True
-            if isinstance(space, spaces.Discrete) and space.start != 0:
-                warnings.warn(
-                    f"Discrete observation space (key '{key}') with a non-zero start is not supported by Stable-Baselines3. "
-                    "You can use a wrapper or update your observation space."
-                )
+            _check_non_zero_start(space, "observation", key)
         if nested_dict:
             warnings.warn(
@@ -87,11 +107,7 @@ def _check_unsupported_spaces(env: gym.Env, observation_space: spaces.Space, act
             "which is supported by SB3."
         )
-    if isinstance(observation_space, spaces.Discrete) and observation_space.start != 0:
-        warnings.warn(
-            "Discrete observation space with a non-zero start is not supported by Stable-Baselines3. "
-            "You can use a wrapper or update your observation space."
-        )
+    _check_non_zero_start(observation_space, "observation")
     if isinstance(observation_space, spaces.Sequence):
         warnings.warn(
@@ -100,11 +116,7 @@ def _check_unsupported_spaces(env: gym.Env, observation_space: spaces.Space, act
             "Note: The checks for returned values are skipped."
         )
-    if isinstance(action_space, spaces.Discrete) and action_space.start != 0:
-        warnings.warn(
-            "Discrete action space with a non-zero start is not supported by Stable-Baselines3. "
-            "You can use a wrapper or update your action space."
-        )
+    _check_non_zero_start(action_space, "action")
     if not _is_numpy_array_space(action_space):
         warnings.warn(

{stable_baselines3-2.2.1 → stable_baselines3-2.3.0}/stable_baselines3/common/on_policy_algorithm.py RENAMED Viewed

@@ -92,6 +92,7 @@ class OnPolicyAlgorithm(BaseAlgorithm):
             use_sde=use_sde,
             sde_sample_freq=sde_sample_freq,
             support_multi_env=True,
+            monitor_wrapper=monitor_wrapper,
             seed=seed,
             stats_window_size=stats_window_size,
             tensorboard_log=tensorboard_log,
@@ -200,7 +201,7 @@ class OnPolicyAlgorithm(BaseAlgorithm):
             if not callback.on_step():
                 return False
-            self._update_info_buffer(infos)
+            self._update_info_buffer(infos, dones)
             n_steps += 1
             if isinstance(self.action_space, spaces.Discrete):
@@ -250,6 +251,28 @@ class OnPolicyAlgorithm(BaseAlgorithm):
         """
         raise NotImplementedError
+    def _dump_logs(self, iteration: int) -> None:
+        """
+        Write log.
+        :param iteration: Current logging iteration
+        """
+        assert self.ep_info_buffer is not None
+        assert self.ep_success_buffer is not None
+        time_elapsed = max((time.time_ns() - self.start_time) / 1e9, sys.float_info.epsilon)
+        fps = int((self.num_timesteps - self._num_timesteps_at_start) / time_elapsed)
+        self.logger.record("time/iterations", iteration, exclude="tensorboard")
+        if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0:
+            self.logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer]))
+            self.logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer]))
+        self.logger.record("time/fps", fps)
+        self.logger.record("time/time_elapsed", int(time_elapsed), exclude="tensorboard")
+        self.logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard")
+        if len(self.ep_success_buffer) > 0:
+            self.logger.record("rollout/success_rate", safe_mean(self.ep_success_buffer))
+        self.logger.dump(step=self.num_timesteps)
     def learn(
         self: SelfOnPolicyAlgorithm,
         total_timesteps: int,
@@ -285,16 +308,7 @@ class OnPolicyAlgorithm(BaseAlgorithm):
             # Display training infos
             if log_interval is not None and iteration % log_interval == 0:
                 assert self.ep_info_buffer is not None
-                time_elapsed = max((time.time_ns() - self.start_time) / 1e9, sys.float_info.epsilon)
-                fps = int((self.num_timesteps - self._num_timesteps_at_start) / time_elapsed)
-                self.logger.record("time/iterations", iteration, exclude="tensorboard")
-                if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0:
-                    self.logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer]))
-                    self.logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer]))
-                self.logger.record("time/fps", fps)
-                self.logger.record("time/time_elapsed", int(time_elapsed), exclude="tensorboard")
-                self.logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard")
-                self.logger.dump(step=self.num_timesteps)
+                self._dump_logs(iteration)
             self.train()

{stable_baselines3-2.2.1 → stable_baselines3-2.3.0}/stable_baselines3/common/policies.py RENAMED Viewed

@@ -173,7 +173,9 @@ class BaseModel(nn.Module):
         :return:
         """
         device = get_device(device)
-        saved_variables = th.load(path, map_location=device)
+        # Note(antonin): we cannot use `weights_only=True` here because we need to allow
+        # gymnasium imports for the policy to be loaded successfully
+        saved_variables = th.load(path, map_location=device, weights_only=False)
         # Create policy object
         model = cls(**saved_variables["data"])

{stable_baselines3-2.2.1 → stable_baselines3-2.3.0}/stable_baselines3/common/save_util.py RENAMED Viewed

@@ -2,6 +2,7 @@
 Save util taken from stable_baselines
 used to serialize data (class parameters) of model classes
 """
 import base64
 import functools
 import io
@@ -446,7 +447,7 @@ def load_from_zip_file(
                     file_content.seek(0)
                     # Load the parameters with the right ``map_location``.
                     # Remove ".pth" ending with splitext
-                    th_object = th.load(file_content, map_location=device)
+                    th_object = th.load(file_content, map_location=device, weights_only=True)
                     # "tensors.pth" was renamed "pytorch_variables.pth" in v0.9.0, see PR #138
                     if file_path == "pytorch_variables.pth" or file_path == "tensors.pth":
                         # PyTorch variables (not state_dicts)

{stable_baselines3-2.2.1 → stable_baselines3-2.3.0}/stable_baselines3/common/type_aliases.py RENAMED Viewed

@@ -1,4 +1,5 @@
 """Common aliases for type hints"""
 from enum import Enum
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple, Optional, Protocol, SupportsFloat, Tuple, Union

{stable_baselines3-2.2.1 → stable_baselines3-2.3.0}/stable_baselines3/common/vec_env/util.py RENAMED Viewed

@@ -1,6 +1,7 @@
 """
 Helpers for dealing with vectorized environments.
 """
 from collections import OrderedDict
 from typing import Any, Dict, List, Tuple

{stable_baselines3-2.2.1 → stable_baselines3-2.3.0}/stable_baselines3/common/vec_env/vec_frame_stack.py RENAMED Viewed

@@ -29,7 +29,12 @@ class VecFrameStack(VecEnvWrapper):
     def step_wait(
         self,
-    ) -> Tuple[Union[np.ndarray, Dict[str, np.ndarray]], np.ndarray, np.ndarray, List[Dict[str, Any]],]:
+    ) -> Tuple[
+        Union[np.ndarray, Dict[str, np.ndarray]],
+        np.ndarray,
+        np.ndarray,
+        List[Dict[str, Any]],
+    ]:
         observations, rewards, dones, infos = self.venv.step_wait()
         observations, infos = self.stacked_obs.update(observations, dones, infos)  # type: ignore[arg-type]
         return observations, rewards, dones, infos

{stable_baselines3-2.2.1 → stable_baselines3-2.3.0}/stable_baselines3/ddpg/ddpg.py RENAMED Viewed

@@ -60,11 +60,11 @@ class DDPG(TD3):
         learning_rate: Union[float, Schedule] = 1e-3,
         buffer_size: int = 1_000_000,  # 1e6
         learning_starts: int = 100,
-        batch_size: int = 100,
+        batch_size: int = 256,
         tau: float = 0.005,
         gamma: float = 0.99,
-        train_freq: Union[int, Tuple[int, str]] = (1, "episode"),
-        gradient_steps: int = -1,
+        train_freq: Union[int, Tuple[int, str]] = 1,
+        gradient_steps: int = 1,
         action_noise: Optional[ActionNoise] = None,
         replay_buffer_class: Optional[Type[ReplayBuffer]] = None,
         replay_buffer_kwargs: Optional[Dict[str, Any]] = None,

{stable_baselines3-2.2.1 → stable_baselines3-2.3.0}/stable_baselines3/dqn/dqn.py RENAMED Viewed

@@ -79,7 +79,7 @@ class DQN(OffPolicyAlgorithm):
         env: Union[GymEnv, str],
         learning_rate: Union[float, Schedule] = 1e-4,
         buffer_size: int = 1_000_000,  # 1e6
-        learning_starts: int = 50000,
+        learning_starts: int = 100,
         batch_size: int = 32,
         tau: float = 1.0,
         gamma: float = 0.99,

{stable_baselines3-2.2.1 → stable_baselines3-2.3.0}/stable_baselines3/td3/td3.py RENAMED Viewed

@@ -83,11 +83,11 @@ class TD3(OffPolicyAlgorithm):
         learning_rate: Union[float, Schedule] = 1e-3,
         buffer_size: int = 1_000_000,  # 1e6
         learning_starts: int = 100,
-        batch_size: int = 100,
+        batch_size: int = 256,
         tau: float = 0.005,
         gamma: float = 0.99,
-        train_freq: Union[int, Tuple[int, str]] = (1, "episode"),
-        gradient_steps: int = -1,
+        train_freq: Union[int, Tuple[int, str]] = 1,
+        gradient_steps: int = 1,
         action_noise: Optional[ActionNoise] = None,
         replay_buffer_class: Optional[Type[ReplayBuffer]] = None,
         replay_buffer_kwargs: Optional[Dict[str, Any]] = None,

stable_baselines3-2.3.0/stable_baselines3/version.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ 2.3.0

{stable_baselines3-2.2.1 → stable_baselines3-2.3.0}/stable_baselines3.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
-Name: stable-baselines3
-Version: 2.2.1
+Name: stable_baselines3
+Version: 2.3.0
 Summary: Pytorch version of Stable Baselines, implementations of reinforcement learning algorithms.
 Home-page: https://github.com/DLR-RM/stable-baselines3
 Author: Antonin Raffin
@@ -34,8 +34,8 @@ Requires-Dist: pytest-cov; extra == "tests"
 Requires-Dist: pytest-env; extra == "tests"
 Requires-Dist: pytest-xdist; extra == "tests"
 Requires-Dist: mypy; extra == "tests"
-Requires-Dist: ruff>=0.0.288; extra == "tests"
-Requires-Dist: black<24,>=23.9.1; extra == "tests"
+Requires-Dist: ruff>=0.3.1; extra == "tests"
+Requires-Dist: black<25,>=24.2.0; extra == "tests"
 Provides-Extra: docs
 Requires-Dist: sphinx<8,>=5; extra == "docs"
 Requires-Dist: sphinx-autobuild; extra == "docs"
@@ -99,7 +99,7 @@ import gymnasium
 from stable_baselines3 import PPO
-env = gymnasium.make("CartPole-v1")
+env = gymnasium.make("CartPole-v1", render_mode="human")
 model = PPO("MlpPolicy", env, verbose=1)
 model.learn(total_timesteps=10_000)

{stable_baselines3-2.2.1 → stable_baselines3-2.3.0}/stable_baselines3.egg-info/requires.txt RENAMED Viewed

@@ -39,5 +39,5 @@ pytest-cov
 pytest-env
 pytest-xdist
 mypy
-ruff>=0.0.288
-black<24,>=23.9.1
+ruff>=0.3.1
+black<25,>=24.2.0

{stable_baselines3-2.2.1 → stable_baselines3-2.3.0}/tests/test_envs.py RENAMED Viewed

@@ -123,6 +123,8 @@ def test_high_dimension_action_space():
         spaces.Dict({"img": spaces.Box(low=0, high=255, shape=(32, 32, 3), dtype=np.uint8)}),
         # Non zero start index
         spaces.Discrete(3, start=-1),
+        # Non zero start index (MultiDiscrete)
+        spaces.MultiDiscrete([4, 4], start=[1, 0]),
         # Non zero start index inside a Dict
         spaces.Dict({"obs": spaces.Discrete(3, start=1)}),
     ],
@@ -164,6 +166,8 @@ def test_non_default_spaces(new_obs_space):
         spaces.Box(low=np.array([-1, -1, -1]), high=np.array([1, 1, 0.99]), dtype=np.float32),
         # Non zero start index
         spaces.Discrete(3, start=-1),
+        # Non zero start index (MultiDiscrete)
+        spaces.MultiDiscrete([4, 4], start=[1, 0]),
     ],
 )
 def test_non_default_action_spaces(new_action_space):
@@ -179,7 +183,7 @@ def test_non_default_action_spaces(new_action_space):
     env.action_space = new_action_space
     # Discrete action space
-    if isinstance(new_action_space, spaces.Discrete):
+    if isinstance(new_action_space, (spaces.Discrete, spaces.MultiDiscrete)):
         with pytest.warns(UserWarning):
             check_env(env)
         return

{stable_baselines3-2.2.1 → stable_baselines3-2.3.0}/tests/test_logger.py RENAMED Viewed

@@ -14,7 +14,7 @@ from gymnasium import spaces
 from matplotlib import pyplot as plt
 from pandas.errors import EmptyDataError
-from stable_baselines3 import A2C, DQN
+from stable_baselines3 import A2C, DQN, PPO
 from stable_baselines3.common.env_checker import check_env
 from stable_baselines3.common.logger import (
     DEBUG,
@@ -33,6 +33,7 @@ from stable_baselines3.common.logger import (
     read_csv,
     read_json,
 )
+from stable_baselines3.common.monitor import Monitor
 KEY_VALUES = {
     "test": 1,
@@ -474,3 +475,92 @@ def test_human_output_format_custom_test_io(base_class):
 """
     assert printed == desired_printed
+class DummySuccessEnv(gym.Env):
+    """
+    Create a dummy success environment that returns wether True or False for info['is_success']
+    at the end of an episode according to its dummy successes list
+    """
+    def __init__(self, dummy_successes, ep_steps):
+        """Init the dummy success env
+        :param dummy_successes: list of size (n_logs_iterations, n_episodes_per_log) that specifies
+            the success value of log iteration i at episode j
+        :param ep_steps: number of steps per episode (to activate truncated)
+        """
+        self.n_steps = 0
+        self.log_id = 0
+        self.ep_id = 0
+        self.ep_steps = ep_steps
+        self.dummy_success = dummy_successes
+        self.num_logs = len(dummy_successes)
+        self.ep_per_log = len(dummy_successes[0])
+        self.steps_per_log = self.ep_per_log * self.ep_steps
+        self.action_space = spaces.Discrete(2)
+        self.observation_space = spaces.Discrete(2)
+    def reset(self, seed=None, options=None):
+        """
+        Reset the env and advance to the next episode_id to get the next dummy success
+        """
+        self.n_steps = 0
+        if self.ep_id == self.ep_per_log:
+            self.ep_id = 0
+            self.log_id = (self.log_id + 1) % self.num_logs
+        return self.observation_space.sample(), {}
+    def step(self, action):
+        """
+        Step and return a dummy success when an episode is truncated
+        """
+        self.n_steps += 1
+        truncated = self.n_steps >= self.ep_steps
+        info = {}
+        if truncated:
+            maybe_success = self.dummy_success[self.log_id][self.ep_id]
+            info["is_success"] = maybe_success
+            self.ep_id += 1
+        return self.observation_space.sample(), 0.0, False, truncated, info
+def test_rollout_success_rate_on_policy_algorithm(tmp_path):
+    """
+    Test if the rollout/success_rate information is correctly logged with on policy algorithms
+    To do so, create a dummy environment that takes as argument dummy successes (i.e when an episode)
+    is going to be successfull or not.
+    """
+    STATS_WINDOW_SIZE = 10
+    # Add dummy successes with 0.3, 0.5 and 0.8 success_rate of length STATS_WINDOW_SIZE
+    dummy_successes = [
+        [True] * 3 + [False] * 7,
+        [True] * 5 + [False] * 5,
+        [True] * 8 + [False] * 2,
+    ]
+    ep_steps = 64
+    # Monitor the env to track the success info
+    monitor_file = str(tmp_path / "monitor.csv")
+    env = Monitor(DummySuccessEnv(dummy_successes, ep_steps), filename=monitor_file, info_keywords=("is_success",))
+    # Equip the model of a custom logger to check the success_rate info
+    model = PPO("MlpPolicy", env=env, stats_window_size=STATS_WINDOW_SIZE, n_steps=env.steps_per_log, verbose=1)
+    logger = InMemoryLogger()
+    model.set_logger(logger)
+    # Make the model learn and check that the success rate corresponds to the ratio of dummy successes
+    model.learn(total_timesteps=env.ep_per_log * ep_steps, log_interval=1)
+    assert logger.name_to_value["rollout/success_rate"] == 0.3
+    model.learn(total_timesteps=env.ep_per_log * ep_steps, log_interval=1)
+    assert logger.name_to_value["rollout/success_rate"] == 0.5
+    model.learn(total_timesteps=env.ep_per_log * ep_steps, log_interval=1)
+    assert logger.name_to_value["rollout/success_rate"] == 0.8