PyPI - mani-skill-nightly - Versions diffs - 2025.4.5.813__py3-none-any.whl → 2025.4.5.2036__py3-none-any.whl - Mend

mani-skill-nightly 2025.4.5.813py3-none-any.whl → 2025.4.5.2036py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

mani_skill/envs/sim2real_env.py ADDED Viewed

@@ -0,0 +1,381 @@
+import time
+from typing import Any, Callable, Dict, List, Optional
+import gymnasium as gym
+import numpy as np
+import torch
+from mani_skill.agents.base_real_agent import BaseRealAgent
+from mani_skill.envs.sapien_env import BaseEnv
+from mani_skill.sensors.camera import Camera, CameraConfig
+from mani_skill.utils import common
+from mani_skill.utils.logging_utils import logger
+class Sim2RealEnv(gym.Env):
+    """
+    Sim2RealEnv is a class that lets you interface with a real robot and align the real robot and environment with a simulation environment. It tries to ensure the action and observation space
+    are the exact same in the real and simulation environments. Any wrappers you apply to the simulation environment are also used in the Sim2RealEnv automatically.
+    There are some caveats in which you may need to override this class / write your own code instead:
+    - If you use privileged features in the simulation environment like an object's pose then we cannot retrieve those poses in the real environment. You can for example override the `_get_obs_extra` function to compute those values in the real environment via a perception pipeline.
+    - While we align controllers and observation shapes/ordering as much as possible, there can still be distribution shifts between the simulation and real environment. These can include vision gaps (sim images looking not like the real world) and sensor biases and noise.
+    Args:
+        sim_env (BaseEnv): The simulation environment that the real environment should be aligned with.
+        agent (BaseRealAgent): The real robot agent to control. This must be an object that inherits from BaseRealAgent.
+        obs_mode (str): The observation mode to use.
+        real_reset_function (Optional[Callable[[Sim2RealEnv, Optional[int], Optional[dict]], None]]): The function to call to reset the real robot. By default this is None and we use a default reset function which
+            calls the simulation reset function and resets the agent/robot qpos to whatever the simulation reset function sampled, then prompts the user to press enter before continuing running.
+            This function is given access to the Sim2RealEnv instance, the given seed and options dictionary similar to a standard gym reset function. The default function and example is shown below:
+            .. code-block:: python
+                def real_reset_function(self, seed=None, options=None):
+                    self.sim_env.reset(seed=seed, options=options)
+                    self.agent.reset(qpos=self.base_sim_env.agent.robot.qpos.cpu().flatten())
+                    input("Press enter if the environment is reset")
+        sensor_data_processing_function (Optional[Callable[[Dict], Dict]]): The function to call to process the sensor data returned by the BaseRealAgent.get_sensor_data function.
+            By default this is None and we use a default processing function which does the following for each sensor type:
+            - Camera: Perform a center crop of the real sensor image (rgb or depth) to have the same aspect ratio as the simulation sensor image. Then resize the image to the simulation sensor image shape using cv2.resize
+    """
+    metadata = {"render_modes": ["human", "rgb_array", "sensors", "all"]}
+    def __init__(
+        self,
+        sim_env: BaseEnv,
+        agent: BaseRealAgent,
+        obs_mode: str = "rgb",
+        real_reset_function: Optional[
+            Callable[["Sim2RealEnv", Optional[int], Optional[dict]], None]
+        ] = None,
+        sensor_data_processing_function: Optional[Callable[[Dict], Dict]] = None,
+        # obs_mode: Optional[str] = None,
+        reward_mode: Optional[str] = "none",
+        # control_mode: Optional[str] = None,
+        render_mode: Optional[str] = "sensors",
+        # robot_uids: BaseRealAgent = None,
+    ):
+        self.sim_env = sim_env
+        self.num_envs = 1
+        assert (
+            self.sim_env.backend.sim_backend == "physx_cpu"
+        ), "For the Sim2RealEnv we expect the simulation to be using the physx_cpu simulation backend currently in order to correctly align the robot"
+        # copy over some sim parameters/settings
+        self.device = self.sim_env.backend.device
+        self.sim_freq = self.sim_env.sim_freq
+        self.control_freq = self.sim_env.control_freq
+        # control timing
+        self.control_dt = 1 / self.control_freq
+        self.last_control_time: Optional[float] = None
+        self.base_sim_env: BaseEnv = sim_env.unwrapped
+        """the unwrapped simulation environment"""
+        self._reward_mode = reward_mode
+        self._obs_mode = obs_mode
+        self.reward_mode = reward_mode
+        self.obs_mode = obs_mode
+        self.obs_mode_struct = self.base_sim_env.obs_mode_struct
+        self.render_mode = render_mode
+        self._elapsed_steps = torch.zeros((1,), dtype=torch.int32)
+        # setup spaces
+        self._orig_single_action_space = self.base_sim_env._orig_single_action_space
+        self.action_space = self.sim_env.action_space
+        self.observation_space = self.sim_env.observation_space
+        # setup step and reset functions and handle wrappers for the user
+        def default_real_reset_function(self: Sim2RealEnv, seed=None, options=None):
+            self.sim_env.reset(seed=seed, options=options)
+            self.agent.reset(qpos=self.base_sim_env.agent.robot.qpos.cpu().flatten())
+            input("Press enter if the environment is reset")
+        self.real_reset_function = real_reset_function or default_real_reset_function
+        class RealEnvStepReset(gym.Env):
+            def step(dummy_self, action):
+                ret = self.base_sim_env.__class__.step(self, action)
+                return ret
+            def render(dummy_self):
+                return self.render()
+            def reset(dummy_self, seed=None, options=None):
+                # TODO: reset controller/agent
+                return self.get_obs(), {"reconfigure": False}
+            @property
+            def unwrapped(dummy_self):
+                # reference the Sim2RealEnv instance
+                return self
+        cur_env = self.sim_env
+        wrappers: List[gym.Wrapper] = []
+        while isinstance(cur_env, gym.Wrapper):
+            wrappers.append(cur_env)
+            cur_env = cur_env.env
+        self._handle_wrappers = len(wrappers) > 0
+        if self._handle_wrappers:
+            self._first_wrapper = wrappers[0]
+            self._last_wrapper = wrappers[-1]
+        self._env_with_real_step_reset = RealEnvStepReset()
+        """a simple object that defines the real step/reset functions for gym wrappers to call and use."""
+        self._sensor_names = list(self.base_sim_env.scene.sensors.keys())
+        """list of sensors the simulation environment uses"""
+        # setup the real agent based on the simulation agent
+        self.agent = agent
+        self.agent._sim_agent = self.base_sim_env.agent
+        # TODO create real controller class based on sim one?? Or can we just fake the data
+        self.agent._sim_agent.controller.qpos
+        self.sensor_data_processing_function = sensor_data_processing_function
+        # automatically try and generate a visual observation processing function to align a real camera with the simulated camera
+        if sensor_data_processing_function is None:
+            camera_sensor_names = [
+                name
+                for name in self._sensor_names
+                if isinstance(self.base_sim_env.scene.sensors[name], Camera)
+            ]
+            def sensor_data_processing_function(sensor_data: Dict):
+                import cv2
+                for sensor_name in camera_sensor_names:
+                    sim_sensor_cfg = self.base_sim_env._sensor_configs[sensor_name]
+                    assert isinstance(sim_sensor_cfg, CameraConfig)
+                    target_h, target_w = sim_sensor_cfg.height, sim_sensor_cfg.width
+                    real_sensor_data = sensor_data[sensor_name]
+                    # crop to same aspect ratio
+                    for key in ["rgb", "depth"]:
+                        if key in real_sensor_data:
+                            img = real_sensor_data[key][0].numpy()
+                            xy_res = img.shape[:2]
+                            crop_res = np.min(xy_res)
+                            cutoff = (np.max(xy_res) - crop_res) // 2
+                            if xy_res[0] == xy_res[1]:
+                                pass
+                            elif np.argmax(xy_res) == 0:
+                                img = img[cutoff:-cutoff, :, :]
+                            else:
+                                img = img[:, cutoff:-cutoff, :]
+                            real_sensor_data[key] = common.to_tensor(
+                                cv2.resize(img, (target_w, target_h))
+                            ).unsqueeze(0)
+                    sensor_data[sensor_name] = real_sensor_data
+                return sensor_data
+            self.sensor_data_processing_function = sensor_data_processing_function
+        sample_sim_obs, _ = self.sim_env.reset()
+        sample_real_obs, _ = self.reset()
+        # perform checks to avoid errors in alignments
+        self._check_observations(sample_sim_obs, sample_real_obs)
+    @property
+    def elapsed_steps(self):
+        return self._elapsed_steps
+    def _step_action(self, action):
+        """Re-implementation of the simulated BaseEnv._step_action function for real environments. This uses the simulation agent's
+        controller to compute the joint targets/velocities without stepping the simulator"""
+        action = common.to_tensor(action)
+        if action.shape == self._orig_single_action_space.shape:
+            action = common.batch(action)
+        # NOTE (stao): this won't work for interpolated target joint position control methods at the moment
+        self.base_sim_env.agent.set_action(action)
+        # to best ensure whatever signals we send to the simulator robot we also send to the real robot we directly inspect
+        # what drive targets the simulator controller sends and what was set by that controller on the simulated robot
+        sim_articulation = self.agent.controller.articulation
+        if self.last_control_time is None:
+            self.last_control_time = time.perf_counter()
+        else:
+            dt = time.perf_counter() - self.last_control_time
+            if dt < self.control_dt:
+                time.sleep(self.control_dt - dt)
+            else:
+                logger.warning(
+                    f"Control dt {self.control_dt} was not reached, actual dt was {dt}"
+                )
+        self.last_control_time = time.perf_counter()
+        if self.agent.controller.sets_target_qpos:
+            self.agent.set_target_qpos(sim_articulation.drive_targets)
+        if self.agent.controller.sets_target_qvel:
+            self.agent.set_target_qvel(sim_articulation.drive_velocities)
+    def step(self, action):
+        """
+        In order to make users able to use most gym environment wrappers without having to write extra code for the real environment
+        we temporarily swap the last wrapper's .env property with the RealEnvStepReset environment that has the real step/reset functions
+        """
+        if self._handle_wrappers:
+            orig_env = self._last_wrapper.env
+            self._last_wrapper.env = self._env_with_real_step_reset
+            ret = self._first_wrapper.step(action)
+            self._last_wrapper.env = orig_env
+        else:
+            ret = self._env_with_real_step_reset.step(action)
+        # ensure sim agent qpos is synced
+        self.base_sim_env.agent.robot.set_qpos(self.agent.robot.qpos)
+        return ret
+    def reset(self, seed=None, options=None):
+        self.real_reset_function(self, seed, options)
+        if self._handle_wrappers:
+            orig_env = self._last_wrapper.env
+            self._last_wrapper.env = self._env_with_real_step_reset
+            ret = self._first_wrapper.reset(seed=seed, options=options)
+            self._last_wrapper.env = orig_env
+        else:
+            ret = self._env_with_real_step_reset.reset(seed, options)
+        # sets sim to whatever the real agent reset to in order to sync them. Some controllers use the agent's
+        # current qpos and as this is the sim controller we copy the real world agent qpos so it behaves the same
+        # moreover some properties of the robot like forward kinematic computed poses are done through the simulated robot and so qpos has to be up to date
+        self.base_sim_env.agent.robot.set_qpos(self.agent.robot.qpos)
+        self.agent.controller.reset()
+        return ret
+    # -------------------------------------------------------------------------- #
+    # reimplementations of simulation BaseEnv observation related functions
+    # -------------------------------------------------------------------------- #
+    def get_obs(self, info=None, unflattened=False):
+        # uses the original environment's get_obs function. Override this only if you want complete control over the returned observations before any wrappers are applied.
+        return self.base_sim_env.__class__.get_obs(self, info, unflattened)
+    def _flatten_raw_obs(self, obs: Any):
+        return self.base_sim_env.__class__._flatten_raw_obs(self, obs)
+    def _get_obs_agent(self):
+        # using the original user implemented sim env's _get_obs_agent function in case they modify it e.g. to remove qvel values as they might be too noisy
+        return self.base_sim_env.__class__._get_obs_agent(self)
+    def _get_obs_extra(self, info: Dict):
+        # using the original user implemented sim env's _get_obs_extra function in case they modify it e.g. to include engineered features like the tcp_pose of the robot
+        try:
+            return self.base_sim_env.__class__._get_obs_extra(self, info)
+        except:
+            # Print the original error
+            import traceback
+            print(f"Error in _get_obs_extra: {traceback.format_exc()}")
+            # Print another message
+            print(
+                "If there is an error above a common cause is that the _get_obs_extra function defined in the simulation environment is using information not available in the real environment or real agent."
+                "In this case you can override the _get_obs_extra function in the Sim2RealEnv class to compute the desired information in the real environment via a e.g., perception pipeline."
+            )
+            exit(-1)
+    def _get_obs_sensor_data(self, apply_texture_transforms: bool = True):
+        # note apply_texture_transforms is not used for real envs, data is expected to already be transformed to standard texture names, types, and shapes.
+        self.agent.capture_sensor_data(self._sensor_names)
+        data = self.agent.get_sensor_data(self._sensor_names)
+        # observation data needs to be processed to be the same shape in simulation
+        # default strategy is to do a center crop to the same shape as simulation and then resize image to the same shape as simulation
+        data = self.sensor_data_processing_function(data)
+        return data
+    def _get_obs_with_sensor_data(
+        self, info: Dict, apply_texture_transforms: bool = True
+    ) -> dict:
+        """Get the observation with sensor data"""
+        return self.base_sim_env.__class__._get_obs_with_sensor_data(
+            self, info, apply_texture_transforms
+        )
+    def get_sensor_params(self):
+        return self.agent.get_sensor_params(self._sensor_names)
+    def get_info(self):
+        info = dict(elapsed_steps=self._elapsed_steps)
+        return info
+    # -------------------------------------------------------------------------- #
+    # reimplementations of simulation BaseEnv render related functions.
+    # -------------------------------------------------------------------------- #
+    def render(self):
+        return self.base_sim_env.__class__.render(self)
+    def render_sensors(self):
+        return self.base_sim_env.__class__.render_sensors(self)
+    def get_sensor_images(self):
+        # used by render_sensors
+        obs = self._get_obs_sensor_data()
+        sensor_images = dict()
+        for name, sensor in self.base_sim_env.scene.sensors.items():
+            if isinstance(sensor, Camera):
+                sensor_images[name] = sensor.get_images(obs[name])
+        return sensor_images
+    # -------------------------------------------------------------------------- #
+    # reimplementations of simulation BaseEnv reward related functions. By default you can leave this alone but if you do want to
+    # support computing rewards in the real world you can override these functions.
+    # -------------------------------------------------------------------------- #
+    def get_reward(self, obs, action, info):
+        return self.base_sim_env.__class__.get_reward(self, obs, action, info)
+    def compute_sparse_reward(self, obs: Any, action: torch.Tensor, info: Dict):
+        """
+        Computes the sparse reward. By default this function tries to use the success/fail information in
+        returned by the evaluate function and gives +1 if success, -1 if fail, 0 otherwise"""
+        return self.base_sim_env.__class__.compute_sparse_reward(
+            self, obs, action, info
+        )
+    def compute_dense_reward(self, obs: Any, action: torch.Tensor, info: Dict):
+        raise NotImplementedError()
+    def compute_normalized_dense_reward(
+        self, obs: Any, action: torch.Tensor, info: Dict
+    ):
+        raise NotImplementedError()
+    # -------------------------------------------------------------------------- #
+    # various checks
+    # -------------------------------------------------------------------------- #
+    def _check_observations(self, sample_sim_obs, sample_real_obs):
+        """checks if the visual observations are aligned in terms of shape and resolution and expected data types"""
+        # recursive check if the data is all the same shape
+        def check_observation_match(sim_obs, real_obs, path=[]):
+            """Recursively check if observations match in shape and dtype"""
+            if isinstance(sim_obs, dict):
+                for key in sim_obs.keys():
+                    if key not in real_obs:
+                        raise KeyError(
+                            f"Key obs[\"{'.'.join(path + [key])}]\"] found in simulation observation but not in real observation"
+                        )
+                    check_observation_match(
+                        sim_obs[key], real_obs[key], path=path + [key]
+                    )
+            else:
+                assert (
+                    sim_obs.shape == real_obs.shape
+                ), f"Shape mismatch: obs[\"{'.'.join(path)}\"]: {sim_obs.shape} vs {real_obs.shape}"
+                assert (
+                    sim_obs.dtype == real_obs.dtype
+                ), f"Dtype mismatch: obs[\"{'.'.join(path)}\"]: {sim_obs.dtype} vs {real_obs.dtype}"
+        # Call the recursive function to check observations
+        check_observation_match(sample_sim_obs, sample_real_obs)
+    def close(self):
+        self.agent.stop()

mani_skill/envs/tasks/digital_twins/base_env.py CHANGED Viewed

@@ -1,16 +1,15 @@
 import os
-from typing import Dict, List
+from typing import Dict, List, Union
 import cv2
-import gymnasium as gym
-import numpy as np
-import sapien.physx as physx
 import torch
-from mani_skill import ASSET_DIR
 from mani_skill.envs.sapien_env import BaseEnv
 from mani_skill.sensors.camera import CameraConfig
 from mani_skill.utils import common, sapien_utils
+from mani_skill.utils.structs.actor import Actor
+from mani_skill.utils.structs.articulation import Articulation
+from mani_skill.utils.structs.link import Link
 from mani_skill.utils.structs.types import SimConfig
@@ -20,33 +19,45 @@ class BaseDigitalTwinEnv(BaseEnv):
     This is based on the [SIMPLER](https://simpler-env.github.io/) and currently has the following tricks for
     making accurate simulated environments of real world datasets
-    Greenscreening: Add a greenscreened real image to the background to make the images more realistic and more closer to the distribution
-    of real world data.
+    Greenscreening: Add a greenscreened real image to the background to make the images more realistic and closer to the distribution
+    of real world data. To use the functionality in your own custom task you can do the following:
-    Note that this is not a general purpose system for building digital twins you can train and then transfer
-    to the real world. This is designed to support fast evaluation in simulation of real world policies.
+    .. code-block:: python
+        class MyTask(BaseDigitalTwinEnv):
+            def __init__(self, **kwargs):
+                self.rgb_overlay_paths = {"camera_name": "path/to/greenscreen/image.png"}
+                super().__init__(**kwargs)
+            def _load_scene(self, options: dict):
+                # load your objects as usual e.g. a cube at self.cube
+                # exclude the robot and cube from the greenscreen process
+                self.remove_object_from_greenscreen(self.robot)
+                self.remove_object_from_greenscreen(self.cube)
+    Use `self.remove_object_from_greenscreen(object: Actor | Link | Articulation)` to exclude those objects from the greenscreen process.
     """
     rgb_overlay_paths: Dict[str, str] = None
     """dict mapping camera name to the file path of the greenscreening image"""
     _rgb_overlay_images: Dict[str, torch.Tensor] = dict()
-    rgb_always_overlay_objects: List[str] = []
-    """List of names of actors/links that should be covered by the greenscreen"""
-    rgb_overlay_mode: str = (
-        "background"  # 'background' or 'object' or 'debug' or combinations of them
-    )
-    """which RGB overlay mode to use during the greenscreen process"""
+    """dict mapping camera name to the image torch tensor"""
+    rgb_overlay_mode: str = "background"
+    """which RGB overlay mode to use during the greenscreen process. The default is 'background' which enables greenscreening like normal. The other option is 'debug' mode which
+    will make the opacity of the original render and greenscreen overlay both 50%. The third option is "none" which will not perform any greenscreening."""
+    _objects_to_remove_from_greenscreen: List[Union[Actor, Link]] = []
+    """list of articulations/actors/links that should be removed from the greenscreen process"""
+    _segmentation_ids_to_keep: torch.Tensor = None
+    """torch tensor of segmentation ids that reference the objects that should not be greenscreened"""
     def __init__(self, **kwargs):
         # Load the "greenscreen" image, which is used to overlay the background portions of simulation observation
         if self.rgb_overlay_paths is not None:
             for camera_name, path in self.rgb_overlay_paths.items():
                 if not os.path.exists(path):
-                    raise FileNotFoundError(
-                        f"rgb_overlay_path {path} is not found."
-                        "If you installed this repo through 'pip install .' , "
-                        "you can download this directory https://github.com/simpler-env/ManiSkill2_real2sim/tree/main/data to get the real-world image overlay assets. "
-                    )
+                    raise FileNotFoundError(f"rgb_overlay_path {path} is not found.")
                 self._rgb_overlay_images[camera_name] = cv2.cvtColor(
                     cv2.imread(path), cv2.COLOR_BGR2RGB
                 )  # (H, W, 3); float32
@@ -69,28 +80,29 @@ class BaseDigitalTwinEnv(BaseEnv):
     def _load_scene(self, options: dict):
         """
         Load assets for a digital twin scene in
         """
-    def _after_reconfigure(self, options: dict):
-        target_object_actor_ids = [
-            x._objs[0].per_scene_id
-            for x in self.scene.actors.values()
-            if x.name
-            not in ["ground", "goal_site", "", "arena"]
-            + self.rgb_always_overlay_objects
-        ]
-        self.target_object_actor_ids = torch.tensor(
-            target_object_actor_ids, dtype=torch.int16, device=self.device
-        )
-        # get the robot link ids
-        robot_links = self.agent.robot.get_links()
-        self.robot_link_ids = torch.tensor(
-            [x._objs[0].entity.per_scene_id for x in robot_links],
-            dtype=torch.int16,
-            device=self.device,
-        )
+    def remove_object_from_greenscreen(self, object: Union[Articulation, Actor, Link]):
+        """remove an actor/articulation/link from the greenscreen process"""
+        if isinstance(object, Articulation):
+            for link in object.get_links():
+                self._objects_to_remove_from_greenscreen.append(link)
+        elif isinstance(object, Actor):
+            self._objects_to_remove_from_greenscreen.append(object)
+        elif isinstance(object, Link):
+            self._objects_to_remove_from_greenscreen.append(object)
+    def _after_reconfigure(self, options: dict):
+        super()._after_reconfigure(options)
+        # after reconfiguration in CPU/GPU sim we have initialized all ids of objects in the scene.
+        # and can now get the list of segmentation ids to keep
+        per_scene_ids = []
+        for object in self._objects_to_remove_from_greenscreen:
+            per_scene_ids.append(object.per_scene_id)
+        self._segmentation_ids_to_keep = torch.unique(torch.concatenate(per_scene_ids))
+        self._objects_to_remove_from_greenscreen = []
+        # load the overlay images
         for camera_name in self.rgb_overlay_paths.keys():
             sensor = self._sensor_configs[camera_name]
             if isinstance(sensor, CameraConfig):
@@ -106,47 +118,35 @@ class BaseDigitalTwinEnv(BaseEnv):
     def _green_sceen_rgb(self, rgb, segmentation, overlay_img):
         """returns green screened RGB data given a batch of RGB and segmentation images and one overlay image"""
         actor_seg = segmentation[..., 0]
-        mask = torch.ones_like(actor_seg, device=actor_seg.device)
-        if actor_seg.device != self.robot_link_ids.device:
-            # if using CPU simulation, the device of the robot_link_ids and target_object_actor_ids will be CPU first
-            # but for most users who use the sapien_cuda render backend image data will be on the GPU.
-            self.robot_link_ids = self.robot_link_ids.to(actor_seg.device)
-            self.target_object_actor_ids = self.target_object_actor_ids.to(
+        mask = torch.ones_like(actor_seg, device=actor_seg.device, dtype=torch.bool)
+        if self._segmentation_ids_to_keep.device != actor_seg.device:
+            self._segmentation_ids_to_keep = self._segmentation_ids_to_keep.to(
                 actor_seg.device
             )
-        if ("background" in self.rgb_overlay_mode) or (
-            "debug" in self.rgb_overlay_mode
-        ):
-            if ("object" not in self.rgb_overlay_mode) or (
-                "debug" in self.rgb_overlay_mode
-            ):
-                # only overlay the background and keep the foregrounds (robot and target objects) rendered in simulation
-                mask[
-                    torch.isin(
-                        actor_seg,
-                        torch.concatenate(
-                            [self.robot_link_ids, self.target_object_actor_ids]
-                        ),
-                    )
-                ] = 0
-            else:
-                # overlay everything except the robot links
-                mask[np.isin(actor_seg, self.robot_link_ids)] = 0.0
-        else:
-            raise NotImplementedError(self.rgb_overlay_mode)
+        if self.rgb_overlay_mode == "background":
+            # only overlay the background and keep the foregrounds (robot and target objects) rendered in simulation
+            mask[
+                torch.isin(
+                    actor_seg,
+                    self._segmentation_ids_to_keep,
+                )
+            ] = 0
         mask = mask[..., None]
         # perform overlay on the RGB observation image
         if "debug" not in self.rgb_overlay_mode:
-            rgb = rgb * (1 - mask) + overlay_img * mask
+            rgb = rgb * (~mask) + overlay_img * mask
         else:
             rgb = rgb * 0.5 + overlay_img * 0.5
+            rgb = rgb.to(torch.uint8)
         return rgb
-    def get_obs(self, info: dict = None):
-        obs = super().get_obs(info)
+    def _get_obs_sensor_data(self, apply_texture_transforms: bool = True):
+        obs = super()._get_obs_sensor_data(apply_texture_transforms)
         # "greenscreen" process
+        if self.rgb_overlay_mode == "none":
+            return obs
         if (
             self.obs_mode_struct.visual.rgb
             and self.obs_mode_struct.visual.segmentation
@@ -156,20 +156,20 @@ class BaseDigitalTwinEnv(BaseEnv):
             for camera_name in self._rgb_overlay_images.keys():
                 # obtain overlay mask based on segmentation info
                 assert (
-                    "segmentation" in obs["sensor_data"][camera_name].keys()
+                    "segmentation" in obs[camera_name].keys()
                 ), "Image overlay requires segment info in the observation!"
                 if (
                     self._rgb_overlay_images[camera_name].device
-                    != obs["sensor_data"][camera_name]["rgb"].device
+                    != obs[camera_name]["rgb"].device
                 ):
                     self._rgb_overlay_images[camera_name] = self._rgb_overlay_images[
                         camera_name
-                    ].to(obs["sensor_data"][camera_name]["rgb"].device)
+                    ].to(obs[camera_name]["rgb"].device)
                 overlay_img = self._rgb_overlay_images[camera_name]
                 green_screened_rgb = self._green_sceen_rgb(
-                    obs["sensor_data"][camera_name]["rgb"],
-                    obs["sensor_data"][camera_name]["segmentation"],
+                    obs[camera_name]["rgb"],
+                    obs[camera_name]["segmentation"],
                     overlay_img,
                 )
-                obs["sensor_data"][camera_name]["rgb"] = green_screened_rgb
+                obs[camera_name]["rgb"] = green_screened_rgb
         return obs

mani_skill/envs/tasks/digital_twins/bridge_dataset_eval/base_env.py CHANGED Viewed

@@ -158,6 +158,8 @@ class BaseBridgeEnv(BaseDigitalTwinEnv):
     SUPPORTED_OBS_MODES = ["rgb+segmentation"]
     SUPPORTED_REWARD_MODES = ["none"]
     scene_setting: Literal["flat_table", "sink"] = "flat_table"
+    objects_excluded_from_greenscreening: List[str] = []
+    """object ids that should not be greenscreened"""
     obj_static_friction = 0.5
     obj_dynamic_friction = 0.5
@@ -344,6 +346,10 @@ class BaseBridgeEnv(BaseDigitalTwinEnv):
                 raise ValueError(f"Model {model_id} does not have bbox info.")
         self.episode_model_bbox_sizes = model_bbox_sizes
+        for obj_name in self.objects_excluded_from_greenscreening:
+            self.remove_object_from_greenscreen(self.objs[obj_name])
+        self.remove_object_from_greenscreen(self.agent.robot)
     def _initialize_episode(self, env_idx: torch.Tensor, options: dict):
         # NOTE: this part of code is not GPU parallelized
         with torch.device(self.device):

mani_skill/envs/tasks/digital_twins/bridge_dataset_eval/put_on_in_scene.py CHANGED Viewed

@@ -16,6 +16,10 @@ from mani_skill.utils.registration import register_env
 )
 class PutCarrotOnPlateInScene(BaseBridgeEnv):
     scene_setting = "flat_table"
+    objects_excluded_from_greenscreening = [
+        "bridge_carrot_generated_modified",
+        "bridge_plate_objaverse_larger",
+    ]
     def __init__(self, **kwargs):
         xy_center = np.array([-0.16, 0.00])
@@ -74,7 +78,7 @@ class PutCarrotOnPlateInScene(BaseBridgeEnv):
 )
 class PutEggplantInBasketScene(BaseBridgeEnv):
     scene_setting = "sink"
-    rgb_always_overlay_objects = ["sink", "dummy_sink_target_plane"]
+    objects_excluded_from_greenscreening = ["eggplant"]
     def __init__(self, **kwargs):
         source_obj_name = "eggplant"
@@ -154,6 +158,10 @@ class PutEggplantInBasketScene(BaseBridgeEnv):
 )
 class StackGreenCubeOnYellowCubeBakedTexInScene(BaseBridgeEnv):
     MODEL_JSON = "info_bridge_custom_baked_tex_v0.json"
+    objects_excluded_from_greenscreening = [
+        "baked_green_cube_3cm",
+        "baked_yellow_cube_3cm",
+    ]
     def __init__(
         self,
@@ -213,6 +221,11 @@ class StackGreenCubeOnYellowCubeBakedTexInScene(BaseBridgeEnv):
     asset_download_ids=["bridge_v2_real2sim"],
 )
 class PutSpoonOnTableClothInScene(BaseBridgeEnv):
+    objects_excluded_from_greenscreening = [
+        "table_cloth_generated_shorter",
+        "bridge_spoon_generated_modified",
+    ]
     def __init__(
         self,
         **kwargs,

mani-skill-nightly 2025.4.5.813__py3-none-any.whl → 2025.4.5.2036__py3-none-any.whl

mani-skill-nightly 2025.4.5.813py3-none-any.whl → 2025.4.5.2036py3-none-any.whl