PyPI - egogym - Versions diffs - 0.1.0__py3-none-any.whl - Mend

egogym 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

baselines/pi_policy.py +110 -0
baselines/rum/__init__.py +1 -0
baselines/rum/loss_fns/__init__.py +37 -0
baselines/rum/loss_fns/abstract_loss_fn.py +13 -0
baselines/rum/loss_fns/diffusion_policy_loss_fn.py +114 -0
baselines/rum/loss_fns/rvq_loss_fn.py +104 -0
baselines/rum/loss_fns/vqbet_loss_fn.py +202 -0
baselines/rum/models/__init__.py +1 -0
baselines/rum/models/bet/__init__.py +3 -0
baselines/rum/models/bet/bet.py +347 -0
baselines/rum/models/bet/gpt.py +277 -0
baselines/rum/models/bet/tokenized_bet.py +454 -0
baselines/rum/models/bet/utils.py +124 -0
baselines/rum/models/bet/vqbet.py +410 -0
baselines/rum/models/bet/vqvae/__init__.py +3 -0
baselines/rum/models/bet/vqvae/residual_vq.py +346 -0
baselines/rum/models/bet/vqvae/vector_quantize_pytorch.py +1194 -0
baselines/rum/models/bet/vqvae/vqvae.py +313 -0
baselines/rum/models/bet/vqvae/vqvae_utils.py +30 -0
baselines/rum/models/custom.py +33 -0
baselines/rum/models/encoders/__init__.py +0 -0
baselines/rum/models/encoders/abstract_base_encoder.py +70 -0
baselines/rum/models/encoders/identity.py +45 -0
baselines/rum/models/encoders/timm_encoders.py +82 -0
baselines/rum/models/policies/diffusion_policy.py +881 -0
baselines/rum/models/policies/open_loop.py +122 -0
baselines/rum/models/policies/simple_open_loop.py +108 -0
baselines/rum/molmo/server.py +144 -0
baselines/rum/policy.py +293 -0
baselines/rum/utils/__init__.py +212 -0
baselines/rum/utils/action_transforms.py +22 -0
baselines/rum/utils/decord_transforms.py +135 -0
baselines/rum/utils/rpc.py +249 -0
baselines/rum/utils/schedulers.py +71 -0
baselines/rum/utils/trajectory_vis.py +128 -0
baselines/rum/utils/zmq_utils.py +281 -0
baselines/rum_policy.py +108 -0
egogym/__init__.py +8 -0
egogym/assets/constants.py +1804 -0
egogym/components/__init__.py +1 -0
egogym/components/object.py +94 -0
egogym/egogym.py +106 -0
egogym/embodiments/__init__.py +10 -0
egogym/embodiments/arms/__init__.py +4 -0
egogym/embodiments/arms/arm.py +65 -0
egogym/embodiments/arms/droid.py +49 -0
egogym/embodiments/grippers/__init__.py +4 -0
egogym/embodiments/grippers/floating_gripper.py +58 -0
egogym/embodiments/grippers/rum.py +6 -0
egogym/embodiments/robot.py +95 -0
egogym/evaluate.py +216 -0
egogym/managers/__init__.py +2 -0
egogym/managers/objects_managers.py +30 -0
egogym/managers/textures_manager.py +21 -0
egogym/misc/molmo_client.py +49 -0
egogym/misc/molmo_server.py +197 -0
egogym/policies/__init__.py +1 -0
egogym/policies/base_policy.py +13 -0
egogym/scripts/analayze.py +834 -0
egogym/scripts/plot.py +87 -0
egogym/scripts/plot_correlation.py +392 -0
egogym/scripts/plot_correlation_hardcoded.py +338 -0
egogym/scripts/plot_failure.py +248 -0
egogym/scripts/plot_failure_hardcoded.py +195 -0
egogym/scripts/plot_failure_vlm.py +257 -0
egogym/scripts/plot_failure_vlm_hardcoded.py +177 -0
egogym/scripts/plot_line.py +303 -0
egogym/scripts/plot_line_hardcoded.py +285 -0
egogym/scripts/plot_pi0_bars.py +169 -0
egogym/tasks/close.py +84 -0
egogym/tasks/open.py +85 -0
egogym/tasks/pick.py +121 -0
egogym/utils.py +969 -0
egogym/wrappers/__init__.py +20 -0
egogym/wrappers/episode_monitor.py +282 -0
egogym/wrappers/unprivileged_chatgpt.py +163 -0
egogym/wrappers/unprivileged_gemini.py +157 -0
egogym/wrappers/unprivileged_molmo.py +88 -0
egogym/wrappers/unprivileged_moondream.py +121 -0
egogym-0.1.0.dist-info/METADATA +52 -0
egogym-0.1.0.dist-info/RECORD +83 -0
egogym-0.1.0.dist-info/WHEEL +5 -0
egogym-0.1.0.dist-info/top_level.txt +2 -0

baselines/rum/models/policies/open_loop.py ADDED Viewed

@@ -0,0 +1,122 @@
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+from tqdm import tqdm
+class OpenLoopReplay(nn.Module):
+    def __init__(self, encoder, k=5, enc_weight_pth=None, use_vinn=False, cfg=None):
+        super().__init__()
+        self.encoder = encoder
+        self.cfg = cfg
+        self.use_vinn = use_vinn
+        self.k = k
+        if enc_weight_pth is not None:
+            self.encoder.load_state_dict(
+                torch.load(enc_weight_pth, map_location="cpu")["model"]
+            )
+        self.representations = None
+        self.actions = None
+        self.imgs = None
+        softmax = nn.Softmax(dim=1)
+        self.dist_scale_func = lambda x: (softmax(-x))
+        self.encoder.eval()
+        self.device = "cpu"
+        self.encoder.to(self.device)
+        self.img_transform = T.Resize((256, 256), antialias=True)
+        self.open_loop = False
+        self.idx = 0
+    def to(self, device):
+        self.device = device
+        self.encoder.to(device)
+        return super().to(device)
+    def set_dataset(self, dataloader):
+        self.train_dataset = dataloader.dataset
+        if self.use_vinn:
+            for i, (image, label) in tqdm(enumerate(dataloader)):
+                image = image.float() / 255.0
+                image = image.to(self.device)
+                label = torch.Tensor(label).to("cpu").detach().squeeze()
+                x = (image, label)
+                representation = self.encoder(x).to("cpu").detach().squeeze(dim=1)
+                if self.representations is None:
+                    self.representations = representation
+                    self.actions = label
+                    image = image.to("cpu").detach().numpy()
+                    self.imgs = list(image)
+                else:
+                    self.representations = torch.cat(
+                        (self.representations, representation), 0
+                    )
+                    self.actions = torch.cat((self.actions, label), 0)
+                    image = image.to("cpu").detach().numpy()
+                    self.imgs.extend(list(image))
+    def step(self, img, **kwargs):
+        logs = {}
+        print(self.idx)
+        print(len(self.train_dataset))
+        if self.use_vinn:
+            normalized_image = self.img_transform(img[0].squeeze(0))
+            if not self.open_loop:
+                self.encoder.eval()
+                with torch.no_grad():
+                    act, indices = self(img, return_indices=True)
+                    act = act.squeeze().detach()
+                    act[:-1] = 0
+                    act[-1] = 1
+                    self.neighbor_1_idx = indices[0][0]
+                action_tensor = torch.zeros(7)
+                action_tensor[-1] = 1
+                self.open_loop = True
+                return action_tensor, logs
+            else:
+                _, action = self.train_dataset[self.neighbor_1_idx + self.idx]
+                action_tensor = torch.tensor(action).squeeze()
+                self.idx += 1
+                return action_tensor, logs
+        else:
+            _, action = self.train_dataset[self.idx]
+            action_tensor = torch.tensor(action).squeeze()
+            self.idx += 1
+            return action_tensor, logs
+    def __call__(self, batch_images, k=None, return_indices=False):
+        if k is None:
+            k = self.k
+        all_distances = torch.zeros(
+            (batch_images[0].shape[0], self.representations.shape[0])
+        )
+        batch_rep = self.encoder(batch_images).squeeze(dim=1).detach().to(self.device)
+        dat_rep = self.representations.to(self.device)
+        all_distances = torch.cdist(batch_rep, dat_rep).to("cpu")
+        top_k_distances, indices = torch.topk(all_distances, k, dim=1, largest=False)
+        top_k_actions = self.actions[indices].to(self.device)
+        weights = self.dist_scale_func(top_k_distances).to(self.device)
+        pred = torch.sum(
+            top_k_actions * weights.unsqueeze(-1), dim=1
+        )  # weighted average
+        if return_indices:
+            return pred, indices
+        return pred
+    def reset(self):
+        self.open_loop = False
+        self.idx = 0

baselines/rum/models/policies/simple_open_loop.py ADDED Viewed

@@ -0,0 +1,108 @@
+import os
+import numpy as np
+from scipy.spatial.transform import Rotation as R
+from quaternion import (
+    as_rotation_matrix,
+    quaternion,
+)
+import torch
+P = np.array([[-1, 0, 0, 0], [0, 0, -1, 0], [0, -1, 0, 0], [0, 0, 0, 1]])
+def apply_permutation_transform(matrix):
+    return P @ matrix @ P.T
+class SimpleReplay:
+    def __init__(self, poses_file_path, timeskip=8):
+        self.poses_file_path = poses_file_path
+        self.timeskip = timeskip
+        self.idx = 0
+        self.transforms = None
+        self.process_poses()
+    def to(self, device):
+        pass
+    def eval(self):
+        pass
+    def get_poses(self):
+        with open(self.poses_file_path, "r") as f:
+            lines = f.readlines()
+        poses, timestamps = [], []
+        for line in lines:
+            line_list = eval(line)
+            ts = int(line_list[0].split("<")[1].split(">")[0])
+            pose = np.array(line_list[1:])
+            timestamps.append(ts)
+            poses.append(pose)
+        timestamps = np.array(timestamps)
+        poses = np.array(poses)
+        return poses, timestamps
+    def process_poses(self):
+        quaternions = []
+        translations = []
+        init_pose = None
+        poses, timestamps = self.get_poses()
+        for pose in poses:
+            qx, qy, qz, qw, tx, ty, tz = pose
+            ext_matrix = np.eye(4)
+            ext_matrix[:3, :3] = as_rotation_matrix(quaternion(qw, qx, qy, qz))
+            ext_matrix[:3, 3] = tx, ty, tz
+            if init_pose is None:
+                init_pose = np.copy(ext_matrix)
+            relative_pose = np.linalg.inv(init_pose) @ ext_matrix
+            relative_pose = apply_permutation_transform(relative_pose)
+            translations.append(relative_pose[:3, -1])
+            quaternions.append(
+                R.from_matrix(relative_pose[:3, :3]).as_quat()
+            )
+        quats = np.array(quaternions)
+        translations = np.array(translations)
+        transforms = np.concatenate([translations, quats], axis=1)
+        self.transforms = transforms
+    def get_action(self, idx):
+        prior_translations, prior_rotations = self.transforms[idx, :3], self.transforms[idx, 3:]
+        next_translations, next_rotations = self.transforms[idx + self.timeskip, :3], self.transforms[idx + self.timeskip, 3:]
+        # Now, create the matrices.
+        prior_rot_matrices, next_rot_matrices = (
+            R.from_quat(prior_rotations).as_matrix(),
+            R.from_quat(next_rotations).as_matrix(),
+        )
+        # Now, compute the relative matrices.
+        prior_matrices = np.eye(4)
+        prior_matrices[:3, :3] = prior_rot_matrices
+        prior_matrices[:3, 3] = prior_translations
+        next_matrices = np.eye(4)
+        next_matrices[:3, :3] = next_rot_matrices
+        next_matrices[:3, 3] = next_translations
+        relative_transforms = np.matmul(np.linalg.inv(prior_matrices), next_matrices)
+        relative_translations = relative_transforms[:3, 3]
+        relative_rotations = R.from_matrix(relative_transforms[:3, :3]).as_rotvec()
+        gripper = 1.0
+        return np.concatenate([relative_translations, relative_rotations, [gripper]], dtype=np.float32)
+    def step(self, img, step_no):
+        start_idx = self.timeskip * step_no
+        if start_idx + self.timeskip >= len(self.transforms):
+            print("INDEX OUT OF BOUNDS")
+            action_tensor = torch.zeros(7)
+            action_tensor[-1] = 1
+            return action_tensor, {}
+        action_tensor = self.get_action(start_idx)
+        return torch.tensor(action_tensor), {}
+    def reset(self):
+        pass

baselines/rum/molmo/server.py ADDED Viewed

@@ -0,0 +1,144 @@
+import argparse
+import asyncio
+import json
+import logging
+import re
+import numpy as np
+import torch
+import websockets
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(__name__)
+def extract_molmo_points(molmo_output: str):
+    points = []
+    for match in re.finditer(
+        r'x\d*="\s*([0-9]+(?:\.[0-9]+)?)"\s+y\d*="\s*([0-9]+(?:\.[0-9]+)?)"',
+        molmo_output,
+    ):
+        try:
+            p = np.array([float(match.group(1)), float(match.group(2))], dtype=np.float32)
+        except ValueError:
+            continue
+        if np.max(p) > 100:
+            continue
+        points.append(p / 100.0)
+    return points
+class Molmo:
+    def __init__(self, model_name="allenai/Molmo-7B-D-0924"):
+        self.processor = AutoProcessor.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+            torch_dtype="auto",
+            device_map="auto",
+        )
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+            torch_dtype="auto",
+            device_map="auto",
+        )
+    def infer(self, rgb: np.ndarray, prompt: str) -> str:
+        image = Image.fromarray(rgb)
+        inputs = self.processor.process(images=[image], text=prompt)
+        inputs = {k: v.to(self.model.device).unsqueeze(0) for k, v in inputs.items()}
+        with torch.inference_mode():
+            output = self.model.generate_from_batch(
+                inputs,
+                GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
+                tokenizer=self.processor.tokenizer,
+            )
+        gen_tokens = output[0, inputs["input_ids"].size(1):]
+        return self.processor.tokenizer.decode(gen_tokens, skip_special_tokens=True)
+    def infer_point(self, rgb: np.ndarray, prompt: str) -> np.ndarray:
+        text = self.infer(rgb, prompt)
+        log.info(f"Molmo output: {text}")
+        points = extract_molmo_points(text)
+        return points[0] if points else np.array([0.5, 0.5], dtype=np.float32)
+class MolmoWebSocketServer:
+    def __init__(self, host="0.0.0.0", port=8765):
+        self.host = host
+        self.port = port
+        self.molmo = Molmo()
+    async def handle_client(self, websocket):
+        client_id = id(websocket)
+        log.info(f"Client connected: {client_id}")
+        try:
+            async for msg in websocket:
+                try:
+                    req = json.loads(msg)
+                    action = req.get("action")
+                    if action != "infer_point":
+                        raise ValueError("Only 'infer_point' action is supported")
+                    rgb = np.array(req["rgb"], dtype=np.uint8)
+                    if "object_name" in req:
+                        prompt = f"Point to the center of the {req['object_name']}."
+                        label = req["object_name"].replace(" ", "_")
+                    elif "prompt" in req:
+                        prompt = req["prompt"]
+                        label = "custom_prompt"
+                    else:
+                        raise ValueError("Provide either 'object_name' or 'prompt'")
+                    point = self.molmo.infer_point(rgb, prompt)
+                    resp = {
+                        "status": "ok",
+                        "action": "infer_point",
+                        "point": point.tolist(),
+                    }
+                except Exception as e:
+                    log.exception("Request error")
+                    resp = {"status": "error", "message": str(e)}
+                await websocket.send(json.dumps(resp))
+        except websockets.exceptions.ConnectionClosed:
+            log.info(f"Client disconnected: {client_id}")
+    async def start(self):
+        log.info(f"Starting Molmo server on ws://{self.host}:{self.port}")
+        async with websockets.serve(
+            self.handle_client,
+            self.host,
+            self.port,
+            max_size=50 * 1024 * 1024,
+        ):
+            await asyncio.Future()
+def main():
+    parser = argparse.ArgumentParser("Molmo Pointing Server")
+    parser.add_argument("--host", default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=8765)
+    args = parser.parse_args()
+    server = MolmoWebSocketServer(
+        host=args.host,
+        port=args.port,
+    )
+    asyncio.run(server.start())
+if __name__ == "__main__":
+    main()

baselines/rum/policy.py ADDED Viewed

@@ -0,0 +1,293 @@
+import torch
+import numpy as np
+import cv2
+from PIL import Image
+from scipy.spatial.transform import Rotation as R
+import torchvision.transforms as T
+from hydra import initialize, compose
+import hydra
+P = np.array(
+    [[-1, 0, 0, 0], [0, 0, -1, 0], [0, -1, 0, 0], [0, 0, 0, 1]], dtype=np.float32
+)
+def init_model_loss_fn(cfg):
+    device = cfg.device if torch.cuda.is_available() else "cpu"
+    model = hydra.utils.instantiate(cfg.model).to(device)
+    model_weight_pth = cfg.get("model_weight_pth")
+    if model_weight_pth is None:
+        raise ValueError("Model weight path is not specified in the config.")
+    checkpoint = torch.load(
+        model_weight_pth, map_location=device, weights_only=False
+    )
+    try:
+        model.load_state_dict(checkpoint["model"])
+    except RuntimeError:
+        checkpoint["model"] = {
+            k.replace("_orig_mod.", ""): v for k, v in checkpoint["model"].items()
+        }
+        model.load_state_dict(checkpoint["model"])
+    loss_fn = hydra.utils.instantiate(cfg.loss_fn, model=model)
+    loss_fn.load_state_dict(checkpoint["loss_fn"])
+    loss_fn = loss_fn.to(device)
+    return model, loss_fn
+class VectorizedBuffer:
+    def __init__(self, batch_size, buffer_size, act_dim, device):
+        self.batch_size = batch_size
+        self.buffer_size = buffer_size
+        self.act_dim = act_dim
+        self.device = device
+        self.image_buffer = None
+        self.goal_buffer = None
+        self.action_buffer = None
+        self.image_buffers_sizes = torch.zeros(batch_size, device=device)
+        self.goal_buffer_size = torch.zeros(batch_size, device=device)
+        self.action_buffers_sizes = torch.zeros(batch_size, device=device)
+    def add_image(self, new_images):
+        if self.image_buffer is None:
+            self.image_buffer = (
+                new_images.unsqueeze(1)
+                .repeat(1, self.buffer_size, 1, 1, 1)
+                .to(self.device)
+            )
+        else:
+            for b in range(self.batch_size):
+                if self.image_buffers_sizes[b] == 0:
+                    self.image_buffer[b] = (
+                        new_images[b].unsqueeze(0).repeat(self.buffer_size, 1, 1, 1)
+                    )
+                else:
+                    self.image_buffer[b] = torch.roll(
+                        self.image_buffer[b], shifts=-1, dims=0
+                    )
+                    self.image_buffer[b, -1] = new_images[b]
+        self.image_buffers_sizes += 1
+        self.image_buffers_sizes = torch.clamp(
+            self.image_buffers_sizes, max=self.buffer_size
+        )
+    def add_goal(self, new_goals):
+        if self.goal_buffer is None:
+            self.goal_buffer = (
+                new_goals.unsqueeze(1).repeat(1, self.buffer_size, 1).to(self.device)
+            )
+        else:
+            for b in range(self.batch_size):
+                if self.goal_buffer_size[b] == 0:
+                    self.goal_buffer[b] = (
+                        new_goals[b].unsqueeze(0).repeat(self.buffer_size, 1)
+                    )
+                else:
+                    self.goal_buffer[b] = torch.roll(
+                        self.goal_buffer[b], shifts=-1, dims=0
+                    )
+                    self.goal_buffer[b, -1] = new_goals[b]
+        self.goal_buffer_size += 1
+        self.goal_buffer_size = torch.clamp(self.goal_buffer_size, max=self.buffer_size)
+    def reset(self, batch_indices):
+        self.image_buffers_sizes[batch_indices] = 0
+        self.goal_buffer_size[batch_indices] = 0
+        self.action_buffers_sizes[batch_indices] = 0
+        if self.image_buffer is not None:
+            self.image_buffer[batch_indices] = torch.zeros_like(
+                self.image_buffer[batch_indices]
+            )
+        if self.goal_buffer is not None:
+            self.goal_buffer[batch_indices] = torch.zeros_like(
+                self.goal_buffer[batch_indices]
+            )
+        if self.action_buffer is not None:
+            self.action_buffer[batch_indices] = torch.zeros_like(
+                self.action_buffer[batch_indices]
+            )
+    def add_action(self, new_actions):
+        B = new_actions.shape[0]
+        if self.action_buffer is None:
+            self.action_buffer = torch.zeros(
+                B, self.buffer_size - 1, self.act_dim, device=self.device
+            )
+            self.action_buffers_sizes = torch.zeros(B, device=self.device)
+        for b in range(B):
+            if self.action_buffers_sizes[b] != 0:
+                self.action_buffer[b] = torch.roll(
+                    self.action_buffer[b], shifts=-1, dims=0
+                )
+            self.action_buffer[b, -1] = new_actions[b]
+        self.action_buffers_sizes += 1
+    def get_input_sequence(self):
+        B = self.image_buffer.shape[0]
+        if self.action_buffer is None:
+            action_buffer = torch.zeros(
+                B, self.buffer_size - 1, self.act_dim, device=self.device
+            )
+        else:
+            action_buffer = self.action_buffer
+        base_act = torch.zeros(B, 1, self.act_dim, device=self.device)
+        act_seq = torch.cat([action_buffer, base_act], dim=1)
+        if self.goal_buffer is None:
+            goal_seq = None
+        else:
+            goal_seq = torch.stack([goal for goal in self.goal_buffer]).to(
+                dtype=torch.float32
+            )
+        return self.image_buffer, goal_seq, act_seq
+def unwrap_model(model):
+    if isinstance(
+        model, (torch.nn.DataParallel, torch.nn.parallel.DistributedDataParallel)
+    ):
+        return model.module
+    return model
+class Policy:
+    def __init__(self, model_path=None, device="cpu", model=None, loss_fn=None):
+        if model_path is None and (model is None or loss_fn is None):
+            raise ValueError(
+                "Either model_path or both model and loss_fn must be provided."
+            )
+        if model is None or loss_fn is None:
+            with initialize(config_path="configs", version_base=None):
+                cfg = compose(config_name="run_vqbet")
+            checkpoint = torch.load(model_path, map_location="cpu", weights_only=False)
+            ckpt_cfg = checkpoint["cfg"]
+            del checkpoint
+            cfg.goal_dim = ckpt_cfg["loss_fn"]["goal_dim"]
+            cfg.gpt_input_dim = ckpt_cfg["loss_fn"]["gpt_model"]["config"]["input_dim"]
+            cfg.loss_fn.gpt_model.config.n_layer = ckpt_cfg["loss_fn"]["gpt_model"][
+                "config"
+            ]["n_layer"]
+            cfg.loss_fn.gpt_model.config.n_head = ckpt_cfg["loss_fn"]["gpt_model"][
+                "config"
+            ]["n_head"]
+            cfg.loss_fn.gpt_model.config.n_embd = ckpt_cfg["loss_fn"]["gpt_model"][
+                "config"
+            ]["n_embd"]
+            cfg.vqvae_n_embed = ckpt_cfg["vqvae_n_embed"]
+            cfg.model_weight_pth = model_path
+            cfg.device = device
+            model, loss_fn = init_model_loss_fn(cfg)
+        self.to_tensor = T.ToTensor()
+        self.model = unwrap_model(model)
+        self.loss_fn = unwrap_model(loss_fn)
+        self.buffer_size = self.loss_fn._vqbet.obs_window_size
+        self.device = device
+        goal_dim = self.loss_fn.goal_dim
+        self.condition = f"{goal_dim}d"
+        if goal_dim == 0:
+            self.condition = None
+        valid_conditions = ("4d", "3d", "2d")
+        if self.condition is not None and self.condition not in valid_conditions:
+            raise ValueError(
+                f"'condition' must be one of {valid_conditions}, got '{self.condition}'"
+            )
+        self.model.eval()
+        self.loss_fn.eval()
+        self.vectorized_buffer = None
+        self.act_dim = 7
+        self.rot_yx_90 = (
+            R.from_euler("y", 90, degrees=True).as_matrix()
+            @ R.from_euler("x", 90, degrees=True).as_matrix()
+        )
+        self.Tyx = np.eye(4, dtype=np.float32)
+        self.Tyx[:3, :3] = self.rot_yx_90
+        rot_z_90 = R.from_euler("z", 90, degrees=True).as_matrix()
+        Tz = np.eye(4, dtype=np.float32)
+        Tz[:3, :3] = rot_z_90
+        M = self.Tyx @ Tz @ P.T
+        M_inv = M.T
+        self.M_t = torch.from_numpy(M).to(device)
+        self.M_inv_t = torch.from_numpy(M_inv).to(device)
+    def reset(self, indicies=None):
+        if indicies is None:
+            self.vectorized_buffer = None
+        else:
+            self.vectorized_buffer.reset(indicies)
+    def process_image(self, img):
+        if isinstance(img, np.ndarray):
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+            img = Image.fromarray(img)
+        return self.to_tensor(img)
+    def process_image_batch(self, imgs):
+        if isinstance(imgs, np.ndarray):
+            if imgs.dtype == np.uint8:
+                imgs_tensor = torch.from_numpy(imgs).permute(0, 3, 1, 2).float() / 255.0
+            else:
+                imgs_tensor = torch.from_numpy(imgs).permute(0, 3, 1, 2).float()
+            return imgs_tensor.to(self.device)
+        else:
+            return torch.stack([self.process_image(imgs[i]) for i in range(len(imgs))]).to(self.device)
+    def infer(self, observations):
+        obs = observations["rgb_ego"]
+        if self.condition == "2d":
+            goal = observations["object_2d_position"]
+        elif self.condition == "3d":
+            goal = observations["object_3d_position"]
+        else:
+            goal = None
+        if len(obs.shape) == 3:
+            obs = np.expand_dims(obs, axis=0)
+            if goal is not None:
+                goal = np.expand_dims(goal, axis=0)
+        B = obs.shape[0]
+        processed_images = self.process_image_batch(obs)
+        if self.vectorized_buffer is None:
+            image_shape = processed_images.shape[1:]
+            self.vectorized_buffer = VectorizedBuffer(
+                batch_size=B,
+                buffer_size=self.buffer_size,
+                act_dim=self.act_dim,
+                device=self.device,
+            )
+        self.vectorized_buffer.add_image(processed_images)
+        if self.condition is not None:
+            processed_goals = torch.from_numpy(goal).to(self.device, dtype=torch.float32).view(B, -1)
+            self.vectorized_buffer.add_goal(processed_goals)
+        img_seq, goal_seq, act_seq = self.vectorized_buffer.get_input_sequence()
+        with torch.no_grad():
+            model_input = (img_seq, goal_seq, act_seq)
+            model_output = self.model(model_input)
+            action_tensors, logs = self.loss_fn.step(
+                model_input, model_output, return_all=True
+            )
+        action_tensors = action_tensors.squeeze(1).to(self.device)
+        self.vectorized_buffer.add_action(action_tensors)
+        action_tensors = action_tensors.cpu().numpy()
+        return action_tensors