PyPI - plancraft - Versions diffs - 0.1.1__tar.gz → 0.1.3__tar.gz - Mend

plancraft 0.1.1tar.gz → 0.1.3tar.gz

Files changed (41) hide show

{plancraft-0.1.1/plancraft/plancraft.egg-info → plancraft-0.1.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: plancraft
-Version: 0.1.1
+Version: 0.1.3
 Summary: Plancraft: an evaluation dataset for planning with LLM agents
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown

plancraft-0.1.3/plancraft/config.py ADDED Viewed

@@ -0,0 +1,155 @@
+from typing import Literal, Optional, Union
+from pydantic import BaseModel, model_validator
+try:
+    from plancraft.environments.recipes import RECIPES
+except ImportError:
+    RECIPES = {}
+DatasetSplit = Literal[
+    "train", "val", "val.small", "val.small.easy", "test", "test.small"
+]
+class EnvironmentConfig(BaseModel):
+    symbolic: bool
+    symbolic_observation_space: bool
+    symbolic_action_space: bool
+    preferred_spawn_biome: str = "plains"
+    resolution: list[int] = [512, 512]
+class PlancraftConfig(BaseModel):
+    model: str
+    adapter: str = ""
+    tokenizer: str
+    num_generations: int
+    mode: Literal["react", "act", "oracle", "dummy"] = "react"
+    output_dir: str
+    max_steps: int = 30  # max number of steps (smelt/move) to take in the environment before stopping
+    quantize: Literal[False, "int4", "int8"]
+    environment: EnvironmentConfig
+    split: DatasetSplit = "val.small"
+    max_message_window: int = 30  # max number of messages to keep in dialogue history (30 is around 8k llama3 tokens)
+    hot_cache: bool = True  # whether to cache the dialogue history between steps
+    resume: bool = True  # resume inference
+    few_shot: bool = True  # whether to use few-shot prompt
+    system_prompt: bool = True  # whether to use system prompt
+    valid_actions: list[str] = ["move", "smelt", "think", "search", "impossible"]
+    use_maskrcnn: bool = False  # whether to use maskrcnn for multimodal parsing
+    # observations
+    use_text_inventory: bool = True  # whether to include inventory in text
+    use_images: bool = False  # whether to include images in multimodal content
+    use_multimodal_content_format: bool = (
+        False  # whether to use multimodal content format
+    )
+    @model_validator(mode="after")
+    def validate(self):
+        assert set(
+            self.valid_actions
+        ).issubset(
+            {"move", "smelt", "think", "search", "impossible"}
+        ), "valid_actions should be subset of {'move', 'smelt', 'think', 'search', 'impossible'}"
+        if self.use_images:
+            assert (
+                not self.environment.symbolic
+            ), "Set environment.symbolic to False when using images"
+        return self
+class WandbConfig(BaseModel):
+    project: str
+    entity: str
+    mode: str
+class LaunchConfig(BaseModel):
+    command: str
+    job_name: str
+    gpu_limit: int
+    gpu_product: str
+    cpu_request: int
+    ram_request: str
+    interactive: bool = False
+    namespace: str = "informatics"
+    env_vars: dict[str, dict[str, str]]
+class EvalConfig(BaseModel):
+    plancraft: PlancraftConfig
+    wandb: WandbConfig
+    launch: LaunchConfig
+class TrainingArgs(BaseModel):
+    base_model: str = "llama3"
+    trace_mode: str = "oa"
+    push_to_hub: bool = False
+    # uses less space but not working with multi-gpu training..
+    qlora: bool = False
+    lora_alpha: int = 16
+    lora_dropout: float = 0.1
+    lora_r: int = 64
+    # training data args
+    seed: int = 42
+    # model args
+    batch_size: int = 1
+    max_seq_length: int = 8142
+    max_message_window: int = 100
+    only_assistant: bool = True
+    # training args
+    gradient_accumulation_steps: int = 4
+    learning_rate: float = 2e-4
+    max_grad_norm: float = 0.3
+    warmup_ratio: float = 0.03
+    num_train_epochs: int = 3
+    num_workers: int = 1
+class TrainConfig(BaseModel):
+    training: TrainingArgs
+    wandb: WandbConfig
+    launch: LaunchConfig
+class PlancraftExample(BaseModel):
+    target: str
+    inventory: dict[str, int]
+    slotted_inventory: list[dict[str, Union[str, int]]]
+    num_distractors: int
+    impossible: bool
+    optimal_path_length: Optional[int]
+    optimal_path: Optional[list[str]]
+    inventory_trace: Optional[list[dict[str, int]]]
+    items_used: Optional[int]
+    unique_items_used: Optional[int]
+    complexity: Optional[int]
+    complexity_bin: int
+    unseen_in_train: bool
+    unseen_in_val: bool
+    split: DatasetSplit
+    id: str
+    recipe_type: Optional[str] = ""
+    # post processing set recipe type
+    def model_post_init(self, __context):
+        recipe_types = set()
+        if self.optimal_path is None:
+            self.recipe_type = "impossible"
+            return
+        for step in self.optimal_path:
+            for r in RECIPES[step]:
+                recipe_types.add(r.recipe_type)
+        if len(recipe_types) == 1:
+            self.recipe_type = recipe_types.pop()
+        else:
+            self.recipe_type = "mixed"

plancraft-0.1.3/plancraft/environments/__init__.py ADDED Viewed

File without changes

plancraft-0.1.3/plancraft/evaluator.py ADDED Viewed

@@ -0,0 +1,273 @@
+import json
+import os
+import random
+import string
+import time
+import imageio
+import pandas as pd
+import torch
+import wandb
+from loguru import logger
+from tqdm import tqdm
+from plancraft.config import EvalConfig, PlancraftExample
+from plancraft.environments.actions import StopAction
+from plancraft.environments.env_real import RealPlancraft
+from plancraft.environments.env_symbolic import SymbolicPlancraft
+from plancraft.models import get_model
+wandb.require("core")
+class Evaluator:
+    """
+    The evaluator class handles the environment loop and model interaction
+    The environment is created based on the configuration and the examples are loaded from the dataset.
+    """
+    def __init__(self, cfg: EvalConfig):
+        self.cfg = cfg
+        self.output_dir = (
+            f"{cfg.plancraft.output_dir}/{self.evaluator_name()}/{cfg.plancraft.split}"
+        )
+        self.generation_number = 0
+        self.examples: list[PlancraftExample] = self.load_dataset(cfg.plancraft.split)
+        self.environment = self.create_env(cfg)
+        self.model = get_model(cfg)
+        self.record_frames = not (cfg.plancraft.environment.symbolic)
+        # no_op action
+        self.no_op = self.environment.action_space.no_op()
+    def evaluator_name(self) -> str:
+        symb_str = "real"
+        if self.cfg.plancraft.environment.symbolic:
+            symb_str = "symb"
+        if self.cfg.plancraft.use_maskrcnn:
+            symb_str += "_mrcnn"
+        model_name = self.cfg.plancraft.model.split("/")[-1]
+        if self.cfg.plancraft.adapter != "":
+            model_name = self.cfg.plancraft.adapter.split("/")[-1]
+        mode = self.cfg.plancraft.mode
+        if mode in ["dummy", "oracle"]:
+            return f"{mode}_{symb_str}"
+        actions = "|".join(self.cfg.plancraft.valid_actions)
+        return f"{self.cfg.plancraft.mode}_{symb_str}_{model_name}_{actions}"
+    def save_results_dict(self, example: PlancraftExample, results_dict: dict):
+        output_dir = f"{self.output_dir}/{self.generation_number}"
+        os.makedirs(output_dir, exist_ok=True)
+        json_path = f"{output_dir}/{example.id}.json"
+        with open(json_path, "w") as f:
+            json.dump(results_dict, f, indent=4)
+        wandb.save(json_path, policy="now")
+    def save_images(self, example: PlancraftExample, frames: list):
+        if len(frames) == 0:
+            return
+        output_dir = f"{self.output_dir}/{self.generation_number}"
+        os.makedirs(output_dir, exist_ok=True)
+        imageio.mimsave(f"{output_dir}/{example.id}.gif", frames)
+        # upload to wandb
+        wandb.save(f"{output_dir}/{example.id}.gif", policy="now")
+    def load_results_dict(self, example: PlancraftExample) -> dict:
+        path = f"{self.output_dir}/{self.generation_number}/{example.id}.json"
+        if not os.path.exists(path) or not self.cfg.plancraft.resume:
+            return None
+        with open(path, "r") as f:
+            return json.load(f)
+    def create_env(self, cfg: EvalConfig) -> RealPlancraft | SymbolicPlancraft:
+        if cfg.plancraft.environment.symbolic:
+            return SymbolicPlancraft(inventory=[])
+        return RealPlancraft(
+            inventory=[],
+            symbolic_action_space=cfg.plancraft.environment.symbolic_action_space,
+            symbolic_observation_space=cfg.plancraft.environment.symbolic_observation_space,
+            preferred_spawn_biome=cfg.plancraft.environment.preferred_spawn_biome,
+            resolution=cfg.plancraft.environment.resolution,
+        )
+    def close(self):
+        self.environment.close()
+    def load_dataset(self, dataset_split: str) -> list[PlancraftExample]:
+        with open(f"data/{dataset_split}.json", "r") as f:
+            dataset = json.load(f)
+            return [PlancraftExample(**example) for example in dataset]
+    def reset(
+        self,
+        example: PlancraftExample,
+    ):
+        current_inventory = example.slotted_inventory
+        self.environment.fast_reset(new_inventory=current_inventory)
+        # do a no op to an initial observation
+        obs, _, _, _ = self.environment.step(self.no_op)
+        # assert that the inventory is correct
+        if "inventory" in obs:
+            for item in current_inventory:
+                slot = item["slot"]
+                if (
+                    obs["inventory"][slot]["type"] != item["type"]
+                    or obs["inventory"][slot]["quantity"] != item["quantity"]
+                ) and item["type"] != "air":
+                    logger.warning(f"Inventory does not match expected for slot {slot}")
+                    logger.warning(f"Expected {item}")
+                    logger.warning(f"Got {obs['inventory'][slot]}")
+                    # try again
+                    self.reset(example)
+        objective = f"Craft an item of type: {example.target}"
+        self.model.reset_history(objective=objective)
+    def check_done(self, inventory: list[dict[str, int]], target: str):
+        """
+        Check that target object is obtained
+        """
+        for item in inventory:
+            if target == item["type"]:
+                # ensure item is taken out of crafting slot
+                if "slot" in item and item["slot"] != 0:
+                    return True
+                if "index" in item and item["index"] != 0:
+                    return True
+        return False
+    @torch.no_grad()
+    def eval_all_examples(self, progress_bar=False) -> list:
+        results = []
+        action = self.no_op.copy()
+        pbar = tqdm(
+            total=len(self.examples),
+            disable=not progress_bar,
+        )
+        correct = 0
+        count = 0
+        for example in self.examples:
+            if resume_result := self.load_results_dict(example):
+                pbar.update(self.cfg.plancraft.max_steps)
+                results.append(resume_result)
+                continue
+            success = False
+            self.reset(example)
+            action = self.no_op.copy()
+            while (
+                not self.model.history.check_stuck()
+                and self.model.history.num_steps < self.cfg.plancraft.max_steps
+            ):
+                # if the action is stop then we end the episode
+                if isinstance(action, StopAction):
+                    # if the action is stop and task is impossible then success
+                    # otherwise we should not have stopped
+                    success = example.impossible
+                    break
+                # step action
+                observation, _, _, _ = self.environment.step(action)
+                # check if the episode is done
+                success = self.check_done(observation["inventory"], example.target)
+                # exit if success
+                if success:
+                    break
+                # predict next action
+                action = self.model.step(observation)
+            # save results and reset
+            result = {
+                "success": success,
+                "recipe_type": example.recipe_type,
+                "number_of_steps": self.model.history.num_steps,
+                "model_trace": self.model.history.trace(),
+                "example_id": example.id,
+                "impossible": example.impossible,
+            }
+            results.append(result)
+            self.save_results_dict(example, result)
+            self.save_images(example, self.model.history.images)
+            correct += int(result["success"])
+            count += 1
+            acc = correct / count
+            pbar.set_postfix(correct=correct, count=count, acc=acc)
+            pbar.update(1)
+        return results
+    def eval_all(self):
+        logger.info(
+            f"Running evaluation over {len(self.examples)} examples {self.cfg.plancraft.num_generations} times."
+        )
+        run_name = (
+            f"{self.evaluator_name()} {self.cfg.plancraft.split}".replace(" ", "_")
+            .replace(".", "_")
+            .strip()
+        )
+        for n in range(self.cfg.plancraft.num_generations):
+            logger.info(f"Generation {n+1}/{self.cfg.plancraft.num_generations}")
+            run_id = "".join(random.choices(string.ascii_lowercase, k=5))
+            generation_run_name = run_name + f"_{run_id}"
+            wandb.init(
+                name=generation_run_name,
+                project=self.cfg.wandb.project,
+                entity=self.cfg.wandb.entity,
+                mode=self.cfg.wandb.mode,
+                group=self.cfg.plancraft.model,
+                job_type=self.cfg.plancraft.mode,
+                config=self.cfg.model_dump(),
+            )
+            time_now = time.time()
+            results_list = self.eval_all_examples(progress_bar=True)
+            results_df = pd.DataFrame(results_list)
+            output = {
+                "avg_success_rate": results_df["success"].mean(),
+                "avg_number_of_steps": results_df["number_of_steps"].mean(),
+                "avg_num_tokens_used": results_df["model_trace"]
+                .apply(pd.Series)["tokens_used"]
+                .mean(),
+            }
+            # calculate success rate for each recipe type
+            recipe_types = results_df["recipe_type"].unique()
+            for recipe_type in recipe_types:
+                mask = results_df["recipe_type"] == recipe_type
+                success_rate = results_df[mask]["success"].mean()
+                output[f"{recipe_type}_success_rate"] = success_rate
+            time_elapsed = time.time() - time_now
+            logger.info(f"Time elapsed: {time_elapsed:.2f}s")
+            logger.info(output)
+            wandb.log(output)
+            table = wandb.Table(
+                dataframe=results_df[["success", "number_of_steps", "example_id"]]
+            )
+            wandb.log({"results": table})
+            wandb.finish()
+            self.generation_number += 1
+        logger.info("Done")

plancraft-0.1.3/plancraft/utils.py ADDED Viewed

@@ -0,0 +1,84 @@
+import glob
+import pathlib
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+def get_downloaded_models() -> dict:
+    """
+    Get the list of downloaded models on the NFS partition (EIDF).
+    """
+    downloaded_models = {}
+    # known models on NFS partition
+    if pathlib.Path("/nfs").exists():
+        local_models = glob.glob("/nfs/public/hf/models/*/*")
+        downloaded_models = {
+            model.replace("/nfs/public/hf/models/", ""): model for model in local_models
+        }
+    return downloaded_models
+def get_torch_device() -> torch.device:
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+    elif torch.backends.mps.is_available():
+        if not torch.backends.mps.is_built():
+            print(
+                "MPS not available because the current PyTorch install was not built with MPS enabled."
+            )
+        else:
+            device = torch.device("mps")
+    return device
+def resize_image(img, target_resolution=(128, 128)):
+    if type(img) == np.ndarray:
+        img = cv2.resize(img, target_resolution, interpolation=cv2.INTER_LINEAR)
+    elif type(img) == torch.Tensor:
+        img = F.interpolate(img, size=target_resolution, mode="bilinear")
+    else:
+        raise ValueError("Unsupported type for img")
+    return img
+def save_frames_to_video(frames: list, out_path: str):
+    imgs = []
+    for id, (frame, goal) in enumerate(frames):
+        # if torch.is_tensor(frame):
+        # frame = frame.permute(0, 2, 3, 1).cpu().numpy()
+        frame = resize_image(frame, (320, 240)).astype("uint8")
+        cv2.putText(
+            frame,
+            f"FID: {id}",
+            (10, 25),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.8,
+            (255, 255, 255),
+            2,
+        )
+        cv2.putText(
+            frame,
+            f"Goal: {goal}",
+            (10, 55),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.8,
+            (255, 0, 0),
+            2,
+        )
+        imgs.append(Image.fromarray(frame))
+    imgs = imgs[::3]
+    imgs[0].save(
+        out_path,
+        save_all=True,
+        append_images=imgs[1:],
+        optimize=False,
+        quality=0,
+        duration=150,
+        loop=0,
+    )

{plancraft-0.1.1 → plancraft-0.1.3/plancraft.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: plancraft
-Version: 0.1.1
+Version: 0.1.3
 Summary: Plancraft: an evaluation dataset for planning with LLM agents
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown

{plancraft-0.1.1/plancraft → plancraft-0.1.3}/plancraft.egg-info/SOURCES.txt RENAMED Viewed

@@ -1,6 +1,15 @@
 LICENSE
 README.md
 pyproject.toml
+plancraft/__init__.py
+plancraft/config.py
+plancraft/evaluator.py
+plancraft/utils.py
+plancraft.egg-info/PKG-INFO
+plancraft.egg-info/SOURCES.txt
+plancraft.egg-info/dependency_links.txt
+plancraft.egg-info/requires.txt
+plancraft.egg-info/top_level.txt
 plancraft/environments/__init__.py
 plancraft/environments/actions.py
 plancraft/environments/env_real.py
@@ -21,11 +30,6 @@ plancraft/models/prompts.py
 plancraft/models/react.py
 plancraft/models/utils.py
 plancraft/models/few_shot_images/__init__.py
-plancraft/plancraft.egg-info/PKG-INFO
-plancraft/plancraft.egg-info/SOURCES.txt
-plancraft/plancraft.egg-info/dependency_links.txt
-plancraft/plancraft.egg-info/requires.txt
-plancraft/plancraft.egg-info/top_level.txt
 plancraft/train/dataset.py
 tests/test_planner.py
 tests/test_real_env.py

plancraft-0.1.3/plancraft.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ plancraft

{plancraft-0.1.1 → plancraft-0.1.3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "plancraft"
-version = "0.1.1"
+version = "0.1.3"
 description = "Plancraft: an evaluation dataset for planning with LLM agents"
 readme = "README.md"
 requires-python = ">=3.9"
@@ -58,14 +58,14 @@ dev-dependencies = [
     "uv>=0.4.25",
 ]
-[tool.setuptools.packages]
-find = { where = ["plancraft"]}
 [project.optional-dependencies]
 full = [
     "gym>=0.19.0,<=0.23.1",
 ]
+[tool.setuptools.package-dir]
+plancraft = "plancraft"
 [build-system]
 requires = ["setuptools"]
 build-backend = "setuptools.build_meta"