PyPI - plancraft - Versions diffs - 0.3.17__py3-none-any.whl → 0.3.18__py3-none-any.whl - Mend

plancraft 0.3.17py3-none-any.whl → 0.3.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

plancraft/evaluator.py CHANGED Viewed

@@ -176,12 +176,14 @@ class Evaluator:
         example: PlancraftExample,
         model: PlancraftBaseModel,
     ) -> dict:
-        """Given the loaded model and an example from Plancraft
-        run the episode until success or termination."""
+        """
+        Given the loaded model and an example from Plancraft
+        run the episode until success or termination.
+        """
         # start environment
         environment = PlancraftEnvironment(
-            inventory=example.slotted_inventory,
+            inventory=deepcopy(example.slotted_inventory),
             resolution=self.resolution,
         )
@@ -252,6 +254,135 @@ class Evaluator:
             "images": history.images,
         }
+    def batch_eval_examples(
+        self,
+        examples: list[PlancraftExample],
+        model,
+    ) -> list:
+        # Initialize environments and histories
+        environments = [
+            PlancraftEnvironment(
+                inventory=deepcopy(examples[i].slotted_inventory),
+                resolution=self.resolution,
+            )
+            for i in range(len(examples))
+        ]
+        histories = [
+            History(
+                actions=self.actions,
+                use_multimodal_content_format=self.use_multimodal_content_format,
+                use_images=self.use_images,
+                use_text_inventory=self.use_text_inventory,
+                resolution=self.resolution,
+                few_shot=self.few_shot,
+                system_prompt=deepcopy(self.system_prompt),
+                prompt_examples=deepcopy(self.prompt_examples),
+                prompt_images=deepcopy(self.prompt_images),
+            )
+            for _ in range(len(examples))
+        ]
+        # Track which environments are still active
+        active_mask = [True for _ in range(len(examples))]
+        results = [None for _ in range(len(examples))]
+        steps_taken = [0 for _ in range(len(examples))]
+        actions = [None for _ in range(len(examples))]
+        while any(active_mask) and all(steps < self.max_steps for steps in steps_taken):
+            # Get observations for all active environments
+            observations = []
+            active_indices = []
+            for i, (env, action, active) in enumerate(
+                zip(environments, actions, active_mask)
+            ):
+                if not active:
+                    continue
+                if isinstance(action, StopAction):
+                    # Handle stop action
+                    active_mask[i] = False
+                    results[i] = {
+                        "success": examples[i].impossible,
+                        "recipe_type": examples[i].recipe_type,
+                        "complexity": examples[i].complexity_split,
+                        "number_of_steps": steps_taken[i],
+                        "model_trace": histories[i].trace(),
+                        "example_id": examples[i].id,
+                        "images": histories[i].images,
+                    }
+                    logger.info("STOP")
+                    continue
+                active_indices.append(i)
+                if isinstance(action, str):
+                    # Handle message action
+                    obs = env.step()
+                    obs["target"] = examples[i].target
+                    obs["message"] = action
+                else:
+                    # Handle environment action
+                    obs = env.step(action)
+                    obs["target"] = examples[i].target
+                    obs["message"] = self.convert_observation_to_message(
+                        obs, model=model
+                    )
+                    # Check if done
+                    if self.check_done(obs["inventory"], examples[i].target):
+                        active_mask[i] = False
+                        results[i] = {
+                            "success": True,
+                            "recipe_type": examples[i].recipe_type,
+                            "complexity": examples[i].complexity_split,
+                            "number_of_steps": steps_taken[i],
+                            "model_trace": histories[i].trace(),
+                            "example_id": examples[i].id,
+                            "images": histories[i].images,
+                        }
+                        continue
+                observations.append(obs)
+                histories[i].add_observation_to_history(obs)
+                histories[i].add_message_to_history(content=obs["message"], role="user")
+                steps_taken[i] += 1
+            if not observations:
+                break
+            # Batch predict actions for active environments
+            active_histories = [histories[i] for i in active_indices]
+            raw_actions = model.batch_step(
+                observations, dialogue_histories=active_histories
+            )
+            # Process actions for each active environment
+            for idx, raw_action in zip(active_indices, raw_actions):
+                logger.info(f"{histories[idx].num_steps}, {raw_action}")
+                histories[idx].add_message_to_history(
+                    content=raw_action, role="assistant"
+                )
+                actions[idx] = self.parse_raw_model_response(
+                    raw_action,
+                    observation=observations[active_indices.index(idx)],
+                    history=histories[idx],
+                )
+        # Fill in results for environments that didn't finish
+        for i, result in enumerate(results):
+            if result is None:
+                results[i] = {
+                    "success": False,
+                    "recipe_type": examples[i].recipe_type,
+                    "complexity": examples[i].complexity_split,
+                    "number_of_steps": steps_taken[i],
+                    "model_trace": histories[i].trace(),
+                    "example_id": examples[i].id,
+                    "images": histories[i].images,
+                }
+        return results
     def eval_all_examples(self, model, progress_bar=False) -> list:
         results = []
         pbar = tqdm(

plancraft/models/dummy.py CHANGED Viewed

@@ -40,3 +40,6 @@ class DummyModel(PlancraftBaseModel):
     def step(self, observation: dict, **kwargs) -> str:
         return str(self.random_select(observation))
+    def batch_step(self, observations: list[dict], **kwargs) -> list:
+        return [self.step(observation) for observation in observations]

plancraft/models/oracle.py CHANGED Viewed

@@ -38,3 +38,12 @@ class OracleModel(PlancraftBaseModel):
         action = self.subplans.pop(0)
         return action
+    def batch_step(self, observations: list[dict], **kwargs) -> list:
+        # Need to fully isolate state between examples
+        actions = []
+        for observation in observations:
+            self.reset()
+            action = self.step(observation)
+            actions.append(action)
+        return actions

{plancraft-0.3.17.dist-info → plancraft-0.3.18.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: plancraft
-Version: 0.3.17
+Version: 0.3.18
 Summary: Plancraft: an evaluation dataset for planning with LLM agents
 License: MIT License

{plancraft-0.3.17.dist-info → plancraft-0.3.18.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 plancraft/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 plancraft/config.py,sha256=ShsFRlJ7plsl3ToD9fiO_4LDQuXdbjNV6Xp6o3Yk2Yg,4315
-plancraft/evaluator.py,sha256=q7khX8FrMeb5QOgYZba-24jC7ZXp83VU7sa1H1kKS08,11061
+plancraft/evaluator.py,sha256=v8itX8buduqTZdR39gtLwdhKGEnSX3rJv9Yd13EzNgQ,16395
 plancraft/generate_dataset.py,sha256=DlrU-PmvWqSNJD1g1-8Lpb8n3N-Ogw3rje1nrRzjGKs,2382
 plancraft/utils.py,sha256=67UUDMSv8TqX_I0fL5-yG_vkHvTZlnhSLkktWAg5p34,5712
 plancraft/data/test.json,sha256=4jWfYMAVuZCFmGB4iZJAjlh9_8jXECdaGp8xn7_tAM4,1317131
@@ -1915,12 +1915,12 @@ plancraft/models/__init__.py,sha256=TBrarn93qt4IFJRNqtzOfaA8jGMPCgD7DFs-M84ipmk,
 plancraft/models/act.py,sha256=6Xb8rylg3OngOraVFgduH_hQR62VcoyTeFntN4q3hsQ,2691
 plancraft/models/base.py,sha256=uhG1tRmsBerJzW8qHoLyLEYpveDv0co7AAhi4mSfyO4,661
 plancraft/models/bbox_model.py,sha256=3b1IEspoHiVUR6GOWjEbp4YoxRhGkzKt-eOiwaN8NXo,17091
-plancraft/models/dummy.py,sha256=856oEX6NquXSIIfQLTEFFeB8ib7VUUs5cB0TVHAiFvI,1248
+plancraft/models/dummy.py,sha256=3Nsnw12s_n5mWMuxUTaPCuJIzPp0vLHWKE827iKY5o0,1391
 plancraft/models/generators.py,sha256=F76_iPiqxUjDIrQwF58tzM0bLM91OkZJ0sBqBuki5wY,13939
-plancraft/models/oracle.py,sha256=tMp9mTwD70T3qohj-LZhJFjHYWyiVHDh8gu27asVimI,1342
+plancraft/models/oracle.py,sha256=f-0KWlBuHy6wcxmDsxM3MQ_QwfBstzfbA26mlk1MgLA,1657
 plancraft/models/utils.py,sha256=E-sZohvolWgGbpHQKgAgkgIfUJoVnT5pMt6JP8xLHKg,4034
 plancraft/train/dataset.py,sha256=oFqEd4LG9oEQ-71teh0Wf7-jJbtybT2ZibfM2bBdBkM,5474
-plancraft-0.3.17.dist-info/METADATA,sha256=DhUcHfnj_fMJTnHQVQVl50RpM3mwPCdFHOFNQtCo39c,11148
-plancraft-0.3.17.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-plancraft-0.3.17.dist-info/licenses/LICENSE,sha256=YGR8ehDB4t-T-lOQKMfKNR-2zsOU7E3E5NA8t25HKE0,1070
-plancraft-0.3.17.dist-info/RECORD,,
+plancraft-0.3.18.dist-info/METADATA,sha256=p_Ln_3jx77ygBZG6yjuLhVs883PysUXUCi1sK67QvJs,11148
+plancraft-0.3.18.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+plancraft-0.3.18.dist-info/licenses/LICENSE,sha256=YGR8ehDB4t-T-lOQKMfKNR-2zsOU7E3E5NA8t25HKE0,1070
+plancraft-0.3.18.dist-info/RECORD,,

{plancraft-0.3.17.dist-info → plancraft-0.3.18.dist-info}/WHEEL RENAMED Viewed

File without changes

{plancraft-0.3.17.dist-info → plancraft-0.3.18.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

plancraft 0.3.17__py3-none-any.whl → 0.3.18__py3-none-any.whl

plancraft 0.3.17py3-none-any.whl → 0.3.18py3-none-any.whl