PyPI - plancraft - Versions diffs - 0.3.30__py3-none-any.whl → 0.3.32__py3-none-any.whl - Mend

plancraft 0.3.30py3-none-any.whl → 0.3.32py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

plancraft/evaluator.py +89 -162
plancraft/models/act.py +0 -3
plancraft/models/base.py +0 -7
plancraft/models/dummy.py +0 -6
plancraft/models/oracle.py +0 -6
{plancraft-0.3.30.dist-info → plancraft-0.3.32.dist-info}/METADATA +1 -1
{plancraft-0.3.30.dist-info → plancraft-0.3.32.dist-info}/RECORD +9 -9
{plancraft-0.3.30.dist-info → plancraft-0.3.32.dist-info}/WHEEL +0 -0
{plancraft-0.3.30.dist-info → plancraft-0.3.32.dist-info}/licenses/LICENSE +0 -0

plancraft/evaluator.py CHANGED Viewed

@@ -170,96 +170,70 @@ class Evaluator:
             content_list.append({"type": "image"})
         return {"content": content_list}
-    def eval_example(
-        self,
-        example: PlancraftExample,
-        model: PlancraftBaseModel,
-    ) -> dict:
-        """
-        Given the loaded model and an example from Plancraft
-        run the episode until success or termination.
-        """
-        # start environment
+    def _init_environment(self, example: PlancraftExample) -> tuple:
+        """Initialize environment and history for an example"""
         environment = PlancraftEnvironment(
             inventory=deepcopy(example.slotted_inventory),
             resolution=self.resolution,
         )
-        # initialise history/dialogue tracking
         history = self.create_history()
-        observation = environment.step()
-        # add target and first message to history
-        observation["target"] = example.target
-        observation["message"] = self.convert_observation_to_message(
-            observation, model=model
+        obs = environment.step()
+        obs["target"] = example.target
+        obs["message"] = self.convert_observation_to_message(obs)
+        return environment, history, obs
+    def _process_model_output(
+        self, raw_action, observation: dict, history: HistoryBase
+    ) -> tuple:
+        """Process model output and update history"""
+        if isinstance(raw_action, PlancraftModelOutput):
+            history.add_message_to_history(
+                content=raw_action.action,
+                role="assistant",
+                **(raw_action.kwargs or {}),
+            )
+            raw_action = raw_action.action
+        else:
+            history.add_message_to_history(content=raw_action, role="assistant")
+        action = self.parse_raw_model_response(
+            raw_action,
+            observation=observation,
+            history=history,
         )
+        return action
+    def _execute_action(
+        self, action, example: PlancraftExample, environment, model=None
+    ) -> tuple[dict, bool]:
+        """Execute action and return next observation and success status"""
         success = False
-        # run episode until stuck or until max steps is reached
-        while history.num_steps < self.max_steps:
-            # add observation to history
-            history.add_observation_to_history(observation)
-            history.add_message_to_history(content=observation["message"], role="user")
-            # predict next action
-            raw_action = model.step(observation, dialogue_history=history)
-            # if the model returns a PlancraftModelOutput, extract the action
-            if isinstance(raw_action, PlancraftModelOutput):
-                history.add_message_to_history(
-                    content=raw_action.action,
-                    role="assistant",
-                    **(raw_action.kwargs or {}),
-                )
-                raw_action = raw_action.action
-            elif isinstance(raw_action, str):
-                history.add_message_to_history(content=raw_action, role="assistant")
-            else:
-                raise ValueError(
-                    f"model.step() output must be a string or PlancraftModelOutput, got {type(raw_action)}"
-                )
-            # parse the raw action
-            action = self.parse_raw_model_response(
-                raw_action, observation=observation, history=history
-            )
-            # if the action is stop then we end the episode
-            if isinstance(action, StopAction):
-                # if the action is stop and task is impossible then success
-                # otherwise we should not have stopped
-                observation = None
-                success = example.impossible
-            # action is external tool then it is str
-            elif isinstance(action, str):
-                observation = environment.step()
-                observation["target"] = example.target
-                observation["message"] = action
-            # action is environment action
-            else:
-                observation = environment.step(action)
-                observation["target"] = example.target
-                observation["message"] = self.convert_observation_to_message(
-                    observation, model=model
-                )
-                # check if the episode is done
-                success = self.check_done(observation["inventory"], example.target)
-            # update model with success or failure
-            # observation is the next state after the action (s1)
-            # history is the dialogue history
-            # -- the last message contains the action taken (a0)
-            # -- the second to last message is the observation (s0)
-            # success is whether the episode is sucessful (r)
-            model.update(
-                observation=observation, history=history, success=success, action=action
+        # stop action
+        if isinstance(action, StopAction):
+            observation = None
+            #  success is True if example was truly impossible
+            success = example.impossible
+        #  if action is a string, it is a message response
+        elif isinstance(action, str):
+            observation = environment.step()
+            observation["target"] = example.target
+            observation["message"] = action
+        #  execute action and check if target is obtained
+        else:
+            observation = environment.step(action)
+            observation["target"] = example.target
+            observation["message"] = self.convert_observation_to_message(
+                observation, model=model
             )
+            success = self.check_done(observation["inventory"], example.target)
-            # exit if success
-            if success or isinstance(action, StopAction):
-                break
+        return observation, success
-        # save results and reset
+    def _create_result(
+        self, example: PlancraftExample, success: bool, history: HistoryBase
+    ) -> dict:
+        """Create result dictionary for an example"""
         return {
             "success": success,
             "recipe_type": example.recipe_type,
@@ -270,20 +244,36 @@ class Evaluator:
             "images": history.images,
         }
+    def eval_example(
+        self,
+        example: PlancraftExample,
+        model: PlancraftBaseModel,
+    ) -> dict:
+        environment, history, observation = self._init_environment(example)
+        success = False
+        while history.num_steps < self.max_steps:
+            history.add_observation_to_history(observation)
+            history.add_message_to_history(content=observation["message"], role="user")
+            raw_action = model.step(observation, dialogue_history=history)
+            action = self._process_model_output(raw_action, observation, history)
+            observation, success = self._execute_action(
+                action, example, environment, model
+            )
+            if success or isinstance(action, StopAction):
+                break
+        return self._create_result(example, success, history)
     def batch_eval_examples(
         self,
         examples: list[PlancraftExample],
         model,
         batch_size: int = 4,
+        callback_fn: Optional[callable] = None,
     ) -> list:
-        """
-        Processes examples in batches with dynamic replacement from a queue.
-        Args:
-            examples: List of examples to process
-            model: Model to use for evaluation
-            batch_size: Maximum number of concurrent environments
-        """
         pending_examples = deque(examples)
         active_examples = []
         active_environments = []
@@ -294,21 +284,13 @@ class Evaluator:
         # Initialize first batch
         while len(active_examples) < batch_size and pending_examples:
             example = pending_examples.popleft()
-            env = PlancraftEnvironment(
-                inventory=deepcopy(example.slotted_inventory),
-                resolution=self.resolution,
-            )
-            history = self.create_history()
-            obs = env.step()
-            obs["target"] = example.target
-            obs["message"] = self.convert_observation_to_message(obs, model=model)
+            env, history, obs = self._init_environment(example)
             active_examples.append(example)
             active_environments.append(env)
             active_histories.append(history)
             active_observations.append(obs)
-        # Process until all examples are done
         while active_examples:
             # Add observations to histories
             for i in range(len(active_examples)):
@@ -317,12 +299,10 @@ class Evaluator:
                     content=active_observations[i]["message"], role="user"
                 )
-            # Get model predictions for current batch
             raw_actions = model.batch_step(
                 active_observations, dialogue_histories=active_histories
             )
-            # Process each active environment
             completed_indices = []
             successes = []
             actions = []
@@ -330,71 +310,28 @@ class Evaluator:
             for i, (example, raw_action) in enumerate(
                 zip(active_examples, raw_actions)
             ):
-                # Handle model output
-                if isinstance(raw_action, PlancraftModelOutput):
-                    active_histories[i].add_message_to_history(
-                        content=raw_action.action,
-                        role="assistant",
-                        **(raw_action.kwargs or {}),
-                    )
-                    raw_action = raw_action.action
-                else:
-                    active_histories[i].add_message_to_history(
-                        content=raw_action, role="assistant"
-                    )
-                # Parse and execute action
-                action = self.parse_raw_model_response(
-                    raw_action,
-                    observation=active_observations[i],
-                    history=active_histories[i],
+                action = self._process_model_output(
+                    raw_action, active_observations[i], active_histories[i]
                 )
                 actions.append(action)
-                success = False
-                if isinstance(action, StopAction):
-                    success = example.impossible
-                    active_observations[i] = None
-                elif isinstance(action, str):
-                    obs = active_environments[i].step()
-                    obs["target"] = example.target
-                    obs["message"] = action
-                    active_observations[i] = obs
-                else:
-                    obs = active_environments[i].step(action)
-                    obs["target"] = example.target
-                    obs["message"] = self.convert_observation_to_message(
-                        obs, model=model
-                    )
-                    active_observations[i] = obs
-                    success = self.check_done(obs["inventory"], example.target)
+                obs, success = self._execute_action(
+                    action, example, active_environments[i], model
+                )
+                active_observations[i] = obs
                 successes.append(success)
-                # Check if environment is done
                 if (
                     success
                     or isinstance(action, StopAction)
                     or active_histories[i].num_steps >= self.max_steps
                 ):
-                    results[example.id] = {
-                        "success": success,
-                        "recipe_type": example.recipe_type,
-                        "complexity": example.complexity_split,
-                        "number_of_steps": active_histories[i].num_steps,
-                        "model_trace": active_histories[i].trace(),
-                        "example_id": example.id,
-                        "images": active_histories[i].images,
-                    }
+                    results[example.id] = self._create_result(
+                        example, success, active_histories[i]
+                    )
                     completed_indices.append(i)
-            # Update model
-            model.batch_update(
-                observations=active_observations,
-                histories=active_histories,
-                successes=successes,
-                actions=actions,
-            )
+                    if callback_fn:
+                        callback_fn(example=example, results=results[example.id])
             # Remove completed environments and replace with new ones
             for i in reversed(completed_indices):
@@ -403,19 +340,9 @@ class Evaluator:
                 active_histories.pop(i)
                 active_observations.pop(i)
-                # Add new environment if there are pending examples
                 if pending_examples:
                     example = pending_examples.popleft()
-                    env = PlancraftEnvironment(
-                        inventory=deepcopy(example.slotted_inventory),
-                        resolution=self.resolution,
-                    )
-                    history = self.create_history()
-                    obs = env.step()
-                    obs["target"] = example.target
-                    obs["message"] = self.convert_observation_to_message(
-                        obs, model=model
-                    )
+                    env, history, obs = self._init_environment(example)
                     active_examples.append(example)
                     active_environments.append(env)

plancraft/models/act.py CHANGED Viewed

@@ -72,6 +72,3 @@ class ActModel(PlancraftBaseModel):
         dialogue_history.tokens_used += action_token_used
         # return raw action message
         return action_messages[0].split("\n")[0].strip()
-    def update(self, **kwargs):
-        pass

plancraft/models/base.py CHANGED Viewed

@@ -33,10 +33,3 @@ class PlancraftBaseModel(abc.ABC):
         Reset the model state - ready for a new episode
         """
         raise NotImplementedError()
-    @abc.abstractmethod
-    def update(self, **kwargs) -> None:
-        """
-        Update the model state based on the dialogue history
-        """
-        raise NotImplementedError()

plancraft/models/dummy.py CHANGED Viewed

@@ -45,9 +45,3 @@ class DummyModel(PlancraftBaseModel):
         self, observations: list[dict], **kwargs
     ) -> list[PlancraftModelOutput]:
         return [self.step(observation) for observation in observations]
-    def update(self, **kwargs):
-        pass
-    def batch_update(self, **kwargs):
-        pass

plancraft/models/oracle.py CHANGED Viewed

@@ -47,9 +47,3 @@ class OracleModel(PlancraftBaseModel):
             action = self.step(observation)
             actions.append(action)
         return actions
-    def update(self, **kwargs):
-        pass
-    def batch_update(self, **kwargs):
-        pass

{plancraft-0.3.30.dist-info → plancraft-0.3.32.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: plancraft
-Version: 0.3.30
+Version: 0.3.32
 Summary: Plancraft: an evaluation dataset for planning with LLM agents
 License: MIT License

{plancraft-0.3.30.dist-info → plancraft-0.3.32.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 plancraft/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 plancraft/config.py,sha256=ShsFRlJ7plsl3ToD9fiO_4LDQuXdbjNV6Xp6o3Yk2Yg,4315
-plancraft/evaluator.py,sha256=VFJnfitixU2Y4RTxp0lDALoCSFMMwMJPgSQC0Y0tmH8,18121
+plancraft/evaluator.py,sha256=mxzvbGpEDkiKW8u79QgYz5Q4wnZvkQSXiAvi0OVu4Qs,14754
 plancraft/generate_dataset.py,sha256=DlrU-PmvWqSNJD1g1-8Lpb8n3N-Ogw3rje1nrRzjGKs,2382
 plancraft/utils.py,sha256=VhnxMihh6pRhNjQTK5HDc0FYWmF9_EcQyRP_a7fbIZA,7156
 plancraft/data/test.json,sha256=4jWfYMAVuZCFmGB4iZJAjlh9_8jXECdaGp8xn7_tAM4,1317131
@@ -1912,15 +1912,15 @@ plancraft/environment/tags/wooden_stairs.json,sha256=GCr2_5UGPMYZECqQ_5NYSvbwuwt
 plancraft/environment/tags/wooden_trapdoors.json,sha256=DbjfwoHJL8VuYWV61A1uDqW7LJsGlOP4eoxcGIQVYr4,303
 plancraft/environment/tags/wool.json,sha256=Z59l4mdPztVZBFaglJ4mV9H2OnyCVzhqQRi2dduak78,496
 plancraft/models/__init__.py,sha256=TBrarn93qt4IFJRNqtzOfaA8jGMPCgD7DFs-M84ipmk,510
-plancraft/models/act.py,sha256=_OZo9a_6R0wajdR7axZarjI3IJP7glFrWeDIrbcHDmw,2737
-plancraft/models/base.py,sha256=Krm6MdOjU-qlps1WSX7pxdnqXLiyI3qsI9Na7Xk8r1c,1038
+plancraft/models/act.py,sha256=6Xb8rylg3OngOraVFgduH_hQR62VcoyTeFntN4q3hsQ,2691
+plancraft/models/base.py,sha256=S8EdkqWpn8nE1WcrqDoA4Hx4p52qEttGxnqjIPWvl3Q,852
 plancraft/models/bbox_model.py,sha256=3b1IEspoHiVUR6GOWjEbp4YoxRhGkzKt-eOiwaN8NXo,17091
-plancraft/models/dummy.py,sha256=UWbW3bjrQr_0UYYrNf_D0jWpUq6e50vAp21F0zi8iFM,1593
+plancraft/models/dummy.py,sha256=_NUTviv5ye6KGzODRt0Zykk8shsek0QBqWCeZW3ldSQ,1495
 plancraft/models/generators.py,sha256=F76_iPiqxUjDIrQwF58tzM0bLM91OkZJ0sBqBuki5wY,13939
-plancraft/models/oracle.py,sha256=jmt_kBBNXt0VWUX7q6OHkJoRZWItCMy4qGH5qbLSc1c,1755
+plancraft/models/oracle.py,sha256=f-0KWlBuHy6wcxmDsxM3MQ_QwfBstzfbA26mlk1MgLA,1657
 plancraft/models/utils.py,sha256=E-sZohvolWgGbpHQKgAgkgIfUJoVnT5pMt6JP8xLHKg,4034
 plancraft/train/dataset.py,sha256=oFqEd4LG9oEQ-71teh0Wf7-jJbtybT2ZibfM2bBdBkM,5474
-plancraft-0.3.30.dist-info/METADATA,sha256=tltUHYqXhfDXfsQGU5NLhEp6TjR41g6X0OWFn5dpttg,11148
-plancraft-0.3.30.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-plancraft-0.3.30.dist-info/licenses/LICENSE,sha256=YGR8ehDB4t-T-lOQKMfKNR-2zsOU7E3E5NA8t25HKE0,1070
-plancraft-0.3.30.dist-info/RECORD,,
+plancraft-0.3.32.dist-info/METADATA,sha256=vRc_HMJhCvX4LnEPLHIbgKaJCbQP4Gq0qb4xITGFkYQ,11148
+plancraft-0.3.32.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+plancraft-0.3.32.dist-info/licenses/LICENSE,sha256=YGR8ehDB4t-T-lOQKMfKNR-2zsOU7E3E5NA8t25HKE0,1070
+plancraft-0.3.32.dist-info/RECORD,,

{plancraft-0.3.30.dist-info → plancraft-0.3.32.dist-info}/WHEEL RENAMED Viewed

File without changes

{plancraft-0.3.30.dist-info → plancraft-0.3.32.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

plancraft 0.3.30__py3-none-any.whl → 0.3.32__py3-none-any.whl

plancraft 0.3.30py3-none-any.whl → 0.3.32py3-none-any.whl