PyPI - plancraft - Versions diffs - 0.3.27__py3-none-any.whl → 0.3.29__py3-none-any.whl - Mend

plancraft 0.3.27py3-none-any.whl → 0.3.29py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

plancraft/evaluator.py +151 -120
plancraft/models/act.py +3 -0
plancraft/models/base.py +7 -0
plancraft/models/dummy.py +6 -0
plancraft/models/oracle.py +6 -0
{plancraft-0.3.27.dist-info → plancraft-0.3.29.dist-info}/METADATA +1 -1
{plancraft-0.3.27.dist-info → plancraft-0.3.29.dist-info}/RECORD +9 -9
{plancraft-0.3.27.dist-info → plancraft-0.3.29.dist-info}/WHEEL +0 -0
{plancraft-0.3.27.dist-info → plancraft-0.3.29.dist-info}/licenses/LICENSE +0 -0

plancraft/evaluator.py CHANGED Viewed

@@ -187,47 +187,24 @@ class Evaluator:
         # initialise history/dialogue tracking
         history = self.create_history()
+        observation = environment.step()
+        # add target and first message to history
+        observation["target"] = example.target
+        observation["message"] = self.convert_observation_to_message(
+            observation, model=model
+        )
         success = False
-        action = None
         # run episode until stuck or until max steps is reached
         while history.num_steps < self.max_steps:
-            # if the action is stop then we end the episode
-            if isinstance(action, StopAction):
-                # if the action is stop and task is impossible then success
-                # otherwise we should not have stopped
-                success = example.impossible
-                break
-            # action is external tool then it is str
-            if isinstance(action, str):
-                observation = environment.step()
-                observation["target"] = example.target
-                observation["message"] = action
-            # action is environment action
-            else:
-                observation = environment.step(action)
-                # convert inventory observation to text message
-                observation["target"] = example.target
-                observation["message"] = self.convert_observation_to_message(
-                    observation, model=model
-                )
-                # check if the episode is done
-                success = self.check_done(observation["inventory"], example.target)
-            # exit if success
-            if success:
-                break
             # add observation to history
             history.add_observation_to_history(observation)
-            # add observation message to history
             history.add_message_to_history(content=observation["message"], role="user")
             # predict next action
             raw_action = model.step(observation, dialogue_history=history)
             # if the model returns a PlancraftModelOutput, extract the action
             if isinstance(raw_action, PlancraftModelOutput):
-                # add message to history
                 history.add_message_to_history(
                     content=raw_action.action,
                     role="assistant",
@@ -235,7 +212,6 @@ class Evaluator:
                 )
                 raw_action = raw_action.action
             elif isinstance(raw_action, str):
-                # add message to history
                 history.add_message_to_history(content=raw_action, role="assistant")
             else:
                 raise ValueError(
@@ -247,6 +223,41 @@ class Evaluator:
                 raw_action, observation=observation, history=history
             )
+            # if the action is stop then we end the episode
+            if isinstance(action, StopAction):
+                # if the action is stop and task is impossible then success
+                # otherwise we should not have stopped
+                observation = None
+                success = example.impossible
+            # action is external tool then it is str
+            elif isinstance(action, str):
+                observation = environment.step()
+                observation["target"] = example.target
+                observation["message"] = action
+            # action is environment action
+            else:
+                observation = environment.step(action)
+                observation["target"] = example.target
+                observation["message"] = self.convert_observation_to_message(
+                    observation, model=model
+                )
+                # check if the episode is done
+                success = self.check_done(observation["inventory"], example.target)
+            # update model with success or failure
+            # observation is the next state after the action (s1)
+            # history is the dialogue history
+            # -- the last message contains the action taken (a0)
+            # -- the second to last message is the observation (s0)
+            # success is whether the episode is sucessful (r)
+            model.update(
+                observation=observation, history=history, success=success, action=action
+            )
+            # exit if success
+            if success or isinstance(action, StopAction):
+                break
         # save results and reset
         return {
             "success": success,
@@ -263,6 +274,13 @@ class Evaluator:
         examples: list[PlancraftExample],
         model,
     ) -> list:
+        """
+        Similar to eval_example, but processes multiple examples at once.
+        Tracks which environments are still active until they've either succeeded,
+        reached max steps, or invoked StopAction.
+        """
         # Initialize environments and histories
         environments = [
             PlancraftEnvironment(
@@ -271,126 +289,139 @@ class Evaluator:
             )
             for i in range(len(examples))
         ]
         histories = [self.create_history() for _ in range(len(examples))]
         # Track which environments are still active
         active_mask = [True for _ in range(len(examples))]
         results = [None for _ in range(len(examples))]
-        steps_taken = [0 for _ in range(len(examples))]
-        actions = [None for _ in range(len(examples))]
-        while any(active_mask) and all(steps < self.max_steps for steps in steps_taken):
-            # Get observations for all active environments
-            observations = []
-            active_indices = []
-            active_histories = []
-            for i, (env, action, active) in enumerate(
-                zip(environments, actions, active_mask)
-            ):
-                if not active:
-                    continue
-                if isinstance(action, StopAction):
-                    # Handle stop action
-                    active_mask[i] = False
-                    results[i] = {
-                        "success": examples[i].impossible,
-                        "recipe_type": examples[i].recipe_type,
-                        "complexity": examples[i].complexity_split,
-                        "number_of_steps": steps_taken[i],
-                        "model_trace": histories[i].trace(),
-                        "example_id": examples[i].id,
-                        "images": histories[i].images,
-                    }
-                    continue
+        observations = []
+        # Initialize observations (s0) and user messages from environment
+        for i in range(len(examples)):
+            obs = environments[i].step()
+            obs["target"] = examples[i].target
+            obs["message"] = self.convert_observation_to_message(obs, model=model)
+            observations.append(obs)
+        # Process until all done or max steps reached
+        while any(active_mask) and all(
+            history.num_steps < self.max_steps for history in histories
+        ):
+            # Gather active environments
+            active_indices = [
+                i
+                for i, active in enumerate(active_mask)
+                if active and histories[i].num_steps < self.max_steps
+            ]
+            if not active_indices:
+                break
-                # Get observation
-                if isinstance(action, str):
-                    obs = env.step()
-                    obs["target"] = examples[i].target
-                    obs["message"] = action
-                else:
-                    obs = env.step(action)
-                    obs["target"] = examples[i].target
-                    obs["message"] = self.convert_observation_to_message(
-                        obs, model=model
+            # For each active environment, add new obs to history for next iteration
+            for env_idx in active_indices:
+                if active_mask[env_idx]:
+                    histories[env_idx].add_observation_to_history(observations[env_idx])
+                    histories[env_idx].add_message_to_history(
+                        content=observations[env_idx]["message"], role="user"
                     )
-                    # Check if done
-                    if self.check_done(obs["inventory"], examples[i].target):
-                        active_mask[i] = False
-                        results[i] = {
-                            "success": True,
-                            "recipe_type": examples[i].recipe_type,
-                            "complexity": examples[i].complexity_split,
-                            "number_of_steps": steps_taken[i],
-                            "model_trace": histories[i].trace(),
-                            "example_id": examples[i].id,
-                            "images": histories[i].images,
-                        }
-                        continue
-                # Add to batch lists
-                active_indices.append(i)
-                observations.append(obs)
-                active_histories.append(histories[i])
-                # Update history
-                histories[i].add_observation_to_history(obs)
-                histories[i].add_message_to_history(content=obs["message"], role="user")
-                steps_taken[i] += 1
-            if not observations:
-                break
+            batch_observations = [observations[i] for i in active_indices]
+            batch_histories = [histories[i] for i in active_indices]
-            # Batch predict actions for active environments
+            # Predict next actions in batch
             raw_actions = model.batch_step(
-                observations, dialogue_histories=active_histories
+                batch_observations, dialogue_histories=batch_histories
             )
-            # Process actions for each active environment
-            for batch_idx, (idx, raw_action) in enumerate(
-                zip(active_indices, raw_actions)
-            ):
-                # if the model returns a PlancraftModelOutput, extract the action
+            # Process each raw action and update environment/history
+            successes = []
+            actions = []
+            for env_idx, raw_action in zip(active_indices, raw_actions):
+                # Add model's message to history
                 if isinstance(raw_action, PlancraftModelOutput):
-                    # add message to history
-                    histories[idx].add_message_to_history(
+                    histories[env_idx].add_message_to_history(
                         content=raw_action.action,
                         role="assistant",
                         **(raw_action.kwargs or {}),
                     )
-                    actions[idx] = self.parse_raw_model_response(
-                        raw_action.action,
-                        observation=observations[batch_idx],
-                        history=histories[idx],
-                    )
-                # if the model returns a string, parse the raw action
+                    raw_action = raw_action.action
                 elif isinstance(raw_action, str):
-                    # add message to history
-                    histories[idx].add_message_to_history(
+                    histories[env_idx].add_message_to_history(
                         content=raw_action, role="assistant"
                     )
-                    actions[idx] = self.parse_raw_model_response(
-                        raw_action,
-                        observation=observations[batch_idx],
-                        history=histories[idx],
-                    )
                 else:
                     raise ValueError(
-                        f"model.step() output must be a string or PlancraftModelOutput, got {type(raw_action)}"
+                        f"model.batch_step() must return list[str] or list[PlancraftModelOutput], got {type(raw_action)}"
                     )
-        # Fill in results for environments that didn't finish
+                # Parse action
+                action = self.parse_raw_model_response(
+                    raw_action,
+                    observation=observations[env_idx],
+                    history=histories[env_idx],
+                )
+                actions.append(action)
+                success = False
+                # If action is StopAction
+                if isinstance(action, StopAction):
+                    # if the action is StopAction and the example is impossible,
+                    # we consider that a 'success' in the sense that the model recognized it can't be done
+                    success = examples[env_idx].impossible
+                    observations[env_idx] = None
+                # If parsed action is a string, it's a message
+                elif isinstance(action, str):
+                    obs = environments[env_idx].step()
+                    obs["target"] = examples[env_idx].target
+                    obs["message"] = action
+                    observations[env_idx] = obs
+                # Otherwise it's an actual environment action
+                else:
+                    obs = environments[env_idx].step(action)
+                    obs["target"] = examples[env_idx].target
+                    obs["message"] = self.convert_observation_to_message(
+                        obs, model=model
+                    )
+                    observations[env_idx] = obs
+                    success = self.check_done(
+                        obs["inventory"], examples[env_idx].target
+                    )
+                successes.append(success)
+                # If done, or action was stop, mark inactive and store result
+                if (
+                    success
+                    or isinstance(action, StopAction)
+                    or histories[env_idx].num_steps >= self.max_steps
+                ):
+                    active_mask[env_idx] = False
+                    results[env_idx] = {
+                        "success": success,
+                        "recipe_type": examples[env_idx].recipe_type,
+                        "complexity": examples[env_idx].complexity_split,
+                        "number_of_steps": histories[env_idx].num_steps,
+                        "model_trace": histories[env_idx].trace(),
+                        "example_id": examples[env_idx].id,
+                        "images": histories[env_idx].images,
+                    }
+            # Update the model for this single environment
+            batch_observations = [observations[i] for i in active_indices]
+            batch_histories = [histories[i] for i in active_indices]
+            model.batch_update(
+                observations=batch_observations,
+                histories=batch_histories,
+                successes=successes,
+                actions=actions,
+            )
+        # Fill in results for any environment that never completed
         for i, result in enumerate(results):
             if result is None:
                 results[i] = {
                     "success": False,
                     "recipe_type": examples[i].recipe_type,
                     "complexity": examples[i].complexity_split,
-                    "number_of_steps": steps_taken[i],
+                    "number_of_steps": histories[i].num_steps,
                     "model_trace": histories[i].trace(),
                     "example_id": examples[i].id,
                     "images": histories[i].images,

plancraft/models/act.py CHANGED Viewed

@@ -72,3 +72,6 @@ class ActModel(PlancraftBaseModel):
         dialogue_history.tokens_used += action_token_used
         # return raw action message
         return action_messages[0].split("\n")[0].strip()
+    def update(self, **kwargs):
+        pass

plancraft/models/base.py CHANGED Viewed

@@ -33,3 +33,10 @@ class PlancraftBaseModel(abc.ABC):
         Reset the model state - ready for a new episode
         """
         raise NotImplementedError()
+    @abc.abstractmethod
+    def update(self, **kwargs) -> None:
+        """
+        Update the model state based on the dialogue history
+        """
+        raise NotImplementedError()

plancraft/models/dummy.py CHANGED Viewed

@@ -45,3 +45,9 @@ class DummyModel(PlancraftBaseModel):
         self, observations: list[dict], **kwargs
     ) -> list[PlancraftModelOutput]:
         return [self.step(observation) for observation in observations]
+    def update(self, **kwargs):
+        pass
+    def batch_update(self, **kwargs):
+        pass

plancraft/models/oracle.py CHANGED Viewed

@@ -47,3 +47,9 @@ class OracleModel(PlancraftBaseModel):
             action = self.step(observation)
             actions.append(action)
         return actions
+    def update(self, **kwargs):
+        pass
+    def batch_update(self, **kwargs):
+        pass

{plancraft-0.3.27.dist-info → plancraft-0.3.29.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: plancraft
-Version: 0.3.27
+Version: 0.3.29
 Summary: Plancraft: an evaluation dataset for planning with LLM agents
 License: MIT License

{plancraft-0.3.27.dist-info → plancraft-0.3.29.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 plancraft/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 plancraft/config.py,sha256=ShsFRlJ7plsl3ToD9fiO_4LDQuXdbjNV6Xp6o3Yk2Yg,4315
-plancraft/evaluator.py,sha256=dyszVJtTc_PThVEeGmp6YMkmEn4gaXQW52eWaKO2FQ8,17210
+plancraft/evaluator.py,sha256=v73k0O8mTUj87jC1ODL9w59IzBOoOJUfmYaB2x1s73U,18850
 plancraft/generate_dataset.py,sha256=DlrU-PmvWqSNJD1g1-8Lpb8n3N-Ogw3rje1nrRzjGKs,2382
 plancraft/utils.py,sha256=VhnxMihh6pRhNjQTK5HDc0FYWmF9_EcQyRP_a7fbIZA,7156
 plancraft/data/test.json,sha256=4jWfYMAVuZCFmGB4iZJAjlh9_8jXECdaGp8xn7_tAM4,1317131
@@ -1912,15 +1912,15 @@ plancraft/environment/tags/wooden_stairs.json,sha256=GCr2_5UGPMYZECqQ_5NYSvbwuwt
 plancraft/environment/tags/wooden_trapdoors.json,sha256=DbjfwoHJL8VuYWV61A1uDqW7LJsGlOP4eoxcGIQVYr4,303
 plancraft/environment/tags/wool.json,sha256=Z59l4mdPztVZBFaglJ4mV9H2OnyCVzhqQRi2dduak78,496
 plancraft/models/__init__.py,sha256=TBrarn93qt4IFJRNqtzOfaA8jGMPCgD7DFs-M84ipmk,510
-plancraft/models/act.py,sha256=6Xb8rylg3OngOraVFgduH_hQR62VcoyTeFntN4q3hsQ,2691
-plancraft/models/base.py,sha256=S8EdkqWpn8nE1WcrqDoA4Hx4p52qEttGxnqjIPWvl3Q,852
+plancraft/models/act.py,sha256=_OZo9a_6R0wajdR7axZarjI3IJP7glFrWeDIrbcHDmw,2737
+plancraft/models/base.py,sha256=Krm6MdOjU-qlps1WSX7pxdnqXLiyI3qsI9Na7Xk8r1c,1038
 plancraft/models/bbox_model.py,sha256=3b1IEspoHiVUR6GOWjEbp4YoxRhGkzKt-eOiwaN8NXo,17091
-plancraft/models/dummy.py,sha256=_NUTviv5ye6KGzODRt0Zykk8shsek0QBqWCeZW3ldSQ,1495
+plancraft/models/dummy.py,sha256=UWbW3bjrQr_0UYYrNf_D0jWpUq6e50vAp21F0zi8iFM,1593
 plancraft/models/generators.py,sha256=F76_iPiqxUjDIrQwF58tzM0bLM91OkZJ0sBqBuki5wY,13939
-plancraft/models/oracle.py,sha256=f-0KWlBuHy6wcxmDsxM3MQ_QwfBstzfbA26mlk1MgLA,1657
+plancraft/models/oracle.py,sha256=jmt_kBBNXt0VWUX7q6OHkJoRZWItCMy4qGH5qbLSc1c,1755
 plancraft/models/utils.py,sha256=E-sZohvolWgGbpHQKgAgkgIfUJoVnT5pMt6JP8xLHKg,4034
 plancraft/train/dataset.py,sha256=oFqEd4LG9oEQ-71teh0Wf7-jJbtybT2ZibfM2bBdBkM,5474
-plancraft-0.3.27.dist-info/METADATA,sha256=fii20vfc62_UjIquU8OXEWtxRXfaaaA2lRo7EbFaQok,11148
-plancraft-0.3.27.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-plancraft-0.3.27.dist-info/licenses/LICENSE,sha256=YGR8ehDB4t-T-lOQKMfKNR-2zsOU7E3E5NA8t25HKE0,1070
-plancraft-0.3.27.dist-info/RECORD,,
+plancraft-0.3.29.dist-info/METADATA,sha256=qLWNDUZpsYGVEGNvHwRwxf912NjcVihvN_5oTvyMG5c,11148
+plancraft-0.3.29.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+plancraft-0.3.29.dist-info/licenses/LICENSE,sha256=YGR8ehDB4t-T-lOQKMfKNR-2zsOU7E3E5NA8t25HKE0,1070
+plancraft-0.3.29.dist-info/RECORD,,

{plancraft-0.3.27.dist-info → plancraft-0.3.29.dist-info}/WHEEL RENAMED Viewed

File without changes

{plancraft-0.3.27.dist-info → plancraft-0.3.29.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

plancraft 0.3.27__py3-none-any.whl → 0.3.29__py3-none-any.whl

plancraft 0.3.27py3-none-any.whl → 0.3.29py3-none-any.whl