PyPI - plancraft - Versions diffs - 0.3.29__py3-none-any.whl → 0.3.31__py3-none-any.whl - Mend

plancraft 0.3.29py3-none-any.whl → 0.3.31py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

plancraft/evaluator.py +100 -119
plancraft/models/act.py +0 -3
plancraft/models/base.py +0 -7
plancraft/models/dummy.py +0 -6
plancraft/models/oracle.py +0 -6
{plancraft-0.3.29.dist-info → plancraft-0.3.31.dist-info}/METADATA +1 -1
{plancraft-0.3.29.dist-info → plancraft-0.3.31.dist-info}/RECORD +9 -9
{plancraft-0.3.29.dist-info → plancraft-0.3.31.dist-info}/WHEEL +0 -0
{plancraft-0.3.29.dist-info → plancraft-0.3.31.dist-info}/licenses/LICENSE +0 -0

plancraft/evaluator.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import os
 from typing import Optional
 from copy import deepcopy
+from collections import deque
 import imageio
 from tqdm import tqdm
@@ -244,16 +245,6 @@ class Evaluator:
                 # check if the episode is done
                 success = self.check_done(observation["inventory"], example.target)
-            # update model with success or failure
-            # observation is the next state after the action (s1)
-            # history is the dialogue history
-            # -- the last message contains the action taken (a0)
-            # -- the second to last message is the observation (s0)
-            # success is whether the episode is sucessful (r)
-            model.update(
-                observation=observation, history=history, success=success, action=action
-            )
             # exit if success
             if success or isinstance(action, StopAction):
                 break
@@ -273,161 +264,151 @@ class Evaluator:
         self,
         examples: list[PlancraftExample],
         model,
+        batch_size: int = 4,
+        callback_fn: Optional[callable] = None,
     ) -> list:
         """
-        Similar to eval_example, but processes multiple examples at once.
+        Processes examples in batches with dynamic replacement from a queue.
-        Tracks which environments are still active until they've either succeeded,
-        reached max steps, or invoked StopAction.
+        Args:
+            examples: List of examples to process
+            model: Model to use for evaluation
+            batch_size: Maximum number of concurrent environments
+            callback_fn: Optional callback function to call after each result
         """
-        # Initialize environments and histories
-        environments = [
-            PlancraftEnvironment(
-                inventory=deepcopy(examples[i].slotted_inventory),
+        pending_examples = deque(examples)
+        active_examples = []
+        active_environments = []
+        active_histories = []
+        active_observations = []
+        results = {ex.id: None for ex in examples}
+        # Initialize first batch
+        while len(active_examples) < batch_size and pending_examples:
+            example = pending_examples.popleft()
+            env = PlancraftEnvironment(
+                inventory=deepcopy(example.slotted_inventory),
                 resolution=self.resolution,
             )
-            for i in range(len(examples))
-        ]
-        histories = [self.create_history() for _ in range(len(examples))]
-        # Track which environments are still active
-        active_mask = [True for _ in range(len(examples))]
-        results = [None for _ in range(len(examples))]
-        observations = []
-        # Initialize observations (s0) and user messages from environment
-        for i in range(len(examples)):
-            obs = environments[i].step()
-            obs["target"] = examples[i].target
+            history = self.create_history()
+            obs = env.step()
+            obs["target"] = example.target
             obs["message"] = self.convert_observation_to_message(obs, model=model)
-            observations.append(obs)
-        # Process until all done or max steps reached
-        while any(active_mask) and all(
-            history.num_steps < self.max_steps for history in histories
-        ):
-            # Gather active environments
-            active_indices = [
-                i
-                for i, active in enumerate(active_mask)
-                if active and histories[i].num_steps < self.max_steps
-            ]
-            if not active_indices:
-                break
-            # For each active environment, add new obs to history for next iteration
-            for env_idx in active_indices:
-                if active_mask[env_idx]:
-                    histories[env_idx].add_observation_to_history(observations[env_idx])
-                    histories[env_idx].add_message_to_history(
-                        content=observations[env_idx]["message"], role="user"
-                    )
-            batch_observations = [observations[i] for i in active_indices]
-            batch_histories = [histories[i] for i in active_indices]
+            active_examples.append(example)
+            active_environments.append(env)
+            active_histories.append(history)
+            active_observations.append(obs)
+        # Process until all examples are done
+        while active_examples:
+            # Add observations to histories
+            for i in range(len(active_examples)):
+                active_histories[i].add_observation_to_history(active_observations[i])
+                active_histories[i].add_message_to_history(
+                    content=active_observations[i]["message"], role="user"
+                )
-            # Predict next actions in batch
+            # Get model predictions for current batch
             raw_actions = model.batch_step(
-                batch_observations, dialogue_histories=batch_histories
+                active_observations, dialogue_histories=active_histories
             )
-            # Process each raw action and update environment/history
+            # Process each active environment
+            completed_indices = []
             successes = []
             actions = []
-            for env_idx, raw_action in zip(active_indices, raw_actions):
-                # Add model's message to history
+            for i, (example, raw_action) in enumerate(
+                zip(active_examples, raw_actions)
+            ):
+                # Handle model output
                 if isinstance(raw_action, PlancraftModelOutput):
-                    histories[env_idx].add_message_to_history(
+                    active_histories[i].add_message_to_history(
                         content=raw_action.action,
                         role="assistant",
                         **(raw_action.kwargs or {}),
                     )
                     raw_action = raw_action.action
-                elif isinstance(raw_action, str):
-                    histories[env_idx].add_message_to_history(
-                        content=raw_action, role="assistant"
-                    )
                 else:
-                    raise ValueError(
-                        f"model.batch_step() must return list[str] or list[PlancraftModelOutput], got {type(raw_action)}"
+                    active_histories[i].add_message_to_history(
+                        content=raw_action, role="assistant"
                     )
-                # Parse action
+                # Parse and execute action
                 action = self.parse_raw_model_response(
                     raw_action,
-                    observation=observations[env_idx],
-                    history=histories[env_idx],
+                    observation=active_observations[i],
+                    history=active_histories[i],
                 )
                 actions.append(action)
                 success = False
-                # If action is StopAction
                 if isinstance(action, StopAction):
-                    # if the action is StopAction and the example is impossible,
-                    # we consider that a 'success' in the sense that the model recognized it can't be done
-                    success = examples[env_idx].impossible
-                    observations[env_idx] = None
-                # If parsed action is a string, it's a message
+                    success = example.impossible
+                    active_observations[i] = None
                 elif isinstance(action, str):
-                    obs = environments[env_idx].step()
-                    obs["target"] = examples[env_idx].target
+                    obs = active_environments[i].step()
+                    obs["target"] = example.target
                     obs["message"] = action
-                    observations[env_idx] = obs
-                # Otherwise it's an actual environment action
+                    active_observations[i] = obs
                 else:
-                    obs = environments[env_idx].step(action)
-                    obs["target"] = examples[env_idx].target
+                    obs = active_environments[i].step(action)
+                    obs["target"] = example.target
                     obs["message"] = self.convert_observation_to_message(
                         obs, model=model
                     )
-                    observations[env_idx] = obs
-                    success = self.check_done(
-                        obs["inventory"], examples[env_idx].target
-                    )
+                    active_observations[i] = obs
+                    success = self.check_done(obs["inventory"], example.target)
                 successes.append(success)
-                # If done, or action was stop, mark inactive and store result
+                # Check if environment is done
                 if (
                     success
                     or isinstance(action, StopAction)
-                    or histories[env_idx].num_steps >= self.max_steps
+                    or active_histories[i].num_steps >= self.max_steps
                 ):
-                    active_mask[env_idx] = False
-                    results[env_idx] = {
+                    results[example.id] = {
                         "success": success,
-                        "recipe_type": examples[env_idx].recipe_type,
-                        "complexity": examples[env_idx].complexity_split,
-                        "number_of_steps": histories[env_idx].num_steps,
-                        "model_trace": histories[env_idx].trace(),
-                        "example_id": examples[env_idx].id,
-                        "images": histories[env_idx].images,
+                        "recipe_type": example.recipe_type,
+                        "complexity": example.complexity_split,
+                        "number_of_steps": active_histories[i].num_steps,
+                        "model_trace": active_histories[i].trace(),
+                        "example_id": example.id,
+                        "images": active_histories[i].images,
                     }
+                    completed_indices.append(i)
+                    if callback_fn:
+                        callback_fn(results[example.id])
+            # Remove completed environments and replace with new ones
+            for i in reversed(completed_indices):
+                active_examples.pop(i)
+                active_environments.pop(i)
+                active_histories.pop(i)
+                active_observations.pop(i)
+                # Add new environment if there are pending examples
+                if pending_examples:
+                    example = pending_examples.popleft()
+                    env = PlancraftEnvironment(
+                        inventory=deepcopy(example.slotted_inventory),
+                        resolution=self.resolution,
+                    )
+                    history = self.create_history()
+                    obs = env.step()
+                    obs["target"] = example.target
+                    obs["message"] = self.convert_observation_to_message(
+                        obs, model=model
+                    )
-            # Update the model for this single environment
-            batch_observations = [observations[i] for i in active_indices]
-            batch_histories = [histories[i] for i in active_indices]
-            model.batch_update(
-                observations=batch_observations,
-                histories=batch_histories,
-                successes=successes,
-                actions=actions,
-            )
-        # Fill in results for any environment that never completed
-        for i, result in enumerate(results):
-            if result is None:
-                results[i] = {
-                    "success": False,
-                    "recipe_type": examples[i].recipe_type,
-                    "complexity": examples[i].complexity_split,
-                    "number_of_steps": histories[i].num_steps,
-                    "model_trace": histories[i].trace(),
-                    "example_id": examples[i].id,
-                    "images": histories[i].images,
-                }
+                    active_examples.append(example)
+                    active_environments.append(env)
+                    active_histories.append(history)
+                    active_observations.append(obs)
-        return results
+        return list(results.values())
     def eval_all_examples(self, model, progress_bar=False) -> list:
         results = []

plancraft/models/act.py CHANGED Viewed

@@ -72,6 +72,3 @@ class ActModel(PlancraftBaseModel):
         dialogue_history.tokens_used += action_token_used
         # return raw action message
         return action_messages[0].split("\n")[0].strip()
-    def update(self, **kwargs):
-        pass

plancraft/models/base.py CHANGED Viewed

@@ -33,10 +33,3 @@ class PlancraftBaseModel(abc.ABC):
         Reset the model state - ready for a new episode
         """
         raise NotImplementedError()
-    @abc.abstractmethod
-    def update(self, **kwargs) -> None:
-        """
-        Update the model state based on the dialogue history
-        """
-        raise NotImplementedError()

plancraft/models/dummy.py CHANGED Viewed

@@ -45,9 +45,3 @@ class DummyModel(PlancraftBaseModel):
         self, observations: list[dict], **kwargs
     ) -> list[PlancraftModelOutput]:
         return [self.step(observation) for observation in observations]
-    def update(self, **kwargs):
-        pass
-    def batch_update(self, **kwargs):
-        pass

plancraft/models/oracle.py CHANGED Viewed

@@ -47,9 +47,3 @@ class OracleModel(PlancraftBaseModel):
             action = self.step(observation)
             actions.append(action)
         return actions
-    def update(self, **kwargs):
-        pass
-    def batch_update(self, **kwargs):
-        pass

{plancraft-0.3.29.dist-info → plancraft-0.3.31.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: plancraft
-Version: 0.3.29
+Version: 0.3.31
 Summary: Plancraft: an evaluation dataset for planning with LLM agents
 License: MIT License

{plancraft-0.3.29.dist-info → plancraft-0.3.31.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 plancraft/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 plancraft/config.py,sha256=ShsFRlJ7plsl3ToD9fiO_4LDQuXdbjNV6Xp6o3Yk2Yg,4315
-plancraft/evaluator.py,sha256=v73k0O8mTUj87jC1ODL9w59IzBOoOJUfmYaB2x1s73U,18850
+plancraft/evaluator.py,sha256=VteLAT_rPogw8NYZos7jEuuakyfE_3CsFuv6A39Geyw,17614
 plancraft/generate_dataset.py,sha256=DlrU-PmvWqSNJD1g1-8Lpb8n3N-Ogw3rje1nrRzjGKs,2382
 plancraft/utils.py,sha256=VhnxMihh6pRhNjQTK5HDc0FYWmF9_EcQyRP_a7fbIZA,7156
 plancraft/data/test.json,sha256=4jWfYMAVuZCFmGB4iZJAjlh9_8jXECdaGp8xn7_tAM4,1317131
@@ -1912,15 +1912,15 @@ plancraft/environment/tags/wooden_stairs.json,sha256=GCr2_5UGPMYZECqQ_5NYSvbwuwt
 plancraft/environment/tags/wooden_trapdoors.json,sha256=DbjfwoHJL8VuYWV61A1uDqW7LJsGlOP4eoxcGIQVYr4,303
 plancraft/environment/tags/wool.json,sha256=Z59l4mdPztVZBFaglJ4mV9H2OnyCVzhqQRi2dduak78,496
 plancraft/models/__init__.py,sha256=TBrarn93qt4IFJRNqtzOfaA8jGMPCgD7DFs-M84ipmk,510
-plancraft/models/act.py,sha256=_OZo9a_6R0wajdR7axZarjI3IJP7glFrWeDIrbcHDmw,2737
-plancraft/models/base.py,sha256=Krm6MdOjU-qlps1WSX7pxdnqXLiyI3qsI9Na7Xk8r1c,1038
+plancraft/models/act.py,sha256=6Xb8rylg3OngOraVFgduH_hQR62VcoyTeFntN4q3hsQ,2691
+plancraft/models/base.py,sha256=S8EdkqWpn8nE1WcrqDoA4Hx4p52qEttGxnqjIPWvl3Q,852
 plancraft/models/bbox_model.py,sha256=3b1IEspoHiVUR6GOWjEbp4YoxRhGkzKt-eOiwaN8NXo,17091
-plancraft/models/dummy.py,sha256=UWbW3bjrQr_0UYYrNf_D0jWpUq6e50vAp21F0zi8iFM,1593
+plancraft/models/dummy.py,sha256=_NUTviv5ye6KGzODRt0Zykk8shsek0QBqWCeZW3ldSQ,1495
 plancraft/models/generators.py,sha256=F76_iPiqxUjDIrQwF58tzM0bLM91OkZJ0sBqBuki5wY,13939
-plancraft/models/oracle.py,sha256=jmt_kBBNXt0VWUX7q6OHkJoRZWItCMy4qGH5qbLSc1c,1755
+plancraft/models/oracle.py,sha256=f-0KWlBuHy6wcxmDsxM3MQ_QwfBstzfbA26mlk1MgLA,1657
 plancraft/models/utils.py,sha256=E-sZohvolWgGbpHQKgAgkgIfUJoVnT5pMt6JP8xLHKg,4034
 plancraft/train/dataset.py,sha256=oFqEd4LG9oEQ-71teh0Wf7-jJbtybT2ZibfM2bBdBkM,5474
-plancraft-0.3.29.dist-info/METADATA,sha256=qLWNDUZpsYGVEGNvHwRwxf912NjcVihvN_5oTvyMG5c,11148
-plancraft-0.3.29.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-plancraft-0.3.29.dist-info/licenses/LICENSE,sha256=YGR8ehDB4t-T-lOQKMfKNR-2zsOU7E3E5NA8t25HKE0,1070
-plancraft-0.3.29.dist-info/RECORD,,
+plancraft-0.3.31.dist-info/METADATA,sha256=gU6j3SQEGdXIeW1pab_Pz6hspDhl_g0vaPIkIXRScYo,11148
+plancraft-0.3.31.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+plancraft-0.3.31.dist-info/licenses/LICENSE,sha256=YGR8ehDB4t-T-lOQKMfKNR-2zsOU7E3E5NA8t25HKE0,1070
+plancraft-0.3.31.dist-info/RECORD,,

{plancraft-0.3.29.dist-info → plancraft-0.3.31.dist-info}/WHEEL RENAMED Viewed

File without changes

{plancraft-0.3.29.dist-info → plancraft-0.3.31.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

plancraft 0.3.29__py3-none-any.whl → 0.3.31__py3-none-any.whl

plancraft 0.3.29py3-none-any.whl → 0.3.31py3-none-any.whl