plancraft 0.3.4__py3-none-any.whl → 0.3.5__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- plancraft/evaluator.py +3 -8
- plancraft/models/dummy.py +1 -2
- plancraft/utils.py +1 -11
- {plancraft-0.3.4.dist-info → plancraft-0.3.5.dist-info}/METADATA +14 -15
- {plancraft-0.3.4.dist-info → plancraft-0.3.5.dist-info}/RECORD +7 -7
- {plancraft-0.3.4.dist-info → plancraft-0.3.5.dist-info}/WHEEL +0 -0
- {plancraft-0.3.4.dist-info → plancraft-0.3.5.dist-info}/licenses/LICENSE +0 -0
plancraft/evaluator.py
CHANGED
@@ -213,11 +213,9 @@ class Evaluator:
|
|
213
213
|
num_non_env_actions += 1
|
214
214
|
# action is environment action
|
215
215
|
else:
|
216
|
-
# add action to history
|
217
216
|
if isinstance(action, str):
|
218
217
|
observation = self.environment.step()
|
219
218
|
else:
|
220
|
-
self.history.add_action_to_history(action)
|
221
219
|
observation = self.environment.step(action)
|
222
220
|
|
223
221
|
# convert inventory observation to text message
|
@@ -229,6 +227,9 @@ class Evaluator:
|
|
229
227
|
|
230
228
|
# check if the episode is done
|
231
229
|
success = self.check_done(observation["inventory"], example.target)
|
230
|
+
# exit if success
|
231
|
+
if success:
|
232
|
+
break
|
232
233
|
|
233
234
|
# add observation to history
|
234
235
|
self.history.add_observation_to_history(observation)
|
@@ -236,11 +237,6 @@ class Evaluator:
|
|
236
237
|
self.history.add_message_to_history(
|
237
238
|
content=observation["message"], role="user"
|
238
239
|
)
|
239
|
-
|
240
|
-
# exit if success
|
241
|
-
if success:
|
242
|
-
break
|
243
|
-
|
244
240
|
# predict next action
|
245
241
|
raw_action = self.model.step(observation, dialogue_history=self.history)
|
246
242
|
# add message to history
|
@@ -256,7 +252,6 @@ class Evaluator:
|
|
256
252
|
"number_of_steps": self.history.num_steps,
|
257
253
|
"model_trace": self.history.trace(),
|
258
254
|
"example_id": example.id,
|
259
|
-
"impossible": example.impossible,
|
260
255
|
}
|
261
256
|
|
262
257
|
def eval_all_examples(self, progress_bar=False) -> list:
|
plancraft/models/dummy.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
import random
|
2
2
|
|
3
|
-
from plancraft.config import EvalConfig
|
4
3
|
from plancraft.environment.actions import (
|
5
4
|
MoveAction,
|
6
5
|
)
|
@@ -12,7 +11,7 @@ class DummyModel(PlancraftBaseModel):
|
|
12
11
|
Dummy model returns actions that do random action
|
13
12
|
"""
|
14
13
|
|
15
|
-
def __init__(self, cfg
|
14
|
+
def __init__(self, cfg=None):
|
16
15
|
pass
|
17
16
|
|
18
17
|
def reset(self):
|
plancraft/utils.py
CHANGED
@@ -7,11 +7,7 @@ from typing import Optional
|
|
7
7
|
import torch
|
8
8
|
from loguru import logger
|
9
9
|
|
10
|
-
from plancraft.environment.actions import
|
11
|
-
ActionHandlerBase,
|
12
|
-
MoveAction,
|
13
|
-
SmeltAction,
|
14
|
-
)
|
10
|
+
from plancraft.environment.actions import ActionHandlerBase
|
15
11
|
from plancraft.environment.prompts import (
|
16
12
|
get_prompt_example,
|
17
13
|
get_system_prompt,
|
@@ -108,10 +104,6 @@ class History:
|
|
108
104
|
else:
|
109
105
|
self.dialogue_history.append({"role": role, "content": content})
|
110
106
|
|
111
|
-
def add_action_to_history(self, action: SmeltAction | MoveAction):
|
112
|
-
if isinstance(action, SmeltAction) or isinstance(action, MoveAction):
|
113
|
-
self.action_history.append(action.model_dump())
|
114
|
-
|
115
107
|
def add_inventory_to_history(self, inventory: dict):
|
116
108
|
self.inventory_history.append(inventory)
|
117
109
|
# count inventory
|
@@ -148,7 +140,6 @@ class History:
|
|
148
140
|
self.images = copy(self.prompt_images)
|
149
141
|
self.initial_dialogue_length = len(self.dialogue_history)
|
150
142
|
|
151
|
-
self.action_history = []
|
152
143
|
self.inventory_history = []
|
153
144
|
self.inventory_counters = []
|
154
145
|
|
@@ -159,7 +150,6 @@ class History:
|
|
159
150
|
"dialogue_history": copy(
|
160
151
|
self.dialogue_history[self.initial_dialogue_length :]
|
161
152
|
),
|
162
|
-
"action_history": copy(self.action_history),
|
163
153
|
"inventory_history": copy(self.inventory_history),
|
164
154
|
"tokens_used": copy(self.tokens_used),
|
165
155
|
}
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: plancraft
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.5
|
4
4
|
Summary: Plancraft: an evaluation dataset for planning with LLM agents
|
5
5
|
License: MIT License
|
6
6
|
|
@@ -72,6 +72,13 @@ You can install the package by running the following command:
|
|
72
72
|
pip install plancraft
|
73
73
|
```
|
74
74
|
|
75
|
+
Or:
|
76
|
+
|
77
|
+
```bash
|
78
|
+
uv add plancraft
|
79
|
+
```
|
80
|
+
|
81
|
+
|
75
82
|
![gif-example3](docs/images/train_images/TRAIN0010.gif)
|
76
83
|
![gif-example1](docs/images/train_images/TRAIN1133.gif)
|
77
84
|
![gif-example2](docs/images/train_images/TRAIN0383.gif)
|
@@ -117,17 +124,14 @@ The package also provides an `Evaluator` class that can be used to evaluate the
|
|
117
124
|
|
118
125
|
```python
|
119
126
|
from plancraft.evaluator import Evaluator
|
120
|
-
from plancraft.config import EvalConfig
|
121
127
|
|
122
128
|
def main():
|
123
|
-
# Create the config
|
124
|
-
config = EvalConfig(...)
|
125
129
|
# create model -- Note you can create your own model by subclassing PlancraftBaseModel
|
126
|
-
model = get_model(
|
130
|
+
model = get_model("dummy")
|
127
131
|
# Create the evaluator
|
128
|
-
evaluator = Evaluator(
|
132
|
+
evaluator = Evaluator(run_name="dummy", model=model)
|
129
133
|
# Evaluate the agent
|
130
|
-
evaluator.
|
134
|
+
evaluator.eval_all_examples()
|
131
135
|
```
|
132
136
|
|
133
137
|
The evaluator class handles the environment loop and model interaction. The environment is created based on the configuration and the examples are loaded from the dataset. The `Evaluator` uses the dataset examples and initializes the environment with the example's inventory. It is also responsible for early stopping and verifying the target object has been craft. Finally, it also saves the results of the evaluation and the images generated during the evaluation.
|
@@ -159,7 +163,6 @@ while not history.check_stuck() and history.num_steps < max_steps:
|
|
159
163
|
# Handle invalid case (exceeded non-env action limit)
|
160
164
|
observation = environment.step()
|
161
165
|
else:
|
162
|
-
history.add_action_to_history(action) # Add action to history
|
163
166
|
observation = environment.step(action)
|
164
167
|
|
165
168
|
# Convert observation to message and reset non-env counter
|
@@ -170,19 +173,16 @@ while not history.check_stuck() and history.num_steps < max_steps:
|
|
170
173
|
# Check if episode is complete
|
171
174
|
success = check_done(observation["inventory"], example.target)
|
172
175
|
|
173
|
-
# Update history with observation and message
|
174
|
-
history.add_observation_to_history(observation)
|
175
|
-
history.add_message_to_history(content=observation["message"], role="user")
|
176
|
-
|
177
176
|
if success: # Exit loop if success
|
178
177
|
break
|
179
178
|
|
179
|
+
# Update history with observation and message
|
180
|
+
history.add_observation_to_history(observation)
|
181
|
+
history.add_message_to_history(content=observation["message"], role="user")
|
180
182
|
# Model predicts next action
|
181
183
|
raw_action = model.step(observation, dialogue_history=history)
|
182
|
-
|
183
184
|
# Update history with predicted action
|
184
185
|
history.add_message_to_history(content=raw_action, role="assistant")
|
185
|
-
|
186
186
|
# Parse raw action into a structured format
|
187
187
|
action = parse_raw_model_response(raw_action)
|
188
188
|
|
@@ -194,7 +194,6 @@ return {
|
|
194
194
|
"number_of_steps": history.num_steps,
|
195
195
|
"model_trace": history.trace(),
|
196
196
|
"example_id": example.id,
|
197
|
-
"impossible": example.impossible,
|
198
197
|
}
|
199
198
|
```
|
200
199
|
|
@@ -1,8 +1,8 @@
|
|
1
1
|
plancraft/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
plancraft/config.py,sha256=Ppkps-E8xDNYEP9prOVxW2zEG9MpWVzcLJi4tmGLjuQ,4285
|
3
|
-
plancraft/evaluator.py,sha256=
|
3
|
+
plancraft/evaluator.py,sha256=dTsE3FiQTJc094TmBvfBvefOpGSYcePIGVT36OEIClU,10910
|
4
4
|
plancraft/generate_dataset.py,sha256=DlrU-PmvWqSNJD1g1-8Lpb8n3N-Ogw3rje1nrRzjGKs,2382
|
5
|
-
plancraft/utils.py,sha256=
|
5
|
+
plancraft/utils.py,sha256=8bO8wrblmIW1aXEJre7ALGbL6GvuFrY38aZDdA_8W-g,6882
|
6
6
|
plancraft/data/test.json,sha256=4jWfYMAVuZCFmGB4iZJAjlh9_8jXECdaGp8xn7_tAM4,1317131
|
7
7
|
plancraft/data/test.small.easy.json,sha256=5NZEJ2PqIgmHQecJOIVQyM1D6GFKyJq7GVmgRudaqQk,189304
|
8
8
|
plancraft/data/test.small.json,sha256=eULAG1rdolRMXPrecV-7YoDIheKGyIT5MVpWdISV0wg,270089
|
@@ -1915,12 +1915,12 @@ plancraft/models/__init__.py,sha256=TBrarn93qt4IFJRNqtzOfaA8jGMPCgD7DFs-M84ipmk,
|
|
1915
1915
|
plancraft/models/act.py,sha256=6Xb8rylg3OngOraVFgduH_hQR62VcoyTeFntN4q3hsQ,2691
|
1916
1916
|
plancraft/models/base.py,sha256=uhG1tRmsBerJzW8qHoLyLEYpveDv0co7AAhi4mSfyO4,661
|
1917
1917
|
plancraft/models/bbox_model.py,sha256=3b1IEspoHiVUR6GOWjEbp4YoxRhGkzKt-eOiwaN8NXo,17091
|
1918
|
-
plancraft/models/dummy.py,sha256=
|
1918
|
+
plancraft/models/dummy.py,sha256=jBxke6VNpyYh_HBcFxCx64djO5F3wr5GbbnC0XePZ20,1015
|
1919
1919
|
plancraft/models/generators.py,sha256=F76_iPiqxUjDIrQwF58tzM0bLM91OkZJ0sBqBuki5wY,13939
|
1920
1920
|
plancraft/models/oracle.py,sha256=jDCE6zVFvbwFpDzQZTkHIlRwMud1yMJ4LVIdfpt5ddU,8449
|
1921
1921
|
plancraft/models/utils.py,sha256=E-sZohvolWgGbpHQKgAgkgIfUJoVnT5pMt6JP8xLHKg,4034
|
1922
1922
|
plancraft/train/dataset.py,sha256=oFqEd4LG9oEQ-71teh0Wf7-jJbtybT2ZibfM2bBdBkM,5474
|
1923
|
-
plancraft-0.3.
|
1924
|
-
plancraft-0.3.
|
1925
|
-
plancraft-0.3.
|
1926
|
-
plancraft-0.3.
|
1923
|
+
plancraft-0.3.5.dist-info/METADATA,sha256=QxQSXPXF162We8KwESaZ-nn94gqfz_5PQaXNDWkvV1Y,11147
|
1924
|
+
plancraft-0.3.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
1925
|
+
plancraft-0.3.5.dist-info/licenses/LICENSE,sha256=YGR8ehDB4t-T-lOQKMfKNR-2zsOU7E3E5NA8t25HKE0,1070
|
1926
|
+
plancraft-0.3.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|