plancraft 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- plancraft/environment/prompts.py +13 -2
- plancraft/evaluator.py +14 -10
- plancraft/models/dummy.py +1 -2
- plancraft/utils.py +22 -29
- {plancraft-0.3.3.dist-info → plancraft-0.3.5.dist-info}/METADATA +14 -15
- {plancraft-0.3.3.dist-info → plancraft-0.3.5.dist-info}/RECORD +8 -8
- {plancraft-0.3.3.dist-info → plancraft-0.3.5.dist-info}/WHEEL +0 -0
- {plancraft-0.3.3.dist-info → plancraft-0.3.5.dist-info}/licenses/LICENSE +0 -0
plancraft/environment/prompts.py
CHANGED
@@ -59,7 +59,8 @@ SEARCH_STEPS = [
|
|
59
59
|
|
60
60
|
def get_system_prompt(
|
61
61
|
handlers: list[ActionHandlerBase] = [MoveActionHandler(), SmeltActionHandler()],
|
62
|
-
|
62
|
+
use_multimodal_content_format=False,
|
63
|
+
) -> dict:
|
63
64
|
action_names = [handler.action_name for handler in handlers]
|
64
65
|
assert "move" in action_names, "MoveActionHandler should be one of the handlers"
|
65
66
|
assert "smelt" in action_names, "SmeltActionHandler should be one of the handlers"
|
@@ -72,7 +73,17 @@ def get_system_prompt(
|
|
72
73
|
for handler in handlers:
|
73
74
|
output_format += f"\n\t- {handler.prompt_format_example}"
|
74
75
|
|
75
|
-
|
76
|
+
system_prompt_text = f"{BASE_SYSTEM_PROMPT}\n\nActions:{descriptions}\n\nFormat{output_format}\n\n{BASE_SYSTEM_PROMPT_EXAMPLE}"
|
77
|
+
|
78
|
+
if use_multimodal_content_format:
|
79
|
+
return {
|
80
|
+
"role": "system",
|
81
|
+
"content": [{"text": system_prompt_text, "type": "text"}],
|
82
|
+
}
|
83
|
+
return {
|
84
|
+
"role": "system",
|
85
|
+
"content": system_prompt_text,
|
86
|
+
}
|
76
87
|
|
77
88
|
|
78
89
|
def get_prompt_example(
|
plancraft/evaluator.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import json
|
2
2
|
import os
|
3
|
+
from typing import Optional
|
3
4
|
|
4
5
|
import imageio
|
5
6
|
from loguru import logger
|
@@ -8,18 +9,18 @@ from tqdm import tqdm
|
|
8
9
|
import wandb
|
9
10
|
from plancraft.config import PlancraftExample
|
10
11
|
from plancraft.environment.actions import (
|
11
|
-
StopAction,
|
12
12
|
ActionHandlerBase,
|
13
13
|
MoveActionHandler,
|
14
14
|
SmeltActionHandler,
|
15
|
+
StopAction,
|
15
16
|
)
|
16
17
|
from plancraft.environment.env import (
|
17
18
|
PlancraftEnvironment,
|
18
19
|
get_objective_str,
|
19
20
|
target_and_inventory_to_text_obs,
|
20
21
|
)
|
21
|
-
from plancraft.utils import History
|
22
22
|
from plancraft.models.base import PlancraftBaseModel
|
23
|
+
from plancraft.utils import History
|
23
24
|
|
24
25
|
|
25
26
|
class Evaluator:
|
@@ -48,6 +49,10 @@ class Evaluator:
|
|
48
49
|
use_images: bool = False,
|
49
50
|
use_text_inventory: bool = False,
|
50
51
|
use_fasterrcnn: bool = False,
|
52
|
+
system_prompt: Optional[dict] = None,
|
53
|
+
prompt_examples: list[dict] = [],
|
54
|
+
prompt_images: list[str] = [],
|
55
|
+
few_shot: bool = True,
|
51
56
|
):
|
52
57
|
self.run_name = run_name
|
53
58
|
self.use_multimodal_content_format = use_multimodal_content_format
|
@@ -77,6 +82,10 @@ class Evaluator:
|
|
77
82
|
use_images=use_images,
|
78
83
|
use_text_inventory=use_text_inventory,
|
79
84
|
resolution=resolution,
|
85
|
+
few_shot=few_shot,
|
86
|
+
system_prompt=system_prompt,
|
87
|
+
prompt_examples=prompt_examples,
|
88
|
+
prompt_images=prompt_images,
|
80
89
|
)
|
81
90
|
|
82
91
|
# load model
|
@@ -204,11 +213,9 @@ class Evaluator:
|
|
204
213
|
num_non_env_actions += 1
|
205
214
|
# action is environment action
|
206
215
|
else:
|
207
|
-
# add action to history
|
208
216
|
if isinstance(action, str):
|
209
217
|
observation = self.environment.step()
|
210
218
|
else:
|
211
|
-
self.history.add_action_to_history(action)
|
212
219
|
observation = self.environment.step(action)
|
213
220
|
|
214
221
|
# convert inventory observation to text message
|
@@ -220,6 +227,9 @@ class Evaluator:
|
|
220
227
|
|
221
228
|
# check if the episode is done
|
222
229
|
success = self.check_done(observation["inventory"], example.target)
|
230
|
+
# exit if success
|
231
|
+
if success:
|
232
|
+
break
|
223
233
|
|
224
234
|
# add observation to history
|
225
235
|
self.history.add_observation_to_history(observation)
|
@@ -227,11 +237,6 @@ class Evaluator:
|
|
227
237
|
self.history.add_message_to_history(
|
228
238
|
content=observation["message"], role="user"
|
229
239
|
)
|
230
|
-
|
231
|
-
# exit if success
|
232
|
-
if success:
|
233
|
-
break
|
234
|
-
|
235
240
|
# predict next action
|
236
241
|
raw_action = self.model.step(observation, dialogue_history=self.history)
|
237
242
|
# add message to history
|
@@ -247,7 +252,6 @@ class Evaluator:
|
|
247
252
|
"number_of_steps": self.history.num_steps,
|
248
253
|
"model_trace": self.history.trace(),
|
249
254
|
"example_id": example.id,
|
250
|
-
"impossible": example.impossible,
|
251
255
|
}
|
252
256
|
|
253
257
|
def eval_all_examples(self, progress_bar=False) -> list:
|
plancraft/models/dummy.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
import random
|
2
2
|
|
3
|
-
from plancraft.config import EvalConfig
|
4
3
|
from plancraft.environment.actions import (
|
5
4
|
MoveAction,
|
6
5
|
)
|
@@ -12,7 +11,7 @@ class DummyModel(PlancraftBaseModel):
|
|
12
11
|
Dummy model returns actions that do random action
|
13
12
|
"""
|
14
13
|
|
15
|
-
def __init__(self, cfg
|
14
|
+
def __init__(self, cfg=None):
|
16
15
|
pass
|
17
16
|
|
18
17
|
def reset(self):
|
plancraft/utils.py
CHANGED
@@ -2,18 +2,15 @@ import glob
|
|
2
2
|
import pathlib
|
3
3
|
from collections import Counter
|
4
4
|
from copy import copy
|
5
|
+
from typing import Optional
|
5
6
|
|
6
7
|
import torch
|
7
8
|
from loguru import logger
|
8
9
|
|
9
|
-
from plancraft.environment.actions import
|
10
|
-
ActionHandlerBase,
|
11
|
-
MoveAction,
|
12
|
-
SmeltAction,
|
13
|
-
)
|
10
|
+
from plancraft.environment.actions import ActionHandlerBase
|
14
11
|
from plancraft.environment.prompts import (
|
15
|
-
get_system_prompt,
|
16
12
|
get_prompt_example,
|
13
|
+
get_system_prompt,
|
17
14
|
load_prompt_images,
|
18
15
|
)
|
19
16
|
|
@@ -35,6 +32,9 @@ class History:
|
|
35
32
|
use_images=False,
|
36
33
|
use_text_inventory=False,
|
37
34
|
resolution="high",
|
35
|
+
system_prompt: Optional[dict] = None,
|
36
|
+
prompt_examples: list[dict] = [],
|
37
|
+
prompt_images: list[str] = [],
|
38
38
|
):
|
39
39
|
self.action_handlers = actions
|
40
40
|
self.use_multimodal_content_format = use_multimodal_content_format
|
@@ -49,31 +49,30 @@ class History:
|
|
49
49
|
|
50
50
|
self.tokens_used = 0
|
51
51
|
|
52
|
+
# use system prompt if provided
|
53
|
+
if system_prompt:
|
54
|
+
self.system_prompt_dialogue = system_prompt
|
55
|
+
else:
|
56
|
+
# generate system prompt
|
57
|
+
self.system_prompt_dialogue = get_system_prompt(
|
58
|
+
handlers=self.action_handlers,
|
59
|
+
use_multimodal_content_format=self.use_multimodal_content_format,
|
60
|
+
)
|
61
|
+
|
52
62
|
# set up dialogue history with few-shot prompt
|
63
|
+
self.prompt_examples = prompt_examples
|
64
|
+
self.prompt_images = prompt_images
|
53
65
|
self.set_up_few_shot_prompt()
|
54
|
-
self.system_prompt_dialogue = self.system_prompt()
|
55
66
|
|
56
67
|
self.dialogue_history = copy(self.prompt_examples)
|
57
68
|
self.images = copy(self.prompt_images)
|
58
69
|
self.initial_dialogue_length = len(self.dialogue_history)
|
59
70
|
|
60
|
-
def system_prompt(self):
|
61
|
-
# kept separate from dialogue history because certain models deal with system prompt differently
|
62
|
-
system_prompt_text = get_system_prompt(handlers=self.action_handlers)
|
63
|
-
if self.use_multimodal_content_format:
|
64
|
-
return {
|
65
|
-
"role": "system",
|
66
|
-
"content": [{"text": system_prompt_text, "type": "text"}],
|
67
|
-
}
|
68
|
-
return {
|
69
|
-
"role": "system",
|
70
|
-
"content": system_prompt_text,
|
71
|
-
}
|
72
|
-
|
73
71
|
def set_up_few_shot_prompt(self):
|
74
|
-
|
75
|
-
self.
|
76
|
-
|
72
|
+
# if either prompt_examples or prompt_images are provided, skip
|
73
|
+
if self.prompt_examples or self.prompt_images:
|
74
|
+
return
|
75
|
+
# if few-shot is not enabled, skip
|
77
76
|
if self.few_shot:
|
78
77
|
self.prompt_examples = get_prompt_example(
|
79
78
|
self.action_handlers,
|
@@ -105,10 +104,6 @@ class History:
|
|
105
104
|
else:
|
106
105
|
self.dialogue_history.append({"role": role, "content": content})
|
107
106
|
|
108
|
-
def add_action_to_history(self, action: SmeltAction | MoveAction):
|
109
|
-
if isinstance(action, SmeltAction) or isinstance(action, MoveAction):
|
110
|
-
self.action_history.append(action.model_dump())
|
111
|
-
|
112
107
|
def add_inventory_to_history(self, inventory: dict):
|
113
108
|
self.inventory_history.append(inventory)
|
114
109
|
# count inventory
|
@@ -145,7 +140,6 @@ class History:
|
|
145
140
|
self.images = copy(self.prompt_images)
|
146
141
|
self.initial_dialogue_length = len(self.dialogue_history)
|
147
142
|
|
148
|
-
self.action_history = []
|
149
143
|
self.inventory_history = []
|
150
144
|
self.inventory_counters = []
|
151
145
|
|
@@ -156,7 +150,6 @@ class History:
|
|
156
150
|
"dialogue_history": copy(
|
157
151
|
self.dialogue_history[self.initial_dialogue_length :]
|
158
152
|
),
|
159
|
-
"action_history": copy(self.action_history),
|
160
153
|
"inventory_history": copy(self.inventory_history),
|
161
154
|
"tokens_used": copy(self.tokens_used),
|
162
155
|
}
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: plancraft
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.5
|
4
4
|
Summary: Plancraft: an evaluation dataset for planning with LLM agents
|
5
5
|
License: MIT License
|
6
6
|
|
@@ -72,6 +72,13 @@ You can install the package by running the following command:
|
|
72
72
|
pip install plancraft
|
73
73
|
```
|
74
74
|
|
75
|
+
Or:
|
76
|
+
|
77
|
+
```bash
|
78
|
+
uv add plancraft
|
79
|
+
```
|
80
|
+
|
81
|
+
|
75
82
|
![gif-example3](docs/images/train_images/TRAIN0010.gif)
|
76
83
|
![gif-example1](docs/images/train_images/TRAIN1133.gif)
|
77
84
|
![gif-example2](docs/images/train_images/TRAIN0383.gif)
|
@@ -117,17 +124,14 @@ The package also provides an `Evaluator` class that can be used to evaluate the
|
|
117
124
|
|
118
125
|
```python
|
119
126
|
from plancraft.evaluator import Evaluator
|
120
|
-
from plancraft.config import EvalConfig
|
121
127
|
|
122
128
|
def main():
|
123
|
-
# Create the config
|
124
|
-
config = EvalConfig(...)
|
125
129
|
# create model -- Note you can create your own model by subclassing PlancraftBaseModel
|
126
|
-
model = get_model(
|
130
|
+
model = get_model("dummy")
|
127
131
|
# Create the evaluator
|
128
|
-
evaluator = Evaluator(
|
132
|
+
evaluator = Evaluator(run_name="dummy", model=model)
|
129
133
|
# Evaluate the agent
|
130
|
-
evaluator.
|
134
|
+
evaluator.eval_all_examples()
|
131
135
|
```
|
132
136
|
|
133
137
|
The evaluator class handles the environment loop and model interaction. The environment is created based on the configuration and the examples are loaded from the dataset. The `Evaluator` uses the dataset examples and initializes the environment with the example's inventory. It is also responsible for early stopping and verifying the target object has been craft. Finally, it also saves the results of the evaluation and the images generated during the evaluation.
|
@@ -159,7 +163,6 @@ while not history.check_stuck() and history.num_steps < max_steps:
|
|
159
163
|
# Handle invalid case (exceeded non-env action limit)
|
160
164
|
observation = environment.step()
|
161
165
|
else:
|
162
|
-
history.add_action_to_history(action) # Add action to history
|
163
166
|
observation = environment.step(action)
|
164
167
|
|
165
168
|
# Convert observation to message and reset non-env counter
|
@@ -170,19 +173,16 @@ while not history.check_stuck() and history.num_steps < max_steps:
|
|
170
173
|
# Check if episode is complete
|
171
174
|
success = check_done(observation["inventory"], example.target)
|
172
175
|
|
173
|
-
# Update history with observation and message
|
174
|
-
history.add_observation_to_history(observation)
|
175
|
-
history.add_message_to_history(content=observation["message"], role="user")
|
176
|
-
|
177
176
|
if success: # Exit loop if success
|
178
177
|
break
|
179
178
|
|
179
|
+
# Update history with observation and message
|
180
|
+
history.add_observation_to_history(observation)
|
181
|
+
history.add_message_to_history(content=observation["message"], role="user")
|
180
182
|
# Model predicts next action
|
181
183
|
raw_action = model.step(observation, dialogue_history=history)
|
182
|
-
|
183
184
|
# Update history with predicted action
|
184
185
|
history.add_message_to_history(content=raw_action, role="assistant")
|
185
|
-
|
186
186
|
# Parse raw action into a structured format
|
187
187
|
action = parse_raw_model_response(raw_action)
|
188
188
|
|
@@ -194,7 +194,6 @@ return {
|
|
194
194
|
"number_of_steps": history.num_steps,
|
195
195
|
"model_trace": history.trace(),
|
196
196
|
"example_id": example.id,
|
197
|
-
"impossible": example.impossible,
|
198
197
|
}
|
199
198
|
```
|
200
199
|
|
@@ -1,8 +1,8 @@
|
|
1
1
|
plancraft/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
plancraft/config.py,sha256=Ppkps-E8xDNYEP9prOVxW2zEG9MpWVzcLJi4tmGLjuQ,4285
|
3
|
-
plancraft/evaluator.py,sha256=
|
3
|
+
plancraft/evaluator.py,sha256=dTsE3FiQTJc094TmBvfBvefOpGSYcePIGVT36OEIClU,10910
|
4
4
|
plancraft/generate_dataset.py,sha256=DlrU-PmvWqSNJD1g1-8Lpb8n3N-Ogw3rje1nrRzjGKs,2382
|
5
|
-
plancraft/utils.py,sha256=
|
5
|
+
plancraft/utils.py,sha256=8bO8wrblmIW1aXEJre7ALGbL6GvuFrY38aZDdA_8W-g,6882
|
6
6
|
plancraft/data/test.json,sha256=4jWfYMAVuZCFmGB4iZJAjlh9_8jXECdaGp8xn7_tAM4,1317131
|
7
7
|
plancraft/data/test.small.easy.json,sha256=5NZEJ2PqIgmHQecJOIVQyM1D6GFKyJq7GVmgRudaqQk,189304
|
8
8
|
plancraft/data/test.small.json,sha256=eULAG1rdolRMXPrecV-7YoDIheKGyIT5MVpWdISV0wg,270089
|
@@ -15,7 +15,7 @@ plancraft/environment/actions.py,sha256=D9QqBW7yWsbWCjxNyWp61Xtb0c6EtyXk3PZ1I8SR
|
|
15
15
|
plancraft/environment/env.py,sha256=F5xo1eAJ9MeuoE2IpG_LtbaE0BGd66URPB_rehAWIiU,16372
|
16
16
|
plancraft/environment/items.py,sha256=Z9rhSyVDEoHF1pxRvhyiT94tyQJaWHi3wUHVcamz82o,221
|
17
17
|
plancraft/environment/planner.py,sha256=eJExz3OxSzurIEdH9LOtMwFH9ApqMQ3CokVhmbV6Px0,3953
|
18
|
-
plancraft/environment/prompts.py,sha256=
|
18
|
+
plancraft/environment/prompts.py,sha256=8QXclX0ygpL02uZichE1AVkbdn_0HGteD5bzo0FZGOU,6947
|
19
19
|
plancraft/environment/recipes.py,sha256=0vwzOU86eZmGN2EpZVSIvzxpx0AOBWNPxTtAOFBN2A0,19570
|
20
20
|
plancraft/environment/sampler.py,sha256=IZT-XjmWSZrs0zDyRTMjYytXxewdwYf5YGGdKsR5ll4,7643
|
21
21
|
plancraft/environment/search.py,sha256=uFHpLvW40rMKOxDabcyWrpOrhKLDZqAJOF_jew4_WXk,1837
|
@@ -1915,12 +1915,12 @@ plancraft/models/__init__.py,sha256=TBrarn93qt4IFJRNqtzOfaA8jGMPCgD7DFs-M84ipmk,
|
|
1915
1915
|
plancraft/models/act.py,sha256=6Xb8rylg3OngOraVFgduH_hQR62VcoyTeFntN4q3hsQ,2691
|
1916
1916
|
plancraft/models/base.py,sha256=uhG1tRmsBerJzW8qHoLyLEYpveDv0co7AAhi4mSfyO4,661
|
1917
1917
|
plancraft/models/bbox_model.py,sha256=3b1IEspoHiVUR6GOWjEbp4YoxRhGkzKt-eOiwaN8NXo,17091
|
1918
|
-
plancraft/models/dummy.py,sha256=
|
1918
|
+
plancraft/models/dummy.py,sha256=jBxke6VNpyYh_HBcFxCx64djO5F3wr5GbbnC0XePZ20,1015
|
1919
1919
|
plancraft/models/generators.py,sha256=F76_iPiqxUjDIrQwF58tzM0bLM91OkZJ0sBqBuki5wY,13939
|
1920
1920
|
plancraft/models/oracle.py,sha256=jDCE6zVFvbwFpDzQZTkHIlRwMud1yMJ4LVIdfpt5ddU,8449
|
1921
1921
|
plancraft/models/utils.py,sha256=E-sZohvolWgGbpHQKgAgkgIfUJoVnT5pMt6JP8xLHKg,4034
|
1922
1922
|
plancraft/train/dataset.py,sha256=oFqEd4LG9oEQ-71teh0Wf7-jJbtybT2ZibfM2bBdBkM,5474
|
1923
|
-
plancraft-0.3.
|
1924
|
-
plancraft-0.3.
|
1925
|
-
plancraft-0.3.
|
1926
|
-
plancraft-0.3.
|
1923
|
+
plancraft-0.3.5.dist-info/METADATA,sha256=QxQSXPXF162We8KwESaZ-nn94gqfz_5PQaXNDWkvV1Y,11147
|
1924
|
+
plancraft-0.3.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
1925
|
+
plancraft-0.3.5.dist-info/licenses/LICENSE,sha256=YGR8ehDB4t-T-lOQKMfKNR-2zsOU7E3E5NA8t25HKE0,1070
|
1926
|
+
plancraft-0.3.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|