plancraft 0.3.31__py3-none-any.whl → 0.3.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- plancraft/evaluator.py +87 -146
- {plancraft-0.3.31.dist-info → plancraft-0.3.32.dist-info}/METADATA +1 -1
- {plancraft-0.3.31.dist-info → plancraft-0.3.32.dist-info}/RECORD +5 -5
- {plancraft-0.3.31.dist-info → plancraft-0.3.32.dist-info}/WHEEL +0 -0
- {plancraft-0.3.31.dist-info → plancraft-0.3.32.dist-info}/licenses/LICENSE +0 -0
plancraft/evaluator.py
CHANGED
@@ -170,86 +170,70 @@ class Evaluator:
|
|
170
170
|
content_list.append({"type": "image"})
|
171
171
|
return {"content": content_list}
|
172
172
|
|
173
|
-
def
|
174
|
-
|
175
|
-
example: PlancraftExample,
|
176
|
-
model: PlancraftBaseModel,
|
177
|
-
) -> dict:
|
178
|
-
"""
|
179
|
-
Given the loaded model and an example from Plancraft
|
180
|
-
run the episode until success or termination.
|
181
|
-
"""
|
182
|
-
|
183
|
-
# start environment
|
173
|
+
def _init_environment(self, example: PlancraftExample) -> tuple:
|
174
|
+
"""Initialize environment and history for an example"""
|
184
175
|
environment = PlancraftEnvironment(
|
185
176
|
inventory=deepcopy(example.slotted_inventory),
|
186
177
|
resolution=self.resolution,
|
187
178
|
)
|
188
|
-
|
189
|
-
# initialise history/dialogue tracking
|
190
179
|
history = self.create_history()
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
180
|
+
obs = environment.step()
|
181
|
+
obs["target"] = example.target
|
182
|
+
obs["message"] = self.convert_observation_to_message(obs)
|
183
|
+
return environment, history, obs
|
184
|
+
|
185
|
+
def _process_model_output(
|
186
|
+
self, raw_action, observation: dict, history: HistoryBase
|
187
|
+
) -> tuple:
|
188
|
+
"""Process model output and update history"""
|
189
|
+
if isinstance(raw_action, PlancraftModelOutput):
|
190
|
+
history.add_message_to_history(
|
191
|
+
content=raw_action.action,
|
192
|
+
role="assistant",
|
193
|
+
**(raw_action.kwargs or {}),
|
194
|
+
)
|
195
|
+
raw_action = raw_action.action
|
196
|
+
else:
|
197
|
+
history.add_message_to_history(content=raw_action, role="assistant")
|
198
|
+
|
199
|
+
action = self.parse_raw_model_response(
|
200
|
+
raw_action,
|
201
|
+
observation=observation,
|
202
|
+
history=history,
|
196
203
|
)
|
204
|
+
return action
|
197
205
|
|
206
|
+
def _execute_action(
|
207
|
+
self, action, example: PlancraftExample, environment, model=None
|
208
|
+
) -> tuple[dict, bool]:
|
209
|
+
"""Execute action and return next observation and success status"""
|
198
210
|
success = False
|
199
|
-
# run episode until stuck or until max steps is reached
|
200
|
-
while history.num_steps < self.max_steps:
|
201
|
-
# add observation to history
|
202
|
-
history.add_observation_to_history(observation)
|
203
|
-
history.add_message_to_history(content=observation["message"], role="user")
|
204
|
-
# predict next action
|
205
|
-
raw_action = model.step(observation, dialogue_history=history)
|
206
211
|
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
action = self.parse_raw_model_response(
|
224
|
-
raw_action, observation=observation, history=history
|
212
|
+
# stop action
|
213
|
+
if isinstance(action, StopAction):
|
214
|
+
observation = None
|
215
|
+
# success is True if example was truly impossible
|
216
|
+
success = example.impossible
|
217
|
+
# if action is a string, it is a message response
|
218
|
+
elif isinstance(action, str):
|
219
|
+
observation = environment.step()
|
220
|
+
observation["target"] = example.target
|
221
|
+
observation["message"] = action
|
222
|
+
# execute action and check if target is obtained
|
223
|
+
else:
|
224
|
+
observation = environment.step(action)
|
225
|
+
observation["target"] = example.target
|
226
|
+
observation["message"] = self.convert_observation_to_message(
|
227
|
+
observation, model=model
|
225
228
|
)
|
229
|
+
success = self.check_done(observation["inventory"], example.target)
|
226
230
|
|
227
|
-
|
228
|
-
if isinstance(action, StopAction):
|
229
|
-
# if the action is stop and task is impossible then success
|
230
|
-
# otherwise we should not have stopped
|
231
|
-
observation = None
|
232
|
-
success = example.impossible
|
233
|
-
# action is external tool then it is str
|
234
|
-
elif isinstance(action, str):
|
235
|
-
observation = environment.step()
|
236
|
-
observation["target"] = example.target
|
237
|
-
observation["message"] = action
|
238
|
-
# action is environment action
|
239
|
-
else:
|
240
|
-
observation = environment.step(action)
|
241
|
-
observation["target"] = example.target
|
242
|
-
observation["message"] = self.convert_observation_to_message(
|
243
|
-
observation, model=model
|
244
|
-
)
|
245
|
-
# check if the episode is done
|
246
|
-
success = self.check_done(observation["inventory"], example.target)
|
247
|
-
|
248
|
-
# exit if success
|
249
|
-
if success or isinstance(action, StopAction):
|
250
|
-
break
|
231
|
+
return observation, success
|
251
232
|
|
252
|
-
|
233
|
+
def _create_result(
|
234
|
+
self, example: PlancraftExample, success: bool, history: HistoryBase
|
235
|
+
) -> dict:
|
236
|
+
"""Create result dictionary for an example"""
|
253
237
|
return {
|
254
238
|
"success": success,
|
255
239
|
"recipe_type": example.recipe_type,
|
@@ -260,6 +244,29 @@ class Evaluator:
|
|
260
244
|
"images": history.images,
|
261
245
|
}
|
262
246
|
|
247
|
+
def eval_example(
|
248
|
+
self,
|
249
|
+
example: PlancraftExample,
|
250
|
+
model: PlancraftBaseModel,
|
251
|
+
) -> dict:
|
252
|
+
environment, history, observation = self._init_environment(example)
|
253
|
+
success = False
|
254
|
+
|
255
|
+
while history.num_steps < self.max_steps:
|
256
|
+
history.add_observation_to_history(observation)
|
257
|
+
history.add_message_to_history(content=observation["message"], role="user")
|
258
|
+
|
259
|
+
raw_action = model.step(observation, dialogue_history=history)
|
260
|
+
action = self._process_model_output(raw_action, observation, history)
|
261
|
+
|
262
|
+
observation, success = self._execute_action(
|
263
|
+
action, example, environment, model
|
264
|
+
)
|
265
|
+
if success or isinstance(action, StopAction):
|
266
|
+
break
|
267
|
+
|
268
|
+
return self._create_result(example, success, history)
|
269
|
+
|
263
270
|
def batch_eval_examples(
|
264
271
|
self,
|
265
272
|
examples: list[PlancraftExample],
|
@@ -267,15 +274,6 @@ class Evaluator:
|
|
267
274
|
batch_size: int = 4,
|
268
275
|
callback_fn: Optional[callable] = None,
|
269
276
|
) -> list:
|
270
|
-
"""
|
271
|
-
Processes examples in batches with dynamic replacement from a queue.
|
272
|
-
|
273
|
-
Args:
|
274
|
-
examples: List of examples to process
|
275
|
-
model: Model to use for evaluation
|
276
|
-
batch_size: Maximum number of concurrent environments
|
277
|
-
callback_fn: Optional callback function to call after each result
|
278
|
-
"""
|
279
277
|
pending_examples = deque(examples)
|
280
278
|
active_examples = []
|
281
279
|
active_environments = []
|
@@ -286,21 +284,13 @@ class Evaluator:
|
|
286
284
|
# Initialize first batch
|
287
285
|
while len(active_examples) < batch_size and pending_examples:
|
288
286
|
example = pending_examples.popleft()
|
289
|
-
env =
|
290
|
-
inventory=deepcopy(example.slotted_inventory),
|
291
|
-
resolution=self.resolution,
|
292
|
-
)
|
293
|
-
history = self.create_history()
|
294
|
-
obs = env.step()
|
295
|
-
obs["target"] = example.target
|
296
|
-
obs["message"] = self.convert_observation_to_message(obs, model=model)
|
287
|
+
env, history, obs = self._init_environment(example)
|
297
288
|
|
298
289
|
active_examples.append(example)
|
299
290
|
active_environments.append(env)
|
300
291
|
active_histories.append(history)
|
301
292
|
active_observations.append(obs)
|
302
293
|
|
303
|
-
# Process until all examples are done
|
304
294
|
while active_examples:
|
305
295
|
# Add observations to histories
|
306
296
|
for i in range(len(active_examples)):
|
@@ -309,12 +299,10 @@ class Evaluator:
|
|
309
299
|
content=active_observations[i]["message"], role="user"
|
310
300
|
)
|
311
301
|
|
312
|
-
# Get model predictions for current batch
|
313
302
|
raw_actions = model.batch_step(
|
314
303
|
active_observations, dialogue_histories=active_histories
|
315
304
|
)
|
316
305
|
|
317
|
-
# Process each active environment
|
318
306
|
completed_indices = []
|
319
307
|
successes = []
|
320
308
|
actions = []
|
@@ -322,65 +310,28 @@ class Evaluator:
|
|
322
310
|
for i, (example, raw_action) in enumerate(
|
323
311
|
zip(active_examples, raw_actions)
|
324
312
|
):
|
325
|
-
|
326
|
-
|
327
|
-
active_histories[i].add_message_to_history(
|
328
|
-
content=raw_action.action,
|
329
|
-
role="assistant",
|
330
|
-
**(raw_action.kwargs or {}),
|
331
|
-
)
|
332
|
-
raw_action = raw_action.action
|
333
|
-
else:
|
334
|
-
active_histories[i].add_message_to_history(
|
335
|
-
content=raw_action, role="assistant"
|
336
|
-
)
|
337
|
-
|
338
|
-
# Parse and execute action
|
339
|
-
action = self.parse_raw_model_response(
|
340
|
-
raw_action,
|
341
|
-
observation=active_observations[i],
|
342
|
-
history=active_histories[i],
|
313
|
+
action = self._process_model_output(
|
314
|
+
raw_action, active_observations[i], active_histories[i]
|
343
315
|
)
|
344
316
|
actions.append(action)
|
345
|
-
success = False
|
346
|
-
|
347
|
-
if isinstance(action, StopAction):
|
348
|
-
success = example.impossible
|
349
|
-
active_observations[i] = None
|
350
|
-
elif isinstance(action, str):
|
351
|
-
obs = active_environments[i].step()
|
352
|
-
obs["target"] = example.target
|
353
|
-
obs["message"] = action
|
354
|
-
active_observations[i] = obs
|
355
|
-
else:
|
356
|
-
obs = active_environments[i].step(action)
|
357
|
-
obs["target"] = example.target
|
358
|
-
obs["message"] = self.convert_observation_to_message(
|
359
|
-
obs, model=model
|
360
|
-
)
|
361
|
-
active_observations[i] = obs
|
362
|
-
success = self.check_done(obs["inventory"], example.target)
|
363
317
|
|
318
|
+
obs, success = self._execute_action(
|
319
|
+
action, example, active_environments[i], model
|
320
|
+
)
|
321
|
+
active_observations[i] = obs
|
364
322
|
successes.append(success)
|
365
323
|
|
366
|
-
# Check if environment is done
|
367
324
|
if (
|
368
325
|
success
|
369
326
|
or isinstance(action, StopAction)
|
370
327
|
or active_histories[i].num_steps >= self.max_steps
|
371
328
|
):
|
372
|
-
results[example.id] =
|
373
|
-
|
374
|
-
|
375
|
-
"complexity": example.complexity_split,
|
376
|
-
"number_of_steps": active_histories[i].num_steps,
|
377
|
-
"model_trace": active_histories[i].trace(),
|
378
|
-
"example_id": example.id,
|
379
|
-
"images": active_histories[i].images,
|
380
|
-
}
|
329
|
+
results[example.id] = self._create_result(
|
330
|
+
example, success, active_histories[i]
|
331
|
+
)
|
381
332
|
completed_indices.append(i)
|
382
333
|
if callback_fn:
|
383
|
-
callback_fn(results[example.id])
|
334
|
+
callback_fn(example=example, results=results[example.id])
|
384
335
|
|
385
336
|
# Remove completed environments and replace with new ones
|
386
337
|
for i in reversed(completed_indices):
|
@@ -389,19 +340,9 @@ class Evaluator:
|
|
389
340
|
active_histories.pop(i)
|
390
341
|
active_observations.pop(i)
|
391
342
|
|
392
|
-
# Add new environment if there are pending examples
|
393
343
|
if pending_examples:
|
394
344
|
example = pending_examples.popleft()
|
395
|
-
env =
|
396
|
-
inventory=deepcopy(example.slotted_inventory),
|
397
|
-
resolution=self.resolution,
|
398
|
-
)
|
399
|
-
history = self.create_history()
|
400
|
-
obs = env.step()
|
401
|
-
obs["target"] = example.target
|
402
|
-
obs["message"] = self.convert_observation_to_message(
|
403
|
-
obs, model=model
|
404
|
-
)
|
345
|
+
env, history, obs = self._init_environment(example)
|
405
346
|
|
406
347
|
active_examples.append(example)
|
407
348
|
active_environments.append(env)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
plancraft/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
plancraft/config.py,sha256=ShsFRlJ7plsl3ToD9fiO_4LDQuXdbjNV6Xp6o3Yk2Yg,4315
|
3
|
-
plancraft/evaluator.py,sha256=
|
3
|
+
plancraft/evaluator.py,sha256=mxzvbGpEDkiKW8u79QgYz5Q4wnZvkQSXiAvi0OVu4Qs,14754
|
4
4
|
plancraft/generate_dataset.py,sha256=DlrU-PmvWqSNJD1g1-8Lpb8n3N-Ogw3rje1nrRzjGKs,2382
|
5
5
|
plancraft/utils.py,sha256=VhnxMihh6pRhNjQTK5HDc0FYWmF9_EcQyRP_a7fbIZA,7156
|
6
6
|
plancraft/data/test.json,sha256=4jWfYMAVuZCFmGB4iZJAjlh9_8jXECdaGp8xn7_tAM4,1317131
|
@@ -1920,7 +1920,7 @@ plancraft/models/generators.py,sha256=F76_iPiqxUjDIrQwF58tzM0bLM91OkZJ0sBqBuki5w
|
|
1920
1920
|
plancraft/models/oracle.py,sha256=f-0KWlBuHy6wcxmDsxM3MQ_QwfBstzfbA26mlk1MgLA,1657
|
1921
1921
|
plancraft/models/utils.py,sha256=E-sZohvolWgGbpHQKgAgkgIfUJoVnT5pMt6JP8xLHKg,4034
|
1922
1922
|
plancraft/train/dataset.py,sha256=oFqEd4LG9oEQ-71teh0Wf7-jJbtybT2ZibfM2bBdBkM,5474
|
1923
|
-
plancraft-0.3.
|
1924
|
-
plancraft-0.3.
|
1925
|
-
plancraft-0.3.
|
1926
|
-
plancraft-0.3.
|
1923
|
+
plancraft-0.3.32.dist-info/METADATA,sha256=vRc_HMJhCvX4LnEPLHIbgKaJCbQP4Gq0qb4xITGFkYQ,11148
|
1924
|
+
plancraft-0.3.32.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
1925
|
+
plancraft-0.3.32.dist-info/licenses/LICENSE,sha256=YGR8ehDB4t-T-lOQKMfKNR-2zsOU7E3E5NA8t25HKE0,1070
|
1926
|
+
plancraft-0.3.32.dist-info/RECORD,,
|
File without changes
|
File without changes
|