plancraft 0.3.30__py3-none-any.whl → 0.3.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
plancraft/evaluator.py CHANGED
@@ -170,96 +170,70 @@ class Evaluator:
170
170
  content_list.append({"type": "image"})
171
171
  return {"content": content_list}
172
172
 
173
- def eval_example(
174
- self,
175
- example: PlancraftExample,
176
- model: PlancraftBaseModel,
177
- ) -> dict:
178
- """
179
- Given the loaded model and an example from Plancraft
180
- run the episode until success or termination.
181
- """
182
-
183
- # start environment
173
+ def _init_environment(self, example: PlancraftExample) -> tuple:
174
+ """Initialize environment and history for an example"""
184
175
  environment = PlancraftEnvironment(
185
176
  inventory=deepcopy(example.slotted_inventory),
186
177
  resolution=self.resolution,
187
178
  )
188
-
189
- # initialise history/dialogue tracking
190
179
  history = self.create_history()
191
- observation = environment.step()
192
- # add target and first message to history
193
- observation["target"] = example.target
194
- observation["message"] = self.convert_observation_to_message(
195
- observation, model=model
180
+ obs = environment.step()
181
+ obs["target"] = example.target
182
+ obs["message"] = self.convert_observation_to_message(obs)
183
+ return environment, history, obs
184
+
185
+ def _process_model_output(
186
+ self, raw_action, observation: dict, history: HistoryBase
187
+ ) -> tuple:
188
+ """Process model output and update history"""
189
+ if isinstance(raw_action, PlancraftModelOutput):
190
+ history.add_message_to_history(
191
+ content=raw_action.action,
192
+ role="assistant",
193
+ **(raw_action.kwargs or {}),
194
+ )
195
+ raw_action = raw_action.action
196
+ else:
197
+ history.add_message_to_history(content=raw_action, role="assistant")
198
+
199
+ action = self.parse_raw_model_response(
200
+ raw_action,
201
+ observation=observation,
202
+ history=history,
196
203
  )
204
+ return action
197
205
 
206
+ def _execute_action(
207
+ self, action, example: PlancraftExample, environment, model=None
208
+ ) -> tuple[dict, bool]:
209
+ """Execute action and return next observation and success status"""
198
210
  success = False
199
- # run episode until stuck or until max steps is reached
200
- while history.num_steps < self.max_steps:
201
- # add observation to history
202
- history.add_observation_to_history(observation)
203
- history.add_message_to_history(content=observation["message"], role="user")
204
- # predict next action
205
- raw_action = model.step(observation, dialogue_history=history)
206
-
207
- # if the model returns a PlancraftModelOutput, extract the action
208
- if isinstance(raw_action, PlancraftModelOutput):
209
- history.add_message_to_history(
210
- content=raw_action.action,
211
- role="assistant",
212
- **(raw_action.kwargs or {}),
213
- )
214
- raw_action = raw_action.action
215
- elif isinstance(raw_action, str):
216
- history.add_message_to_history(content=raw_action, role="assistant")
217
- else:
218
- raise ValueError(
219
- f"model.step() output must be a string or PlancraftModelOutput, got {type(raw_action)}"
220
- )
221
211
 
222
- # parse the raw action
223
- action = self.parse_raw_model_response(
224
- raw_action, observation=observation, history=history
225
- )
226
-
227
- # if the action is stop then we end the episode
228
- if isinstance(action, StopAction):
229
- # if the action is stop and task is impossible then success
230
- # otherwise we should not have stopped
231
- observation = None
232
- success = example.impossible
233
- # action is external tool then it is str
234
- elif isinstance(action, str):
235
- observation = environment.step()
236
- observation["target"] = example.target
237
- observation["message"] = action
238
- # action is environment action
239
- else:
240
- observation = environment.step(action)
241
- observation["target"] = example.target
242
- observation["message"] = self.convert_observation_to_message(
243
- observation, model=model
244
- )
245
- # check if the episode is done
246
- success = self.check_done(observation["inventory"], example.target)
247
-
248
- # update model with success or failure
249
- # observation is the next state after the action (s1)
250
- # history is the dialogue history
251
- # -- the last message contains the action taken (a0)
252
- # -- the second to last message is the observation (s0)
253
- # success is whether the episode is sucessful (r)
254
- model.update(
255
- observation=observation, history=history, success=success, action=action
212
+ # stop action
213
+ if isinstance(action, StopAction):
214
+ observation = None
215
+ # success is True if example was truly impossible
216
+ success = example.impossible
217
+ # if action is a string, it is a message response
218
+ elif isinstance(action, str):
219
+ observation = environment.step()
220
+ observation["target"] = example.target
221
+ observation["message"] = action
222
+ # execute action and check if target is obtained
223
+ else:
224
+ observation = environment.step(action)
225
+ observation["target"] = example.target
226
+ observation["message"] = self.convert_observation_to_message(
227
+ observation, model=model
256
228
  )
229
+ success = self.check_done(observation["inventory"], example.target)
257
230
 
258
- # exit if success
259
- if success or isinstance(action, StopAction):
260
- break
231
+ return observation, success
261
232
 
262
- # save results and reset
233
+ def _create_result(
234
+ self, example: PlancraftExample, success: bool, history: HistoryBase
235
+ ) -> dict:
236
+ """Create result dictionary for an example"""
263
237
  return {
264
238
  "success": success,
265
239
  "recipe_type": example.recipe_type,
@@ -270,20 +244,36 @@ class Evaluator:
270
244
  "images": history.images,
271
245
  }
272
246
 
247
+ def eval_example(
248
+ self,
249
+ example: PlancraftExample,
250
+ model: PlancraftBaseModel,
251
+ ) -> dict:
252
+ environment, history, observation = self._init_environment(example)
253
+ success = False
254
+
255
+ while history.num_steps < self.max_steps:
256
+ history.add_observation_to_history(observation)
257
+ history.add_message_to_history(content=observation["message"], role="user")
258
+
259
+ raw_action = model.step(observation, dialogue_history=history)
260
+ action = self._process_model_output(raw_action, observation, history)
261
+
262
+ observation, success = self._execute_action(
263
+ action, example, environment, model
264
+ )
265
+ if success or isinstance(action, StopAction):
266
+ break
267
+
268
+ return self._create_result(example, success, history)
269
+
273
270
  def batch_eval_examples(
274
271
  self,
275
272
  examples: list[PlancraftExample],
276
273
  model,
277
274
  batch_size: int = 4,
275
+ callback_fn: Optional[callable] = None,
278
276
  ) -> list:
279
- """
280
- Processes examples in batches with dynamic replacement from a queue.
281
-
282
- Args:
283
- examples: List of examples to process
284
- model: Model to use for evaluation
285
- batch_size: Maximum number of concurrent environments
286
- """
287
277
  pending_examples = deque(examples)
288
278
  active_examples = []
289
279
  active_environments = []
@@ -294,21 +284,13 @@ class Evaluator:
294
284
  # Initialize first batch
295
285
  while len(active_examples) < batch_size and pending_examples:
296
286
  example = pending_examples.popleft()
297
- env = PlancraftEnvironment(
298
- inventory=deepcopy(example.slotted_inventory),
299
- resolution=self.resolution,
300
- )
301
- history = self.create_history()
302
- obs = env.step()
303
- obs["target"] = example.target
304
- obs["message"] = self.convert_observation_to_message(obs, model=model)
287
+ env, history, obs = self._init_environment(example)
305
288
 
306
289
  active_examples.append(example)
307
290
  active_environments.append(env)
308
291
  active_histories.append(history)
309
292
  active_observations.append(obs)
310
293
 
311
- # Process until all examples are done
312
294
  while active_examples:
313
295
  # Add observations to histories
314
296
  for i in range(len(active_examples)):
@@ -317,12 +299,10 @@ class Evaluator:
317
299
  content=active_observations[i]["message"], role="user"
318
300
  )
319
301
 
320
- # Get model predictions for current batch
321
302
  raw_actions = model.batch_step(
322
303
  active_observations, dialogue_histories=active_histories
323
304
  )
324
305
 
325
- # Process each active environment
326
306
  completed_indices = []
327
307
  successes = []
328
308
  actions = []
@@ -330,71 +310,28 @@ class Evaluator:
330
310
  for i, (example, raw_action) in enumerate(
331
311
  zip(active_examples, raw_actions)
332
312
  ):
333
- # Handle model output
334
- if isinstance(raw_action, PlancraftModelOutput):
335
- active_histories[i].add_message_to_history(
336
- content=raw_action.action,
337
- role="assistant",
338
- **(raw_action.kwargs or {}),
339
- )
340
- raw_action = raw_action.action
341
- else:
342
- active_histories[i].add_message_to_history(
343
- content=raw_action, role="assistant"
344
- )
345
-
346
- # Parse and execute action
347
- action = self.parse_raw_model_response(
348
- raw_action,
349
- observation=active_observations[i],
350
- history=active_histories[i],
313
+ action = self._process_model_output(
314
+ raw_action, active_observations[i], active_histories[i]
351
315
  )
352
316
  actions.append(action)
353
- success = False
354
-
355
- if isinstance(action, StopAction):
356
- success = example.impossible
357
- active_observations[i] = None
358
- elif isinstance(action, str):
359
- obs = active_environments[i].step()
360
- obs["target"] = example.target
361
- obs["message"] = action
362
- active_observations[i] = obs
363
- else:
364
- obs = active_environments[i].step(action)
365
- obs["target"] = example.target
366
- obs["message"] = self.convert_observation_to_message(
367
- obs, model=model
368
- )
369
- active_observations[i] = obs
370
- success = self.check_done(obs["inventory"], example.target)
371
317
 
318
+ obs, success = self._execute_action(
319
+ action, example, active_environments[i], model
320
+ )
321
+ active_observations[i] = obs
372
322
  successes.append(success)
373
323
 
374
- # Check if environment is done
375
324
  if (
376
325
  success
377
326
  or isinstance(action, StopAction)
378
327
  or active_histories[i].num_steps >= self.max_steps
379
328
  ):
380
- results[example.id] = {
381
- "success": success,
382
- "recipe_type": example.recipe_type,
383
- "complexity": example.complexity_split,
384
- "number_of_steps": active_histories[i].num_steps,
385
- "model_trace": active_histories[i].trace(),
386
- "example_id": example.id,
387
- "images": active_histories[i].images,
388
- }
329
+ results[example.id] = self._create_result(
330
+ example, success, active_histories[i]
331
+ )
389
332
  completed_indices.append(i)
390
-
391
- # Update model
392
- model.batch_update(
393
- observations=active_observations,
394
- histories=active_histories,
395
- successes=successes,
396
- actions=actions,
397
- )
333
+ if callback_fn:
334
+ callback_fn(example=example, results=results[example.id])
398
335
 
399
336
  # Remove completed environments and replace with new ones
400
337
  for i in reversed(completed_indices):
@@ -403,19 +340,9 @@ class Evaluator:
403
340
  active_histories.pop(i)
404
341
  active_observations.pop(i)
405
342
 
406
- # Add new environment if there are pending examples
407
343
  if pending_examples:
408
344
  example = pending_examples.popleft()
409
- env = PlancraftEnvironment(
410
- inventory=deepcopy(example.slotted_inventory),
411
- resolution=self.resolution,
412
- )
413
- history = self.create_history()
414
- obs = env.step()
415
- obs["target"] = example.target
416
- obs["message"] = self.convert_observation_to_message(
417
- obs, model=model
418
- )
345
+ env, history, obs = self._init_environment(example)
419
346
 
420
347
  active_examples.append(example)
421
348
  active_environments.append(env)
plancraft/models/act.py CHANGED
@@ -72,6 +72,3 @@ class ActModel(PlancraftBaseModel):
72
72
  dialogue_history.tokens_used += action_token_used
73
73
  # return raw action message
74
74
  return action_messages[0].split("\n")[0].strip()
75
-
76
- def update(self, **kwargs):
77
- pass
plancraft/models/base.py CHANGED
@@ -33,10 +33,3 @@ class PlancraftBaseModel(abc.ABC):
33
33
  Reset the model state - ready for a new episode
34
34
  """
35
35
  raise NotImplementedError()
36
-
37
- @abc.abstractmethod
38
- def update(self, **kwargs) -> None:
39
- """
40
- Update the model state based on the dialogue history
41
- """
42
- raise NotImplementedError()
plancraft/models/dummy.py CHANGED
@@ -45,9 +45,3 @@ class DummyModel(PlancraftBaseModel):
45
45
  self, observations: list[dict], **kwargs
46
46
  ) -> list[PlancraftModelOutput]:
47
47
  return [self.step(observation) for observation in observations]
48
-
49
- def update(self, **kwargs):
50
- pass
51
-
52
- def batch_update(self, **kwargs):
53
- pass
@@ -47,9 +47,3 @@ class OracleModel(PlancraftBaseModel):
47
47
  action = self.step(observation)
48
48
  actions.append(action)
49
49
  return actions
50
-
51
- def update(self, **kwargs):
52
- pass
53
-
54
- def batch_update(self, **kwargs):
55
- pass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: plancraft
3
- Version: 0.3.30
3
+ Version: 0.3.32
4
4
  Summary: Plancraft: an evaluation dataset for planning with LLM agents
5
5
  License: MIT License
6
6
 
@@ -1,6 +1,6 @@
1
1
  plancraft/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  plancraft/config.py,sha256=ShsFRlJ7plsl3ToD9fiO_4LDQuXdbjNV6Xp6o3Yk2Yg,4315
3
- plancraft/evaluator.py,sha256=VFJnfitixU2Y4RTxp0lDALoCSFMMwMJPgSQC0Y0tmH8,18121
3
+ plancraft/evaluator.py,sha256=mxzvbGpEDkiKW8u79QgYz5Q4wnZvkQSXiAvi0OVu4Qs,14754
4
4
  plancraft/generate_dataset.py,sha256=DlrU-PmvWqSNJD1g1-8Lpb8n3N-Ogw3rje1nrRzjGKs,2382
5
5
  plancraft/utils.py,sha256=VhnxMihh6pRhNjQTK5HDc0FYWmF9_EcQyRP_a7fbIZA,7156
6
6
  plancraft/data/test.json,sha256=4jWfYMAVuZCFmGB4iZJAjlh9_8jXECdaGp8xn7_tAM4,1317131
@@ -1912,15 +1912,15 @@ plancraft/environment/tags/wooden_stairs.json,sha256=GCr2_5UGPMYZECqQ_5NYSvbwuwt
1912
1912
  plancraft/environment/tags/wooden_trapdoors.json,sha256=DbjfwoHJL8VuYWV61A1uDqW7LJsGlOP4eoxcGIQVYr4,303
1913
1913
  plancraft/environment/tags/wool.json,sha256=Z59l4mdPztVZBFaglJ4mV9H2OnyCVzhqQRi2dduak78,496
1914
1914
  plancraft/models/__init__.py,sha256=TBrarn93qt4IFJRNqtzOfaA8jGMPCgD7DFs-M84ipmk,510
1915
- plancraft/models/act.py,sha256=_OZo9a_6R0wajdR7axZarjI3IJP7glFrWeDIrbcHDmw,2737
1916
- plancraft/models/base.py,sha256=Krm6MdOjU-qlps1WSX7pxdnqXLiyI3qsI9Na7Xk8r1c,1038
1915
+ plancraft/models/act.py,sha256=6Xb8rylg3OngOraVFgduH_hQR62VcoyTeFntN4q3hsQ,2691
1916
+ plancraft/models/base.py,sha256=S8EdkqWpn8nE1WcrqDoA4Hx4p52qEttGxnqjIPWvl3Q,852
1917
1917
  plancraft/models/bbox_model.py,sha256=3b1IEspoHiVUR6GOWjEbp4YoxRhGkzKt-eOiwaN8NXo,17091
1918
- plancraft/models/dummy.py,sha256=UWbW3bjrQr_0UYYrNf_D0jWpUq6e50vAp21F0zi8iFM,1593
1918
+ plancraft/models/dummy.py,sha256=_NUTviv5ye6KGzODRt0Zykk8shsek0QBqWCeZW3ldSQ,1495
1919
1919
  plancraft/models/generators.py,sha256=F76_iPiqxUjDIrQwF58tzM0bLM91OkZJ0sBqBuki5wY,13939
1920
- plancraft/models/oracle.py,sha256=jmt_kBBNXt0VWUX7q6OHkJoRZWItCMy4qGH5qbLSc1c,1755
1920
+ plancraft/models/oracle.py,sha256=f-0KWlBuHy6wcxmDsxM3MQ_QwfBstzfbA26mlk1MgLA,1657
1921
1921
  plancraft/models/utils.py,sha256=E-sZohvolWgGbpHQKgAgkgIfUJoVnT5pMt6JP8xLHKg,4034
1922
1922
  plancraft/train/dataset.py,sha256=oFqEd4LG9oEQ-71teh0Wf7-jJbtybT2ZibfM2bBdBkM,5474
1923
- plancraft-0.3.30.dist-info/METADATA,sha256=tltUHYqXhfDXfsQGU5NLhEp6TjR41g6X0OWFn5dpttg,11148
1924
- plancraft-0.3.30.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
1925
- plancraft-0.3.30.dist-info/licenses/LICENSE,sha256=YGR8ehDB4t-T-lOQKMfKNR-2zsOU7E3E5NA8t25HKE0,1070
1926
- plancraft-0.3.30.dist-info/RECORD,,
1923
+ plancraft-0.3.32.dist-info/METADATA,sha256=vRc_HMJhCvX4LnEPLHIbgKaJCbQP4Gq0qb4xITGFkYQ,11148
1924
+ plancraft-0.3.32.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
1925
+ plancraft-0.3.32.dist-info/licenses/LICENSE,sha256=YGR8ehDB4t-T-lOQKMfKNR-2zsOU7E3E5NA8t25HKE0,1070
1926
+ plancraft-0.3.32.dist-info/RECORD,,