plancraft 0.3.31__py3-none-any.whl → 0.3.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
plancraft/evaluator.py CHANGED
@@ -170,86 +170,70 @@ class Evaluator:
170
170
  content_list.append({"type": "image"})
171
171
  return {"content": content_list}
172
172
 
173
- def eval_example(
174
- self,
175
- example: PlancraftExample,
176
- model: PlancraftBaseModel,
177
- ) -> dict:
178
- """
179
- Given the loaded model and an example from Plancraft
180
- run the episode until success or termination.
181
- """
182
-
183
- # start environment
173
+ def _init_environment(self, example: PlancraftExample) -> tuple:
174
+ """Initialize environment and history for an example"""
184
175
  environment = PlancraftEnvironment(
185
176
  inventory=deepcopy(example.slotted_inventory),
186
177
  resolution=self.resolution,
187
178
  )
188
-
189
- # initialise history/dialogue tracking
190
179
  history = self.create_history()
191
- observation = environment.step()
192
- # add target and first message to history
193
- observation["target"] = example.target
194
- observation["message"] = self.convert_observation_to_message(
195
- observation, model=model
180
+ obs = environment.step()
181
+ obs["target"] = example.target
182
+ obs["message"] = self.convert_observation_to_message(obs)
183
+ return environment, history, obs
184
+
185
+ def _process_model_output(
186
+ self, raw_action, observation: dict, history: HistoryBase
187
+ ) -> tuple:
188
+ """Process model output and update history"""
189
+ if isinstance(raw_action, PlancraftModelOutput):
190
+ history.add_message_to_history(
191
+ content=raw_action.action,
192
+ role="assistant",
193
+ **(raw_action.kwargs or {}),
194
+ )
195
+ raw_action = raw_action.action
196
+ else:
197
+ history.add_message_to_history(content=raw_action, role="assistant")
198
+
199
+ action = self.parse_raw_model_response(
200
+ raw_action,
201
+ observation=observation,
202
+ history=history,
196
203
  )
204
+ return action
197
205
 
206
+ def _execute_action(
207
+ self, action, example: PlancraftExample, environment, model=None
208
+ ) -> tuple[dict, bool]:
209
+ """Execute action and return next observation and success status"""
198
210
  success = False
199
- # run episode until stuck or until max steps is reached
200
- while history.num_steps < self.max_steps:
201
- # add observation to history
202
- history.add_observation_to_history(observation)
203
- history.add_message_to_history(content=observation["message"], role="user")
204
- # predict next action
205
- raw_action = model.step(observation, dialogue_history=history)
206
211
 
207
- # if the model returns a PlancraftModelOutput, extract the action
208
- if isinstance(raw_action, PlancraftModelOutput):
209
- history.add_message_to_history(
210
- content=raw_action.action,
211
- role="assistant",
212
- **(raw_action.kwargs or {}),
213
- )
214
- raw_action = raw_action.action
215
- elif isinstance(raw_action, str):
216
- history.add_message_to_history(content=raw_action, role="assistant")
217
- else:
218
- raise ValueError(
219
- f"model.step() output must be a string or PlancraftModelOutput, got {type(raw_action)}"
220
- )
221
-
222
- # parse the raw action
223
- action = self.parse_raw_model_response(
224
- raw_action, observation=observation, history=history
212
+ # stop action
213
+ if isinstance(action, StopAction):
214
+ observation = None
215
+ # success is True if example was truly impossible
216
+ success = example.impossible
217
+ # if action is a string, it is a message response
218
+ elif isinstance(action, str):
219
+ observation = environment.step()
220
+ observation["target"] = example.target
221
+ observation["message"] = action
222
+ # execute action and check if target is obtained
223
+ else:
224
+ observation = environment.step(action)
225
+ observation["target"] = example.target
226
+ observation["message"] = self.convert_observation_to_message(
227
+ observation, model=model
225
228
  )
229
+ success = self.check_done(observation["inventory"], example.target)
226
230
 
227
- # if the action is stop then we end the episode
228
- if isinstance(action, StopAction):
229
- # if the action is stop and task is impossible then success
230
- # otherwise we should not have stopped
231
- observation = None
232
- success = example.impossible
233
- # action is external tool then it is str
234
- elif isinstance(action, str):
235
- observation = environment.step()
236
- observation["target"] = example.target
237
- observation["message"] = action
238
- # action is environment action
239
- else:
240
- observation = environment.step(action)
241
- observation["target"] = example.target
242
- observation["message"] = self.convert_observation_to_message(
243
- observation, model=model
244
- )
245
- # check if the episode is done
246
- success = self.check_done(observation["inventory"], example.target)
247
-
248
- # exit if success
249
- if success or isinstance(action, StopAction):
250
- break
231
+ return observation, success
251
232
 
252
- # save results and reset
233
+ def _create_result(
234
+ self, example: PlancraftExample, success: bool, history: HistoryBase
235
+ ) -> dict:
236
+ """Create result dictionary for an example"""
253
237
  return {
254
238
  "success": success,
255
239
  "recipe_type": example.recipe_type,
@@ -260,6 +244,29 @@ class Evaluator:
260
244
  "images": history.images,
261
245
  }
262
246
 
247
+ def eval_example(
248
+ self,
249
+ example: PlancraftExample,
250
+ model: PlancraftBaseModel,
251
+ ) -> dict:
252
+ environment, history, observation = self._init_environment(example)
253
+ success = False
254
+
255
+ while history.num_steps < self.max_steps:
256
+ history.add_observation_to_history(observation)
257
+ history.add_message_to_history(content=observation["message"], role="user")
258
+
259
+ raw_action = model.step(observation, dialogue_history=history)
260
+ action = self._process_model_output(raw_action, observation, history)
261
+
262
+ observation, success = self._execute_action(
263
+ action, example, environment, model
264
+ )
265
+ if success or isinstance(action, StopAction):
266
+ break
267
+
268
+ return self._create_result(example, success, history)
269
+
263
270
  def batch_eval_examples(
264
271
  self,
265
272
  examples: list[PlancraftExample],
@@ -267,15 +274,6 @@ class Evaluator:
267
274
  batch_size: int = 4,
268
275
  callback_fn: Optional[callable] = None,
269
276
  ) -> list:
270
- """
271
- Processes examples in batches with dynamic replacement from a queue.
272
-
273
- Args:
274
- examples: List of examples to process
275
- model: Model to use for evaluation
276
- batch_size: Maximum number of concurrent environments
277
- callback_fn: Optional callback function to call after each result
278
- """
279
277
  pending_examples = deque(examples)
280
278
  active_examples = []
281
279
  active_environments = []
@@ -286,21 +284,13 @@ class Evaluator:
286
284
  # Initialize first batch
287
285
  while len(active_examples) < batch_size and pending_examples:
288
286
  example = pending_examples.popleft()
289
- env = PlancraftEnvironment(
290
- inventory=deepcopy(example.slotted_inventory),
291
- resolution=self.resolution,
292
- )
293
- history = self.create_history()
294
- obs = env.step()
295
- obs["target"] = example.target
296
- obs["message"] = self.convert_observation_to_message(obs, model=model)
287
+ env, history, obs = self._init_environment(example)
297
288
 
298
289
  active_examples.append(example)
299
290
  active_environments.append(env)
300
291
  active_histories.append(history)
301
292
  active_observations.append(obs)
302
293
 
303
- # Process until all examples are done
304
294
  while active_examples:
305
295
  # Add observations to histories
306
296
  for i in range(len(active_examples)):
@@ -309,12 +299,10 @@ class Evaluator:
309
299
  content=active_observations[i]["message"], role="user"
310
300
  )
311
301
 
312
- # Get model predictions for current batch
313
302
  raw_actions = model.batch_step(
314
303
  active_observations, dialogue_histories=active_histories
315
304
  )
316
305
 
317
- # Process each active environment
318
306
  completed_indices = []
319
307
  successes = []
320
308
  actions = []
@@ -322,65 +310,28 @@ class Evaluator:
322
310
  for i, (example, raw_action) in enumerate(
323
311
  zip(active_examples, raw_actions)
324
312
  ):
325
- # Handle model output
326
- if isinstance(raw_action, PlancraftModelOutput):
327
- active_histories[i].add_message_to_history(
328
- content=raw_action.action,
329
- role="assistant",
330
- **(raw_action.kwargs or {}),
331
- )
332
- raw_action = raw_action.action
333
- else:
334
- active_histories[i].add_message_to_history(
335
- content=raw_action, role="assistant"
336
- )
337
-
338
- # Parse and execute action
339
- action = self.parse_raw_model_response(
340
- raw_action,
341
- observation=active_observations[i],
342
- history=active_histories[i],
313
+ action = self._process_model_output(
314
+ raw_action, active_observations[i], active_histories[i]
343
315
  )
344
316
  actions.append(action)
345
- success = False
346
-
347
- if isinstance(action, StopAction):
348
- success = example.impossible
349
- active_observations[i] = None
350
- elif isinstance(action, str):
351
- obs = active_environments[i].step()
352
- obs["target"] = example.target
353
- obs["message"] = action
354
- active_observations[i] = obs
355
- else:
356
- obs = active_environments[i].step(action)
357
- obs["target"] = example.target
358
- obs["message"] = self.convert_observation_to_message(
359
- obs, model=model
360
- )
361
- active_observations[i] = obs
362
- success = self.check_done(obs["inventory"], example.target)
363
317
 
318
+ obs, success = self._execute_action(
319
+ action, example, active_environments[i], model
320
+ )
321
+ active_observations[i] = obs
364
322
  successes.append(success)
365
323
 
366
- # Check if environment is done
367
324
  if (
368
325
  success
369
326
  or isinstance(action, StopAction)
370
327
  or active_histories[i].num_steps >= self.max_steps
371
328
  ):
372
- results[example.id] = {
373
- "success": success,
374
- "recipe_type": example.recipe_type,
375
- "complexity": example.complexity_split,
376
- "number_of_steps": active_histories[i].num_steps,
377
- "model_trace": active_histories[i].trace(),
378
- "example_id": example.id,
379
- "images": active_histories[i].images,
380
- }
329
+ results[example.id] = self._create_result(
330
+ example, success, active_histories[i]
331
+ )
381
332
  completed_indices.append(i)
382
333
  if callback_fn:
383
- callback_fn(results[example.id])
334
+ callback_fn(example=example, results=results[example.id])
384
335
 
385
336
  # Remove completed environments and replace with new ones
386
337
  for i in reversed(completed_indices):
@@ -389,19 +340,9 @@ class Evaluator:
389
340
  active_histories.pop(i)
390
341
  active_observations.pop(i)
391
342
 
392
- # Add new environment if there are pending examples
393
343
  if pending_examples:
394
344
  example = pending_examples.popleft()
395
- env = PlancraftEnvironment(
396
- inventory=deepcopy(example.slotted_inventory),
397
- resolution=self.resolution,
398
- )
399
- history = self.create_history()
400
- obs = env.step()
401
- obs["target"] = example.target
402
- obs["message"] = self.convert_observation_to_message(
403
- obs, model=model
404
- )
345
+ env, history, obs = self._init_environment(example)
405
346
 
406
347
  active_examples.append(example)
407
348
  active_environments.append(env)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: plancraft
3
- Version: 0.3.31
3
+ Version: 0.3.32
4
4
  Summary: Plancraft: an evaluation dataset for planning with LLM agents
5
5
  License: MIT License
6
6
 
@@ -1,6 +1,6 @@
1
1
  plancraft/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  plancraft/config.py,sha256=ShsFRlJ7plsl3ToD9fiO_4LDQuXdbjNV6Xp6o3Yk2Yg,4315
3
- plancraft/evaluator.py,sha256=VteLAT_rPogw8NYZos7jEuuakyfE_3CsFuv6A39Geyw,17614
3
+ plancraft/evaluator.py,sha256=mxzvbGpEDkiKW8u79QgYz5Q4wnZvkQSXiAvi0OVu4Qs,14754
4
4
  plancraft/generate_dataset.py,sha256=DlrU-PmvWqSNJD1g1-8Lpb8n3N-Ogw3rje1nrRzjGKs,2382
5
5
  plancraft/utils.py,sha256=VhnxMihh6pRhNjQTK5HDc0FYWmF9_EcQyRP_a7fbIZA,7156
6
6
  plancraft/data/test.json,sha256=4jWfYMAVuZCFmGB4iZJAjlh9_8jXECdaGp8xn7_tAM4,1317131
@@ -1920,7 +1920,7 @@ plancraft/models/generators.py,sha256=F76_iPiqxUjDIrQwF58tzM0bLM91OkZJ0sBqBuki5w
1920
1920
  plancraft/models/oracle.py,sha256=f-0KWlBuHy6wcxmDsxM3MQ_QwfBstzfbA26mlk1MgLA,1657
1921
1921
  plancraft/models/utils.py,sha256=E-sZohvolWgGbpHQKgAgkgIfUJoVnT5pMt6JP8xLHKg,4034
1922
1922
  plancraft/train/dataset.py,sha256=oFqEd4LG9oEQ-71teh0Wf7-jJbtybT2ZibfM2bBdBkM,5474
1923
- plancraft-0.3.31.dist-info/METADATA,sha256=gU6j3SQEGdXIeW1pab_Pz6hspDhl_g0vaPIkIXRScYo,11148
1924
- plancraft-0.3.31.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
1925
- plancraft-0.3.31.dist-info/licenses/LICENSE,sha256=YGR8ehDB4t-T-lOQKMfKNR-2zsOU7E3E5NA8t25HKE0,1070
1926
- plancraft-0.3.31.dist-info/RECORD,,
1923
+ plancraft-0.3.32.dist-info/METADATA,sha256=vRc_HMJhCvX4LnEPLHIbgKaJCbQP4Gq0qb4xITGFkYQ,11148
1924
+ plancraft-0.3.32.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
1925
+ plancraft-0.3.32.dist-info/licenses/LICENSE,sha256=YGR8ehDB4t-T-lOQKMfKNR-2zsOU7E3E5NA8t25HKE0,1070
1926
+ plancraft-0.3.32.dist-info/RECORD,,