openadapt-ml 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. openadapt_ml/baselines/__init__.py +121 -0
  2. openadapt_ml/baselines/adapter.py +185 -0
  3. openadapt_ml/baselines/cli.py +314 -0
  4. openadapt_ml/baselines/config.py +448 -0
  5. openadapt_ml/baselines/parser.py +922 -0
  6. openadapt_ml/baselines/prompts.py +787 -0
  7. openadapt_ml/benchmarks/__init__.py +13 -107
  8. openadapt_ml/benchmarks/agent.py +297 -374
  9. openadapt_ml/benchmarks/azure.py +62 -24
  10. openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
  11. openadapt_ml/benchmarks/cli.py +1874 -751
  12. openadapt_ml/benchmarks/trace_export.py +631 -0
  13. openadapt_ml/benchmarks/viewer.py +1236 -0
  14. openadapt_ml/benchmarks/vm_monitor.py +1111 -0
  15. openadapt_ml/benchmarks/waa_deploy/Dockerfile +216 -0
  16. openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
  17. openadapt_ml/benchmarks/waa_deploy/api_agent.py +540 -0
  18. openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
  19. openadapt_ml/cloud/azure_inference.py +3 -5
  20. openadapt_ml/cloud/lambda_labs.py +722 -307
  21. openadapt_ml/cloud/local.py +3194 -89
  22. openadapt_ml/cloud/ssh_tunnel.py +595 -0
  23. openadapt_ml/datasets/next_action.py +125 -96
  24. openadapt_ml/evals/grounding.py +32 -9
  25. openadapt_ml/evals/plot_eval_metrics.py +15 -13
  26. openadapt_ml/evals/trajectory_matching.py +120 -57
  27. openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
  28. openadapt_ml/experiments/demo_prompt/format_demo.py +236 -0
  29. openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
  30. openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
  31. openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
  32. openadapt_ml/experiments/demo_prompt/run_experiment.py +541 -0
  33. openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
  34. openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
  35. openadapt_ml/experiments/representation_shootout/config.py +390 -0
  36. openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
  37. openadapt_ml/experiments/representation_shootout/runner.py +687 -0
  38. openadapt_ml/experiments/waa_demo/__init__.py +10 -0
  39. openadapt_ml/experiments/waa_demo/demos.py +357 -0
  40. openadapt_ml/experiments/waa_demo/runner.py +732 -0
  41. openadapt_ml/experiments/waa_demo/tasks.py +151 -0
  42. openadapt_ml/export/__init__.py +9 -0
  43. openadapt_ml/export/__main__.py +6 -0
  44. openadapt_ml/export/cli.py +89 -0
  45. openadapt_ml/export/parquet.py +277 -0
  46. openadapt_ml/grounding/detector.py +18 -14
  47. openadapt_ml/ingest/__init__.py +11 -10
  48. openadapt_ml/ingest/capture.py +97 -86
  49. openadapt_ml/ingest/loader.py +120 -69
  50. openadapt_ml/ingest/synthetic.py +344 -193
  51. openadapt_ml/models/api_adapter.py +14 -4
  52. openadapt_ml/models/base_adapter.py +10 -2
  53. openadapt_ml/models/providers/__init__.py +288 -0
  54. openadapt_ml/models/providers/anthropic.py +266 -0
  55. openadapt_ml/models/providers/base.py +299 -0
  56. openadapt_ml/models/providers/google.py +376 -0
  57. openadapt_ml/models/providers/openai.py +342 -0
  58. openadapt_ml/models/qwen_vl.py +46 -19
  59. openadapt_ml/perception/__init__.py +35 -0
  60. openadapt_ml/perception/integration.py +399 -0
  61. openadapt_ml/retrieval/README.md +226 -0
  62. openadapt_ml/retrieval/USAGE.md +391 -0
  63. openadapt_ml/retrieval/__init__.py +91 -0
  64. openadapt_ml/retrieval/demo_retriever.py +843 -0
  65. openadapt_ml/retrieval/embeddings.py +630 -0
  66. openadapt_ml/retrieval/index.py +194 -0
  67. openadapt_ml/retrieval/retriever.py +162 -0
  68. openadapt_ml/runtime/__init__.py +50 -0
  69. openadapt_ml/runtime/policy.py +27 -14
  70. openadapt_ml/runtime/safety_gate.py +471 -0
  71. openadapt_ml/schema/__init__.py +113 -0
  72. openadapt_ml/schema/converters.py +588 -0
  73. openadapt_ml/schema/episode.py +470 -0
  74. openadapt_ml/scripts/capture_screenshots.py +530 -0
  75. openadapt_ml/scripts/compare.py +102 -61
  76. openadapt_ml/scripts/demo_policy.py +4 -1
  77. openadapt_ml/scripts/eval_policy.py +19 -14
  78. openadapt_ml/scripts/make_gif.py +1 -1
  79. openadapt_ml/scripts/prepare_synthetic.py +16 -17
  80. openadapt_ml/scripts/train.py +98 -75
  81. openadapt_ml/segmentation/README.md +920 -0
  82. openadapt_ml/segmentation/__init__.py +97 -0
  83. openadapt_ml/segmentation/adapters/__init__.py +5 -0
  84. openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
  85. openadapt_ml/segmentation/annotator.py +610 -0
  86. openadapt_ml/segmentation/cache.py +290 -0
  87. openadapt_ml/segmentation/cli.py +674 -0
  88. openadapt_ml/segmentation/deduplicator.py +656 -0
  89. openadapt_ml/segmentation/frame_describer.py +788 -0
  90. openadapt_ml/segmentation/pipeline.py +340 -0
  91. openadapt_ml/segmentation/schemas.py +622 -0
  92. openadapt_ml/segmentation/segment_extractor.py +634 -0
  93. openadapt_ml/training/azure_ops_viewer.py +1097 -0
  94. openadapt_ml/training/benchmark_viewer.py +3255 -19
  95. openadapt_ml/training/shared_ui.py +7 -7
  96. openadapt_ml/training/stub_provider.py +57 -35
  97. openadapt_ml/training/trainer.py +255 -441
  98. openadapt_ml/training/trl_trainer.py +403 -0
  99. openadapt_ml/training/viewer.py +323 -108
  100. openadapt_ml/training/viewer_components.py +180 -0
  101. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +312 -69
  102. openadapt_ml-0.2.1.dist-info/RECORD +116 -0
  103. openadapt_ml/benchmarks/base.py +0 -366
  104. openadapt_ml/benchmarks/data_collection.py +0 -432
  105. openadapt_ml/benchmarks/runner.py +0 -381
  106. openadapt_ml/benchmarks/waa.py +0 -704
  107. openadapt_ml/schemas/__init__.py +0 -53
  108. openadapt_ml/schemas/sessions.py +0 -122
  109. openadapt_ml/schemas/validation.py +0 -252
  110. openadapt_ml-0.1.0.dist-info/RECORD +0 -55
  111. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
  112. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -6,25 +6,24 @@ and convert them to the Episode/Step format used by openadapt-ml for training.
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
- import uuid
10
9
  from pathlib import Path
11
10
  from typing import TYPE_CHECKING
12
11
 
13
- from openadapt_ml.schemas.sessions import Action, Episode, Observation, Session, Step
12
+ from openadapt_ml.schema import Action, ActionType, Episode, Observation, Step
14
13
 
15
14
  if TYPE_CHECKING:
16
15
  from PIL import Image
17
16
 
18
- # Event type mapping from openadapt-capture to openadapt-ml
17
+ # Event type mapping from openadapt-capture to openadapt-ml ActionType
19
18
  EVENT_TYPE_MAP = {
20
- "mouse.singleclick": "click",
21
- "mouse.click": "click",
22
- "mouse.doubleclick": "double_click",
23
- "mouse.drag": "drag",
24
- "mouse.scroll": "scroll",
25
- "key.type": "type",
26
- "key.down": "key_press",
27
- "key.up": "key_press",
19
+ "mouse.singleclick": ActionType.CLICK,
20
+ "mouse.click": ActionType.CLICK,
21
+ "mouse.doubleclick": ActionType.DOUBLE_CLICK,
22
+ "mouse.drag": ActionType.DRAG,
23
+ "mouse.scroll": ActionType.SCROLL,
24
+ "key.type": ActionType.TYPE,
25
+ "key.down": ActionType.KEY,
26
+ "key.up": ActionType.KEY,
28
27
  }
29
28
 
30
29
 
@@ -33,7 +32,7 @@ def _normalize_coords(
33
32
  y: float | None,
34
33
  screen_width: int,
35
34
  screen_height: int,
36
- ) -> tuple[float | None, float | None]:
35
+ ) -> tuple[float, float] | None:
37
36
  """Normalize pixel coordinates to [0, 1] range.
38
37
 
39
38
  Args:
@@ -43,11 +42,11 @@ def _normalize_coords(
43
42
  screen_height: Screen height in pixels.
44
43
 
45
44
  Returns:
46
- Tuple of (normalized_x, normalized_y).
45
+ Tuple of (normalized_x, normalized_y) or None if coords are None.
47
46
  """
48
47
  if x is None or y is None:
49
- return None, None
50
- return x / screen_width, y / screen_height
48
+ return None
49
+ return (x / screen_width, y / screen_height)
51
50
 
52
51
 
53
52
  def _save_screenshot(
@@ -77,7 +76,7 @@ def _save_screenshot(
77
76
  def capture_to_episode(
78
77
  capture_path: str | Path,
79
78
  output_dir: str | Path | None = None,
80
- goal: str | None = None,
79
+ instruction: str | None = None,
81
80
  episode_id: str | None = None,
82
81
  include_moves: bool = False,
83
82
  ) -> Episode:
@@ -87,8 +86,8 @@ def capture_to_episode(
87
86
  capture_path: Path to the capture directory.
88
87
  output_dir: Directory to save extracted screenshots. If None, uses
89
88
  capture_path/screenshots.
90
- goal: Task description/goal for the episode. If None, uses capture's
91
- task_description or a generic message.
89
+ instruction: Task description/instruction for the episode. If None, uses
90
+ capture's task_description or a generic message.
92
91
  episode_id: Identifier for the episode. If None, generates a UUID.
93
92
  include_moves: Whether to include mouse move events.
94
93
 
@@ -101,7 +100,7 @@ def capture_to_episode(
101
100
  """
102
101
  try:
103
102
  from openadapt_capture import Capture
104
- from openadapt_capture.events import (
103
+ from openadapt_capture.events import ( # noqa: F401
105
104
  EventType,
106
105
  KeyTypeEvent,
107
106
  MouseClickEvent,
@@ -126,18 +125,20 @@ def capture_to_episode(
126
125
  if episode_id is None:
127
126
  episode_id = f"capture_{capture.id}"
128
127
 
129
- # Get goal from capture or derive from context
130
- if goal is None:
128
+ # Get instruction from capture or derive from context
129
+ if instruction is None:
131
130
  if capture.task_description:
132
- goal = capture.task_description
131
+ instruction = capture.task_description
133
132
  else:
134
- # Try to derive goal from directory name (e.g., "turn-off-nightshift" -> "Turn off nightshift")
133
+ # Try to derive instruction from directory name (e.g., "turn-off-nightshift" -> "Turn off nightshift")
135
134
  dir_name = capture_path.name
136
135
  if dir_name and dir_name != "capture":
137
136
  # Convert kebab-case/snake_case to readable text
138
- goal = dir_name.replace("-", " ").replace("_", " ").strip().capitalize()
137
+ instruction = (
138
+ dir_name.replace("-", " ").replace("_", " ").strip().capitalize()
139
+ )
139
140
  else:
140
- goal = "Complete the recorded workflow"
141
+ instruction = "Complete the recorded workflow"
141
142
 
142
143
  # Get screen dimensions for coordinate normalization
143
144
  screen_width, screen_height = capture.screen_size
@@ -152,22 +153,19 @@ def capture_to_episode(
152
153
  continue
153
154
 
154
155
  # Save screenshot
155
- image_path = _save_screenshot(screenshot, output_dir, episode_id, idx)
156
+ screenshot_path = _save_screenshot(screenshot, output_dir, episode_id, idx)
156
157
 
157
158
  # Normalize coordinates
158
- norm_x, norm_y = _normalize_coords(
159
- action.x, action.y, screen_width, screen_height
160
- )
159
+ norm_coords = _normalize_coords(action.x, action.y, screen_width, screen_height)
161
160
 
162
- # Map event type to openadapt-ml action type
161
+ # Map event type to openadapt-ml ActionType
163
162
  event_type = action.type
164
- action_type = EVENT_TYPE_MAP.get(event_type, "click")
163
+ action_type = EVENT_TYPE_MAP.get(event_type, ActionType.CLICK)
165
164
 
166
165
  # Build Action object
167
166
  ml_action = Action(
168
167
  type=action_type,
169
- x=norm_x,
170
- y=norm_y,
168
+ normalized_coordinates=norm_coords,
171
169
  text=action.text,
172
170
  )
173
171
 
@@ -175,34 +173,52 @@ def capture_to_episode(
175
173
  if isinstance(action.event, MouseDragEvent):
176
174
  end_x = action.event.x + action.event.dx
177
175
  end_y = action.event.y + action.event.dy
178
- norm_end_x, norm_end_y = _normalize_coords(
179
- end_x, end_y, screen_width, screen_height
176
+ norm_end = _normalize_coords(end_x, end_y, screen_width, screen_height)
177
+ ml_action = ml_action.model_copy(
178
+ update={
179
+ "normalized_end": norm_end,
180
+ "raw": {
181
+ "button": action.event.button,
182
+ },
183
+ }
180
184
  )
181
- ml_action.raw = {
182
- "end_x": norm_end_x,
183
- "end_y": norm_end_y,
184
- "button": action.event.button,
185
- }
186
185
 
187
186
  # Handle scroll events
188
187
  if isinstance(action.event, MouseScrollEvent):
189
- ml_action.raw = {
190
- "dx": action.event.dx,
191
- "dy": action.event.dy,
192
- }
188
+ # Determine scroll direction from dx/dy
189
+ scroll_direction = None
190
+ if action.event.dy > 0:
191
+ scroll_direction = "down"
192
+ elif action.event.dy < 0:
193
+ scroll_direction = "up"
194
+ elif action.event.dx > 0:
195
+ scroll_direction = "right"
196
+ elif action.event.dx < 0:
197
+ scroll_direction = "left"
198
+
199
+ ml_action = ml_action.model_copy(
200
+ update={
201
+ "scroll_direction": scroll_direction,
202
+ "raw": {
203
+ "dx": action.event.dx,
204
+ "dy": action.event.dy,
205
+ },
206
+ }
207
+ )
193
208
 
194
209
  # Handle keyboard events - include key names for special keys
195
210
  if action.keys:
196
- if ml_action.raw is None:
197
- ml_action.raw = {}
198
- ml_action.raw["keys"] = action.keys
211
+ raw = ml_action.raw or {}
212
+ raw["keys"] = action.keys
213
+ ml_action = ml_action.model_copy(update={"raw": raw})
199
214
 
200
215
  # Create Step
201
216
  step = Step(
202
- t=action.timestamp - start_time,
203
- observation=Observation(image_path=image_path),
217
+ step_index=idx,
218
+ observation=Observation(screenshot_path=screenshot_path),
204
219
  action=ml_action,
205
- thought=None, # Real recordings don't have thoughts
220
+ reasoning=None, # Real recordings don't have reasoning
221
+ timestamp=action.timestamp - start_time,
206
222
  )
207
223
  steps.append(step)
208
224
 
@@ -211,69 +227,64 @@ def capture_to_episode(
211
227
  # Use the last screenshot for the done action
212
228
  last_step = steps[-1]
213
229
  done_step = Step(
214
- t=last_step.t + 0.1,
215
- observation=Observation(image_path=last_step.observation.image_path),
216
- action=Action(type="done"),
217
- thought="Workflow complete.",
230
+ step_index=len(steps),
231
+ observation=Observation(
232
+ screenshot_path=last_step.observation.screenshot_path
233
+ ),
234
+ action=Action(type=ActionType.DONE),
235
+ reasoning="Workflow complete.",
236
+ timestamp=(last_step.timestamp or 0) + 0.1,
218
237
  )
219
238
  steps.append(done_step)
220
239
 
221
240
  capture.close()
222
241
 
223
242
  return Episode(
224
- id=episode_id,
225
- goal=goal,
243
+ episode_id=episode_id,
244
+ instruction=instruction,
226
245
  steps=steps,
227
- summary=f"Real recording with {len(steps)} steps",
228
246
  success=True,
229
- workflow_id=capture.id,
247
+ metadata={
248
+ "summary": f"Real recording with {len(steps)} steps",
249
+ "workflow_id": capture.id,
250
+ },
230
251
  )
231
252
 
232
253
 
233
- def capture_to_session(
254
+ def capture_to_episodes(
234
255
  capture_path: str | Path,
235
256
  output_dir: str | Path | None = None,
236
- goal: str | None = None,
237
- session_id: str | None = None,
257
+ instruction: str | None = None,
238
258
  include_moves: bool = False,
239
- ) -> Session:
240
- """Convert an openadapt-capture recording to a Session.
259
+ ) -> list[Episode]:
260
+ """Convert an openadapt-capture recording to a list with one Episode.
261
+
262
+ This is a convenience function that returns episodes as a list for consistency
263
+ with the new schema (which uses list[Episode] instead of Session).
241
264
 
242
265
  Args:
243
266
  capture_path: Path to the capture directory.
244
267
  output_dir: Directory to save extracted screenshots.
245
- goal: Task description/goal for the episode.
246
- session_id: Identifier for the session. If None, generates a UUID.
268
+ instruction: Task description/instruction for the episode.
247
269
  include_moves: Whether to include mouse move events.
248
270
 
249
271
  Returns:
250
- Session containing a single Episode.
272
+ List containing a single Episode.
251
273
  """
252
274
  episode = capture_to_episode(
253
275
  capture_path=capture_path,
254
276
  output_dir=output_dir,
255
- goal=goal,
277
+ instruction=instruction,
256
278
  include_moves=include_moves,
257
279
  )
258
-
259
- if session_id is None:
260
- session_id = f"session_{uuid.uuid4().hex[:8]}"
261
-
262
- return Session(
263
- id=session_id,
264
- episodes=[episode],
265
- meta={
266
- "source": "openadapt-capture",
267
- "capture_path": str(capture_path),
268
- },
269
- )
280
+ return [episode]
270
281
 
271
282
 
272
- def load_captures_as_sessions(
283
+ def load_captures_as_episodes(
273
284
  captures_dir: str | Path,
274
285
  output_dir: str | Path | None = None,
275
286
  include_moves: bool = False,
276
- ) -> list[Session]:
287
+ ) -> list[Episode]:
277
288
  """Load multiple captures from a directory.
278
289
 
279
290
  Scans for subdirectories containing capture.db files.
@@ -284,10 +295,10 @@ def load_captures_as_sessions(
284
295
  include_moves: Whether to include mouse move events.
285
296
 
286
297
  Returns:
287
- List of Sessions, one per capture.
298
+ List of Episodes, one per capture.
288
299
  """
289
300
  captures_dir = Path(captures_dir)
290
- sessions = []
301
+ episodes = []
291
302
 
292
303
  # Find all capture.db files
293
304
  for db_path in captures_dir.glob("**/capture.db"):
@@ -300,13 +311,13 @@ def load_captures_as_sessions(
300
311
  capture_output = None
301
312
 
302
313
  try:
303
- session = capture_to_session(
314
+ episode = capture_to_episode(
304
315
  capture_path=capture_path,
305
316
  output_dir=capture_output,
306
317
  include_moves=include_moves,
307
318
  )
308
- sessions.append(session)
319
+ episodes.append(episode)
309
320
  except Exception as e:
310
321
  print(f"Warning: Failed to load {capture_path}: {e}")
311
322
 
312
- return sessions
323
+ return episodes
@@ -8,10 +8,10 @@ from __future__ import annotations
8
8
 
9
9
  import json
10
10
  from pathlib import Path
11
- from typing import Any, Dict, List, Optional, Union
11
+ from typing import Any, Dict, List, Union
12
12
 
13
- from openadapt_ml.schemas.sessions import Action, Episode, Observation, Step
14
- from openadapt_ml.schemas.validation import validate_episodes, summarize_episodes
13
+
14
+ from openadapt_ml.schema import Action, ActionType, Episode, Observation, Step
15
15
 
16
16
 
17
17
  def load_episodes(
@@ -52,7 +52,7 @@ def load_episodes(
52
52
 
53
53
  if path.is_file():
54
54
  # Single JSON file
55
- episodes = _load_episodes_from_file(path)
55
+ episodes = _load_episodes_from_file(path, validate=validate)
56
56
  elif path.is_dir():
57
57
  # Directory of JSON files
58
58
  json_files = sorted(path.glob("*.json"))
@@ -60,15 +60,15 @@ def load_episodes(
60
60
  raise ValueError(f"No JSON files found in {path}")
61
61
 
62
62
  for json_file in json_files:
63
- file_episodes = _load_episodes_from_file(json_file)
63
+ file_episodes = _load_episodes_from_file(json_file, validate=validate)
64
64
  episodes.extend(file_episodes)
65
65
  else:
66
66
  raise ValueError(f"Path must be a file or directory: {path}")
67
67
 
68
- if validate:
69
- warnings = validate_episodes(episodes, check_images=check_images)
68
+ if check_images:
69
+ warnings = _check_episode_images(episodes)
70
70
  if warnings:
71
- print(f"Validation warnings ({len(warnings)}):")
71
+ print(f"Image warnings ({len(warnings)}):")
72
72
  for w in warnings[:10]: # Show first 10
73
73
  print(f" - {w}")
74
74
  if len(warnings) > 10:
@@ -77,7 +77,21 @@ def load_episodes(
77
77
  return episodes
78
78
 
79
79
 
80
- def _load_episodes_from_file(path: Path) -> List[Episode]:
80
+ def _check_episode_images(episodes: List[Episode]) -> List[str]:
81
+ """Check that all referenced images exist on disk."""
82
+ warnings = []
83
+ for ep in episodes:
84
+ for step in ep.steps:
85
+ if step.observation.screenshot_path:
86
+ if not Path(step.observation.screenshot_path).exists():
87
+ warnings.append(
88
+ f"Episode {ep.episode_id}, step {step.step_index}: "
89
+ f"Image not found: {step.observation.screenshot_path}"
90
+ )
91
+ return warnings
92
+
93
+
94
+ def _load_episodes_from_file(path: Path, validate: bool = True) -> List[Episode]:
81
95
  """Load episodes from a single JSON file."""
82
96
  with open(path, "r") as f:
83
97
  data = json.load(f)
@@ -85,75 +99,123 @@ def _load_episodes_from_file(path: Path) -> List[Episode]:
85
99
  # Handle different JSON structures
86
100
  if isinstance(data, list):
87
101
  # List of episodes
88
- return [_dict_to_episode(ep) for ep in data]
102
+ return [_dict_to_episode(ep, validate=validate) for ep in data]
89
103
  elif isinstance(data, dict):
90
104
  # Single episode or wrapped format
91
105
  if "episodes" in data:
92
- return [_dict_to_episode(ep) for ep in data["episodes"]]
93
- elif "id" in data and "goal" in data:
94
- # Single episode
95
- return [_dict_to_episode(data)]
106
+ return [_dict_to_episode(ep, validate=validate) for ep in data["episodes"]]
107
+ elif "episode_id" in data or "id" in data:
108
+ # Single episode (support both old and new field names)
109
+ return [_dict_to_episode(data, validate=validate)]
96
110
  else:
97
111
  raise ValueError(f"Unrecognized JSON format in {path}")
98
112
  else:
99
113
  raise ValueError(f"Expected list or dict in {path}, got {type(data)}")
100
114
 
101
115
 
102
- def _dict_to_episode(data: Dict[str, Any]) -> Episode:
116
+ def _parse_action_type(type_str: str) -> ActionType:
117
+ """Parse action type string to ActionType enum."""
118
+ # Handle common mappings from old format
119
+ type_map = {
120
+ "unknown": ActionType.CLICK,
121
+ "double_click": ActionType.DOUBLE_CLICK,
122
+ "right_click": ActionType.RIGHT_CLICK,
123
+ "key_press": ActionType.KEY,
124
+ }
125
+
126
+ type_lower = type_str.lower()
127
+ if type_lower in type_map:
128
+ return type_map[type_lower]
129
+
130
+ # Try direct enum lookup
131
+ try:
132
+ return ActionType(type_lower)
133
+ except ValueError:
134
+ # Default to CLICK for unknown types
135
+ return ActionType.CLICK
136
+
137
+
138
+ def _dict_to_episode(data: Dict[str, Any], validate: bool = True) -> Episode:
103
139
  """Convert a dictionary to an Episode object."""
104
140
  steps = []
105
- for step_data in data.get("steps", []):
141
+ for step_idx, step_data in enumerate(data.get("steps", [])):
106
142
  # Parse observation
107
143
  obs_data = step_data.get("observation", {})
108
144
  observation = Observation(
109
- image_path=obs_data.get("image_path"),
110
- meta=obs_data.get("meta"),
111
- accessibility_tree=obs_data.get("accessibility_tree"),
112
- dom_html=obs_data.get("dom_html"),
113
- url=obs_data.get("url"),
145
+ screenshot_path=obs_data.get("screenshot_path")
146
+ or obs_data.get("image_path"),
147
+ raw=obs_data.get("raw") or obs_data.get("meta"),
148
+ a11y_tree=obs_data.get("a11y_tree") or obs_data.get("accessibility_tree"),
149
+ dom=obs_data.get("dom") or obs_data.get("dom_html"),
114
150
  window_title=obs_data.get("window_title"),
115
- app_name=obs_data.get("app_name"),
116
151
  focused_element=obs_data.get("focused_element"),
117
152
  )
118
153
 
119
154
  # Parse action
120
155
  action_data = step_data.get("action", {})
156
+
157
+ # Handle action type (string -> enum)
158
+ action_type_raw = action_data.get("type", "click")
159
+ action_type = _parse_action_type(action_type_raw)
160
+
161
+ # Handle coordinates: convert x,y to normalized_coordinates tuple
162
+ normalized_coords = None
163
+ if action_data.get("normalized_coordinates"):
164
+ normalized_coords = tuple(action_data["normalized_coordinates"])
165
+ elif action_data.get("x") is not None and action_data.get("y") is not None:
166
+ normalized_coords = (action_data["x"], action_data["y"])
167
+
168
+ # Handle end coordinates for drag actions
169
+ normalized_end = None
170
+ if action_data.get("normalized_end"):
171
+ normalized_end = tuple(action_data["normalized_end"])
172
+ elif (
173
+ action_data.get("end_x") is not None
174
+ and action_data.get("end_y") is not None
175
+ ):
176
+ normalized_end = (action_data["end_x"], action_data["end_y"])
177
+
121
178
  action = Action(
122
- type=action_data.get("type", "unknown"),
123
- x=action_data.get("x"),
124
- y=action_data.get("y"),
179
+ type=action_type,
180
+ normalized_coordinates=normalized_coords,
181
+ normalized_end=normalized_end,
125
182
  text=action_data.get("text"),
126
183
  raw=action_data.get("raw"),
127
- bbox=tuple(action_data["bbox"]) if action_data.get("bbox") else None,
128
- element_index=action_data.get("element_index"),
129
- target_node_id=action_data.get("target_node_id"),
130
- target_role=action_data.get("target_role"),
131
- target_name=action_data.get("target_name"),
132
184
  key=action_data.get("key"),
133
185
  modifiers=action_data.get("modifiers"),
134
186
  scroll_direction=action_data.get("scroll_direction"),
135
187
  scroll_amount=action_data.get("scroll_amount"),
136
- end_x=action_data.get("end_x"),
137
- end_y=action_data.get("end_y"),
138
- answer=action_data.get("answer"),
139
188
  )
140
189
 
190
+ # Handle step index and timestamp
191
+ step_index = step_data.get("step_index", step_idx)
192
+ timestamp = step_data.get("timestamp") or step_data.get("t")
193
+
141
194
  step = Step(
142
- t=step_data.get("t", 0.0),
195
+ step_index=step_index,
143
196
  observation=observation,
144
197
  action=action,
145
- thought=step_data.get("thought"),
198
+ reasoning=step_data.get("reasoning") or step_data.get("thought"),
199
+ timestamp=timestamp,
146
200
  )
147
201
  steps.append(step)
148
202
 
149
- return Episode(
150
- id=data.get("id", "unknown"),
151
- goal=data.get("goal", ""),
152
- steps=steps,
153
- summary=data.get("summary"),
154
- success=data.get("success"),
155
- workflow_id=data.get("workflow_id"),
156
- )
203
+ # Build episode with field mapping (old -> new)
204
+ episode_data = {
205
+ "episode_id": data.get("episode_id") or data.get("id", "unknown"),
206
+ "instruction": data.get("instruction") or data.get("goal", ""),
207
+ "steps": steps,
208
+ "success": data.get("success"),
209
+ "metadata": {
210
+ "summary": data.get("summary"),
211
+ "workflow_id": data.get("workflow_id"),
212
+ },
213
+ }
214
+
215
+ if validate:
216
+ return Episode.model_validate(episode_data)
217
+ else:
218
+ return Episode(**episode_data)
157
219
 
158
220
 
159
221
  def save_episodes(
@@ -178,9 +240,9 @@ def save_episodes(
178
240
 
179
241
  with open(path, "w") as f:
180
242
  if pretty:
181
- json.dump(data, f, indent=2)
243
+ json.dump(data, f, indent=2, default=str)
182
244
  else:
183
- json.dump(data, f)
245
+ json.dump(data, f, default=str)
184
246
 
185
247
 
186
248
  def _episode_to_dict(episode: Episode) -> Dict[str, Any]:
@@ -188,45 +250,34 @@ def _episode_to_dict(episode: Episode) -> Dict[str, Any]:
188
250
  steps = []
189
251
  for step in episode.steps:
190
252
  step_dict = {
191
- "t": step.t,
253
+ "step_index": step.step_index,
254
+ "timestamp": step.timestamp,
192
255
  "observation": {
193
- "image_path": step.observation.image_path,
194
- "meta": step.observation.meta,
195
- "accessibility_tree": step.observation.accessibility_tree,
196
- "dom_html": step.observation.dom_html,
197
- "url": step.observation.url,
256
+ "screenshot_path": step.observation.screenshot_path,
257
+ "raw": step.observation.raw,
258
+ "a11y_tree": step.observation.a11y_tree,
259
+ "dom": step.observation.dom,
198
260
  "window_title": step.observation.window_title,
199
- "app_name": step.observation.app_name,
200
- "focused_element": step.observation.focused_element,
201
261
  },
202
262
  "action": {
203
- "type": step.action.type,
204
- "x": step.action.x,
205
- "y": step.action.y,
263
+ "type": step.action.type.value,
264
+ "normalized_coordinates": step.action.normalized_coordinates,
265
+ "normalized_end": step.action.normalized_end,
206
266
  "text": step.action.text,
207
267
  "raw": step.action.raw,
208
- "bbox": list(step.action.bbox) if step.action.bbox else None,
209
- "element_index": step.action.element_index,
210
- "target_node_id": step.action.target_node_id,
211
- "target_role": step.action.target_role,
212
- "target_name": step.action.target_name,
213
268
  "key": step.action.key,
214
269
  "modifiers": step.action.modifiers,
215
270
  "scroll_direction": step.action.scroll_direction,
216
271
  "scroll_amount": step.action.scroll_amount,
217
- "end_x": step.action.end_x,
218
- "end_y": step.action.end_y,
219
- "answer": step.action.answer,
220
272
  },
221
- "thought": step.thought,
273
+ "reasoning": step.reasoning,
222
274
  }
223
275
  steps.append(step_dict)
224
276
 
225
277
  return {
226
- "id": episode.id,
227
- "goal": episode.goal,
278
+ "episode_id": episode.episode_id,
279
+ "instruction": episode.instruction,
228
280
  "steps": steps,
229
- "summary": episode.summary,
230
281
  "success": episode.success,
231
- "workflow_id": episode.workflow_id,
282
+ "metadata": episode.metadata,
232
283
  }