openadapt-ml 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/baselines/__init__.py +121 -0
- openadapt_ml/baselines/adapter.py +185 -0
- openadapt_ml/baselines/cli.py +314 -0
- openadapt_ml/baselines/config.py +448 -0
- openadapt_ml/baselines/parser.py +922 -0
- openadapt_ml/baselines/prompts.py +787 -0
- openadapt_ml/benchmarks/__init__.py +13 -107
- openadapt_ml/benchmarks/agent.py +297 -374
- openadapt_ml/benchmarks/azure.py +62 -24
- openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
- openadapt_ml/benchmarks/cli.py +1874 -751
- openadapt_ml/benchmarks/trace_export.py +631 -0
- openadapt_ml/benchmarks/viewer.py +1236 -0
- openadapt_ml/benchmarks/vm_monitor.py +1111 -0
- openadapt_ml/benchmarks/waa_deploy/Dockerfile +216 -0
- openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
- openadapt_ml/benchmarks/waa_deploy/api_agent.py +540 -0
- openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
- openadapt_ml/cloud/azure_inference.py +3 -5
- openadapt_ml/cloud/lambda_labs.py +722 -307
- openadapt_ml/cloud/local.py +3194 -89
- openadapt_ml/cloud/ssh_tunnel.py +595 -0
- openadapt_ml/datasets/next_action.py +125 -96
- openadapt_ml/evals/grounding.py +32 -9
- openadapt_ml/evals/plot_eval_metrics.py +15 -13
- openadapt_ml/evals/trajectory_matching.py +120 -57
- openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
- openadapt_ml/experiments/demo_prompt/format_demo.py +236 -0
- openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
- openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
- openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
- openadapt_ml/experiments/demo_prompt/run_experiment.py +541 -0
- openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
- openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
- openadapt_ml/experiments/representation_shootout/config.py +390 -0
- openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
- openadapt_ml/experiments/representation_shootout/runner.py +687 -0
- openadapt_ml/experiments/waa_demo/__init__.py +10 -0
- openadapt_ml/experiments/waa_demo/demos.py +357 -0
- openadapt_ml/experiments/waa_demo/runner.py +732 -0
- openadapt_ml/experiments/waa_demo/tasks.py +151 -0
- openadapt_ml/export/__init__.py +9 -0
- openadapt_ml/export/__main__.py +6 -0
- openadapt_ml/export/cli.py +89 -0
- openadapt_ml/export/parquet.py +277 -0
- openadapt_ml/grounding/detector.py +18 -14
- openadapt_ml/ingest/__init__.py +11 -10
- openadapt_ml/ingest/capture.py +97 -86
- openadapt_ml/ingest/loader.py +120 -69
- openadapt_ml/ingest/synthetic.py +344 -193
- openadapt_ml/models/api_adapter.py +14 -4
- openadapt_ml/models/base_adapter.py +10 -2
- openadapt_ml/models/providers/__init__.py +288 -0
- openadapt_ml/models/providers/anthropic.py +266 -0
- openadapt_ml/models/providers/base.py +299 -0
- openadapt_ml/models/providers/google.py +376 -0
- openadapt_ml/models/providers/openai.py +342 -0
- openadapt_ml/models/qwen_vl.py +46 -19
- openadapt_ml/perception/__init__.py +35 -0
- openadapt_ml/perception/integration.py +399 -0
- openadapt_ml/retrieval/README.md +226 -0
- openadapt_ml/retrieval/USAGE.md +391 -0
- openadapt_ml/retrieval/__init__.py +91 -0
- openadapt_ml/retrieval/demo_retriever.py +843 -0
- openadapt_ml/retrieval/embeddings.py +630 -0
- openadapt_ml/retrieval/index.py +194 -0
- openadapt_ml/retrieval/retriever.py +162 -0
- openadapt_ml/runtime/__init__.py +50 -0
- openadapt_ml/runtime/policy.py +27 -14
- openadapt_ml/runtime/safety_gate.py +471 -0
- openadapt_ml/schema/__init__.py +113 -0
- openadapt_ml/schema/converters.py +588 -0
- openadapt_ml/schema/episode.py +470 -0
- openadapt_ml/scripts/capture_screenshots.py +530 -0
- openadapt_ml/scripts/compare.py +102 -61
- openadapt_ml/scripts/demo_policy.py +4 -1
- openadapt_ml/scripts/eval_policy.py +19 -14
- openadapt_ml/scripts/make_gif.py +1 -1
- openadapt_ml/scripts/prepare_synthetic.py +16 -17
- openadapt_ml/scripts/train.py +98 -75
- openadapt_ml/segmentation/README.md +920 -0
- openadapt_ml/segmentation/__init__.py +97 -0
- openadapt_ml/segmentation/adapters/__init__.py +5 -0
- openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
- openadapt_ml/segmentation/annotator.py +610 -0
- openadapt_ml/segmentation/cache.py +290 -0
- openadapt_ml/segmentation/cli.py +674 -0
- openadapt_ml/segmentation/deduplicator.py +656 -0
- openadapt_ml/segmentation/frame_describer.py +788 -0
- openadapt_ml/segmentation/pipeline.py +340 -0
- openadapt_ml/segmentation/schemas.py +622 -0
- openadapt_ml/segmentation/segment_extractor.py +634 -0
- openadapt_ml/training/azure_ops_viewer.py +1097 -0
- openadapt_ml/training/benchmark_viewer.py +3255 -19
- openadapt_ml/training/shared_ui.py +7 -7
- openadapt_ml/training/stub_provider.py +57 -35
- openadapt_ml/training/trainer.py +255 -441
- openadapt_ml/training/trl_trainer.py +403 -0
- openadapt_ml/training/viewer.py +323 -108
- openadapt_ml/training/viewer_components.py +180 -0
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +312 -69
- openadapt_ml-0.2.1.dist-info/RECORD +116 -0
- openadapt_ml/benchmarks/base.py +0 -366
- openadapt_ml/benchmarks/data_collection.py +0 -432
- openadapt_ml/benchmarks/runner.py +0 -381
- openadapt_ml/benchmarks/waa.py +0 -704
- openadapt_ml/schemas/__init__.py +0 -53
- openadapt_ml/schemas/sessions.py +0 -122
- openadapt_ml/schemas/validation.py +0 -252
- openadapt_ml-0.1.0.dist-info/RECORD +0 -55
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
openadapt_ml/ingest/capture.py
CHANGED
|
@@ -6,25 +6,24 @@ and convert them to the Episode/Step format used by openadapt-ml for training.
|
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
-
import uuid
|
|
10
9
|
from pathlib import Path
|
|
11
10
|
from typing import TYPE_CHECKING
|
|
12
11
|
|
|
13
|
-
from openadapt_ml.
|
|
12
|
+
from openadapt_ml.schema import Action, ActionType, Episode, Observation, Step
|
|
14
13
|
|
|
15
14
|
if TYPE_CHECKING:
|
|
16
15
|
from PIL import Image
|
|
17
16
|
|
|
18
|
-
# Event type mapping from openadapt-capture to openadapt-ml
|
|
17
|
+
# Event type mapping from openadapt-capture to openadapt-ml ActionType
|
|
19
18
|
EVENT_TYPE_MAP = {
|
|
20
|
-
"mouse.singleclick":
|
|
21
|
-
"mouse.click":
|
|
22
|
-
"mouse.doubleclick":
|
|
23
|
-
"mouse.drag":
|
|
24
|
-
"mouse.scroll":
|
|
25
|
-
"key.type":
|
|
26
|
-
"key.down":
|
|
27
|
-
"key.up":
|
|
19
|
+
"mouse.singleclick": ActionType.CLICK,
|
|
20
|
+
"mouse.click": ActionType.CLICK,
|
|
21
|
+
"mouse.doubleclick": ActionType.DOUBLE_CLICK,
|
|
22
|
+
"mouse.drag": ActionType.DRAG,
|
|
23
|
+
"mouse.scroll": ActionType.SCROLL,
|
|
24
|
+
"key.type": ActionType.TYPE,
|
|
25
|
+
"key.down": ActionType.KEY,
|
|
26
|
+
"key.up": ActionType.KEY,
|
|
28
27
|
}
|
|
29
28
|
|
|
30
29
|
|
|
@@ -33,7 +32,7 @@ def _normalize_coords(
|
|
|
33
32
|
y: float | None,
|
|
34
33
|
screen_width: int,
|
|
35
34
|
screen_height: int,
|
|
36
|
-
) -> tuple[float
|
|
35
|
+
) -> tuple[float, float] | None:
|
|
37
36
|
"""Normalize pixel coordinates to [0, 1] range.
|
|
38
37
|
|
|
39
38
|
Args:
|
|
@@ -43,11 +42,11 @@ def _normalize_coords(
|
|
|
43
42
|
screen_height: Screen height in pixels.
|
|
44
43
|
|
|
45
44
|
Returns:
|
|
46
|
-
Tuple of (normalized_x, normalized_y).
|
|
45
|
+
Tuple of (normalized_x, normalized_y) or None if coords are None.
|
|
47
46
|
"""
|
|
48
47
|
if x is None or y is None:
|
|
49
|
-
return None
|
|
50
|
-
return x / screen_width, y / screen_height
|
|
48
|
+
return None
|
|
49
|
+
return (x / screen_width, y / screen_height)
|
|
51
50
|
|
|
52
51
|
|
|
53
52
|
def _save_screenshot(
|
|
@@ -77,7 +76,7 @@ def _save_screenshot(
|
|
|
77
76
|
def capture_to_episode(
|
|
78
77
|
capture_path: str | Path,
|
|
79
78
|
output_dir: str | Path | None = None,
|
|
80
|
-
|
|
79
|
+
instruction: str | None = None,
|
|
81
80
|
episode_id: str | None = None,
|
|
82
81
|
include_moves: bool = False,
|
|
83
82
|
) -> Episode:
|
|
@@ -87,8 +86,8 @@ def capture_to_episode(
|
|
|
87
86
|
capture_path: Path to the capture directory.
|
|
88
87
|
output_dir: Directory to save extracted screenshots. If None, uses
|
|
89
88
|
capture_path/screenshots.
|
|
90
|
-
|
|
91
|
-
|
|
89
|
+
instruction: Task description/instruction for the episode. If None, uses
|
|
90
|
+
capture's task_description or a generic message.
|
|
92
91
|
episode_id: Identifier for the episode. If None, generates a UUID.
|
|
93
92
|
include_moves: Whether to include mouse move events.
|
|
94
93
|
|
|
@@ -101,7 +100,7 @@ def capture_to_episode(
|
|
|
101
100
|
"""
|
|
102
101
|
try:
|
|
103
102
|
from openadapt_capture import Capture
|
|
104
|
-
from openadapt_capture.events import (
|
|
103
|
+
from openadapt_capture.events import ( # noqa: F401
|
|
105
104
|
EventType,
|
|
106
105
|
KeyTypeEvent,
|
|
107
106
|
MouseClickEvent,
|
|
@@ -126,18 +125,20 @@ def capture_to_episode(
|
|
|
126
125
|
if episode_id is None:
|
|
127
126
|
episode_id = f"capture_{capture.id}"
|
|
128
127
|
|
|
129
|
-
# Get
|
|
130
|
-
if
|
|
128
|
+
# Get instruction from capture or derive from context
|
|
129
|
+
if instruction is None:
|
|
131
130
|
if capture.task_description:
|
|
132
|
-
|
|
131
|
+
instruction = capture.task_description
|
|
133
132
|
else:
|
|
134
|
-
# Try to derive
|
|
133
|
+
# Try to derive instruction from directory name (e.g., "turn-off-nightshift" -> "Turn off nightshift")
|
|
135
134
|
dir_name = capture_path.name
|
|
136
135
|
if dir_name and dir_name != "capture":
|
|
137
136
|
# Convert kebab-case/snake_case to readable text
|
|
138
|
-
|
|
137
|
+
instruction = (
|
|
138
|
+
dir_name.replace("-", " ").replace("_", " ").strip().capitalize()
|
|
139
|
+
)
|
|
139
140
|
else:
|
|
140
|
-
|
|
141
|
+
instruction = "Complete the recorded workflow"
|
|
141
142
|
|
|
142
143
|
# Get screen dimensions for coordinate normalization
|
|
143
144
|
screen_width, screen_height = capture.screen_size
|
|
@@ -152,22 +153,19 @@ def capture_to_episode(
|
|
|
152
153
|
continue
|
|
153
154
|
|
|
154
155
|
# Save screenshot
|
|
155
|
-
|
|
156
|
+
screenshot_path = _save_screenshot(screenshot, output_dir, episode_id, idx)
|
|
156
157
|
|
|
157
158
|
# Normalize coordinates
|
|
158
|
-
|
|
159
|
-
action.x, action.y, screen_width, screen_height
|
|
160
|
-
)
|
|
159
|
+
norm_coords = _normalize_coords(action.x, action.y, screen_width, screen_height)
|
|
161
160
|
|
|
162
|
-
# Map event type to openadapt-ml
|
|
161
|
+
# Map event type to openadapt-ml ActionType
|
|
163
162
|
event_type = action.type
|
|
164
|
-
action_type = EVENT_TYPE_MAP.get(event_type,
|
|
163
|
+
action_type = EVENT_TYPE_MAP.get(event_type, ActionType.CLICK)
|
|
165
164
|
|
|
166
165
|
# Build Action object
|
|
167
166
|
ml_action = Action(
|
|
168
167
|
type=action_type,
|
|
169
|
-
|
|
170
|
-
y=norm_y,
|
|
168
|
+
normalized_coordinates=norm_coords,
|
|
171
169
|
text=action.text,
|
|
172
170
|
)
|
|
173
171
|
|
|
@@ -175,34 +173,52 @@ def capture_to_episode(
|
|
|
175
173
|
if isinstance(action.event, MouseDragEvent):
|
|
176
174
|
end_x = action.event.x + action.event.dx
|
|
177
175
|
end_y = action.event.y + action.event.dy
|
|
178
|
-
|
|
179
|
-
|
|
176
|
+
norm_end = _normalize_coords(end_x, end_y, screen_width, screen_height)
|
|
177
|
+
ml_action = ml_action.model_copy(
|
|
178
|
+
update={
|
|
179
|
+
"normalized_end": norm_end,
|
|
180
|
+
"raw": {
|
|
181
|
+
"button": action.event.button,
|
|
182
|
+
},
|
|
183
|
+
}
|
|
180
184
|
)
|
|
181
|
-
ml_action.raw = {
|
|
182
|
-
"end_x": norm_end_x,
|
|
183
|
-
"end_y": norm_end_y,
|
|
184
|
-
"button": action.event.button,
|
|
185
|
-
}
|
|
186
185
|
|
|
187
186
|
# Handle scroll events
|
|
188
187
|
if isinstance(action.event, MouseScrollEvent):
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
188
|
+
# Determine scroll direction from dx/dy
|
|
189
|
+
scroll_direction = None
|
|
190
|
+
if action.event.dy > 0:
|
|
191
|
+
scroll_direction = "down"
|
|
192
|
+
elif action.event.dy < 0:
|
|
193
|
+
scroll_direction = "up"
|
|
194
|
+
elif action.event.dx > 0:
|
|
195
|
+
scroll_direction = "right"
|
|
196
|
+
elif action.event.dx < 0:
|
|
197
|
+
scroll_direction = "left"
|
|
198
|
+
|
|
199
|
+
ml_action = ml_action.model_copy(
|
|
200
|
+
update={
|
|
201
|
+
"scroll_direction": scroll_direction,
|
|
202
|
+
"raw": {
|
|
203
|
+
"dx": action.event.dx,
|
|
204
|
+
"dy": action.event.dy,
|
|
205
|
+
},
|
|
206
|
+
}
|
|
207
|
+
)
|
|
193
208
|
|
|
194
209
|
# Handle keyboard events - include key names for special keys
|
|
195
210
|
if action.keys:
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
ml_action.raw
|
|
211
|
+
raw = ml_action.raw or {}
|
|
212
|
+
raw["keys"] = action.keys
|
|
213
|
+
ml_action = ml_action.model_copy(update={"raw": raw})
|
|
199
214
|
|
|
200
215
|
# Create Step
|
|
201
216
|
step = Step(
|
|
202
|
-
|
|
203
|
-
observation=Observation(
|
|
217
|
+
step_index=idx,
|
|
218
|
+
observation=Observation(screenshot_path=screenshot_path),
|
|
204
219
|
action=ml_action,
|
|
205
|
-
|
|
220
|
+
reasoning=None, # Real recordings don't have reasoning
|
|
221
|
+
timestamp=action.timestamp - start_time,
|
|
206
222
|
)
|
|
207
223
|
steps.append(step)
|
|
208
224
|
|
|
@@ -211,69 +227,64 @@ def capture_to_episode(
|
|
|
211
227
|
# Use the last screenshot for the done action
|
|
212
228
|
last_step = steps[-1]
|
|
213
229
|
done_step = Step(
|
|
214
|
-
|
|
215
|
-
observation=Observation(
|
|
216
|
-
|
|
217
|
-
|
|
230
|
+
step_index=len(steps),
|
|
231
|
+
observation=Observation(
|
|
232
|
+
screenshot_path=last_step.observation.screenshot_path
|
|
233
|
+
),
|
|
234
|
+
action=Action(type=ActionType.DONE),
|
|
235
|
+
reasoning="Workflow complete.",
|
|
236
|
+
timestamp=(last_step.timestamp or 0) + 0.1,
|
|
218
237
|
)
|
|
219
238
|
steps.append(done_step)
|
|
220
239
|
|
|
221
240
|
capture.close()
|
|
222
241
|
|
|
223
242
|
return Episode(
|
|
224
|
-
|
|
225
|
-
|
|
243
|
+
episode_id=episode_id,
|
|
244
|
+
instruction=instruction,
|
|
226
245
|
steps=steps,
|
|
227
|
-
summary=f"Real recording with {len(steps)} steps",
|
|
228
246
|
success=True,
|
|
229
|
-
|
|
247
|
+
metadata={
|
|
248
|
+
"summary": f"Real recording with {len(steps)} steps",
|
|
249
|
+
"workflow_id": capture.id,
|
|
250
|
+
},
|
|
230
251
|
)
|
|
231
252
|
|
|
232
253
|
|
|
233
|
-
def
|
|
254
|
+
def capture_to_episodes(
|
|
234
255
|
capture_path: str | Path,
|
|
235
256
|
output_dir: str | Path | None = None,
|
|
236
|
-
|
|
237
|
-
session_id: str | None = None,
|
|
257
|
+
instruction: str | None = None,
|
|
238
258
|
include_moves: bool = False,
|
|
239
|
-
) ->
|
|
240
|
-
"""Convert an openadapt-capture recording to a
|
|
259
|
+
) -> list[Episode]:
|
|
260
|
+
"""Convert an openadapt-capture recording to a list with one Episode.
|
|
261
|
+
|
|
262
|
+
This is a convenience function that returns episodes as a list for consistency
|
|
263
|
+
with the new schema (which uses list[Episode] instead of Session).
|
|
241
264
|
|
|
242
265
|
Args:
|
|
243
266
|
capture_path: Path to the capture directory.
|
|
244
267
|
output_dir: Directory to save extracted screenshots.
|
|
245
|
-
|
|
246
|
-
session_id: Identifier for the session. If None, generates a UUID.
|
|
268
|
+
instruction: Task description/instruction for the episode.
|
|
247
269
|
include_moves: Whether to include mouse move events.
|
|
248
270
|
|
|
249
271
|
Returns:
|
|
250
|
-
|
|
272
|
+
List containing a single Episode.
|
|
251
273
|
"""
|
|
252
274
|
episode = capture_to_episode(
|
|
253
275
|
capture_path=capture_path,
|
|
254
276
|
output_dir=output_dir,
|
|
255
|
-
|
|
277
|
+
instruction=instruction,
|
|
256
278
|
include_moves=include_moves,
|
|
257
279
|
)
|
|
258
|
-
|
|
259
|
-
if session_id is None:
|
|
260
|
-
session_id = f"session_{uuid.uuid4().hex[:8]}"
|
|
261
|
-
|
|
262
|
-
return Session(
|
|
263
|
-
id=session_id,
|
|
264
|
-
episodes=[episode],
|
|
265
|
-
meta={
|
|
266
|
-
"source": "openadapt-capture",
|
|
267
|
-
"capture_path": str(capture_path),
|
|
268
|
-
},
|
|
269
|
-
)
|
|
280
|
+
return [episode]
|
|
270
281
|
|
|
271
282
|
|
|
272
|
-
def
|
|
283
|
+
def load_captures_as_episodes(
|
|
273
284
|
captures_dir: str | Path,
|
|
274
285
|
output_dir: str | Path | None = None,
|
|
275
286
|
include_moves: bool = False,
|
|
276
|
-
) -> list[
|
|
287
|
+
) -> list[Episode]:
|
|
277
288
|
"""Load multiple captures from a directory.
|
|
278
289
|
|
|
279
290
|
Scans for subdirectories containing capture.db files.
|
|
@@ -284,10 +295,10 @@ def load_captures_as_sessions(
|
|
|
284
295
|
include_moves: Whether to include mouse move events.
|
|
285
296
|
|
|
286
297
|
Returns:
|
|
287
|
-
List of
|
|
298
|
+
List of Episodes, one per capture.
|
|
288
299
|
"""
|
|
289
300
|
captures_dir = Path(captures_dir)
|
|
290
|
-
|
|
301
|
+
episodes = []
|
|
291
302
|
|
|
292
303
|
# Find all capture.db files
|
|
293
304
|
for db_path in captures_dir.glob("**/capture.db"):
|
|
@@ -300,13 +311,13 @@ def load_captures_as_sessions(
|
|
|
300
311
|
capture_output = None
|
|
301
312
|
|
|
302
313
|
try:
|
|
303
|
-
|
|
314
|
+
episode = capture_to_episode(
|
|
304
315
|
capture_path=capture_path,
|
|
305
316
|
output_dir=capture_output,
|
|
306
317
|
include_moves=include_moves,
|
|
307
318
|
)
|
|
308
|
-
|
|
319
|
+
episodes.append(episode)
|
|
309
320
|
except Exception as e:
|
|
310
321
|
print(f"Warning: Failed to load {capture_path}: {e}")
|
|
311
322
|
|
|
312
|
-
return
|
|
323
|
+
return episodes
|
openadapt_ml/ingest/loader.py
CHANGED
|
@@ -8,10 +8,10 @@ from __future__ import annotations
|
|
|
8
8
|
|
|
9
9
|
import json
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import Any, Dict, List,
|
|
11
|
+
from typing import Any, Dict, List, Union
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
from openadapt_ml.
|
|
13
|
+
|
|
14
|
+
from openadapt_ml.schema import Action, ActionType, Episode, Observation, Step
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
def load_episodes(
|
|
@@ -52,7 +52,7 @@ def load_episodes(
|
|
|
52
52
|
|
|
53
53
|
if path.is_file():
|
|
54
54
|
# Single JSON file
|
|
55
|
-
episodes = _load_episodes_from_file(path)
|
|
55
|
+
episodes = _load_episodes_from_file(path, validate=validate)
|
|
56
56
|
elif path.is_dir():
|
|
57
57
|
# Directory of JSON files
|
|
58
58
|
json_files = sorted(path.glob("*.json"))
|
|
@@ -60,15 +60,15 @@ def load_episodes(
|
|
|
60
60
|
raise ValueError(f"No JSON files found in {path}")
|
|
61
61
|
|
|
62
62
|
for json_file in json_files:
|
|
63
|
-
file_episodes = _load_episodes_from_file(json_file)
|
|
63
|
+
file_episodes = _load_episodes_from_file(json_file, validate=validate)
|
|
64
64
|
episodes.extend(file_episodes)
|
|
65
65
|
else:
|
|
66
66
|
raise ValueError(f"Path must be a file or directory: {path}")
|
|
67
67
|
|
|
68
|
-
if
|
|
69
|
-
warnings =
|
|
68
|
+
if check_images:
|
|
69
|
+
warnings = _check_episode_images(episodes)
|
|
70
70
|
if warnings:
|
|
71
|
-
print(f"
|
|
71
|
+
print(f"Image warnings ({len(warnings)}):")
|
|
72
72
|
for w in warnings[:10]: # Show first 10
|
|
73
73
|
print(f" - {w}")
|
|
74
74
|
if len(warnings) > 10:
|
|
@@ -77,7 +77,21 @@ def load_episodes(
|
|
|
77
77
|
return episodes
|
|
78
78
|
|
|
79
79
|
|
|
80
|
-
def
|
|
80
|
+
def _check_episode_images(episodes: List[Episode]) -> List[str]:
|
|
81
|
+
"""Check that all referenced images exist on disk."""
|
|
82
|
+
warnings = []
|
|
83
|
+
for ep in episodes:
|
|
84
|
+
for step in ep.steps:
|
|
85
|
+
if step.observation.screenshot_path:
|
|
86
|
+
if not Path(step.observation.screenshot_path).exists():
|
|
87
|
+
warnings.append(
|
|
88
|
+
f"Episode {ep.episode_id}, step {step.step_index}: "
|
|
89
|
+
f"Image not found: {step.observation.screenshot_path}"
|
|
90
|
+
)
|
|
91
|
+
return warnings
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _load_episodes_from_file(path: Path, validate: bool = True) -> List[Episode]:
|
|
81
95
|
"""Load episodes from a single JSON file."""
|
|
82
96
|
with open(path, "r") as f:
|
|
83
97
|
data = json.load(f)
|
|
@@ -85,75 +99,123 @@ def _load_episodes_from_file(path: Path) -> List[Episode]:
|
|
|
85
99
|
# Handle different JSON structures
|
|
86
100
|
if isinstance(data, list):
|
|
87
101
|
# List of episodes
|
|
88
|
-
return [_dict_to_episode(ep) for ep in data]
|
|
102
|
+
return [_dict_to_episode(ep, validate=validate) for ep in data]
|
|
89
103
|
elif isinstance(data, dict):
|
|
90
104
|
# Single episode or wrapped format
|
|
91
105
|
if "episodes" in data:
|
|
92
|
-
return [_dict_to_episode(ep) for ep in data["episodes"]]
|
|
93
|
-
elif "
|
|
94
|
-
# Single episode
|
|
95
|
-
return [_dict_to_episode(data)]
|
|
106
|
+
return [_dict_to_episode(ep, validate=validate) for ep in data["episodes"]]
|
|
107
|
+
elif "episode_id" in data or "id" in data:
|
|
108
|
+
# Single episode (support both old and new field names)
|
|
109
|
+
return [_dict_to_episode(data, validate=validate)]
|
|
96
110
|
else:
|
|
97
111
|
raise ValueError(f"Unrecognized JSON format in {path}")
|
|
98
112
|
else:
|
|
99
113
|
raise ValueError(f"Expected list or dict in {path}, got {type(data)}")
|
|
100
114
|
|
|
101
115
|
|
|
102
|
-
def
|
|
116
|
+
def _parse_action_type(type_str: str) -> ActionType:
|
|
117
|
+
"""Parse action type string to ActionType enum."""
|
|
118
|
+
# Handle common mappings from old format
|
|
119
|
+
type_map = {
|
|
120
|
+
"unknown": ActionType.CLICK,
|
|
121
|
+
"double_click": ActionType.DOUBLE_CLICK,
|
|
122
|
+
"right_click": ActionType.RIGHT_CLICK,
|
|
123
|
+
"key_press": ActionType.KEY,
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
type_lower = type_str.lower()
|
|
127
|
+
if type_lower in type_map:
|
|
128
|
+
return type_map[type_lower]
|
|
129
|
+
|
|
130
|
+
# Try direct enum lookup
|
|
131
|
+
try:
|
|
132
|
+
return ActionType(type_lower)
|
|
133
|
+
except ValueError:
|
|
134
|
+
# Default to CLICK for unknown types
|
|
135
|
+
return ActionType.CLICK
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _dict_to_episode(data: Dict[str, Any], validate: bool = True) -> Episode:
|
|
103
139
|
"""Convert a dictionary to an Episode object."""
|
|
104
140
|
steps = []
|
|
105
|
-
for step_data in data.get("steps", []):
|
|
141
|
+
for step_idx, step_data in enumerate(data.get("steps", [])):
|
|
106
142
|
# Parse observation
|
|
107
143
|
obs_data = step_data.get("observation", {})
|
|
108
144
|
observation = Observation(
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
145
|
+
screenshot_path=obs_data.get("screenshot_path")
|
|
146
|
+
or obs_data.get("image_path"),
|
|
147
|
+
raw=obs_data.get("raw") or obs_data.get("meta"),
|
|
148
|
+
a11y_tree=obs_data.get("a11y_tree") or obs_data.get("accessibility_tree"),
|
|
149
|
+
dom=obs_data.get("dom") or obs_data.get("dom_html"),
|
|
114
150
|
window_title=obs_data.get("window_title"),
|
|
115
|
-
app_name=obs_data.get("app_name"),
|
|
116
151
|
focused_element=obs_data.get("focused_element"),
|
|
117
152
|
)
|
|
118
153
|
|
|
119
154
|
# Parse action
|
|
120
155
|
action_data = step_data.get("action", {})
|
|
156
|
+
|
|
157
|
+
# Handle action type (string -> enum)
|
|
158
|
+
action_type_raw = action_data.get("type", "click")
|
|
159
|
+
action_type = _parse_action_type(action_type_raw)
|
|
160
|
+
|
|
161
|
+
# Handle coordinates: convert x,y to normalized_coordinates tuple
|
|
162
|
+
normalized_coords = None
|
|
163
|
+
if action_data.get("normalized_coordinates"):
|
|
164
|
+
normalized_coords = tuple(action_data["normalized_coordinates"])
|
|
165
|
+
elif action_data.get("x") is not None and action_data.get("y") is not None:
|
|
166
|
+
normalized_coords = (action_data["x"], action_data["y"])
|
|
167
|
+
|
|
168
|
+
# Handle end coordinates for drag actions
|
|
169
|
+
normalized_end = None
|
|
170
|
+
if action_data.get("normalized_end"):
|
|
171
|
+
normalized_end = tuple(action_data["normalized_end"])
|
|
172
|
+
elif (
|
|
173
|
+
action_data.get("end_x") is not None
|
|
174
|
+
and action_data.get("end_y") is not None
|
|
175
|
+
):
|
|
176
|
+
normalized_end = (action_data["end_x"], action_data["end_y"])
|
|
177
|
+
|
|
121
178
|
action = Action(
|
|
122
|
-
type=
|
|
123
|
-
|
|
124
|
-
|
|
179
|
+
type=action_type,
|
|
180
|
+
normalized_coordinates=normalized_coords,
|
|
181
|
+
normalized_end=normalized_end,
|
|
125
182
|
text=action_data.get("text"),
|
|
126
183
|
raw=action_data.get("raw"),
|
|
127
|
-
bbox=tuple(action_data["bbox"]) if action_data.get("bbox") else None,
|
|
128
|
-
element_index=action_data.get("element_index"),
|
|
129
|
-
target_node_id=action_data.get("target_node_id"),
|
|
130
|
-
target_role=action_data.get("target_role"),
|
|
131
|
-
target_name=action_data.get("target_name"),
|
|
132
184
|
key=action_data.get("key"),
|
|
133
185
|
modifiers=action_data.get("modifiers"),
|
|
134
186
|
scroll_direction=action_data.get("scroll_direction"),
|
|
135
187
|
scroll_amount=action_data.get("scroll_amount"),
|
|
136
|
-
end_x=action_data.get("end_x"),
|
|
137
|
-
end_y=action_data.get("end_y"),
|
|
138
|
-
answer=action_data.get("answer"),
|
|
139
188
|
)
|
|
140
189
|
|
|
190
|
+
# Handle step index and timestamp
|
|
191
|
+
step_index = step_data.get("step_index", step_idx)
|
|
192
|
+
timestamp = step_data.get("timestamp") or step_data.get("t")
|
|
193
|
+
|
|
141
194
|
step = Step(
|
|
142
|
-
|
|
195
|
+
step_index=step_index,
|
|
143
196
|
observation=observation,
|
|
144
197
|
action=action,
|
|
145
|
-
|
|
198
|
+
reasoning=step_data.get("reasoning") or step_data.get("thought"),
|
|
199
|
+
timestamp=timestamp,
|
|
146
200
|
)
|
|
147
201
|
steps.append(step)
|
|
148
202
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
success
|
|
155
|
-
|
|
156
|
-
|
|
203
|
+
# Build episode with field mapping (old -> new)
|
|
204
|
+
episode_data = {
|
|
205
|
+
"episode_id": data.get("episode_id") or data.get("id", "unknown"),
|
|
206
|
+
"instruction": data.get("instruction") or data.get("goal", ""),
|
|
207
|
+
"steps": steps,
|
|
208
|
+
"success": data.get("success"),
|
|
209
|
+
"metadata": {
|
|
210
|
+
"summary": data.get("summary"),
|
|
211
|
+
"workflow_id": data.get("workflow_id"),
|
|
212
|
+
},
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
if validate:
|
|
216
|
+
return Episode.model_validate(episode_data)
|
|
217
|
+
else:
|
|
218
|
+
return Episode(**episode_data)
|
|
157
219
|
|
|
158
220
|
|
|
159
221
|
def save_episodes(
|
|
@@ -178,9 +240,9 @@ def save_episodes(
|
|
|
178
240
|
|
|
179
241
|
with open(path, "w") as f:
|
|
180
242
|
if pretty:
|
|
181
|
-
json.dump(data, f, indent=2)
|
|
243
|
+
json.dump(data, f, indent=2, default=str)
|
|
182
244
|
else:
|
|
183
|
-
json.dump(data, f)
|
|
245
|
+
json.dump(data, f, default=str)
|
|
184
246
|
|
|
185
247
|
|
|
186
248
|
def _episode_to_dict(episode: Episode) -> Dict[str, Any]:
|
|
@@ -188,45 +250,34 @@ def _episode_to_dict(episode: Episode) -> Dict[str, Any]:
|
|
|
188
250
|
steps = []
|
|
189
251
|
for step in episode.steps:
|
|
190
252
|
step_dict = {
|
|
191
|
-
"
|
|
253
|
+
"step_index": step.step_index,
|
|
254
|
+
"timestamp": step.timestamp,
|
|
192
255
|
"observation": {
|
|
193
|
-
"
|
|
194
|
-
"
|
|
195
|
-
"
|
|
196
|
-
"
|
|
197
|
-
"url": step.observation.url,
|
|
256
|
+
"screenshot_path": step.observation.screenshot_path,
|
|
257
|
+
"raw": step.observation.raw,
|
|
258
|
+
"a11y_tree": step.observation.a11y_tree,
|
|
259
|
+
"dom": step.observation.dom,
|
|
198
260
|
"window_title": step.observation.window_title,
|
|
199
|
-
"app_name": step.observation.app_name,
|
|
200
|
-
"focused_element": step.observation.focused_element,
|
|
201
261
|
},
|
|
202
262
|
"action": {
|
|
203
|
-
"type": step.action.type,
|
|
204
|
-
"
|
|
205
|
-
"
|
|
263
|
+
"type": step.action.type.value,
|
|
264
|
+
"normalized_coordinates": step.action.normalized_coordinates,
|
|
265
|
+
"normalized_end": step.action.normalized_end,
|
|
206
266
|
"text": step.action.text,
|
|
207
267
|
"raw": step.action.raw,
|
|
208
|
-
"bbox": list(step.action.bbox) if step.action.bbox else None,
|
|
209
|
-
"element_index": step.action.element_index,
|
|
210
|
-
"target_node_id": step.action.target_node_id,
|
|
211
|
-
"target_role": step.action.target_role,
|
|
212
|
-
"target_name": step.action.target_name,
|
|
213
268
|
"key": step.action.key,
|
|
214
269
|
"modifiers": step.action.modifiers,
|
|
215
270
|
"scroll_direction": step.action.scroll_direction,
|
|
216
271
|
"scroll_amount": step.action.scroll_amount,
|
|
217
|
-
"end_x": step.action.end_x,
|
|
218
|
-
"end_y": step.action.end_y,
|
|
219
|
-
"answer": step.action.answer,
|
|
220
272
|
},
|
|
221
|
-
"
|
|
273
|
+
"reasoning": step.reasoning,
|
|
222
274
|
}
|
|
223
275
|
steps.append(step_dict)
|
|
224
276
|
|
|
225
277
|
return {
|
|
226
|
-
"
|
|
227
|
-
"
|
|
278
|
+
"episode_id": episode.episode_id,
|
|
279
|
+
"instruction": episode.instruction,
|
|
228
280
|
"steps": steps,
|
|
229
|
-
"summary": episode.summary,
|
|
230
281
|
"success": episode.success,
|
|
231
|
-
"
|
|
282
|
+
"metadata": episode.metadata,
|
|
232
283
|
}
|