openadapt-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. openadapt_ml/__init__.py +0 -0
  2. openadapt_ml/benchmarks/__init__.py +125 -0
  3. openadapt_ml/benchmarks/agent.py +825 -0
  4. openadapt_ml/benchmarks/azure.py +761 -0
  5. openadapt_ml/benchmarks/base.py +366 -0
  6. openadapt_ml/benchmarks/cli.py +884 -0
  7. openadapt_ml/benchmarks/data_collection.py +432 -0
  8. openadapt_ml/benchmarks/runner.py +381 -0
  9. openadapt_ml/benchmarks/waa.py +704 -0
  10. openadapt_ml/cloud/__init__.py +5 -0
  11. openadapt_ml/cloud/azure_inference.py +441 -0
  12. openadapt_ml/cloud/lambda_labs.py +2445 -0
  13. openadapt_ml/cloud/local.py +790 -0
  14. openadapt_ml/config.py +56 -0
  15. openadapt_ml/datasets/__init__.py +0 -0
  16. openadapt_ml/datasets/next_action.py +507 -0
  17. openadapt_ml/evals/__init__.py +23 -0
  18. openadapt_ml/evals/grounding.py +241 -0
  19. openadapt_ml/evals/plot_eval_metrics.py +174 -0
  20. openadapt_ml/evals/trajectory_matching.py +486 -0
  21. openadapt_ml/grounding/__init__.py +45 -0
  22. openadapt_ml/grounding/base.py +236 -0
  23. openadapt_ml/grounding/detector.py +570 -0
  24. openadapt_ml/ingest/__init__.py +43 -0
  25. openadapt_ml/ingest/capture.py +312 -0
  26. openadapt_ml/ingest/loader.py +232 -0
  27. openadapt_ml/ingest/synthetic.py +1102 -0
  28. openadapt_ml/models/__init__.py +0 -0
  29. openadapt_ml/models/api_adapter.py +171 -0
  30. openadapt_ml/models/base_adapter.py +59 -0
  31. openadapt_ml/models/dummy_adapter.py +42 -0
  32. openadapt_ml/models/qwen_vl.py +426 -0
  33. openadapt_ml/runtime/__init__.py +0 -0
  34. openadapt_ml/runtime/policy.py +182 -0
  35. openadapt_ml/schemas/__init__.py +53 -0
  36. openadapt_ml/schemas/sessions.py +122 -0
  37. openadapt_ml/schemas/validation.py +252 -0
  38. openadapt_ml/scripts/__init__.py +0 -0
  39. openadapt_ml/scripts/compare.py +1490 -0
  40. openadapt_ml/scripts/demo_policy.py +62 -0
  41. openadapt_ml/scripts/eval_policy.py +287 -0
  42. openadapt_ml/scripts/make_gif.py +153 -0
  43. openadapt_ml/scripts/prepare_synthetic.py +43 -0
  44. openadapt_ml/scripts/run_qwen_login_benchmark.py +192 -0
  45. openadapt_ml/scripts/train.py +174 -0
  46. openadapt_ml/training/__init__.py +0 -0
  47. openadapt_ml/training/benchmark_viewer.py +1538 -0
  48. openadapt_ml/training/shared_ui.py +157 -0
  49. openadapt_ml/training/stub_provider.py +276 -0
  50. openadapt_ml/training/trainer.py +2446 -0
  51. openadapt_ml/training/viewer.py +2970 -0
  52. openadapt_ml-0.1.0.dist-info/METADATA +818 -0
  53. openadapt_ml-0.1.0.dist-info/RECORD +55 -0
  54. openadapt_ml-0.1.0.dist-info/WHEEL +4 -0
  55. openadapt_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,366 @@
1
+ """Base classes for benchmark integration.
2
+
3
+ This module provides the core abstractions for integrating GUI agent benchmarks
4
+ into openadapt-ml. It supports both interactive environments (WAA, OSWorld) and
5
+ static trajectory datasets (Mind2Web).
6
+
7
+ Example:
8
+ from openadapt_ml.benchmarks import WAAAdapter, evaluate_agent_on_benchmark
9
+
10
+ adapter = WAAAdapter(waa_repo_path="/path/to/WAA")
11
+ results = evaluate_agent_on_benchmark(agent, adapter, max_steps=50)
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from abc import ABC, abstractmethod
17
+ from dataclasses import dataclass, field
18
+ from typing import TYPE_CHECKING, Any, Iterator
19
+
20
+ if TYPE_CHECKING:
21
+ pass
22
+
23
+
24
+ @dataclass
25
+ class BenchmarkTask:
26
+ """Canonical task representation.
27
+
28
+ Attributes:
29
+ task_id: Unique identifier for the task.
30
+ instruction: Natural language task instruction.
31
+ domain: Task domain ("web", "desktop", "mobile").
32
+ initial_state_ref: Reference to initial state (VM snapshot, URL, etc.).
33
+ time_limit_steps: Maximum steps allowed for the task.
34
+ raw_config: Original benchmark config (lossless preservation).
35
+ evaluation_spec: Benchmark-native evaluation specification.
36
+ """
37
+
38
+ task_id: str
39
+ instruction: str
40
+ domain: str # "web", "desktop", "mobile"
41
+
42
+ # Environment setup
43
+ initial_state_ref: str | None = None # VM snapshot, storage_state, start URL
44
+ time_limit_steps: int | None = None
45
+
46
+ # Preserve original config losslessly
47
+ raw_config: dict[str, Any] = field(default_factory=dict)
48
+
49
+ # Evaluation spec (benchmark-native)
50
+ evaluation_spec: dict[str, Any] | None = None
51
+
52
+
53
+ @dataclass
54
+ class BenchmarkObservation:
55
+ """Canonical observation at each step.
56
+
57
+ Supports multiple observation modalities:
58
+ - Visual: screenshots with viewport info
59
+ - Structured UI: accessibility tree (UIA/AXTree/DOM)
60
+ - Context: URL, window title, focused element
61
+
62
+ Attributes:
63
+ screenshot: PNG image bytes.
64
+ screenshot_path: Path to saved screenshot.
65
+ viewport: (width, height) of the viewport.
66
+ accessibility_tree: Platform-specific UI tree (UIA/AXTree/DOM).
67
+ dom_html: Raw HTML for web tasks.
68
+ url: Current URL for web tasks.
69
+ window_title: Active window title for desktop tasks.
70
+ focused_element: Currently focused UI element.
71
+ raw_observation: Original benchmark observation (lossless).
72
+ """
73
+
74
+ # Visual
75
+ screenshot: bytes | None = None # PNG image bytes
76
+ screenshot_path: str | None = None
77
+ viewport: tuple[int, int] | None = None # (width, height)
78
+
79
+ # Structured UI (format varies by platform)
80
+ accessibility_tree: dict | None = None # UIA (Windows), AXTree (macOS), DOM (web)
81
+ dom_html: str | None = None # Raw HTML for web
82
+
83
+ # Context
84
+ url: str | None = None # For web tasks
85
+ window_title: str | None = None # For desktop tasks
86
+ app_name: str | None = None # Active application
87
+ focused_element: dict | None = None # {node_id, bbox, text}
88
+
89
+ # Raw benchmark-specific data (lossless)
90
+ raw_observation: dict[str, Any] | None = None
91
+
92
+
93
+ @dataclass
94
+ class BenchmarkAction:
95
+ """Canonical action representation.
96
+
97
+ Supports multiple action types with both coordinate-based and element-based
98
+ grounding. The "grounding-first" approach stores both when available.
99
+
100
+ Attributes:
101
+ type: Action type ("click", "type", "scroll", "key", "drag", "answer", "done").
102
+ x: X coordinate (normalized [0,1] or pixels).
103
+ y: Y coordinate (normalized [0,1] or pixels).
104
+ target_node_id: Element ID from accessibility tree.
105
+ target_bbox: Element bounding box.
106
+ target_role: Element role (button, textfield, etc.).
107
+ target_name: Element accessible name.
108
+ text: Text to type (for "type" action).
109
+ key: Single key (for "key" action, e.g., "Enter", "Tab").
110
+ modifiers: Key modifiers (["ctrl", "shift", "alt"]).
111
+ scroll_direction: Scroll direction ("up", "down", "left", "right").
112
+ scroll_amount: Scroll amount (pixels or normalized).
113
+ end_x: Drag end X coordinate.
114
+ end_y: Drag end Y coordinate.
115
+ answer: Answer string (for benchmarks that score by answer).
116
+ raw_action: Original benchmark action (lossless).
117
+ """
118
+
119
+ type: str # "click", "type", "scroll", "key", "drag", "answer", "done"
120
+
121
+ # Pointer actions - coordinates
122
+ x: float | None = None # Normalized [0,1] or pixel
123
+ y: float | None = None
124
+
125
+ # Element grounding (when available)
126
+ target_node_id: str | None = None # DOM/AX/UIA node ID
127
+ target_bbox: tuple[float, float, float, float] | None = None
128
+ target_role: str | None = None # "button", "textfield", etc.
129
+ target_name: str | None = None # Accessible name
130
+
131
+ # Keyboard actions
132
+ text: str | None = None # For "type" action - text to type
133
+ key: str | None = None # For "key" action - single key
134
+ modifiers: list[str] | None = None # ["ctrl", "shift", "alt"]
135
+
136
+ # Scroll actions
137
+ scroll_direction: str | None = None # "up", "down", "left", "right"
138
+ scroll_amount: float | None = None # Pixels or normalized
139
+
140
+ # Drag actions
141
+ end_x: float | None = None
142
+ end_y: float | None = None
143
+
144
+ # Answer action (some benchmarks score by final answer)
145
+ answer: str | None = None
146
+
147
+ # Raw benchmark-specific format (lossless)
148
+ raw_action: dict[str, Any] | None = None
149
+
150
+
151
+ @dataclass
152
+ class BenchmarkResult:
153
+ """Result of a single task evaluation.
154
+
155
+ Attributes:
156
+ task_id: ID of the evaluated task.
157
+ success: Whether the task was completed successfully.
158
+ score: Score between 0.0 and 1.0.
159
+ steps: List of (observation, action) pairs from the trajectory.
160
+ num_steps: Number of steps taken.
161
+ error: Error message if task failed due to error.
162
+ reason: Explanation of success/failure.
163
+ total_time_seconds: Total time taken for the task.
164
+ """
165
+
166
+ task_id: str
167
+ success: bool
168
+ score: float # 0.0 to 1.0
169
+
170
+ # Trajectory
171
+ steps: list[tuple[BenchmarkObservation, BenchmarkAction]] = field(
172
+ default_factory=list
173
+ )
174
+ num_steps: int = 0
175
+
176
+ # Diagnostics
177
+ error: str | None = None
178
+ reason: str | None = None # Why success/fail
179
+
180
+ # Timing
181
+ total_time_seconds: float = 0.0
182
+
183
+
184
+ @dataclass
185
+ class UIElement:
186
+ """Normalized UI element for cross-platform use.
187
+
188
+ Provides a common representation for UI elements across platforms
189
+ (Windows UIA, macOS AXTree, web DOM).
190
+
191
+ Attributes:
192
+ node_id: Unique identifier for the element.
193
+ role: Element role (button, textfield, link, etc.).
194
+ name: Accessible name/label.
195
+ bbox: Bounding box (normalized [0,1] or pixels).
196
+ text: Text content.
197
+ value: Current value (for inputs).
198
+ children: Child elements.
199
+ attributes: Additional platform-specific attributes.
200
+ """
201
+
202
+ node_id: str
203
+ role: str # "button", "textfield", "link", etc.
204
+ name: str | None = None # Accessible name/label
205
+ bbox: tuple[float, float, float, float] | None = None # (x1, y1, x2, y2)
206
+ text: str | None = None # Text content
207
+ value: str | None = None # Current value (for inputs)
208
+ children: list[UIElement] | None = None
209
+ attributes: dict[str, Any] | None = None # Platform-specific
210
+
211
+
212
+ class BenchmarkAdapter(ABC):
213
+ """Abstract interface for benchmark integration.
214
+
215
+ Subclasses implement this interface to integrate specific benchmarks
216
+ (WAA, OSWorld, WebArena, etc.) with openadapt-ml.
217
+
218
+ Two types of adapters:
219
+ - Interactive: Run environment, step through tasks (WAA, OSWorld)
220
+ - Static: Load trajectories for offline training/eval (Mind2Web)
221
+ """
222
+
223
+ @property
224
+ @abstractmethod
225
+ def name(self) -> str:
226
+ """Benchmark name (e.g., 'waa', 'osworld', 'webarena')."""
227
+ pass
228
+
229
+ @property
230
+ @abstractmethod
231
+ def benchmark_type(self) -> str:
232
+ """Benchmark type: 'interactive' or 'static'."""
233
+ pass
234
+
235
+ @property
236
+ def supports_parallel(self) -> bool:
237
+ """Whether the adapter supports parallel task execution."""
238
+ return False
239
+
240
+ @abstractmethod
241
+ def list_tasks(self, domain: str | None = None) -> list[BenchmarkTask]:
242
+ """List available tasks, optionally filtered by domain.
243
+
244
+ Args:
245
+ domain: Optional domain filter (e.g., "browser", "office").
246
+
247
+ Returns:
248
+ List of BenchmarkTask objects.
249
+ """
250
+ pass
251
+
252
+ @abstractmethod
253
+ def load_task(self, task_id: str) -> BenchmarkTask:
254
+ """Load a specific task by ID.
255
+
256
+ Args:
257
+ task_id: Task identifier.
258
+
259
+ Returns:
260
+ BenchmarkTask object.
261
+
262
+ Raises:
263
+ KeyError: If task_id not found.
264
+ """
265
+ pass
266
+
267
+ @abstractmethod
268
+ def reset(self, task: BenchmarkTask) -> BenchmarkObservation:
269
+ """Reset environment to task's initial state.
270
+
271
+ Args:
272
+ task: Task to initialize.
273
+
274
+ Returns:
275
+ Initial observation.
276
+ """
277
+ pass
278
+
279
+ @abstractmethod
280
+ def step(
281
+ self, action: BenchmarkAction
282
+ ) -> tuple[BenchmarkObservation, bool, dict[str, Any]]:
283
+ """Execute action and return new observation.
284
+
285
+ Args:
286
+ action: Action to execute.
287
+
288
+ Returns:
289
+ Tuple of (observation, done, info).
290
+ """
291
+ pass
292
+
293
+ @abstractmethod
294
+ def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
295
+ """Run benchmark's native evaluation on current state.
296
+
297
+ Args:
298
+ task: Task to evaluate.
299
+
300
+ Returns:
301
+ BenchmarkResult with success/score.
302
+ """
303
+ pass
304
+
305
+ def close(self) -> None:
306
+ """Clean up resources (VMs, browser, etc.)."""
307
+ pass
308
+
309
+ def __enter__(self) -> BenchmarkAdapter:
310
+ """Context manager entry."""
311
+ return self
312
+
313
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
314
+ """Context manager exit."""
315
+ self.close()
316
+
317
+
318
+ class StaticDatasetAdapter(BenchmarkAdapter):
319
+ """Base for static trajectory datasets (Mind2Web, demos).
320
+
321
+ Static adapters load pre-recorded trajectories for offline training
322
+ or evaluation, rather than running an interactive environment.
323
+ """
324
+
325
+ @property
326
+ def benchmark_type(self) -> str:
327
+ """Static datasets are not interactive."""
328
+ return "static"
329
+
330
+ @abstractmethod
331
+ def load_trajectories(
332
+ self, split: str = "test"
333
+ ) -> Iterator[tuple[BenchmarkTask, list[tuple[BenchmarkObservation, BenchmarkAction]]]]:
334
+ """Iterate over expert trajectories.
335
+
336
+ Args:
337
+ split: Dataset split ("train", "val", "test").
338
+
339
+ Yields:
340
+ Tuples of (task, trajectory) where trajectory is a list of
341
+ (observation, action) pairs.
342
+ """
343
+ pass
344
+
345
+ def reset(self, task: BenchmarkTask) -> BenchmarkObservation:
346
+ """Not supported for static datasets."""
347
+ raise NotImplementedError(
348
+ "Static datasets don't support interactive reset. "
349
+ "Use load_trajectories() instead."
350
+ )
351
+
352
+ def step(
353
+ self, action: BenchmarkAction
354
+ ) -> tuple[BenchmarkObservation, bool, dict[str, Any]]:
355
+ """Not supported for static datasets."""
356
+ raise NotImplementedError(
357
+ "Static datasets don't support interactive stepping. "
358
+ "Use load_trajectories() instead."
359
+ )
360
+
361
+ def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
362
+ """Not supported for static datasets."""
363
+ raise NotImplementedError(
364
+ "Static datasets don't support execution-based evaluation. "
365
+ "Use offline metrics instead."
366
+ )