openadapt-ml 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. openadapt_ml/benchmarks/__init__.py +8 -0
  2. openadapt_ml/benchmarks/agent.py +90 -11
  3. openadapt_ml/benchmarks/azure.py +35 -6
  4. openadapt_ml/benchmarks/cli.py +4449 -201
  5. openadapt_ml/benchmarks/live_tracker.py +180 -0
  6. openadapt_ml/benchmarks/runner.py +41 -4
  7. openadapt_ml/benchmarks/viewer.py +1219 -0
  8. openadapt_ml/benchmarks/vm_monitor.py +610 -0
  9. openadapt_ml/benchmarks/waa.py +61 -4
  10. openadapt_ml/benchmarks/waa_deploy/Dockerfile +222 -0
  11. openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
  12. openadapt_ml/benchmarks/waa_deploy/api_agent.py +539 -0
  13. openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
  14. openadapt_ml/benchmarks/waa_live.py +619 -0
  15. openadapt_ml/cloud/local.py +1555 -1
  16. openadapt_ml/cloud/ssh_tunnel.py +553 -0
  17. openadapt_ml/datasets/next_action.py +87 -68
  18. openadapt_ml/evals/grounding.py +26 -8
  19. openadapt_ml/evals/trajectory_matching.py +84 -36
  20. openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
  21. openadapt_ml/experiments/demo_prompt/format_demo.py +226 -0
  22. openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
  23. openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
  24. openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
  25. openadapt_ml/experiments/demo_prompt/run_experiment.py +531 -0
  26. openadapt_ml/experiments/waa_demo/__init__.py +10 -0
  27. openadapt_ml/experiments/waa_demo/demos.py +357 -0
  28. openadapt_ml/experiments/waa_demo/runner.py +717 -0
  29. openadapt_ml/experiments/waa_demo/tasks.py +151 -0
  30. openadapt_ml/export/__init__.py +9 -0
  31. openadapt_ml/export/__main__.py +6 -0
  32. openadapt_ml/export/cli.py +89 -0
  33. openadapt_ml/export/parquet.py +265 -0
  34. openadapt_ml/ingest/__init__.py +3 -4
  35. openadapt_ml/ingest/capture.py +89 -81
  36. openadapt_ml/ingest/loader.py +116 -68
  37. openadapt_ml/ingest/synthetic.py +221 -159
  38. openadapt_ml/retrieval/README.md +226 -0
  39. openadapt_ml/retrieval/USAGE.md +391 -0
  40. openadapt_ml/retrieval/__init__.py +91 -0
  41. openadapt_ml/retrieval/demo_retriever.py +817 -0
  42. openadapt_ml/retrieval/embeddings.py +629 -0
  43. openadapt_ml/retrieval/index.py +194 -0
  44. openadapt_ml/retrieval/retriever.py +160 -0
  45. openadapt_ml/runtime/policy.py +10 -10
  46. openadapt_ml/schema/__init__.py +104 -0
  47. openadapt_ml/schema/converters.py +541 -0
  48. openadapt_ml/schema/episode.py +457 -0
  49. openadapt_ml/scripts/compare.py +26 -16
  50. openadapt_ml/scripts/eval_policy.py +4 -5
  51. openadapt_ml/scripts/prepare_synthetic.py +14 -17
  52. openadapt_ml/scripts/train.py +81 -70
  53. openadapt_ml/training/benchmark_viewer.py +3225 -0
  54. openadapt_ml/training/trainer.py +120 -363
  55. openadapt_ml/training/trl_trainer.py +354 -0
  56. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/METADATA +102 -60
  57. openadapt_ml-0.2.0.dist-info/RECORD +86 -0
  58. openadapt_ml/schemas/__init__.py +0 -53
  59. openadapt_ml/schemas/sessions.py +0 -122
  60. openadapt_ml/schemas/validation.py +0 -252
  61. openadapt_ml-0.1.0.dist-info/RECORD +0 -55
  62. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/WHEEL +0 -0
  63. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -54,6 +54,7 @@ from openadapt_ml.benchmarks.agent import (
54
54
  PolicyAgent,
55
55
  RandomAgent,
56
56
  ScriptedAgent,
57
+ SmartMockAgent,
57
58
  )
58
59
  from openadapt_ml.benchmarks.base import (
59
60
  BenchmarkAction,
@@ -71,6 +72,8 @@ from openadapt_ml.benchmarks.runner import (
71
72
  evaluate_agent_on_benchmark,
72
73
  )
73
74
  from openadapt_ml.benchmarks.waa import WAAAdapter, WAAConfig, WAAMockAdapter
75
+ from openadapt_ml.benchmarks.waa_live import WAALiveAdapter, WAALiveConfig
76
+ from openadapt_ml.benchmarks.viewer import generate_benchmark_viewer
74
77
 
75
78
  # Azure orchestration (lazy import to avoid requiring azure-ai-ml)
76
79
  def _get_azure_classes():
@@ -97,6 +100,7 @@ __all__ = [
97
100
  "APIBenchmarkAgent",
98
101
  "ScriptedAgent",
99
102
  "RandomAgent",
103
+ "SmartMockAgent",
100
104
  # Evaluation
101
105
  "EvaluationConfig",
102
106
  "evaluate_agent_on_benchmark",
@@ -106,6 +110,10 @@ __all__ = [
106
110
  "WAAAdapter",
107
111
  "WAAConfig",
108
112
  "WAAMockAdapter",
113
+ "WAALiveAdapter",
114
+ "WAALiveConfig",
115
+ # Viewer
116
+ "generate_benchmark_viewer",
109
117
  # Azure (lazy-loaded)
110
118
  "AzureConfig",
111
119
  "AzureWAAOrchestrator",
@@ -36,7 +36,7 @@ from openadapt_ml.benchmarks.base import (
36
36
  if TYPE_CHECKING:
37
37
  from openadapt_ml.models.api_adapter import ApiVLMAdapter
38
38
  from openadapt_ml.runtime.policy import AgentPolicy
39
- from openadapt_ml.schemas.sessions import Action
39
+ from openadapt_ml.schema import Action, ActionType
40
40
 
41
41
 
42
42
  class BenchmarkAgent(ABC):
@@ -259,22 +259,51 @@ class PolicyAgent(BenchmarkAgent):
259
259
  Returns:
260
260
  BenchmarkAction.
261
261
  """
262
+ # Extract normalized coordinates
263
+ x, y = None, None
264
+ if action.normalized_coordinates is not None:
265
+ x, y = action.normalized_coordinates
266
+
267
+ # Extract end coordinates for drag
268
+ end_x, end_y = None, None
269
+ if action.normalized_end is not None:
270
+ end_x, end_y = action.normalized_end
271
+
272
+ # Extract action type value (enum -> string)
273
+ action_type = action.type.value if hasattr(action.type, 'value') else action.type
274
+
275
+ # Extract element info if available
276
+ target_node_id = None
277
+ target_role = None
278
+ target_name = None
279
+ target_bbox = None
280
+ if action.element is not None:
281
+ target_node_id = action.element.element_id
282
+ target_role = action.element.role
283
+ target_name = action.element.name
284
+ if action.element.bounds is not None:
285
+ target_bbox = (
286
+ action.element.bounds.x,
287
+ action.element.bounds.y,
288
+ action.element.bounds.x + action.element.bounds.width,
289
+ action.element.bounds.y + action.element.bounds.height,
290
+ )
291
+
262
292
  return BenchmarkAction(
263
- type=action.type,
264
- x=action.x,
265
- y=action.y,
293
+ type=action_type,
294
+ x=x,
295
+ y=y,
266
296
  text=action.text,
267
- target_bbox=action.bbox,
268
- # Map additional fields if present
269
- target_node_id=getattr(action, "target_node_id", None),
270
- target_role=getattr(action, "target_role", None),
271
- target_name=getattr(action, "target_name", None),
297
+ target_bbox=target_bbox,
298
+ target_node_id=target_node_id,
299
+ target_role=target_role,
300
+ target_name=target_name,
272
301
  key=getattr(action, "key", None),
273
302
  modifiers=getattr(action, "modifiers", None),
274
303
  scroll_direction=getattr(action, "scroll_direction", None),
275
304
  scroll_amount=getattr(action, "scroll_amount", None),
276
- end_x=getattr(action, "end_x", None),
277
- end_y=getattr(action, "end_y", None),
305
+ end_x=end_x,
306
+ end_y=end_y,
278
307
  answer=getattr(action, "answer", None),
279
308
  raw_action={"thought": thought} if thought else None,
280
309
  )
@@ -391,6 +420,56 @@ class RandomAgent(BenchmarkAgent):
391
420
  pass
392
421
 
393
422
 
423
+ class SmartMockAgent(BenchmarkAgent):
424
+ """Agent designed to pass WAAMockAdapter evaluation.
425
+
426
+ Performs a fixed sequence of actions that satisfy the mock adapter's
427
+ success criteria. Use for validating the benchmark pipeline locally.
428
+
429
+ The mock adapter evaluates success based on:
430
+ - Clicking Submit (ID 4) - primary success path
431
+ - Typing something AND clicking OK (ID 1) - form submission path
432
+ - Calling DONE after at least 2 actions - reasonable completion
433
+
434
+ This agent clicks Submit (ID 4) which is the simplest success path.
435
+ """
436
+
437
+ def __init__(self):
438
+ """Initialize the agent."""
439
+ self._step = 0
440
+ # Simple action sequence: click Submit button (ID 4), then done
441
+ self._actions = [
442
+ BenchmarkAction(type="click", target_node_id="4"), # Click Submit
443
+ BenchmarkAction(type="done"),
444
+ ]
445
+
446
+ def act(
447
+ self,
448
+ observation: BenchmarkObservation,
449
+ task: BenchmarkTask,
450
+ history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
451
+ ) -> BenchmarkAction:
452
+ """Return the next scripted action.
453
+
454
+ Args:
455
+ observation: Ignored.
456
+ task: Ignored.
457
+ history: Ignored.
458
+
459
+ Returns:
460
+ Next action from script, or DONE if script exhausted.
461
+ """
462
+ if self._step < len(self._actions):
463
+ action = self._actions[self._step]
464
+ self._step += 1
465
+ return action
466
+ return BenchmarkAction(type="done")
467
+
468
+ def reset(self) -> None:
469
+ """Reset step counter."""
470
+ self._step = 0
471
+
472
+
394
473
  class APIBenchmarkAgent(BenchmarkAgent):
395
474
  """Agent that uses hosted VLM APIs (Claude, GPT-5.1) for benchmark evaluation.
396
475
 
@@ -355,6 +355,7 @@ class AzureMLClient:
355
355
  command: str,
356
356
  environment_variables: dict[str, str] | None = None,
357
357
  display_name: str | None = None,
358
+ timeout_hours: float = 4.0,
358
359
  ) -> str:
359
360
  """Submit a job to a compute instance.
360
361
 
@@ -363,6 +364,8 @@ class AzureMLClient:
363
364
  command: Command to run.
364
365
  environment_variables: Environment variables.
365
366
  display_name: Job display name.
367
+ timeout_hours: Maximum job duration in hours (default: 4). The job
368
+ will be automatically canceled after this duration.
366
369
 
367
370
  Returns:
368
371
  Job name/ID.
@@ -376,16 +379,27 @@ class AzureMLClient:
376
379
  name="waa-agent-env",
377
380
  )
378
381
 
382
+ import time
383
+ import uuid
384
+ timestamp = int(time.time())
385
+ unique_id = str(uuid.uuid4())[:8]
386
+ job_name = f"waa-{compute_name}-{timestamp}-{unique_id}"
387
+
388
+ # Convert hours to seconds for Azure ML timeout
389
+ timeout_seconds = int(timeout_hours * 3600)
390
+
379
391
  job = ml_command(
380
392
  command=command,
381
393
  environment=env,
382
394
  compute=compute_name,
395
+ name=job_name, # Unique job name for Azure ML
383
396
  display_name=display_name or f"waa-job-{compute_name}",
384
397
  environment_variables=environment_variables or {},
398
+ limits={"timeout": timeout_seconds},
385
399
  )
386
400
 
387
401
  submitted = self.client.jobs.create_or_update(job)
388
- logger.info(f"Job submitted: {submitted.name}")
402
+ logger.info(f"Job submitted: {submitted.name} (timeout: {timeout_hours}h)")
389
403
  return submitted.name
390
404
 
391
405
  def wait_for_job(self, job_name: str, timeout_seconds: int = 3600) -> dict:
@@ -458,6 +472,7 @@ class AzureWAAOrchestrator:
458
472
  max_steps_per_task: int = 15,
459
473
  on_worker_complete: Callable[[WorkerState], None] | None = None,
460
474
  cleanup_on_complete: bool = True,
475
+ timeout_hours: float = 4.0,
461
476
  ) -> list[BenchmarkResult]:
462
477
  """Run evaluation across multiple Azure VMs.
463
478
 
@@ -468,6 +483,8 @@ class AzureWAAOrchestrator:
468
483
  max_steps_per_task: Maximum steps per task.
469
484
  on_worker_complete: Callback when a worker finishes.
470
485
  cleanup_on_complete: Whether to delete VMs after completion.
486
+ timeout_hours: Maximum job duration in hours (default: 4). Jobs are
487
+ auto-canceled after this duration to prevent runaway costs.
471
488
 
472
489
  Returns:
473
490
  List of BenchmarkResult for all tasks.
@@ -519,7 +536,7 @@ class AzureWAAOrchestrator:
519
536
 
520
537
  # Submit jobs to workers
521
538
  print(f"[3/4] Submitting evaluation jobs...")
522
- self._submit_worker_jobs(workers, task_batches, agent, max_steps_per_task)
539
+ self._submit_worker_jobs(workers, task_batches, agent, max_steps_per_task, timeout_hours)
523
540
  print(f" Jobs submitted")
524
541
 
525
542
  # Wait for completion and collect results
@@ -577,8 +594,17 @@ class AzureWAAOrchestrator:
577
594
  task_batches: list[list[BenchmarkTask]],
578
595
  agent: BenchmarkAgent,
579
596
  max_steps: int,
597
+ timeout_hours: float = 4.0,
580
598
  ) -> None:
581
- """Submit evaluation jobs to workers."""
599
+ """Submit evaluation jobs to workers.
600
+
601
+ Args:
602
+ workers: List of worker states.
603
+ task_batches: Task batches for each worker.
604
+ agent: Agent to run.
605
+ max_steps: Maximum steps per task.
606
+ timeout_hours: Maximum job duration in hours.
607
+ """
582
608
  for worker, tasks in zip(workers, task_batches):
583
609
  if worker.status == "failed":
584
610
  continue
@@ -591,7 +617,7 @@ class AzureWAAOrchestrator:
591
617
  # Build command
592
618
  command = self._build_worker_command(task_ids_json, max_steps, agent)
593
619
 
594
- # Submit job
620
+ # Submit job with timeout
595
621
  self.ml_client.submit_job(
596
622
  compute_name=worker.compute_name,
597
623
  command=command,
@@ -600,6 +626,7 @@ class AzureWAAOrchestrator:
600
626
  "WAA_MAX_STEPS": str(max_steps),
601
627
  },
602
628
  display_name=f"waa-worker-{worker.worker_id}",
629
+ timeout_hours=timeout_hours,
603
630
  )
604
631
  worker.status = "running"
605
632
  worker.start_time = time.time()
@@ -625,9 +652,11 @@ class AzureWAAOrchestrator:
625
652
  # TODO: Serialize agent config and pass to remote worker
626
653
  # For now, workers use a default agent configuration
627
654
  _ = agent # Reserved for agent serialization
655
+ # WAA Docker image has client at /client (see Dockerfile-WinArena)
656
+ # The run.py script is at /client/run.py (not a module, so use python run.py)
628
657
  return f"""
629
- cd /workspace/WindowsAgentArena && \
630
- python -m client.run \
658
+ cd /client && \
659
+ python run.py \
631
660
  --task_ids '{task_ids_json}' \
632
661
  --max_steps {max_steps} \
633
662
  --output_dir /outputs