openadapt-ml 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/benchmarks/__init__.py +8 -0
- openadapt_ml/benchmarks/agent.py +90 -11
- openadapt_ml/benchmarks/azure.py +35 -6
- openadapt_ml/benchmarks/cli.py +4449 -201
- openadapt_ml/benchmarks/live_tracker.py +180 -0
- openadapt_ml/benchmarks/runner.py +41 -4
- openadapt_ml/benchmarks/viewer.py +1219 -0
- openadapt_ml/benchmarks/vm_monitor.py +610 -0
- openadapt_ml/benchmarks/waa.py +61 -4
- openadapt_ml/benchmarks/waa_deploy/Dockerfile +222 -0
- openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
- openadapt_ml/benchmarks/waa_deploy/api_agent.py +539 -0
- openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
- openadapt_ml/benchmarks/waa_live.py +619 -0
- openadapt_ml/cloud/local.py +1555 -1
- openadapt_ml/cloud/ssh_tunnel.py +553 -0
- openadapt_ml/datasets/next_action.py +87 -68
- openadapt_ml/evals/grounding.py +26 -8
- openadapt_ml/evals/trajectory_matching.py +84 -36
- openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
- openadapt_ml/experiments/demo_prompt/format_demo.py +226 -0
- openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
- openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
- openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
- openadapt_ml/experiments/demo_prompt/run_experiment.py +531 -0
- openadapt_ml/experiments/waa_demo/__init__.py +10 -0
- openadapt_ml/experiments/waa_demo/demos.py +357 -0
- openadapt_ml/experiments/waa_demo/runner.py +717 -0
- openadapt_ml/experiments/waa_demo/tasks.py +151 -0
- openadapt_ml/export/__init__.py +9 -0
- openadapt_ml/export/__main__.py +6 -0
- openadapt_ml/export/cli.py +89 -0
- openadapt_ml/export/parquet.py +265 -0
- openadapt_ml/ingest/__init__.py +3 -4
- openadapt_ml/ingest/capture.py +89 -81
- openadapt_ml/ingest/loader.py +116 -68
- openadapt_ml/ingest/synthetic.py +221 -159
- openadapt_ml/retrieval/README.md +226 -0
- openadapt_ml/retrieval/USAGE.md +391 -0
- openadapt_ml/retrieval/__init__.py +91 -0
- openadapt_ml/retrieval/demo_retriever.py +817 -0
- openadapt_ml/retrieval/embeddings.py +629 -0
- openadapt_ml/retrieval/index.py +194 -0
- openadapt_ml/retrieval/retriever.py +160 -0
- openadapt_ml/runtime/policy.py +10 -10
- openadapt_ml/schema/__init__.py +104 -0
- openadapt_ml/schema/converters.py +541 -0
- openadapt_ml/schema/episode.py +457 -0
- openadapt_ml/scripts/compare.py +26 -16
- openadapt_ml/scripts/eval_policy.py +4 -5
- openadapt_ml/scripts/prepare_synthetic.py +14 -17
- openadapt_ml/scripts/train.py +81 -70
- openadapt_ml/training/benchmark_viewer.py +3225 -0
- openadapt_ml/training/trainer.py +120 -363
- openadapt_ml/training/trl_trainer.py +354 -0
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/METADATA +102 -60
- openadapt_ml-0.2.0.dist-info/RECORD +86 -0
- openadapt_ml/schemas/__init__.py +0 -53
- openadapt_ml/schemas/sessions.py +0 -122
- openadapt_ml/schemas/validation.py +0 -252
- openadapt_ml-0.1.0.dist-info/RECORD +0 -55
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/WHEEL +0 -0
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -54,6 +54,7 @@ from openadapt_ml.benchmarks.agent import (
|
|
|
54
54
|
PolicyAgent,
|
|
55
55
|
RandomAgent,
|
|
56
56
|
ScriptedAgent,
|
|
57
|
+
SmartMockAgent,
|
|
57
58
|
)
|
|
58
59
|
from openadapt_ml.benchmarks.base import (
|
|
59
60
|
BenchmarkAction,
|
|
@@ -71,6 +72,8 @@ from openadapt_ml.benchmarks.runner import (
|
|
|
71
72
|
evaluate_agent_on_benchmark,
|
|
72
73
|
)
|
|
73
74
|
from openadapt_ml.benchmarks.waa import WAAAdapter, WAAConfig, WAAMockAdapter
|
|
75
|
+
from openadapt_ml.benchmarks.waa_live import WAALiveAdapter, WAALiveConfig
|
|
76
|
+
from openadapt_ml.benchmarks.viewer import generate_benchmark_viewer
|
|
74
77
|
|
|
75
78
|
# Azure orchestration (lazy import to avoid requiring azure-ai-ml)
|
|
76
79
|
def _get_azure_classes():
|
|
@@ -97,6 +100,7 @@ __all__ = [
|
|
|
97
100
|
"APIBenchmarkAgent",
|
|
98
101
|
"ScriptedAgent",
|
|
99
102
|
"RandomAgent",
|
|
103
|
+
"SmartMockAgent",
|
|
100
104
|
# Evaluation
|
|
101
105
|
"EvaluationConfig",
|
|
102
106
|
"evaluate_agent_on_benchmark",
|
|
@@ -106,6 +110,10 @@ __all__ = [
|
|
|
106
110
|
"WAAAdapter",
|
|
107
111
|
"WAAConfig",
|
|
108
112
|
"WAAMockAdapter",
|
|
113
|
+
"WAALiveAdapter",
|
|
114
|
+
"WAALiveConfig",
|
|
115
|
+
# Viewer
|
|
116
|
+
"generate_benchmark_viewer",
|
|
109
117
|
# Azure (lazy-loaded)
|
|
110
118
|
"AzureConfig",
|
|
111
119
|
"AzureWAAOrchestrator",
|
openadapt_ml/benchmarks/agent.py
CHANGED
|
@@ -36,7 +36,7 @@ from openadapt_ml.benchmarks.base import (
|
|
|
36
36
|
if TYPE_CHECKING:
|
|
37
37
|
from openadapt_ml.models.api_adapter import ApiVLMAdapter
|
|
38
38
|
from openadapt_ml.runtime.policy import AgentPolicy
|
|
39
|
-
from openadapt_ml.
|
|
39
|
+
from openadapt_ml.schema import Action, ActionType
|
|
40
40
|
|
|
41
41
|
|
|
42
42
|
class BenchmarkAgent(ABC):
|
|
@@ -259,22 +259,51 @@ class PolicyAgent(BenchmarkAgent):
|
|
|
259
259
|
Returns:
|
|
260
260
|
BenchmarkAction.
|
|
261
261
|
"""
|
|
262
|
+
# Extract normalized coordinates
|
|
263
|
+
x, y = None, None
|
|
264
|
+
if action.normalized_coordinates is not None:
|
|
265
|
+
x, y = action.normalized_coordinates
|
|
266
|
+
|
|
267
|
+
# Extract end coordinates for drag
|
|
268
|
+
end_x, end_y = None, None
|
|
269
|
+
if action.normalized_end is not None:
|
|
270
|
+
end_x, end_y = action.normalized_end
|
|
271
|
+
|
|
272
|
+
# Extract action type value (enum -> string)
|
|
273
|
+
action_type = action.type.value if hasattr(action.type, 'value') else action.type
|
|
274
|
+
|
|
275
|
+
# Extract element info if available
|
|
276
|
+
target_node_id = None
|
|
277
|
+
target_role = None
|
|
278
|
+
target_name = None
|
|
279
|
+
target_bbox = None
|
|
280
|
+
if action.element is not None:
|
|
281
|
+
target_node_id = action.element.element_id
|
|
282
|
+
target_role = action.element.role
|
|
283
|
+
target_name = action.element.name
|
|
284
|
+
if action.element.bounds is not None:
|
|
285
|
+
target_bbox = (
|
|
286
|
+
action.element.bounds.x,
|
|
287
|
+
action.element.bounds.y,
|
|
288
|
+
action.element.bounds.x + action.element.bounds.width,
|
|
289
|
+
action.element.bounds.y + action.element.bounds.height,
|
|
290
|
+
)
|
|
291
|
+
|
|
262
292
|
return BenchmarkAction(
|
|
263
|
-
type=
|
|
264
|
-
x=
|
|
265
|
-
y=
|
|
293
|
+
type=action_type,
|
|
294
|
+
x=x,
|
|
295
|
+
y=y,
|
|
266
296
|
text=action.text,
|
|
267
|
-
target_bbox=
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
target_name=getattr(action, "target_name", None),
|
|
297
|
+
target_bbox=target_bbox,
|
|
298
|
+
target_node_id=target_node_id,
|
|
299
|
+
target_role=target_role,
|
|
300
|
+
target_name=target_name,
|
|
272
301
|
key=getattr(action, "key", None),
|
|
273
302
|
modifiers=getattr(action, "modifiers", None),
|
|
274
303
|
scroll_direction=getattr(action, "scroll_direction", None),
|
|
275
304
|
scroll_amount=getattr(action, "scroll_amount", None),
|
|
276
|
-
end_x=
|
|
277
|
-
end_y=
|
|
305
|
+
end_x=end_x,
|
|
306
|
+
end_y=end_y,
|
|
278
307
|
answer=getattr(action, "answer", None),
|
|
279
308
|
raw_action={"thought": thought} if thought else None,
|
|
280
309
|
)
|
|
@@ -391,6 +420,56 @@ class RandomAgent(BenchmarkAgent):
|
|
|
391
420
|
pass
|
|
392
421
|
|
|
393
422
|
|
|
423
|
+
class SmartMockAgent(BenchmarkAgent):
|
|
424
|
+
"""Agent designed to pass WAAMockAdapter evaluation.
|
|
425
|
+
|
|
426
|
+
Performs a fixed sequence of actions that satisfy the mock adapter's
|
|
427
|
+
success criteria. Use for validating the benchmark pipeline locally.
|
|
428
|
+
|
|
429
|
+
The mock adapter evaluates success based on:
|
|
430
|
+
- Clicking Submit (ID 4) - primary success path
|
|
431
|
+
- Typing something AND clicking OK (ID 1) - form submission path
|
|
432
|
+
- Calling DONE after at least 2 actions - reasonable completion
|
|
433
|
+
|
|
434
|
+
This agent clicks Submit (ID 4) which is the simplest success path.
|
|
435
|
+
"""
|
|
436
|
+
|
|
437
|
+
def __init__(self):
|
|
438
|
+
"""Initialize the agent."""
|
|
439
|
+
self._step = 0
|
|
440
|
+
# Simple action sequence: click Submit button (ID 4), then done
|
|
441
|
+
self._actions = [
|
|
442
|
+
BenchmarkAction(type="click", target_node_id="4"), # Click Submit
|
|
443
|
+
BenchmarkAction(type="done"),
|
|
444
|
+
]
|
|
445
|
+
|
|
446
|
+
def act(
|
|
447
|
+
self,
|
|
448
|
+
observation: BenchmarkObservation,
|
|
449
|
+
task: BenchmarkTask,
|
|
450
|
+
history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
|
|
451
|
+
) -> BenchmarkAction:
|
|
452
|
+
"""Return the next scripted action.
|
|
453
|
+
|
|
454
|
+
Args:
|
|
455
|
+
observation: Ignored.
|
|
456
|
+
task: Ignored.
|
|
457
|
+
history: Ignored.
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
Next action from script, or DONE if script exhausted.
|
|
461
|
+
"""
|
|
462
|
+
if self._step < len(self._actions):
|
|
463
|
+
action = self._actions[self._step]
|
|
464
|
+
self._step += 1
|
|
465
|
+
return action
|
|
466
|
+
return BenchmarkAction(type="done")
|
|
467
|
+
|
|
468
|
+
def reset(self) -> None:
|
|
469
|
+
"""Reset step counter."""
|
|
470
|
+
self._step = 0
|
|
471
|
+
|
|
472
|
+
|
|
394
473
|
class APIBenchmarkAgent(BenchmarkAgent):
|
|
395
474
|
"""Agent that uses hosted VLM APIs (Claude, GPT-5.1) for benchmark evaluation.
|
|
396
475
|
|
openadapt_ml/benchmarks/azure.py
CHANGED
|
@@ -355,6 +355,7 @@ class AzureMLClient:
|
|
|
355
355
|
command: str,
|
|
356
356
|
environment_variables: dict[str, str] | None = None,
|
|
357
357
|
display_name: str | None = None,
|
|
358
|
+
timeout_hours: float = 4.0,
|
|
358
359
|
) -> str:
|
|
359
360
|
"""Submit a job to a compute instance.
|
|
360
361
|
|
|
@@ -363,6 +364,8 @@ class AzureMLClient:
|
|
|
363
364
|
command: Command to run.
|
|
364
365
|
environment_variables: Environment variables.
|
|
365
366
|
display_name: Job display name.
|
|
367
|
+
timeout_hours: Maximum job duration in hours (default: 4). The job
|
|
368
|
+
will be automatically canceled after this duration.
|
|
366
369
|
|
|
367
370
|
Returns:
|
|
368
371
|
Job name/ID.
|
|
@@ -376,16 +379,27 @@ class AzureMLClient:
|
|
|
376
379
|
name="waa-agent-env",
|
|
377
380
|
)
|
|
378
381
|
|
|
382
|
+
import time
|
|
383
|
+
import uuid
|
|
384
|
+
timestamp = int(time.time())
|
|
385
|
+
unique_id = str(uuid.uuid4())[:8]
|
|
386
|
+
job_name = f"waa-{compute_name}-{timestamp}-{unique_id}"
|
|
387
|
+
|
|
388
|
+
# Convert hours to seconds for Azure ML timeout
|
|
389
|
+
timeout_seconds = int(timeout_hours * 3600)
|
|
390
|
+
|
|
379
391
|
job = ml_command(
|
|
380
392
|
command=command,
|
|
381
393
|
environment=env,
|
|
382
394
|
compute=compute_name,
|
|
395
|
+
name=job_name, # Unique job name for Azure ML
|
|
383
396
|
display_name=display_name or f"waa-job-{compute_name}",
|
|
384
397
|
environment_variables=environment_variables or {},
|
|
398
|
+
limits={"timeout": timeout_seconds},
|
|
385
399
|
)
|
|
386
400
|
|
|
387
401
|
submitted = self.client.jobs.create_or_update(job)
|
|
388
|
-
logger.info(f"Job submitted: {submitted.name}")
|
|
402
|
+
logger.info(f"Job submitted: {submitted.name} (timeout: {timeout_hours}h)")
|
|
389
403
|
return submitted.name
|
|
390
404
|
|
|
391
405
|
def wait_for_job(self, job_name: str, timeout_seconds: int = 3600) -> dict:
|
|
@@ -458,6 +472,7 @@ class AzureWAAOrchestrator:
|
|
|
458
472
|
max_steps_per_task: int = 15,
|
|
459
473
|
on_worker_complete: Callable[[WorkerState], None] | None = None,
|
|
460
474
|
cleanup_on_complete: bool = True,
|
|
475
|
+
timeout_hours: float = 4.0,
|
|
461
476
|
) -> list[BenchmarkResult]:
|
|
462
477
|
"""Run evaluation across multiple Azure VMs.
|
|
463
478
|
|
|
@@ -468,6 +483,8 @@ class AzureWAAOrchestrator:
|
|
|
468
483
|
max_steps_per_task: Maximum steps per task.
|
|
469
484
|
on_worker_complete: Callback when a worker finishes.
|
|
470
485
|
cleanup_on_complete: Whether to delete VMs after completion.
|
|
486
|
+
timeout_hours: Maximum job duration in hours (default: 4). Jobs are
|
|
487
|
+
auto-canceled after this duration to prevent runaway costs.
|
|
471
488
|
|
|
472
489
|
Returns:
|
|
473
490
|
List of BenchmarkResult for all tasks.
|
|
@@ -519,7 +536,7 @@ class AzureWAAOrchestrator:
|
|
|
519
536
|
|
|
520
537
|
# Submit jobs to workers
|
|
521
538
|
print(f"[3/4] Submitting evaluation jobs...")
|
|
522
|
-
self._submit_worker_jobs(workers, task_batches, agent, max_steps_per_task)
|
|
539
|
+
self._submit_worker_jobs(workers, task_batches, agent, max_steps_per_task, timeout_hours)
|
|
523
540
|
print(f" Jobs submitted")
|
|
524
541
|
|
|
525
542
|
# Wait for completion and collect results
|
|
@@ -577,8 +594,17 @@ class AzureWAAOrchestrator:
|
|
|
577
594
|
task_batches: list[list[BenchmarkTask]],
|
|
578
595
|
agent: BenchmarkAgent,
|
|
579
596
|
max_steps: int,
|
|
597
|
+
timeout_hours: float = 4.0,
|
|
580
598
|
) -> None:
|
|
581
|
-
"""Submit evaluation jobs to workers.
|
|
599
|
+
"""Submit evaluation jobs to workers.
|
|
600
|
+
|
|
601
|
+
Args:
|
|
602
|
+
workers: List of worker states.
|
|
603
|
+
task_batches: Task batches for each worker.
|
|
604
|
+
agent: Agent to run.
|
|
605
|
+
max_steps: Maximum steps per task.
|
|
606
|
+
timeout_hours: Maximum job duration in hours.
|
|
607
|
+
"""
|
|
582
608
|
for worker, tasks in zip(workers, task_batches):
|
|
583
609
|
if worker.status == "failed":
|
|
584
610
|
continue
|
|
@@ -591,7 +617,7 @@ class AzureWAAOrchestrator:
|
|
|
591
617
|
# Build command
|
|
592
618
|
command = self._build_worker_command(task_ids_json, max_steps, agent)
|
|
593
619
|
|
|
594
|
-
# Submit job
|
|
620
|
+
# Submit job with timeout
|
|
595
621
|
self.ml_client.submit_job(
|
|
596
622
|
compute_name=worker.compute_name,
|
|
597
623
|
command=command,
|
|
@@ -600,6 +626,7 @@ class AzureWAAOrchestrator:
|
|
|
600
626
|
"WAA_MAX_STEPS": str(max_steps),
|
|
601
627
|
},
|
|
602
628
|
display_name=f"waa-worker-{worker.worker_id}",
|
|
629
|
+
timeout_hours=timeout_hours,
|
|
603
630
|
)
|
|
604
631
|
worker.status = "running"
|
|
605
632
|
worker.start_time = time.time()
|
|
@@ -625,9 +652,11 @@ class AzureWAAOrchestrator:
|
|
|
625
652
|
# TODO: Serialize agent config and pass to remote worker
|
|
626
653
|
# For now, workers use a default agent configuration
|
|
627
654
|
_ = agent # Reserved for agent serialization
|
|
655
|
+
# WAA Docker image has client at /client (see Dockerfile-WinArena)
|
|
656
|
+
# The run.py script is at /client/run.py (not a module, so use python run.py)
|
|
628
657
|
return f"""
|
|
629
|
-
cd /
|
|
630
|
-
python
|
|
658
|
+
cd /client && \
|
|
659
|
+
python run.py \
|
|
631
660
|
--task_ids '{task_ids_json}' \
|
|
632
661
|
--max_steps {max_steps} \
|
|
633
662
|
--output_dir /outputs
|