openadapt-ml 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. openadapt_ml/baselines/__init__.py +121 -0
  2. openadapt_ml/baselines/adapter.py +185 -0
  3. openadapt_ml/baselines/cli.py +314 -0
  4. openadapt_ml/baselines/config.py +448 -0
  5. openadapt_ml/baselines/parser.py +922 -0
  6. openadapt_ml/baselines/prompts.py +787 -0
  7. openadapt_ml/benchmarks/__init__.py +13 -115
  8. openadapt_ml/benchmarks/agent.py +265 -421
  9. openadapt_ml/benchmarks/azure.py +28 -19
  10. openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
  11. openadapt_ml/benchmarks/cli.py +1722 -4847
  12. openadapt_ml/benchmarks/trace_export.py +631 -0
  13. openadapt_ml/benchmarks/viewer.py +22 -5
  14. openadapt_ml/benchmarks/vm_monitor.py +530 -29
  15. openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
  16. openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
  17. openadapt_ml/cloud/azure_inference.py +3 -5
  18. openadapt_ml/cloud/lambda_labs.py +722 -307
  19. openadapt_ml/cloud/local.py +2038 -487
  20. openadapt_ml/cloud/ssh_tunnel.py +68 -26
  21. openadapt_ml/datasets/next_action.py +40 -30
  22. openadapt_ml/evals/grounding.py +8 -3
  23. openadapt_ml/evals/plot_eval_metrics.py +15 -13
  24. openadapt_ml/evals/trajectory_matching.py +41 -26
  25. openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
  26. openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
  27. openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
  28. openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
  29. openadapt_ml/experiments/representation_shootout/config.py +390 -0
  30. openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
  31. openadapt_ml/experiments/representation_shootout/runner.py +687 -0
  32. openadapt_ml/experiments/waa_demo/runner.py +29 -14
  33. openadapt_ml/export/parquet.py +36 -24
  34. openadapt_ml/grounding/detector.py +18 -14
  35. openadapt_ml/ingest/__init__.py +8 -6
  36. openadapt_ml/ingest/capture.py +25 -22
  37. openadapt_ml/ingest/loader.py +7 -4
  38. openadapt_ml/ingest/synthetic.py +189 -100
  39. openadapt_ml/models/api_adapter.py +14 -4
  40. openadapt_ml/models/base_adapter.py +10 -2
  41. openadapt_ml/models/providers/__init__.py +288 -0
  42. openadapt_ml/models/providers/anthropic.py +266 -0
  43. openadapt_ml/models/providers/base.py +299 -0
  44. openadapt_ml/models/providers/google.py +376 -0
  45. openadapt_ml/models/providers/openai.py +342 -0
  46. openadapt_ml/models/qwen_vl.py +46 -19
  47. openadapt_ml/perception/__init__.py +35 -0
  48. openadapt_ml/perception/integration.py +399 -0
  49. openadapt_ml/retrieval/demo_retriever.py +50 -24
  50. openadapt_ml/retrieval/embeddings.py +9 -8
  51. openadapt_ml/retrieval/retriever.py +3 -1
  52. openadapt_ml/runtime/__init__.py +50 -0
  53. openadapt_ml/runtime/policy.py +18 -5
  54. openadapt_ml/runtime/safety_gate.py +471 -0
  55. openadapt_ml/schema/__init__.py +9 -0
  56. openadapt_ml/schema/converters.py +74 -27
  57. openadapt_ml/schema/episode.py +31 -18
  58. openadapt_ml/scripts/capture_screenshots.py +530 -0
  59. openadapt_ml/scripts/compare.py +85 -54
  60. openadapt_ml/scripts/demo_policy.py +4 -1
  61. openadapt_ml/scripts/eval_policy.py +15 -9
  62. openadapt_ml/scripts/make_gif.py +1 -1
  63. openadapt_ml/scripts/prepare_synthetic.py +3 -1
  64. openadapt_ml/scripts/train.py +21 -9
  65. openadapt_ml/segmentation/README.md +920 -0
  66. openadapt_ml/segmentation/__init__.py +97 -0
  67. openadapt_ml/segmentation/adapters/__init__.py +5 -0
  68. openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
  69. openadapt_ml/segmentation/annotator.py +610 -0
  70. openadapt_ml/segmentation/cache.py +290 -0
  71. openadapt_ml/segmentation/cli.py +674 -0
  72. openadapt_ml/segmentation/deduplicator.py +656 -0
  73. openadapt_ml/segmentation/frame_describer.py +788 -0
  74. openadapt_ml/segmentation/pipeline.py +340 -0
  75. openadapt_ml/segmentation/schemas.py +622 -0
  76. openadapt_ml/segmentation/segment_extractor.py +634 -0
  77. openadapt_ml/training/azure_ops_viewer.py +1097 -0
  78. openadapt_ml/training/benchmark_viewer.py +52 -41
  79. openadapt_ml/training/shared_ui.py +7 -7
  80. openadapt_ml/training/stub_provider.py +57 -35
  81. openadapt_ml/training/trainer.py +143 -86
  82. openadapt_ml/training/trl_trainer.py +70 -21
  83. openadapt_ml/training/viewer.py +323 -108
  84. openadapt_ml/training/viewer_components.py +180 -0
  85. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +215 -14
  86. openadapt_ml-0.2.1.dist-info/RECORD +116 -0
  87. openadapt_ml/benchmarks/base.py +0 -366
  88. openadapt_ml/benchmarks/data_collection.py +0 -432
  89. openadapt_ml/benchmarks/live_tracker.py +0 -180
  90. openadapt_ml/benchmarks/runner.py +0 -418
  91. openadapt_ml/benchmarks/waa.py +0 -761
  92. openadapt_ml/benchmarks/waa_live.py +0 -619
  93. openadapt_ml-0.2.0.dist-info/RECORD +0 -86
  94. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
  95. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,133 +1,31 @@
1
1
  """Benchmark integration for openadapt-ml.
2
2
 
3
- This module provides interfaces and utilities for evaluating GUI agents
4
- on standardized benchmarks like Windows Agent Arena (WAA), OSWorld,
5
- WebArena, and others.
3
+ This module provides ML-specific agents for benchmark evaluation.
4
+ These agents wrap openadapt-ml internals (trained policies, API adapters).
6
5
 
7
- Core classes:
8
- - BenchmarkAdapter: Abstract interface for benchmark integration
9
- - BenchmarkAgent: Abstract interface for agents to be evaluated
10
- - BenchmarkTask, BenchmarkObservation, BenchmarkAction: Data classes
11
-
12
- Agent implementations:
13
- - PolicyAgent: Wraps openadapt-ml AgentPolicy
14
- - APIBenchmarkAgent: Uses hosted VLM APIs (Claude, GPT-5.1)
15
- - ScriptedAgent: Follows predefined action sequence
16
- - RandomAgent: Takes random actions (baseline)
17
-
18
- Evaluation:
19
- - evaluate_agent_on_benchmark: Run agent on benchmark tasks
20
- - compute_metrics: Compute aggregate metrics from results
21
-
22
- Example:
6
+ For benchmark infrastructure (adapters, runners, viewers), use openadapt-evals:
23
7
  ```python
24
- from openadapt_ml.benchmarks import (
25
- BenchmarkAdapter,
26
- PolicyAgent,
27
- APIBenchmarkAgent,
8
+ from openadapt_evals import (
9
+ WAAMockAdapter,
10
+ WAALiveAdapter,
28
11
  evaluate_agent_on_benchmark,
29
- compute_metrics,
30
12
  )
31
-
32
- # Create adapter for specific benchmark
33
- adapter = WAAAdapter(waa_repo_path="/path/to/WAA")
34
-
35
- # Wrap policy as benchmark agent
36
- agent = PolicyAgent(policy)
37
-
38
- # Or use API-backed agent for baselines
39
- agent = APIBenchmarkAgent(provider="anthropic") # Claude
40
- agent = APIBenchmarkAgent(provider="openai") # GPT-5.1
41
-
42
- # Run evaluation
43
- results = evaluate_agent_on_benchmark(agent, adapter, max_steps=50)
44
-
45
- # Compute metrics
46
- metrics = compute_metrics(results)
47
- print(f"Success rate: {metrics['success_rate']:.1%}")
48
13
  ```
14
+
15
+ ML-specific agents (only available in openadapt-ml):
16
+ - PolicyAgent: Wraps openadapt_ml.runtime.policy.AgentPolicy
17
+ - APIBenchmarkAgent: Uses openadapt_ml.models.api_adapter.ApiVLMAdapter
18
+ - UnifiedBaselineAgent: Uses openadapt_ml.baselines adapters
49
19
  """
50
20
 
51
21
  from openadapt_ml.benchmarks.agent import (
52
22
  APIBenchmarkAgent,
53
- BenchmarkAgent,
54
23
  PolicyAgent,
55
- RandomAgent,
56
- ScriptedAgent,
57
- SmartMockAgent,
58
- )
59
- from openadapt_ml.benchmarks.base import (
60
- BenchmarkAction,
61
- BenchmarkAdapter,
62
- BenchmarkObservation,
63
- BenchmarkResult,
64
- BenchmarkTask,
65
- StaticDatasetAdapter,
66
- UIElement,
24
+ UnifiedBaselineAgent,
67
25
  )
68
- from openadapt_ml.benchmarks.runner import (
69
- EvaluationConfig,
70
- compute_domain_metrics,
71
- compute_metrics,
72
- evaluate_agent_on_benchmark,
73
- )
74
- from openadapt_ml.benchmarks.waa import WAAAdapter, WAAConfig, WAAMockAdapter
75
- from openadapt_ml.benchmarks.waa_live import WAALiveAdapter, WAALiveConfig
76
- from openadapt_ml.benchmarks.viewer import generate_benchmark_viewer
77
-
78
- # Azure orchestration (lazy import to avoid requiring azure-ai-ml)
79
- def _get_azure_classes():
80
- from openadapt_ml.benchmarks.azure import (
81
- AzureConfig,
82
- AzureWAAOrchestrator,
83
- estimate_cost,
84
- )
85
- return AzureConfig, AzureWAAOrchestrator, estimate_cost
86
-
87
26
 
88
27
  __all__ = [
89
- # Base classes
90
- "BenchmarkAdapter",
91
- "BenchmarkTask",
92
- "BenchmarkObservation",
93
- "BenchmarkAction",
94
- "BenchmarkResult",
95
- "StaticDatasetAdapter",
96
- "UIElement",
97
- # Agents
98
- "BenchmarkAgent",
99
28
  "PolicyAgent",
100
29
  "APIBenchmarkAgent",
101
- "ScriptedAgent",
102
- "RandomAgent",
103
- "SmartMockAgent",
104
- # Evaluation
105
- "EvaluationConfig",
106
- "evaluate_agent_on_benchmark",
107
- "compute_metrics",
108
- "compute_domain_metrics",
109
- # WAA
110
- "WAAAdapter",
111
- "WAAConfig",
112
- "WAAMockAdapter",
113
- "WAALiveAdapter",
114
- "WAALiveConfig",
115
- # Viewer
116
- "generate_benchmark_viewer",
117
- # Azure (lazy-loaded)
118
- "AzureConfig",
119
- "AzureWAAOrchestrator",
120
- "estimate_cost",
30
+ "UnifiedBaselineAgent",
121
31
  ]
122
-
123
-
124
- # Lazy loading for Azure classes (avoids requiring azure-ai-ml for basic usage)
125
- def __getattr__(name: str):
126
- if name in ("AzureConfig", "AzureWAAOrchestrator", "estimate_cost"):
127
- from openadapt_ml.benchmarks.azure import (
128
- AzureConfig,
129
- AzureWAAOrchestrator,
130
- estimate_cost,
131
- )
132
- return {"AzureConfig": AzureConfig, "AzureWAAOrchestrator": AzureWAAOrchestrator, "estimate_cost": estimate_cost}[name]
133
- raise AttributeError(f"module {__name__!r} has no attribute {name!r}")