openadapt-ml 0.3.0__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (332) hide show
  1. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/CHANGELOG.md +38 -0
  2. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/PKG-INFO +141 -4
  3. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/README.md +140 -3
  4. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/benchmarks/cli.py +37 -42
  5. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/pyproject.toml +1 -1
  6. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/.env.example +0 -0
  7. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/.github/workflows/release.yml +0 -0
  8. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/.github/workflows/test.yml +0 -0
  9. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/.gitignore +0 -0
  10. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/.gitmodules +0 -0
  11. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/.python-version +0 -0
  12. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/CLAUDE.md +0 -0
  13. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/LICENSE +0 -0
  14. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/RETRIEVAL_QUICKSTART.md +0 -0
  15. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/configs/qwen2_5vl_synthetic.yaml +0 -0
  16. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/configs/qwen3vl_capture.yaml +0 -0
  17. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/configs/qwen3vl_capture_4bit.yaml +0 -0
  18. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/configs/qwen3vl_capture_batched.yaml +0 -0
  19. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/configs/qwen3vl_synthetic.yaml +0 -0
  20. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/configs/qwen3vl_synthetic_coord_v2.yaml +0 -0
  21. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/configs/qwen3vl_synthetic_dev.yaml +0 -0
  22. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/configs/qwen3vl_synthetic_registration_som.yaml +0 -0
  23. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/configs/qwen3vl_synthetic_som.yaml +0 -0
  24. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/deprecated/Dockerfile.simple +0 -0
  25. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/deprecated/README.md +0 -0
  26. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/deprecated/docs/WAA_ACR_DESIGN.md +0 -0
  27. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/deprecated/docs/WAA_APPROACH_REVIEW.md +0 -0
  28. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/deprecated/docs/WAA_EVAL_ATTEMPTS.md +0 -0
  29. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/deprecated/docs/WAA_RELIABILITY_ANALYSIS.md +0 -0
  30. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/deprecated/docs/WINDOWS_PRODUCT_KEY_RCA.md +0 -0
  31. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/deprecated/docs/azure_waa_setup.md +0 -0
  32. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/deprecated/docs/waa_setup.md +0 -0
  33. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/deprecated/tmp_dockerfile_winarena.txt +0 -0
  34. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/deprecated/waa_deploy/Dockerfile +0 -0
  35. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/deprecated/waa_deploy/Dockerfile.backup +0 -0
  36. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/deprecated/waa_deploy/Dockerfile.simplified +0 -0
  37. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/deprecated/waa_deploy/__init__.py +0 -0
  38. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/deprecated/waa_deploy/api_agent.py +0 -0
  39. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/deprecated/waa_deploy/start_waa_server.bat +0 -0
  40. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/ARCHITECTURE_DECISIONS.md +0 -0
  41. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/AZURE_DASHBOARD_SPEC.md +0 -0
  42. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/AZURE_ML_COST_TRACKING.md +0 -0
  43. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/AZURE_ML_LIVE_LOGGING.md +0 -0
  44. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/CLEANUP_NOTES.md +0 -0
  45. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/GEMINI_GROUNDING_QUICKSTART.md +0 -0
  46. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/IMPLEMENTATION_SUMMARY_GEMINI_GROUNDING.md +0 -0
  47. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/NEXT_STEPS_GROUNDING_ARCHITECTURE.md +0 -0
  48. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/PRIORITY_2_COMPLETION_SUMMARY.md +0 -0
  49. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/PRIVACY_IMPLEMENTATION_PLAN.md +0 -0
  50. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/RECORD_IMPLEMENTATION_PLAN.md +0 -0
  51. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/REPOSITORY_HISTORY.md +0 -0
  52. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/REPO_CONSOLIDATION_PLAN.md +0 -0
  53. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/SEGMENTATION_TEST_PLAN.md +0 -0
  54. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/SEGMENTATION_TEST_RESULTS.md +0 -0
  55. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/VM_MONITOR_SCREENSHOT_IMPLEMENTATION.md +0 -0
  56. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/WAA_PARALLELIZATION_DESIGN.md +0 -0
  57. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/WAA_UNATTENDED_SCALABLE.md +0 -0
  58. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/analysis_jan2026.md +0 -0
  59. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/architecture_diagram.md +0 -0
  60. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/auto_shutoff_design.md +0 -0
  61. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/background_task_visibility.md +0 -0
  62. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/batching_and_schedulers.md +0 -0
  63. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/benchmark_integration_plan.md +0 -0
  64. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/benchmark_next_steps.md +0 -0
  65. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/benchmark_run_ui_design.md +0 -0
  66. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/benchmark_viewer_integration.md +0 -0
  67. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/benchmark_viewer_phase2.md +0 -0
  68. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/benchmark_viewer_ux_improvements.md +0 -0
  69. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/capture_format_decision.md +0 -0
  70. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/chrome_extension_design.md +0 -0
  71. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/cloud_gpu_training.md +0 -0
  72. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/current_state_dec2024.md +0 -0
  73. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/dashboard_architecture.md +0 -0
  74. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/demo_prompt_experiment.md +0 -0
  75. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/demo_retrieval_design.md +0 -0
  76. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/design.md +0 -0
  77. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/desktop_app_plan.md +0 -0
  78. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/early_termination.md +0 -0
  79. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/enterprise/COORDS_VS_MARKS_ABLATION.md +0 -0
  80. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/enterprise/README.md +0 -0
  81. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/enterprise_integration.md +0 -0
  82. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/eval_json_schema.md +0 -0
  83. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/experiments/demo_conditioned_prompting_results.md +0 -0
  84. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/experiments/multi_step_experiment_design.md +0 -0
  85. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/experiments/representation_shootout_design.md +0 -0
  86. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/experiments/waa_benchmark_results_jan2026.md +0 -0
  87. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/experiments/waa_demo_experiment_design.md +0 -0
  88. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/gemini_grounding.md +0 -0
  89. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/github_org_profile_content.md +0 -0
  90. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/github_org_update_plan.md +0 -0
  91. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/grpo_training_report.md +0 -0
  92. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/gui_actor_integration.md +0 -0
  93. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/images/benchmark_viewer.png +0 -0
  94. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/images/dashboard/training_bottom.png +0 -0
  95. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/images/dashboard/training_top.png +0 -0
  96. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/images/dashboard/viewer_bottom.png +0 -0
  97. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/images/dashboard/viewer_top.png +0 -0
  98. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/images/grounding_demo.png +0 -0
  99. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/images/grounding_demo_full.png +0 -0
  100. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/images/training-dashboard.png +0 -0
  101. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/images/viewer-comparison.png +0 -0
  102. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/infra_refactor_design.md +0 -0
  103. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/live_benchmark_monitoring_fix.md +0 -0
  104. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/live_inference_design.md +0 -0
  105. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/mock_adapter_evaluation_fix.md +0 -0
  106. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/new_openadapt_architecture.md +0 -0
  107. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/openadapt_capture_migration_detailed.md +0 -0
  108. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/openadapt_capture_migration_plan.md +0 -0
  109. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/openadapt_integration_plan.md +0 -0
  110. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/opencua_integration.md +0 -0
  111. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/output_artifacts_and_media.md +0 -0
  112. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/parallelization_implementation.md +0 -0
  113. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/parquet_export_design.md +0 -0
  114. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/perception_integration.md +0 -0
  115. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/prediction_loading_architecture.md +0 -0
  116. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/qwen3_vl_embedding_design.md +0 -0
  117. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/qwen3_vl_embedding_literature_review.md +0 -0
  118. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/qwen3_vl_embedding_research.md +0 -0
  119. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/qwen_login_experiment.md +0 -0
  120. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/research/cua_waa_comparison.md +0 -0
  121. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/research_thesis.md +0 -0
  122. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/roadmap.md +0 -0
  123. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/safety_gate_design.md +0 -0
  124. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/schema/README.md +0 -0
  125. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/schema/episode.schema.json +0 -0
  126. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/schema_consolidation_plan.md +0 -0
  127. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/screenshots/vm_monitor_dashboard_full.png +0 -0
  128. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/screenshots/vm_monitor_details.png +0 -0
  129. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/screenshots/vm_monitor_terminal.png +0 -0
  130. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/semantic_element_capture.md +0 -0
  131. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/set_of_marks_implementation.md +0 -0
  132. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/smart_mock_agent_design.md +0 -0
  133. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/som_implementation_verification.md +0 -0
  134. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/sse_architecture.md +0 -0
  135. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/sse_benchmark_endpoint.md +0 -0
  136. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/sse_frontend_integration.md +0 -0
  137. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/sse_quick_reference.md +0 -0
  138. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/sse_usage_examples.md +0 -0
  139. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/state_and_next_steps_qwen_login.md +0 -0
  140. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/stub_training_adapter.md +0 -0
  141. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/synthetic_login_jitter_and_ablation.md +0 -0
  142. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/training_feedback_ux.md +0 -0
  143. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/trl_unsloth_integration_analysis.md +0 -0
  144. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/unified_compute_architecture.md +0 -0
  145. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/viewer_architecture_survey.md +0 -0
  146. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/viewer_consolidation_design.md +0 -0
  147. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/viewer_eval_integration.md +0 -0
  148. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/viewer_layout_redesign.md +0 -0
  149. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/viewer_redesign_proposal.md +0 -0
  150. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/vision.md +0 -0
  151. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/vm_monitor_screenshot_analysis.md +0 -0
  152. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/waa_demo_recording_guide.md +0 -0
  153. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/waa_live_adapter_design.md +0 -0
  154. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/waa_network_architecture.md +0 -0
  155. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/waa_parallelization_plan.md +0 -0
  156. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/waa_speedup_options.md +0 -0
  157. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/waa_vanilla_automation.md +0 -0
  158. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/wandb_integration.md +0 -0
  159. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/docs/website_redesign_plan.md +0 -0
  160. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/examples/README.md +0 -0
  161. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/examples/demo_retrieval_example.py +0 -0
  162. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/examples/retrieval_with_capture.py +0 -0
  163. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/examples/sample_data.json +0 -0
  164. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/examples/test_gemini_grounding.py +0 -0
  165. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/examples/train_from_json.py +0 -0
  166. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/experiment_results/representation_shootout/results_20260116_142335.json +0 -0
  167. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/experiments/qwen_login/2b_dev/media/qwen3_2b_login_demo.gif +0 -0
  168. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/experiments/qwen_login/2b_dev/media/qwen3_2b_login_demo_session_0001.gif +0 -0
  169. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/experiments/qwen_login/2b_dev/plots/base_vs_ft.png +0 -0
  170. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/experiments/qwen_login/2b_dev/plots/qwen3_2b_base_vs_ft_hardened_v2.png +0 -0
  171. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/experiments/qwen_login/2b_dev/plots/qwen_vs_apis.png +0 -0
  172. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/experiments/qwen_login/2b_dev_fixed/plots/qwen_base_vs_ft.png +0 -0
  173. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/experiments/qwen_login/8b_hero/plots/qwen3_8b_base_vs_ft_hardened_v2.png +0 -0
  174. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/experiments/qwen_login/SOM_INVESTIGATION_REPORT.md +0 -0
  175. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/experiments/qwen_login/comprehensive_comparison.png +0 -0
  176. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/experiments/qwen_login/login_demo.gif +0 -0
  177. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/experiments/qwen_login/registration_demo.gif +0 -0
  178. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/experiments/qwen_login/registration_som_eval.json +0 -0
  179. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/negative_control_results/NEGATIVE_CONTROL_REPORT.md +0 -0
  180. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/negative_control_results/RESULTS_SUMMARY.txt +0 -0
  181. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/negative_control_results/negative_control_20251231_005135.json +0 -0
  182. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/__init__.py +0 -0
  183. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/baselines/__init__.py +0 -0
  184. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/baselines/adapter.py +0 -0
  185. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/baselines/cli.py +0 -0
  186. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/baselines/config.py +0 -0
  187. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/baselines/parser.py +0 -0
  188. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/baselines/prompts.py +0 -0
  189. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/benchmarks/__init__.py +0 -0
  190. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/benchmarks/agent.py +0 -0
  191. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/benchmarks/azure.py +0 -0
  192. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/benchmarks/azure_ops_tracker.py +0 -0
  193. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/benchmarks/trace_export.py +0 -0
  194. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/benchmarks/viewer.py +0 -0
  195. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/benchmarks/vm_monitor.py +0 -0
  196. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/benchmarks/waa_deploy/Dockerfile +0 -0
  197. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/benchmarks/waa_deploy/__init__.py +0 -0
  198. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/benchmarks/waa_deploy/api_agent.py +0 -0
  199. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +0 -0
  200. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/cloud/__init__.py +0 -0
  201. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/cloud/azure_inference.py +0 -0
  202. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/cloud/lambda_labs.py +0 -0
  203. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/cloud/local.py +0 -0
  204. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/cloud/ssh_tunnel.py +0 -0
  205. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/config.py +0 -0
  206. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/datasets/__init__.py +0 -0
  207. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/datasets/next_action.py +0 -0
  208. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/evals/__init__.py +0 -0
  209. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/evals/grounding.py +0 -0
  210. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/evals/plot_eval_metrics.py +0 -0
  211. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/evals/trajectory_matching.py +0 -0
  212. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/experiments/demo_prompt/__init__.py +0 -0
  213. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/experiments/demo_prompt/format_demo.py +0 -0
  214. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +0 -0
  215. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +0 -0
  216. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +0 -0
  217. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/experiments/demo_prompt/run_experiment.py +0 -0
  218. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/experiments/representation_shootout/__init__.py +0 -0
  219. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/experiments/representation_shootout/conditions.py +0 -0
  220. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/experiments/representation_shootout/config.py +0 -0
  221. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/experiments/representation_shootout/evaluator.py +0 -0
  222. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/experiments/representation_shootout/runner.py +0 -0
  223. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/experiments/waa_demo/__init__.py +0 -0
  224. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/experiments/waa_demo/demos.py +0 -0
  225. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/experiments/waa_demo/runner.py +0 -0
  226. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/experiments/waa_demo/tasks.py +0 -0
  227. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/export/__init__.py +0 -0
  228. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/export/__main__.py +0 -0
  229. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/export/cli.py +0 -0
  230. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/export/parquet.py +0 -0
  231. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/grounding/__init__.py +0 -0
  232. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/grounding/base.py +0 -0
  233. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/grounding/detector.py +0 -0
  234. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/ingest/__init__.py +0 -0
  235. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/ingest/capture.py +0 -0
  236. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/ingest/loader.py +0 -0
  237. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/ingest/synthetic.py +0 -0
  238. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/models/__init__.py +0 -0
  239. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/models/api_adapter.py +0 -0
  240. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/models/base_adapter.py +0 -0
  241. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/models/dummy_adapter.py +0 -0
  242. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/models/providers/__init__.py +0 -0
  243. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/models/providers/anthropic.py +0 -0
  244. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/models/providers/base.py +0 -0
  245. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/models/providers/google.py +0 -0
  246. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/models/providers/openai.py +0 -0
  247. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/models/qwen_vl.py +0 -0
  248. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/perception/__init__.py +0 -0
  249. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/perception/integration.py +0 -0
  250. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/retrieval/README.md +0 -0
  251. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/retrieval/USAGE.md +0 -0
  252. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/retrieval/__init__.py +0 -0
  253. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/retrieval/demo_retriever.py +0 -0
  254. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/retrieval/embeddings.py +0 -0
  255. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/retrieval/index.py +0 -0
  256. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/retrieval/retriever.py +0 -0
  257. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/runtime/__init__.py +0 -0
  258. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/runtime/policy.py +0 -0
  259. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/runtime/safety_gate.py +0 -0
  260. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/schema/__init__.py +0 -0
  261. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/schema/converters.py +0 -0
  262. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/schema/episode.py +0 -0
  263. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/scripts/__init__.py +0 -0
  264. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/scripts/capture_screenshots.py +0 -0
  265. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/scripts/compare.py +0 -0
  266. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/scripts/demo_policy.py +0 -0
  267. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/scripts/eval_policy.py +0 -0
  268. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/scripts/make_gif.py +0 -0
  269. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/scripts/prepare_synthetic.py +0 -0
  270. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/scripts/run_qwen_login_benchmark.py +0 -0
  271. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/scripts/train.py +0 -0
  272. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/segmentation/README.md +0 -0
  273. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/segmentation/__init__.py +0 -0
  274. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/segmentation/adapters/__init__.py +0 -0
  275. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/segmentation/adapters/capture_adapter.py +0 -0
  276. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/segmentation/annotator.py +0 -0
  277. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/segmentation/cache.py +0 -0
  278. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/segmentation/cli.py +0 -0
  279. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/segmentation/deduplicator.py +0 -0
  280. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/segmentation/frame_describer.py +0 -0
  281. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/segmentation/pipeline.py +0 -0
  282. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/segmentation/schemas.py +0 -0
  283. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/segmentation/segment_extractor.py +0 -0
  284. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/training/__init__.py +0 -0
  285. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/training/azure_ops_viewer.py +0 -0
  286. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/training/benchmark_viewer.py +0 -0
  287. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/training/shared_ui.py +0 -0
  288. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/training/stub_provider.py +0 -0
  289. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/training/trainer.py +0 -0
  290. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/training/trl_trainer.py +0 -0
  291. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/training/viewer.py +0 -0
  292. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/openadapt_ml/training/viewer_components.py +0 -0
  293. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/scripts/generate_vm_screenshots.py +0 -0
  294. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/scripts/generate_vm_screenshots_simple.py +0 -0
  295. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/scripts/p0_validate_demo_persistence.py +0 -0
  296. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/scripts/p1_episode_success_ab_test.py +0 -0
  297. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/scripts/run_demo_experiment.py +0 -0
  298. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/scripts/run_demo_experiment_n30.py +0 -0
  299. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/scripts/run_multistep_experiment.py +0 -0
  300. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/scripts/setup_azure.py +0 -0
  301. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/scripts/waa_bootstrap_helper.sh +0 -0
  302. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/scripts/waa_bootstrap_local.sh +0 -0
  303. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/__init__.py +0 -0
  304. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/benchmarks/__init__.py +0 -0
  305. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/benchmarks/test_api_agent.py +0 -0
  306. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/benchmarks/test_waa.py +0 -0
  307. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/integration/__init__.py +0 -0
  308. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/integration/test_benchmark_viewer.py +0 -0
  309. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/integration/test_data_collection.py +0 -0
  310. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/integration/test_live_eval.py +0 -0
  311. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/integration/test_sse_endpoint.py +0 -0
  312. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/test_action_parsing.py +0 -0
  313. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/test_api_adapter.py +0 -0
  314. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/test_baselines.py +0 -0
  315. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/test_batching.py +0 -0
  316. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/test_capture_adapter.py +0 -0
  317. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/test_demo_persistence.py +0 -0
  318. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/test_demo_retrieval.py +0 -0
  319. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/test_gemini_grounding_imports.py +0 -0
  320. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/test_local_cli.py +0 -0
  321. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/test_mock_labeling.py +0 -0
  322. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/test_negative_control.py +0 -0
  323. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/test_parquet_export.py +0 -0
  324. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/test_providers.py +0 -0
  325. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/test_retrieval.py +0 -0
  326. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/test_segmentation_pipeline.py +0 -0
  327. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/test_terminal_output.py +0 -0
  328. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/test_training_dummy.py +0 -0
  329. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/test_trl_trainer.py +0 -0
  330. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/test_viewer_screenshots.py +0 -0
  331. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/tests/test_waa_demo.py +0 -0
  332. {openadapt_ml-0.3.0 → openadapt_ml-0.3.1}/uv.lock +0 -0
@@ -1,6 +1,44 @@
1
1
  # CHANGELOG
2
2
 
3
3
 
4
+ ## v0.3.1 (2026-02-05)
5
+
6
+ ### Bug Fixes
7
+
8
+ - **cli**: Resolve ruff linter errors
9
+ ([`210a31f`](https://github.com/OpenAdaptAI/openadapt-ml/commit/210a31fcc054238a08e609520bdf57c312600d72))
10
+
11
+ - Replace bare `except:` with `except Exception:` - Remove unused f-string prefixes - Remove unused
12
+ variable assignments - Remove unused imports
13
+
14
+ Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
15
+
16
+ ### Documentation
17
+
18
+ - **readme**: Add parallel WAA evaluation, fix build badge
19
+ ([#19](https://github.com/OpenAdaptAI/openadapt-ml/pull/19),
20
+ [`fea0a10`](https://github.com/OpenAdaptAI/openadapt-ml/commit/fea0a10c514b87a8a73310a142acb73a6b31146e))
21
+
22
+ * docs(readme): add parallel WAA evaluation section, fix build badge
23
+
24
+ - Fix broken build badge (publish.yml → release.yml) - Add prominent "Parallel WAA Benchmark
25
+ Evaluation" section near top - Add detailed "WAA Benchmark Workflow" section (#14) with: - Single
26
+ VM and parallel pool workflows - VNC access instructions - Architecture diagram - Cost estimates -
27
+ Update section numbering (Limitations → 15, Roadmap → 16)
28
+
29
+ Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
30
+
31
+ * fix(readme): address self-review feedback
32
+
33
+ - Fix anchor placement (move before heading for proper navigation) - Correct pool-delete →
34
+ pool-cleanup (actual command name) - Add pool-status example for getting worker IPs - Add "prices
35
+ vary by region" caveat
36
+
37
+ ---------
38
+
39
+ Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
40
+
41
+
4
42
  ## v0.3.0 (2026-02-05)
5
43
 
6
44
  ### Bug Fixes
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openadapt-ml
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: Model-agnostic, domain-agnostic ML engine for GUI automation agents
5
5
  Project-URL: Homepage, https://github.com/OpenAdaptAI/openadapt-ml
6
6
  Project-URL: Repository, https://github.com/OpenAdaptAI/openadapt-ml
@@ -58,7 +58,7 @@ Description-Content-Type: text/markdown
58
58
 
59
59
  # OpenAdapt-ML
60
60
 
61
- [![Build Status](https://github.com/OpenAdaptAI/openadapt-ml/actions/workflows/publish.yml/badge.svg)](https://github.com/OpenAdaptAI/openadapt-ml/actions/workflows/publish.yml)
61
+ [![Build Status](https://github.com/OpenAdaptAI/openadapt-ml/actions/workflows/release.yml/badge.svg)](https://github.com/OpenAdaptAI/openadapt-ml/actions/workflows/release.yml)
62
62
  [![PyPI version](https://img.shields.io/pypi/v/openadapt-ml.svg)](https://pypi.org/project/openadapt-ml/)
63
63
  [![Downloads](https://img.shields.io/pypi/dm/openadapt-ml.svg)](https://pypi.org/project/openadapt-ml/)
64
64
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
@@ -88,6 +88,38 @@ The design is described in detail in [`docs/design.md`](docs/design.md).
88
88
 
89
89
  ---
90
90
 
91
+ ## Parallel WAA Benchmark Evaluation (New in v0.3.0)
92
+
93
+ Run Windows Agent Arena benchmarks across multiple Azure VMs in parallel for faster evaluation:
94
+
95
+ ```bash
96
+ # Create a pool of 5 workers
97
+ uv run python -m openadapt_ml.benchmarks.cli pool-create --workers 5
98
+
99
+ # Wait for all workers to be ready
100
+ uv run python -m openadapt_ml.benchmarks.cli pool-wait
101
+
102
+ # Run 154 tasks distributed across workers (~5x faster)
103
+ uv run python -m openadapt_ml.benchmarks.cli pool-run --tasks 154
104
+ ```
105
+
106
+ **Key features:**
107
+ - **Parallel execution**: Distribute 154 WAA tasks across N workers
108
+ - **Automatic task distribution**: Uses WAA's native `--worker_id`/`--num_workers` for round-robin assignment
109
+ - **VNC access**: View each Windows VM via SSH tunnels (`localhost:8006`, `localhost:8007`, etc.)
110
+ - **Cost tracking**: Monitor Azure VM costs in real-time
111
+
112
+ **Performance:**
113
+ | Workers | Estimated Time (154 tasks) |
114
+ |---------|---------------------------|
115
+ | 1 | ~50-80 hours |
116
+ | 5 | ~10-16 hours |
117
+ | 10 | ~5-8 hours |
118
+
119
+ See [WAA Benchmark Workflow](#waa-benchmark-workflow) for complete setup instructions.
120
+
121
+ ---
122
+
91
123
  ## 1. Installation
92
124
 
93
125
  ### 1.1 From PyPI (recommended)
@@ -1029,7 +1061,112 @@ uv run python -m openadapt_ml.benchmarks.cli screenshot --target terminal --no-t
1029
1061
 
1030
1062
  ---
1031
1063
 
1032
- ## 14. Limitations & Notes
1064
+ <a id="waa-benchmark-workflow"></a>
1065
+
1066
+ ## 14. WAA Benchmark Workflow
1067
+
1068
+ Windows Agent Arena (WAA) is a benchmark of 154 tasks across 11 Windows domains. OpenAdapt-ML provides infrastructure to run WAA evaluations on Azure VMs with parallel execution.
1069
+
1070
+ ### 14.1 Prerequisites
1071
+
1072
+ 1. **Azure CLI**: `brew install azure-cli && az login`
1073
+ 2. **OpenAI API Key**: Set in `.env` file (`OPENAI_API_KEY=sk-...`)
1074
+ 3. **Azure quota**: Ddsv5 family VMs (8+ vCPUs per worker)
1075
+
1076
+ ### 14.2 Single VM Workflow
1077
+
1078
+ For quick testing or small runs:
1079
+
1080
+ ```bash
1081
+ # Setup VM with WAA
1082
+ uv run python -m openadapt_ml.benchmarks.cli vm setup-waa
1083
+
1084
+ # Start monitoring dashboard (auto-opens VNC, manages SSH tunnels)
1085
+ uv run python -m openadapt_ml.benchmarks.cli vm monitor
1086
+
1087
+ # Run benchmark
1088
+ uv run python -m openadapt_ml.benchmarks.cli waa --num-tasks 10
1089
+
1090
+ # Deallocate when done (stops billing)
1091
+ uv run python -m openadapt_ml.benchmarks.cli vm deallocate -y
1092
+ ```
1093
+
1094
+ ### 14.3 Parallel Pool Workflow (Recommended)
1095
+
1096
+ For full 154-task evaluations, use multiple VMs:
1097
+
1098
+ ```bash
1099
+ # 1. Create pool (provisions N Azure VMs with Docker + WAA)
1100
+ uv run python -m openadapt_ml.benchmarks.cli pool-create --workers 5
1101
+
1102
+ # 2. Wait for all workers to be ready (Windows boot + WAA server startup)
1103
+ uv run python -m openadapt_ml.benchmarks.cli pool-wait
1104
+
1105
+ # 3. Run benchmark across all workers
1106
+ # Tasks are distributed using WAA's native --worker_id/--num_workers
1107
+ uv run python -m openadapt_ml.benchmarks.cli pool-run --tasks 154
1108
+
1109
+ # 4. Monitor progress
1110
+ uv run python -m openadapt_ml.benchmarks.cli pool-status
1111
+ uv run python -m openadapt_ml.benchmarks.cli pool-logs
1112
+
1113
+ # 5. Cleanup (delete all VMs - IMPORTANT to stop billing!)
1114
+ uv run python -m openadapt_ml.benchmarks.cli pool-cleanup
1115
+ ```
1116
+
1117
+ ### 14.4 VNC Access to Workers
1118
+
1119
+ View what each Windows VM is doing:
1120
+
1121
+ ```bash
1122
+ # Get worker IPs
1123
+ uv run python -m openadapt_ml.benchmarks.cli pool-status
1124
+
1125
+ # Set up SSH tunnels (tunnels are created automatically, but you can also do this manually)
1126
+ ssh -f -N -L 8006:localhost:8006 azureuser@<worker-0-ip> # localhost:8006
1127
+ ssh -f -N -L 8007:localhost:8006 azureuser@<worker-1-ip> # localhost:8007
1128
+ # etc.
1129
+
1130
+ # Open in browser
1131
+ open http://localhost:8006 # Worker 0
1132
+ open http://localhost:8007 # Worker 1
1133
+ ```
1134
+
1135
+ ### 14.5 Architecture
1136
+
1137
+ ```
1138
+ Local Machine
1139
+ ├── openadapt-ml CLI (pool-create, pool-wait, pool-run)
1140
+ │ └── SSH tunnels to each worker
1141
+
1142
+ Azure (N VMs, Standard_D8ds_v5)
1143
+ ├── waa-pool-00
1144
+ │ └── Docker
1145
+ │ └── windowsarena/winarena:latest
1146
+ │ └── QEMU (Windows 11)
1147
+ │ ├── WAA Flask server (port 5000)
1148
+ │ └── Navi agent (GPT-4o-mini)
1149
+ ├── waa-pool-01
1150
+ │ └── ...
1151
+ └── waa-pool-N
1152
+ └── ...
1153
+ ```
1154
+
1155
+ ### 14.6 Cost Estimates
1156
+
1157
+ | VM Size | vCPUs | RAM | Cost/hr | 5 VMs for 10hrs |
1158
+ |---------|-------|-----|---------|-----------------|
1159
+ | Standard_D8ds_v5 | 8 | 32GB | ~$0.38 | ~$19 |
1160
+
1161
+ **Tips:**
1162
+ - Always run `pool-cleanup` when done to delete VMs and stop billing
1163
+ - Use `vm deallocate` (not delete) to pause billing but keep disk
1164
+ - Set `--auto-shutdown-hours 2` on `vm monitor` for safety
1165
+ - Prices vary by Azure region
1166
+
1167
+ ---
1168
+
1169
+ ## 15. Limitations & Notes
1033
1170
 
1034
1171
  - **Apple Silicon / bitsandbytes**:
1035
1172
  - Example configs are sized for CPU / Apple Silicon development runs; see
@@ -1053,7 +1190,7 @@ For deeper architectural details, see [`docs/design.md`](docs/design.md).
1053
1190
 
1054
1191
  ---
1055
1192
 
1056
- ## 15. Roadmap
1193
+ ## 16. Roadmap
1057
1194
 
1058
1195
  For the up-to-date, prioritized roadmap (including concrete implementation
1059
1196
  targets and agent-executable acceptance criteria), see
@@ -1,6 +1,6 @@
1
1
  # OpenAdapt-ML
2
2
 
3
- [![Build Status](https://github.com/OpenAdaptAI/openadapt-ml/actions/workflows/publish.yml/badge.svg)](https://github.com/OpenAdaptAI/openadapt-ml/actions/workflows/publish.yml)
3
+ [![Build Status](https://github.com/OpenAdaptAI/openadapt-ml/actions/workflows/release.yml/badge.svg)](https://github.com/OpenAdaptAI/openadapt-ml/actions/workflows/release.yml)
4
4
  [![PyPI version](https://img.shields.io/pypi/v/openadapt-ml.svg)](https://pypi.org/project/openadapt-ml/)
5
5
  [![Downloads](https://img.shields.io/pypi/dm/openadapt-ml.svg)](https://pypi.org/project/openadapt-ml/)
6
6
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
@@ -30,6 +30,38 @@ The design is described in detail in [`docs/design.md`](docs/design.md).
30
30
 
31
31
  ---
32
32
 
33
+ ## Parallel WAA Benchmark Evaluation (New in v0.3.0)
34
+
35
+ Run Windows Agent Arena benchmarks across multiple Azure VMs in parallel for faster evaluation:
36
+
37
+ ```bash
38
+ # Create a pool of 5 workers
39
+ uv run python -m openadapt_ml.benchmarks.cli pool-create --workers 5
40
+
41
+ # Wait for all workers to be ready
42
+ uv run python -m openadapt_ml.benchmarks.cli pool-wait
43
+
44
+ # Run 154 tasks distributed across workers (~5x faster)
45
+ uv run python -m openadapt_ml.benchmarks.cli pool-run --tasks 154
46
+ ```
47
+
48
+ **Key features:**
49
+ - **Parallel execution**: Distribute 154 WAA tasks across N workers
50
+ - **Automatic task distribution**: Uses WAA's native `--worker_id`/`--num_workers` for round-robin assignment
51
+ - **VNC access**: View each Windows VM via SSH tunnels (`localhost:8006`, `localhost:8007`, etc.)
52
+ - **Cost tracking**: Monitor Azure VM costs in real-time
53
+
54
+ **Performance:**
55
+ | Workers | Estimated Time (154 tasks) |
56
+ |---------|---------------------------|
57
+ | 1 | ~50-80 hours |
58
+ | 5 | ~10-16 hours |
59
+ | 10 | ~5-8 hours |
60
+
61
+ See [WAA Benchmark Workflow](#waa-benchmark-workflow) for complete setup instructions.
62
+
63
+ ---
64
+
33
65
  ## 1. Installation
34
66
 
35
67
  ### 1.1 From PyPI (recommended)
@@ -971,7 +1003,112 @@ uv run python -m openadapt_ml.benchmarks.cli screenshot --target terminal --no-t
971
1003
 
972
1004
  ---
973
1005
 
974
- ## 14. Limitations & Notes
1006
+ <a id="waa-benchmark-workflow"></a>
1007
+
1008
+ ## 14. WAA Benchmark Workflow
1009
+
1010
+ Windows Agent Arena (WAA) is a benchmark of 154 tasks across 11 Windows domains. OpenAdapt-ML provides infrastructure to run WAA evaluations on Azure VMs with parallel execution.
1011
+
1012
+ ### 14.1 Prerequisites
1013
+
1014
+ 1. **Azure CLI**: `brew install azure-cli && az login`
1015
+ 2. **OpenAI API Key**: Set in `.env` file (`OPENAI_API_KEY=sk-...`)
1016
+ 3. **Azure quota**: Ddsv5 family VMs (8+ vCPUs per worker)
1017
+
1018
+ ### 14.2 Single VM Workflow
1019
+
1020
+ For quick testing or small runs:
1021
+
1022
+ ```bash
1023
+ # Setup VM with WAA
1024
+ uv run python -m openadapt_ml.benchmarks.cli vm setup-waa
1025
+
1026
+ # Start monitoring dashboard (auto-opens VNC, manages SSH tunnels)
1027
+ uv run python -m openadapt_ml.benchmarks.cli vm monitor
1028
+
1029
+ # Run benchmark
1030
+ uv run python -m openadapt_ml.benchmarks.cli waa --num-tasks 10
1031
+
1032
+ # Deallocate when done (stops billing)
1033
+ uv run python -m openadapt_ml.benchmarks.cli vm deallocate -y
1034
+ ```
1035
+
1036
+ ### 14.3 Parallel Pool Workflow (Recommended)
1037
+
1038
+ For full 154-task evaluations, use multiple VMs:
1039
+
1040
+ ```bash
1041
+ # 1. Create pool (provisions N Azure VMs with Docker + WAA)
1042
+ uv run python -m openadapt_ml.benchmarks.cli pool-create --workers 5
1043
+
1044
+ # 2. Wait for all workers to be ready (Windows boot + WAA server startup)
1045
+ uv run python -m openadapt_ml.benchmarks.cli pool-wait
1046
+
1047
+ # 3. Run benchmark across all workers
1048
+ # Tasks are distributed using WAA's native --worker_id/--num_workers
1049
+ uv run python -m openadapt_ml.benchmarks.cli pool-run --tasks 154
1050
+
1051
+ # 4. Monitor progress
1052
+ uv run python -m openadapt_ml.benchmarks.cli pool-status
1053
+ uv run python -m openadapt_ml.benchmarks.cli pool-logs
1054
+
1055
+ # 5. Cleanup (delete all VMs - IMPORTANT to stop billing!)
1056
+ uv run python -m openadapt_ml.benchmarks.cli pool-cleanup
1057
+ ```
1058
+
1059
+ ### 14.4 VNC Access to Workers
1060
+
1061
+ View what each Windows VM is doing:
1062
+
1063
+ ```bash
1064
+ # Get worker IPs
1065
+ uv run python -m openadapt_ml.benchmarks.cli pool-status
1066
+
1067
+ # Set up SSH tunnels (tunnels are created automatically, but you can also do this manually)
1068
+ ssh -f -N -L 8006:localhost:8006 azureuser@<worker-0-ip> # localhost:8006
1069
+ ssh -f -N -L 8007:localhost:8006 azureuser@<worker-1-ip> # localhost:8007
1070
+ # etc.
1071
+
1072
+ # Open in browser
1073
+ open http://localhost:8006 # Worker 0
1074
+ open http://localhost:8007 # Worker 1
1075
+ ```
1076
+
1077
+ ### 14.5 Architecture
1078
+
1079
+ ```
1080
+ Local Machine
1081
+ ├── openadapt-ml CLI (pool-create, pool-wait, pool-run)
1082
+ │ └── SSH tunnels to each worker
1083
+
1084
+ Azure (N VMs, Standard_D8ds_v5)
1085
+ ├── waa-pool-00
1086
+ │ └── Docker
1087
+ │ └── windowsarena/winarena:latest
1088
+ │ └── QEMU (Windows 11)
1089
+ │ ├── WAA Flask server (port 5000)
1090
+ │ └── Navi agent (GPT-4o-mini)
1091
+ ├── waa-pool-01
1092
+ │ └── ...
1093
+ └── waa-pool-N
1094
+ └── ...
1095
+ ```
1096
+
1097
+ ### 14.6 Cost Estimates
1098
+
1099
+ | VM Size | vCPUs | RAM | Cost/hr | 5 VMs for 10hrs |
1100
+ |---------|-------|-----|---------|-----------------|
1101
+ | Standard_D8ds_v5 | 8 | 32GB | ~$0.38 | ~$19 |
1102
+
1103
+ **Tips:**
1104
+ - Always run `pool-cleanup` when done to delete VMs and stop billing
1105
+ - Use `vm deallocate` (not delete) to pause billing but keep disk
1106
+ - Set `--auto-shutdown-hours 2` on `vm monitor` for safety
1107
+ - Prices vary by Azure region
1108
+
1109
+ ---
1110
+
1111
+ ## 15. Limitations & Notes
975
1112
 
976
1113
  - **Apple Silicon / bitsandbytes**:
977
1114
  - Example configs are sized for CPU / Apple Silicon development runs; see
@@ -995,7 +1132,7 @@ For deeper architectural details, see [`docs/design.md`](docs/design.md).
995
1132
 
996
1133
  ---
997
1134
 
998
- ## 15. Roadmap
1135
+ ## 16. Roadmap
999
1136
 
1000
1137
  For the up-to-date, prioritized roadmap (including concrete implementation
1001
1138
  targets and agent-executable acceptance criteria), see