openadapt-ml 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/.gitignore +1 -0
  2. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/CLAUDE.md +379 -16
  3. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/PKG-INFO +102 -60
  4. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/README.md +96 -59
  5. openadapt_ml-0.2.0/RETRIEVAL_QUICKSTART.md +238 -0
  6. openadapt_ml-0.2.0/docs/GEMINI_GROUNDING_QUICKSTART.md +274 -0
  7. openadapt_ml-0.2.0/docs/IMPLEMENTATION_SUMMARY_GEMINI_GROUNDING.md +322 -0
  8. openadapt_ml-0.2.0/docs/PRIORITY_2_COMPLETION_SUMMARY.md +330 -0
  9. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/azure_waa_setup.md +48 -4
  10. openadapt_ml-0.2.0/docs/background_task_visibility.md +744 -0
  11. openadapt_ml-0.2.0/docs/benchmark_run_ui_design.md +361 -0
  12. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/benchmark_viewer_integration.md +60 -15
  13. openadapt_ml-0.2.0/docs/benchmark_viewer_ux_improvements.md +330 -0
  14. openadapt_ml-0.2.0/docs/capture_format_decision.md +229 -0
  15. openadapt_ml-0.2.0/docs/chrome_extension_design.md +1202 -0
  16. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/cloud_gpu_training.md +60 -1
  17. openadapt_ml-0.2.0/docs/current_state_dec2024.md +223 -0
  18. openadapt_ml-0.2.0/docs/demo_prompt_experiment.md +214 -0
  19. openadapt_ml-0.2.0/docs/demo_retrieval_design.md +666 -0
  20. openadapt_ml-0.2.0/docs/enterprise_integration.md +459 -0
  21. openadapt_ml-0.2.0/docs/experiments/demo_conditioned_prompting_results.md +340 -0
  22. openadapt_ml-0.2.0/docs/experiments/multi_step_experiment_design.md +354 -0
  23. openadapt_ml-0.2.0/docs/experiments/waa_demo_experiment_design.md +430 -0
  24. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/gemini_grounding.md +2 -4
  25. openadapt_ml-0.2.0/docs/images/benchmark_viewer.png +0 -0
  26. openadapt_ml-0.2.0/docs/infra_refactor_design.md +986 -0
  27. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/live_inference_design.md +1 -1
  28. openadapt_ml-0.2.0/docs/mock_adapter_evaluation_fix.md +379 -0
  29. openadapt_ml-0.2.0/docs/openadapt_capture_migration_detailed.md +1434 -0
  30. openadapt_ml-0.2.0/docs/openadapt_capture_migration_plan.md +148 -0
  31. openadapt_ml-0.2.0/docs/parallelization_implementation.md +1141 -0
  32. openadapt_ml-0.2.0/docs/parquet_export_design.md +299 -0
  33. openadapt_ml-0.2.0/docs/research_thesis.md +69 -0
  34. openadapt_ml-0.2.0/docs/schema/README.md +206 -0
  35. openadapt_ml-0.2.0/docs/schema/episode.schema.json +904 -0
  36. openadapt_ml-0.2.0/docs/schema_consolidation_plan.md +282 -0
  37. openadapt_ml-0.2.0/docs/semantic_element_capture.md +618 -0
  38. openadapt_ml-0.2.0/docs/smart_mock_agent_design.md +349 -0
  39. openadapt_ml-0.2.0/docs/sse_architecture.md +432 -0
  40. openadapt_ml-0.2.0/docs/sse_benchmark_endpoint.md +252 -0
  41. openadapt_ml-0.2.0/docs/sse_frontend_integration.md +888 -0
  42. openadapt_ml-0.2.0/docs/sse_quick_reference.md +257 -0
  43. openadapt_ml-0.2.0/docs/sse_usage_examples.md +487 -0
  44. openadapt_ml-0.2.0/docs/waa_demo_recording_guide.md +209 -0
  45. openadapt_ml-0.2.0/docs/waa_live_adapter_design.md +660 -0
  46. openadapt_ml-0.2.0/docs/waa_network_architecture.md +157 -0
  47. openadapt_ml-0.2.0/docs/waa_parallelization_plan.md +281 -0
  48. openadapt_ml-0.2.0/docs/waa_setup.md +315 -0
  49. openadapt_ml-0.2.0/examples/demo_retrieval_example.py +234 -0
  50. openadapt_ml-0.2.0/examples/retrieval_with_capture.py +132 -0
  51. openadapt_ml-0.2.0/examples/train_from_json.py +294 -0
  52. openadapt_ml-0.2.0/negative_control_results/NEGATIVE_CONTROL_REPORT.md +215 -0
  53. openadapt_ml-0.2.0/negative_control_results/RESULTS_SUMMARY.txt +137 -0
  54. openadapt_ml-0.2.0/negative_control_results/negative_control_20251231_005135.json +64 -0
  55. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/benchmarks/__init__.py +8 -0
  56. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/benchmarks/agent.py +90 -11
  57. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/benchmarks/azure.py +35 -6
  58. openadapt_ml-0.2.0/openadapt_ml/benchmarks/cli.py +5132 -0
  59. openadapt_ml-0.2.0/openadapt_ml/benchmarks/live_tracker.py +180 -0
  60. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/benchmarks/runner.py +41 -4
  61. openadapt_ml-0.2.0/openadapt_ml/benchmarks/viewer.py +1219 -0
  62. openadapt_ml-0.2.0/openadapt_ml/benchmarks/vm_monitor.py +610 -0
  63. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/benchmarks/waa.py +61 -4
  64. openadapt_ml-0.2.0/openadapt_ml/benchmarks/waa_deploy/Dockerfile +222 -0
  65. openadapt_ml-0.2.0/openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
  66. openadapt_ml-0.2.0/openadapt_ml/benchmarks/waa_deploy/api_agent.py +539 -0
  67. openadapt_ml-0.2.0/openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
  68. openadapt_ml-0.2.0/openadapt_ml/benchmarks/waa_live.py +619 -0
  69. openadapt_ml-0.2.0/openadapt_ml/cloud/local.py +2344 -0
  70. openadapt_ml-0.2.0/openadapt_ml/cloud/ssh_tunnel.py +553 -0
  71. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/datasets/next_action.py +87 -68
  72. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/evals/grounding.py +26 -8
  73. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/evals/trajectory_matching.py +84 -36
  74. openadapt_ml-0.2.0/openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
  75. openadapt_ml-0.2.0/openadapt_ml/experiments/demo_prompt/format_demo.py +226 -0
  76. openadapt_ml-0.2.0/openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
  77. openadapt_ml-0.2.0/openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
  78. openadapt_ml-0.2.0/openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
  79. openadapt_ml-0.2.0/openadapt_ml/experiments/demo_prompt/run_experiment.py +531 -0
  80. openadapt_ml-0.2.0/openadapt_ml/experiments/waa_demo/__init__.py +10 -0
  81. openadapt_ml-0.2.0/openadapt_ml/experiments/waa_demo/demos.py +357 -0
  82. openadapt_ml-0.2.0/openadapt_ml/experiments/waa_demo/runner.py +717 -0
  83. openadapt_ml-0.2.0/openadapt_ml/experiments/waa_demo/tasks.py +151 -0
  84. openadapt_ml-0.2.0/openadapt_ml/export/__init__.py +9 -0
  85. openadapt_ml-0.2.0/openadapt_ml/export/__main__.py +6 -0
  86. openadapt_ml-0.2.0/openadapt_ml/export/cli.py +89 -0
  87. openadapt_ml-0.2.0/openadapt_ml/export/parquet.py +265 -0
  88. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/ingest/__init__.py +3 -4
  89. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/ingest/capture.py +89 -81
  90. openadapt_ml-0.2.0/openadapt_ml/ingest/loader.py +280 -0
  91. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/ingest/synthetic.py +221 -159
  92. openadapt_ml-0.2.0/openadapt_ml/retrieval/README.md +226 -0
  93. openadapt_ml-0.2.0/openadapt_ml/retrieval/USAGE.md +391 -0
  94. openadapt_ml-0.2.0/openadapt_ml/retrieval/__init__.py +91 -0
  95. openadapt_ml-0.2.0/openadapt_ml/retrieval/demo_retriever.py +817 -0
  96. openadapt_ml-0.2.0/openadapt_ml/retrieval/embeddings.py +629 -0
  97. openadapt_ml-0.2.0/openadapt_ml/retrieval/index.py +194 -0
  98. openadapt_ml-0.2.0/openadapt_ml/retrieval/retriever.py +160 -0
  99. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/runtime/policy.py +10 -10
  100. openadapt_ml-0.2.0/openadapt_ml/schema/__init__.py +104 -0
  101. openadapt_ml-0.2.0/openadapt_ml/schema/converters.py +541 -0
  102. openadapt_ml-0.2.0/openadapt_ml/schema/episode.py +457 -0
  103. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/scripts/compare.py +26 -16
  104. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/scripts/eval_policy.py +4 -5
  105. openadapt_ml-0.2.0/openadapt_ml/scripts/prepare_synthetic.py +40 -0
  106. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/scripts/train.py +81 -70
  107. openadapt_ml-0.2.0/openadapt_ml/training/benchmark_viewer.py +4763 -0
  108. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/training/trainer.py +120 -363
  109. openadapt_ml-0.2.0/openadapt_ml/training/trl_trainer.py +354 -0
  110. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/pyproject.toml +11 -1
  111. openadapt_ml-0.2.0/scripts/p0_validate_demo_persistence.py +358 -0
  112. openadapt_ml-0.2.0/scripts/run_demo_experiment.py +240 -0
  113. openadapt_ml-0.2.0/scripts/run_demo_experiment_n30.py +371 -0
  114. openadapt_ml-0.2.0/scripts/run_multistep_experiment.py +402 -0
  115. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/tests/test_action_parsing.py +17 -20
  116. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/tests/test_api_adapter.py +2 -2
  117. openadapt_ml-0.2.0/tests/test_demo_retrieval.py +743 -0
  118. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/tests/test_local_cli.py +35 -74
  119. openadapt_ml-0.2.0/tests/test_parquet_export.py +174 -0
  120. openadapt_ml-0.2.0/tests/test_retrieval.py +243 -0
  121. openadapt_ml-0.2.0/tests/test_training_dummy.py +90 -0
  122. openadapt_ml-0.2.0/tests/test_trl_trainer.py +617 -0
  123. openadapt_ml-0.2.0/tests/test_waa_demo.py +358 -0
  124. openadapt_ml-0.2.0/tests/test_waa_live.py +314 -0
  125. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/uv.lock +747 -3
  126. openadapt_ml-0.1.0/examples/train_from_json.py +0 -153
  127. openadapt_ml-0.1.0/openadapt_ml/benchmarks/cli.py +0 -884
  128. openadapt_ml-0.1.0/openadapt_ml/cloud/local.py +0 -790
  129. openadapt_ml-0.1.0/openadapt_ml/ingest/loader.py +0 -232
  130. openadapt_ml-0.1.0/openadapt_ml/schemas/__init__.py +0 -53
  131. openadapt_ml-0.1.0/openadapt_ml/schemas/sessions.py +0 -122
  132. openadapt_ml-0.1.0/openadapt_ml/schemas/validation.py +0 -252
  133. openadapt_ml-0.1.0/openadapt_ml/scripts/prepare_synthetic.py +0 -43
  134. openadapt_ml-0.1.0/openadapt_ml/training/benchmark_viewer.py +0 -1538
  135. openadapt_ml-0.1.0/tests/test_training_dummy.py +0 -26
  136. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/.env.example +0 -0
  137. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/.github/workflows/publish.yml +0 -0
  138. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/.gitmodules +0 -0
  139. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/.python-version +0 -0
  140. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/LICENSE +0 -0
  141. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/configs/qwen2_5vl_synthetic.yaml +0 -0
  142. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/configs/qwen3vl_capture.yaml +0 -0
  143. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/configs/qwen3vl_capture_4bit.yaml +0 -0
  144. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/configs/qwen3vl_capture_batched.yaml +0 -0
  145. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/configs/qwen3vl_synthetic.yaml +0 -0
  146. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/configs/qwen3vl_synthetic_coord_v2.yaml +0 -0
  147. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/configs/qwen3vl_synthetic_dev.yaml +0 -0
  148. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/configs/qwen3vl_synthetic_registration_som.yaml +0 -0
  149. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/configs/qwen3vl_synthetic_som.yaml +0 -0
  150. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/NEXT_STEPS_GROUNDING_ARCHITECTURE.md +0 -0
  151. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/PRIVACY_IMPLEMENTATION_PLAN.md +0 -0
  152. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/RECORD_IMPLEMENTATION_PLAN.md +0 -0
  153. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/auto_shutoff_design.md +0 -0
  154. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/azure_acr_authentication.md +0 -0
  155. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/batching_and_schedulers.md +0 -0
  156. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/benchmark_integration_plan.md +0 -0
  157. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/benchmark_next_steps.md +0 -0
  158. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/benchmark_viewer_phase2.md +0 -0
  159. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/dashboard_architecture.md +0 -0
  160. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/design.md +0 -0
  161. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/early_termination.md +0 -0
  162. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/eval_json_schema.md +0 -0
  163. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/gui_actor_integration.md +0 -0
  164. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/images/dashboard/training_bottom.png +0 -0
  165. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/images/dashboard/training_top.png +0 -0
  166. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/images/dashboard/viewer_bottom.png +0 -0
  167. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/images/dashboard/viewer_top.png +0 -0
  168. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/images/grounding_demo.png +0 -0
  169. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/images/grounding_demo_full.png +0 -0
  170. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/images/training-dashboard.png +0 -0
  171. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/images/viewer-comparison.png +0 -0
  172. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/opencua_integration.md +0 -0
  173. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/output_artifacts_and_media.md +0 -0
  174. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/prediction_loading_architecture.md +0 -0
  175. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/qwen_login_experiment.md +0 -0
  176. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/roadmap.md +0 -0
  177. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/set_of_marks_implementation.md +0 -0
  178. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/som_implementation_verification.md +0 -0
  179. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/state_and_next_steps_qwen_login.md +0 -0
  180. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/stub_training_adapter.md +0 -0
  181. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/synthetic_login_jitter_and_ablation.md +0 -0
  182. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/training_feedback_ux.md +0 -0
  183. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/unified_compute_architecture.md +0 -0
  184. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/viewer_eval_integration.md +0 -0
  185. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/viewer_layout_redesign.md +0 -0
  186. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/vision.md +0 -0
  187. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/docs/wandb_integration.md +0 -0
  188. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/examples/README.md +0 -0
  189. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/examples/sample_data.json +0 -0
  190. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/examples/test_gemini_grounding.py +0 -0
  191. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/experiments/qwen_login/2b_dev/media/qwen3_2b_login_demo.gif +0 -0
  192. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/experiments/qwen_login/2b_dev/media/qwen3_2b_login_demo_session_0001.gif +0 -0
  193. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/experiments/qwen_login/2b_dev/plots/base_vs_ft.png +0 -0
  194. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/experiments/qwen_login/2b_dev/plots/qwen3_2b_base_vs_ft_hardened_v2.png +0 -0
  195. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/experiments/qwen_login/2b_dev/plots/qwen_vs_apis.png +0 -0
  196. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/experiments/qwen_login/8b_hero/plots/qwen3_8b_base_vs_ft_hardened_v2.png +0 -0
  197. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/experiments/qwen_login/SOM_INVESTIGATION_REPORT.md +0 -0
  198. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/experiments/qwen_login/comprehensive_comparison.png +0 -0
  199. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/experiments/qwen_login/login_demo.gif +0 -0
  200. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/experiments/qwen_login/registration_demo.gif +0 -0
  201. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/__init__.py +0 -0
  202. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/benchmarks/base.py +0 -0
  203. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/benchmarks/data_collection.py +0 -0
  204. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/cloud/__init__.py +0 -0
  205. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/cloud/azure_inference.py +0 -0
  206. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/cloud/lambda_labs.py +0 -0
  207. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/config.py +0 -0
  208. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/datasets/__init__.py +0 -0
  209. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/evals/__init__.py +0 -0
  210. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/evals/plot_eval_metrics.py +0 -0
  211. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/grounding/__init__.py +0 -0
  212. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/grounding/base.py +0 -0
  213. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/grounding/detector.py +0 -0
  214. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/models/__init__.py +0 -0
  215. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/models/api_adapter.py +0 -0
  216. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/models/base_adapter.py +0 -0
  217. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/models/dummy_adapter.py +0 -0
  218. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/models/qwen_vl.py +0 -0
  219. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/runtime/__init__.py +0 -0
  220. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/scripts/__init__.py +0 -0
  221. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/scripts/demo_policy.py +0 -0
  222. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/scripts/make_gif.py +0 -0
  223. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/scripts/run_qwen_login_benchmark.py +0 -0
  224. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/training/__init__.py +0 -0
  225. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/training/shared_ui.py +0 -0
  226. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/training/stub_provider.py +0 -0
  227. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/openadapt_ml/training/viewer.py +0 -0
  228. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/scripts/fix_acr_auth.py +0 -0
  229. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/scripts/setup_azure.py +0 -0
  230. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/tests/__init__.py +0 -0
  231. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/tests/benchmarks/__init__.py +0 -0
  232. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/tests/benchmarks/test_api_agent.py +0 -0
  233. {openadapt_ml-0.1.0 → openadapt_ml-0.2.0}/tests/benchmarks/test_waa.py +0 -0
@@ -39,3 +39,4 @@ debug_*/
39
39
 
40
40
  # Internal documentation (not for public repo)
41
41
  docs/internal/
42
+ docs/internal/
@@ -2,6 +2,108 @@
2
2
 
3
3
  This file helps maintain context across sessions.
4
4
 
5
+ ---
6
+ ## ⚠️⚠️⚠️ MANDATORY: START DASHBOARD FIRST ⚠️⚠️⚠️
7
+
8
+ ### STOP. READ THIS BEFORE DOING ANYTHING.
9
+
10
+ **If ANY of these are true, you MUST run the dashboard command IMMEDIATELY:**
11
+ - Session just started or was compacted
12
+ - User mentions VMs, Azure, WAA, benchmark, or Windows
13
+ - You're about to run ANY `vm` subcommand (probe, diag, logs, run-waa, etc.)
14
+ - You want to check benchmark status
15
+
16
+ **THE COMMAND (run this FIRST, not after other commands):**
17
+ ```bash
18
+ uv run python -m openadapt_ml.benchmarks.cli vm monitor
19
+ ```
20
+
21
+ **WHY THIS MATTERS:**
22
+ - VNC is ONLY accessible via SSH tunnel at `localhost:8006` (NOT the public IP)
23
+ - The dashboard auto-manages SSH tunnels
24
+ - Without it, you cannot see what Windows is doing
25
+ - The user WILL be frustrated if you keep forgetting this
26
+
27
+ **WRONG (what you keep doing):**
28
+ ```bash
29
+ # DON'T do this - checking probe/diag/logs WITHOUT dashboard running
30
+ uv run python -m openadapt_ml.benchmarks.cli vm probe
31
+ uv run python -m openadapt_ml.benchmarks.cli vm diag
32
+ # Then telling user to "run vm monitor" - NO! YOU run it FIRST!
33
+ ```
34
+
35
+ **RIGHT (what you should do):**
36
+ ```bash
37
+ # ALWAYS start dashboard FIRST, then it handles everything
38
+ uv run python -m openadapt_ml.benchmarks.cli vm monitor
39
+ ```
40
+
41
+ **After every /compact or session restart, your LITERAL FIRST ACTION must be starting this dashboard if VMs are involved.**
42
+
43
+ ---
44
+ ## 🚨🚨🚨 STOP! READ THIS BEFORE EVERY COMMAND 🚨🚨🚨
45
+
46
+ ### ABSOLUTELY NEVER USE RAW SSH COMMANDS
47
+
48
+ **This is the #1 rule. You have been told this MANY times. STOP IGNORING IT.**
49
+
50
+ ❌ **BANNED** (never type these):
51
+ - `ssh azureuser@IP "anything"`
52
+ - `ssh $SSH_OPTS ...`
53
+ - Any command starting with `ssh` to the VM
54
+
55
+ ✅ **REQUIRED** (always use these instead):
56
+ - `uv run python -m openadapt_ml.benchmarks.cli vm exec --cmd "your command"`
57
+ - `uv run python -m openadapt_ml.benchmarks.cli vm diag`
58
+ - `uv run python -m openadapt_ml.benchmarks.cli vm logs`
59
+
60
+ **If a CLI command doesn't exist, ADD IT TO THE CLI FIRST, then use it.**
61
+
62
+ **Before running ANY command involving the VM, ask yourself:**
63
+ 1. Does this start with `ssh`? → STOP, use CLI instead
64
+ 2. Is this a raw shell command to the VM? → STOP, use CLI instead
65
+ 3. Can I use `vm exec --cmd`? → YES, use it
66
+
67
+ This has been explained to you repeatedly. FOLLOW IT.
68
+
69
+ ---
70
+ ## 🔧 DOCKERFILE/VM CHANGES: TEST INSIDE CONTAINER FIRST
71
+
72
+ **Problem**: Each Dockerfile change triggers: rebuild (10 min) → Windows boot (15 min) → test → repeat. Hours wasted on tiny changes.
73
+
74
+ **Solution**: Test fixes INSIDE a running container BEFORE rebuilding:
75
+
76
+ ```bash
77
+ # 1. Start a test container with bash entrypoint (seconds)
78
+ uv run python -m openadapt_ml.benchmarks.cli vm host-exec --cmd \
79
+ 'docker run -d --name test-fix --entrypoint /bin/bash waa-auto:latest -c "sleep 3600"'
80
+
81
+ # 2. Apply your fix manually INSIDE the container (seconds)
82
+ uv run python -m openadapt_ml.benchmarks.cli vm host-exec --cmd \
83
+ "docker exec test-fix sed -i 's/old/new/' /some/file.sh"
84
+
85
+ # 3. Verify the fix works (seconds)
86
+ uv run python -m openadapt_ml.benchmarks.cli vm host-exec --cmd \
87
+ "docker exec test-fix cat /some/file.sh"
88
+
89
+ # 4. Test the actual behavior (seconds)
90
+ uv run python -m openadapt_ml.benchmarks.cli vm host-exec --cmd \
91
+ "docker exec test-fix /some/script.sh && ls /expected/output"
92
+
93
+ # 5. Cleanup
94
+ uv run python -m openadapt_ml.benchmarks.cli vm host-exec --cmd 'docker rm -f test-fix'
95
+
96
+ # 6. ONLY AFTER fix is verified: Update Dockerfile and rebuild ONCE
97
+ ```
98
+
99
+ **Why this matters**:
100
+ - Testing a fix takes SECONDS instead of 30+ minutes
101
+ - Iterate 10x on the fix before committing to a rebuild
102
+ - Don't lose context waiting for long builds
103
+ - Each rebuild should be the LAST rebuild, not a guess
104
+
105
+ ---
106
+
5
107
  ## Project Overview
6
108
 
7
109
  openadapt-ml is a model-agnostic, domain-agnostic ML engine for GUI automation agents. It provides:
@@ -11,7 +113,18 @@ openadapt-ml is a model-agnostic, domain-agnostic ML engine for GUI automation a
11
113
  - Supervised fine-tuning pipeline
12
114
  - Runtime policy API
13
115
 
14
- ## Current Focus: Benchmark Integration
116
+ ## Current Focus: Demo Retrieval
117
+
118
+ **Validated**: Demo-conditioned prompting improves action accuracy (Dec 2024)
119
+ - Zero-shot: 33% correct first actions
120
+ - With demo: 100% correct first actions
121
+ - See `docs/experiments/demo_conditioned_prompting_results.md`
122
+
123
+ **Next step**: Build demo retrieval to automatically select relevant demos from a library.
124
+
125
+ **Key insight**: OpenAdapt's value is **trajectory-conditioned disambiguation of UI affordances**, not "better reasoning".
126
+
127
+ ## Benchmark Integration
15
128
 
16
129
  **Primary benchmark**: Windows Agent Arena (WAA)
17
130
  - 154 tasks across 11 Windows domains
@@ -289,10 +402,13 @@ uv run python -m openadapt_ml.cloud.local serve --open
289
402
  - `docs/benchmark_integration_plan.md` - Benchmark integration architecture
290
403
  - `docs/azure_waa_setup.md` - Azure WAA setup guide (quota increase, costs, troubleshooting)
291
404
  - `docs/design.md` - Overall system design
405
+ - `docs/experiments/demo_conditioned_prompting_results.md` - Demo experiment results (validated Dec 2024)
292
406
  - `openadapt_ml/cloud/` - Cloud GPU providers (Lambda Labs, Azure)
293
407
  - `openadapt_ml/benchmarks/` - Benchmark integration module (WAA, base classes)
408
+ - `openadapt_ml/experiments/demo_prompt/` - Demo-conditioned prompting experiment
294
409
  - `openadapt_ml/grounding/` - Grounding module (GeminiGrounder, etc.)
295
410
  - `openadapt_ml/ingest/capture.py` - Converts openadapt-capture recordings to Episodes
411
+ - `scripts/run_demo_experiment.py` - Run demo-conditioned experiment
296
412
  - `configs/qwen3vl_synthetic_som.yaml` - SoM training config
297
413
 
298
414
  ## Code Patterns
@@ -341,13 +457,94 @@ The training dashboard and capture viewer share UI components for visual consist
341
457
  - Single source of truth for styling (no duplicate CSS to maintain)
342
458
  - Easier to add new dashboards that match existing style
343
459
 
460
+ ## CRITICAL: Always Start Dashboard When Running Azure Resources
461
+
462
+ See the ⚠️ MANDATORY section at the TOP of this file. Use:
463
+ ```bash
464
+ uv run python -m openadapt_ml.benchmarks.cli vm monitor
465
+ ```
466
+
467
+ ## ⚠️ SAFE PROCESS MANAGEMENT ⚠️
468
+
469
+ **NEVER use broad pkill patterns** - they can kill unrelated applications!
470
+
471
+ **WRONG (DANGEROUS):**
472
+ ```bash
473
+ # These patterns are TOO BROAD and will kill unrelated apps:
474
+ pkill -f "openadapt" # Kills anything with "openadapt" in path
475
+ pkill -f "python" # Kills ALL Python processes
476
+ pkill -9 -f "openadapt_ml" # Killed Claude Code, Windsurf, Signal, Chrome tabs!
477
+ ```
478
+
479
+ **RIGHT (SAFE):**
480
+ ```bash
481
+ # Use specific PID-based killing:
482
+ lsof -i :8765 | grep python | awk '{print $2}' | xargs kill 2>/dev/null
483
+
484
+ # Or use specific process names with full path matching:
485
+ pkill -f "python.*-m openadapt_ml.cloud.local serve"
486
+
487
+ # Or kill only the specific port listener:
488
+ kill $(lsof -t -i :8765) 2>/dev/null
489
+
490
+ # Check what would be killed FIRST:
491
+ pgrep -f "openadapt" -l # Lists matching processes before killing
492
+ ```
493
+
494
+ **Before any pkill command:**
495
+ 1. Run `pgrep -f "pattern" -l` to see what matches
496
+ 2. Verify only intended processes are listed
497
+ 3. Use the most specific pattern possible
498
+ 4. Prefer port-based or PID-based killing
499
+
344
500
  ## Don't Do
345
501
 
346
502
  - Don't add timelines/estimates to plans
347
503
  - Don't mention specific clients by name in public docs
348
504
  - Don't over-engineer - keep solutions minimal
349
505
  - Don't use `os.environ` directly - use `config.settings` instead
350
- - Don't use `pip install` - always use `uv pip install` or `uv add` for consistency
506
+ - Don't use `pip install` - always use `uv add` for dependencies or `uv sync` for the project
507
+ - **Don't run Azure/VM operations without starting the dashboard first**
508
+ - ❌ WRONG: `vm probe` then `vm diag` then telling user to run `vm monitor`
509
+ - ✅ RIGHT: `vm monitor` FIRST (it does probe, tunnels, everything)
510
+ - This is the #1 mistake you keep making. STOP IT.
511
+ - **Don't use raw SSH/shell commands** - always use or create CLI commands instead (see below)
512
+ - **Don't tell user to run commands** - YOU run them. The CLI exists so YOU can use it.
513
+
514
+ ## CLI-First Development (IMPORTANT)
515
+
516
+ **ALWAYS** use CLI commands instead of raw SSH/shell commands:
517
+ - ✅ `uv run python -m openadapt_ml.benchmarks.cli vm diag` (not `ssh ... df -h`)
518
+ - ✅ `uv run python -m openadapt_ml.benchmarks.cli vm logs` (not `ssh ... docker logs`)
519
+ - ✅ `uv run python -m openadapt_ml.benchmarks.cli vm probe` (not `ssh ... curl`)
520
+
521
+ **Why**: CLI commands are documented, tested, and persist across context compactions. Raw commands are forgotten.
522
+
523
+ **When you need a new operation**:
524
+ 1. Add a new action to the relevant CLI subcommand (e.g., `vm logs`, `vm exec`)
525
+ 2. Document it in CLAUDE.md
526
+ 3. Use the CLI command going forward
527
+
528
+ **Available VM CLI commands**:
529
+ ```bash
530
+ vm monitor # THE GO-TO COMMAND: Start dashboard, open browser, show probe status
531
+ # Options: --auto-shutdown-hours N (deallocate after N hours)
532
+ vm setup-waa # Full VM setup with Docker and waa-auto image
533
+ vm run-waa # Run benchmark (requires waa-auto image, --rebuild to force image rebuild)
534
+ vm diag # Check disk, Docker, containers, WAA probe status
535
+ vm logs # View container logs (--lines N, --follow)
536
+ vm probe # Check WAA server status (--wait to poll)
537
+ vm exec # Run command in container (--cmd 'your command')
538
+ vm fix-oem # Copy OEM files to Samba share (for manual install.bat)
539
+ vm docker-prune # Clean Docker images, containers, build cache (free disk space)
540
+ vm docker-move # Move Docker/containerd to /mnt via symlinks (147GB space)
541
+ vm stop-build # Stop running Docker build and clean build cache
542
+ vm status # Azure VM status
543
+ vm ssh # Interactive SSH
544
+ vm deallocate # Stop VM billing (preserves disk), use -y to skip confirmation
545
+ vm start # Start a deallocated VM
546
+ vm delete # Delete VM (use -y to skip confirmation)
547
+ ```
351
548
 
352
549
  ## TODO / Known Issues
353
550
 
@@ -406,6 +603,144 @@ az ml workspace sync-keys -n openadapt-ml -g openadapt-agents
406
603
  - [Azure ML Managed Identity ACR Authentication](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-identity-based-service-authentication)
407
604
  - [ACR Pull Role Assignment](https://learn.microsoft.com/en-us/azure/container-registry/container-registry-authentication-managed-identity)
408
605
 
606
+ ### Azure WAA Evaluation - Dedicated VM Setup
607
+ **Status**: WORKING - Custom `waa-auto` Docker image REQUIRED (verified Jan 2026)
608
+
609
+ **Problem**: WAA requires running a Windows VM inside Docker (via QEMU). Azure ML managed compute doesn't support nested virtualization.
610
+
611
+ **CRITICAL**: The official `windowsarena/winarena:latest` image is **BROKEN**. It uses an outdated `dockurr/windows v0.00` that does NOT auto-download Windows 11. You will get "ISO file not found" errors and the VM will never start.
612
+
613
+ **Solution**: The CLI builds a custom `waa-auto` Docker image that:
614
+ 1. Uses modern `dockurr/windows:latest` (v5.14+) which auto-downloads Windows 11
615
+ 2. Installs Python 3 and all WAA client dependencies
616
+ 3. Patches IP addresses for dockurr/windows networking
617
+
618
+ **Working Quick Start** (via CLI - fully automated):
619
+ ```bash
620
+ # 1. Setup VM with Docker and build waa-auto image (~10 min)
621
+ uv run python -m openadapt_ml.benchmarks.cli vm setup-waa --api-key $OPENAI_API_KEY
622
+
623
+ # 2. Run benchmark (Windows downloads on first run, ~15 min, then ~30 min/20 tasks)
624
+ uv run python -m openadapt_ml.benchmarks.cli vm run-waa --num-tasks 20
625
+
626
+ # 3. Delete VM when done (IMPORTANT: stops billing!)
627
+ uv run python -m openadapt_ml.benchmarks.cli vm delete
628
+ ```
629
+
630
+ **Diagnostic commands**:
631
+ ```bash
632
+ # Check VM disk, Docker, containers, WAA probe status
633
+ uv run python -m openadapt_ml.benchmarks.cli vm diag
634
+
635
+ # Check VM Azure status
636
+ uv run python -m openadapt_ml.benchmarks.cli vm status
637
+
638
+ # SSH into VM for debugging
639
+ uv run python -m openadapt_ml.benchmarks.cli vm ssh
640
+
641
+ # Check if WAA server is ready
642
+ uv run python -m openadapt_ml.benchmarks.cli vm probe --wait
643
+
644
+ # Force rebuild waa-auto if needed
645
+ uv run python -m openadapt_ml.benchmarks.cli vm run-waa --rebuild --num-tasks 5
646
+ ```
647
+
648
+ **What the CLI does** (via custom `waa-auto` Docker image in `openadapt_ml/benchmarks/waa/Dockerfile`):
649
+ 1. Uses modern `dockurr/windows:latest` base (auto-downloads Windows 11)
650
+ 2. Copies `/oem` folder from official WAA image (fixes OEM folder issue)
651
+ 3. Patches IP addresses (20.20.20.21 → 172.30.0.2)
652
+ 4. Adds automation commands to Windows FirstLogonCommands:
653
+ - Disable firewall, sleep, lock screen
654
+ - **Auto-runs install.bat** to install Python, Chrome, LibreOffice, VSCode, WAA server
655
+ 5. Installs Python dependencies for benchmark client
656
+
657
+ **Fully automated** - no manual VNC login or script execution needed!
658
+
659
+ **Key requirements**:
660
+ 1. **VM Size**: `Standard_D4ds_v5` or larger (nested virtualization required)
661
+ 2. **Docker storage**: Scripts use `/mnt/WindowsAgentArena/src/win-arena-container/vm/storage`
662
+ 3. **ISO location**: `src/win-arena-container/vm/image/setup.iso`
663
+ 4. **API key**: `config.json` in repo root with OPENAI_API_KEY
664
+ 5. **Valid model name**: Must use real OpenAI model (e.g., `gpt-4o`, `gpt-4o-mini`). Invalid names cause benchmark to hang on API retries.
665
+
666
+ **Architecture**:
667
+ ```
668
+ Azure VM (Standard_D4ds_v5, nested virt enabled)
669
+ └── Docker (data on /mnt)
670
+ └── winarena:latest (built by run-local.sh)
671
+ └── QEMU running Windows 11 VM (IP: 20.20.20.21)
672
+ └── WAA Flask server on port 5000
673
+ └── Navi agent executing tasks
674
+ ```
675
+
676
+ **Monitor progress**:
677
+ - VNC: `http://localhost:8006` (via SSH tunnel, auto-managed by dashboard)
678
+ - Logs: `tail -f /tmp/waa_benchmark.log` (if running via nohup)
679
+
680
+ **Files**:
681
+ - `openadapt_ml/benchmarks/cli.py` - `vm` subcommand with setup-waa, probe
682
+ - `openadapt_ml/cloud/ssh_tunnel.py` - SSH tunnel manager (auto VNC/WAA tunnels)
683
+ - `docs/waa_setup.md` - Detailed setup guide
684
+
685
+ ### SSH Tunnel Management (VNC/WAA Access)
686
+ **Status**: DONE
687
+
688
+ **Problem**: Azure VMs have Network Security Groups (NSGs) that only expose port 22 (SSH) by default. Ports 8006 (VNC) and 5000 (WAA) are not accessible directly.
689
+
690
+ **Solution**: Automatic SSH tunnel management via `SSHTunnelManager`:
691
+
692
+ ```
693
+ Browser → localhost:8006 → SSH Tunnel → Azure VM:8006 → Docker → noVNC
694
+ Browser → localhost:5000 → SSH Tunnel → Azure VM:5000 → WAA Flask
695
+ ```
696
+
697
+ **Architecture**:
698
+ 1. When VM's WAA probe becomes "ready", tunnels auto-start
699
+ 2. When VM goes offline, tunnels auto-stop
700
+ 3. Dashboard shows tunnel status next to VNC button
701
+ 4. VNC button links to localhost:port (tunnel endpoint)
702
+
703
+ **Files**:
704
+ - `openadapt_ml/cloud/ssh_tunnel.py` - SSHTunnelManager class
705
+ - `openadapt_ml/cloud/local.py` - Integration with dashboard server
706
+ - `openadapt_ml/training/benchmark_viewer.py` - UI showing tunnel status
707
+
708
+ **API Endpoints**:
709
+ - `GET /api/tunnels` - Returns tunnel status for VNC and WAA
710
+ - `GET /api/vms` - Includes `tunnels` field with per-tunnel status
711
+
712
+ **Key features**:
713
+ - Auto-start on VM online (idempotent - safe to call repeatedly)
714
+ - Auto-stop on VM offline
715
+ - Port conflict detection
716
+ - Graceful shutdown on process exit
717
+ - No manual SSH commands needed
718
+
719
+ **Manual usage** (if needed):
720
+ ```python
721
+ from openadapt_ml.cloud.ssh_tunnel import get_tunnel_manager
722
+
723
+ manager = get_tunnel_manager()
724
+ manager.start_tunnels_for_vm("172.171.112.41", "azureuser")
725
+ status = manager.get_tunnel_status()
726
+ manager.stop_all_tunnels()
727
+ ```
728
+
729
+ **Why not open NSG ports?**
730
+ 1. VNC has no authentication by default - anyone can connect
731
+ 2. SSH tunnel encrypts all traffic
732
+ 3. Requires SSH key auth - no password guessing
733
+ 4. No Azure NSG changes needed
734
+
735
+ **Alternative: Mock evaluation** for testing without Windows:
736
+ ```bash
737
+ uv run python -m openadapt_ml.benchmarks.cli test-mock --tasks 20
738
+ ```
739
+
740
+ **References**:
741
+ - [Windows Agent Arena GitHub](https://github.com/microsoft/WindowsAgentArena)
742
+ - [Azure nested virtualization](https://learn.microsoft.com/en-us/azure/virtual-machines/acu)
743
+
409
744
  ### Training Dashboard - Terminal Output Streaming
410
745
  **Status**: DONE
411
746
 
@@ -522,7 +857,7 @@ Verified:
522
857
  - Backend flag options: `claude`, `openai` in CLI ✓
523
858
 
524
859
  ### Benchmark Viewer Integration
525
- **Status**: Phase 1 DONE, Phases 2-4 TODO
860
+ **Status**: Phases 1-3 DONE, Phase 4 TODO
526
861
 
527
862
  **Goal**: Integrate benchmark evaluation results (WAA, WebArena, OSWorld) into the unified viewer.
528
863
 
@@ -532,7 +867,7 @@ Verified:
532
867
  1. **Benchmarks tab**: Third tab alongside Training and Viewer
533
868
  2. **Task-level view**: List of benchmark tasks with pass/fail status
534
869
  3. **Step-by-step replay**: Same UI as Viewer tab for benchmark executions
535
- 4. **Model comparison**: Side-by-side comparison of different models on same task
870
+ 4. **Model comparison**: Side-by-side comparison of different models on same task (TODO)
536
871
  5. **Aggregate metrics**: Success rate by domain, difficulty rankings
537
872
 
538
873
  **Implementation phases**:
@@ -543,22 +878,30 @@ Verified:
543
878
  - Directory structure: `benchmark_results/{run_name}/tasks/{task_id}/`
544
879
  - Each task has: `task.json`, `execution.json`, `screenshots/`
545
880
  - Test script: `test_data_collection.py` validates all files are created
546
- 2. **Viewer backend** (TODO): `generate_benchmark_viewer()` function
547
- 3. **UI components** (TODO): Summary dashboard, task list, replay
881
+ 2. **Viewer backend** (DONE): `generate_benchmark_viewer()` function
882
+ - Created `openadapt_ml/benchmarks/viewer.py` with viewer generation
883
+ - Added CLI command: `uv run python -m openadapt_ml.benchmarks.cli view --run-name {name}`
884
+ - Generates standalone HTML with same styling as training viewer
885
+ - Uses shared header components via `shared_ui.py`
886
+ 3. ✅ **UI components** (DONE - Basic): Summary dashboard, task list, replay
887
+ - Summary panel with total tasks, passed/failed, success rate
888
+ - Domain breakdown with per-domain statistics
889
+ - Filter controls (domain, status)
890
+ - Task list with status badges
891
+ - Step-by-step viewer with screenshots, actions, reasoning
892
+ - Playback controls (prev/next, play/pause, speed)
893
+ - Keyboard shortcuts (Space, arrows, Home/End)
548
894
  4. **Analysis** (TODO): Failure clustering, regression detection
549
895
 
550
- **Phase 1 verification:**
896
+ **View benchmark results:**
551
897
  ```bash
552
- # Test data collection
553
- uv run python -m openadapt_ml.benchmarks.cli test-collection --tasks 5
898
+ # Generate HTML viewer and serve it
899
+ uv run python -m openadapt_ml.benchmarks.cli view --run-name {name}
554
900
 
555
- # Verify output
556
- ls -la benchmark_results/{run_name}/tasks/task_001/
557
- # Should contain: task.json, execution.json, screenshots/
558
-
559
- # Check JSON structure
560
- cat benchmark_results/{run_name}/summary.json
561
- cat benchmark_results/{run_name}/tasks/task_001/execution.json
901
+ # Options:
902
+ # --embed-screenshots Embed screenshots as base64 (standalone HTML)
903
+ # --no-open Don't auto-open browser
904
+ # --port 9000 Use custom port
562
905
  ```
563
906
 
564
907
  ## Preventing Stale Data Issues
@@ -618,3 +961,23 @@ The viewer should automatically load:
618
961
  | Predictions not extracted | HTML uses `window.comparisonData` but regex expects `const` | Use regex `(?:const\s+\|window\.)comparisonData` pattern |
619
962
  | Stale data after code change | Browser caching HTML | Hard refresh (Cmd+Shift+R) or disable cache |
620
963
  | Screenshots 404 | Screenshot symlink broken | Recreate: `ln -sf /path/to/capture/screenshots training_output/current/screenshots` |
964
+
965
+ ### UI/Display Guidelines
966
+
967
+ **Placeholder data must be clearly marked** when displaying values that may not reflect actual data:
968
+ - If task counts, worker counts, etc. come from local tracking (not synced with Azure), mark them with an asterisk: "3* tasks • 1* worker(s)"
969
+ - Add a footnote: "[*: placeholder, actual values may differ]"
970
+ - This applies to any data that is locally cached but not confirmed from the authoritative source
971
+
972
+ ### Azure ML Integration Notes
973
+
974
+ **Experiment ID**: The Azure ML experiments page URL requires an experiment ID which is workspace-specific:
975
+ - Current hardcoded ID: `ad29082c-0607-4fda-8cc7-38944eb5a518`
976
+ - **TODO**: Retrieve experiment_id dynamically from Azure using `az ml experiment list`
977
+ - The experiment name is `openadapt-ml` but the URL requires the UUID format
978
+
979
+ **Azure ML URL format**:
980
+ - Jobs list: `https://ml.azure.com/experiments/id/{experiment_id}?wsid={workspace_id}`
981
+ - Specific job: `https://ml.azure.com/experiments/id/{experiment_id}/runs/{run_id}?wsid={workspace_id}`
982
+
983
+ **WAA Docker command**: Use `python run.py` not `python -m client.run` (the client directory is not a Python package)