openadapt-ml 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. openadapt_ml/baselines/__init__.py +121 -0
  2. openadapt_ml/baselines/adapter.py +185 -0
  3. openadapt_ml/baselines/cli.py +314 -0
  4. openadapt_ml/baselines/config.py +448 -0
  5. openadapt_ml/baselines/parser.py +922 -0
  6. openadapt_ml/baselines/prompts.py +787 -0
  7. openadapt_ml/benchmarks/__init__.py +13 -115
  8. openadapt_ml/benchmarks/agent.py +265 -421
  9. openadapt_ml/benchmarks/azure.py +28 -19
  10. openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
  11. openadapt_ml/benchmarks/cli.py +1722 -4847
  12. openadapt_ml/benchmarks/trace_export.py +631 -0
  13. openadapt_ml/benchmarks/viewer.py +22 -5
  14. openadapt_ml/benchmarks/vm_monitor.py +530 -29
  15. openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
  16. openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
  17. openadapt_ml/cloud/azure_inference.py +3 -5
  18. openadapt_ml/cloud/lambda_labs.py +722 -307
  19. openadapt_ml/cloud/local.py +2038 -487
  20. openadapt_ml/cloud/ssh_tunnel.py +68 -26
  21. openadapt_ml/datasets/next_action.py +40 -30
  22. openadapt_ml/evals/grounding.py +8 -3
  23. openadapt_ml/evals/plot_eval_metrics.py +15 -13
  24. openadapt_ml/evals/trajectory_matching.py +41 -26
  25. openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
  26. openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
  27. openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
  28. openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
  29. openadapt_ml/experiments/representation_shootout/config.py +390 -0
  30. openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
  31. openadapt_ml/experiments/representation_shootout/runner.py +687 -0
  32. openadapt_ml/experiments/waa_demo/runner.py +29 -14
  33. openadapt_ml/export/parquet.py +36 -24
  34. openadapt_ml/grounding/detector.py +18 -14
  35. openadapt_ml/ingest/__init__.py +8 -6
  36. openadapt_ml/ingest/capture.py +25 -22
  37. openadapt_ml/ingest/loader.py +7 -4
  38. openadapt_ml/ingest/synthetic.py +189 -100
  39. openadapt_ml/models/api_adapter.py +14 -4
  40. openadapt_ml/models/base_adapter.py +10 -2
  41. openadapt_ml/models/providers/__init__.py +288 -0
  42. openadapt_ml/models/providers/anthropic.py +266 -0
  43. openadapt_ml/models/providers/base.py +299 -0
  44. openadapt_ml/models/providers/google.py +376 -0
  45. openadapt_ml/models/providers/openai.py +342 -0
  46. openadapt_ml/models/qwen_vl.py +46 -19
  47. openadapt_ml/perception/__init__.py +35 -0
  48. openadapt_ml/perception/integration.py +399 -0
  49. openadapt_ml/retrieval/demo_retriever.py +50 -24
  50. openadapt_ml/retrieval/embeddings.py +9 -8
  51. openadapt_ml/retrieval/retriever.py +3 -1
  52. openadapt_ml/runtime/__init__.py +50 -0
  53. openadapt_ml/runtime/policy.py +18 -5
  54. openadapt_ml/runtime/safety_gate.py +471 -0
  55. openadapt_ml/schema/__init__.py +9 -0
  56. openadapt_ml/schema/converters.py +74 -27
  57. openadapt_ml/schema/episode.py +31 -18
  58. openadapt_ml/scripts/capture_screenshots.py +530 -0
  59. openadapt_ml/scripts/compare.py +85 -54
  60. openadapt_ml/scripts/demo_policy.py +4 -1
  61. openadapt_ml/scripts/eval_policy.py +15 -9
  62. openadapt_ml/scripts/make_gif.py +1 -1
  63. openadapt_ml/scripts/prepare_synthetic.py +3 -1
  64. openadapt_ml/scripts/train.py +21 -9
  65. openadapt_ml/segmentation/README.md +920 -0
  66. openadapt_ml/segmentation/__init__.py +97 -0
  67. openadapt_ml/segmentation/adapters/__init__.py +5 -0
  68. openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
  69. openadapt_ml/segmentation/annotator.py +610 -0
  70. openadapt_ml/segmentation/cache.py +290 -0
  71. openadapt_ml/segmentation/cli.py +674 -0
  72. openadapt_ml/segmentation/deduplicator.py +656 -0
  73. openadapt_ml/segmentation/frame_describer.py +788 -0
  74. openadapt_ml/segmentation/pipeline.py +340 -0
  75. openadapt_ml/segmentation/schemas.py +622 -0
  76. openadapt_ml/segmentation/segment_extractor.py +634 -0
  77. openadapt_ml/training/azure_ops_viewer.py +1097 -0
  78. openadapt_ml/training/benchmark_viewer.py +52 -41
  79. openadapt_ml/training/shared_ui.py +7 -7
  80. openadapt_ml/training/stub_provider.py +57 -35
  81. openadapt_ml/training/trainer.py +143 -86
  82. openadapt_ml/training/trl_trainer.py +70 -21
  83. openadapt_ml/training/viewer.py +323 -108
  84. openadapt_ml/training/viewer_components.py +180 -0
  85. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +215 -14
  86. openadapt_ml-0.2.1.dist-info/RECORD +116 -0
  87. openadapt_ml/benchmarks/base.py +0 -366
  88. openadapt_ml/benchmarks/data_collection.py +0 -432
  89. openadapt_ml/benchmarks/live_tracker.py +0 -180
  90. openadapt_ml/benchmarks/runner.py +0 -418
  91. openadapt_ml/benchmarks/waa.py +0 -761
  92. openadapt_ml/benchmarks/waa_live.py +0 -619
  93. openadapt_ml-0.2.0.dist-info/RECORD +0 -86
  94. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
  95. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -3,6 +3,8 @@
3
3
  This module provides reusable classes for monitoring Windows VMs running WAA.
4
4
  Can be used by the viewer, CLI, or as a standalone tool.
5
5
 
6
+ Enhanced with Azure ML job tracking, cost estimation, and activity detection.
7
+
6
8
  Usage:
7
9
  # Monitor a single VM
8
10
  from openadapt_ml.benchmarks.vm_monitor import VMMonitor, VMConfig
@@ -21,6 +23,14 @@ Usage:
21
23
 
22
24
  # Or run continuous monitoring
23
25
  monitor.run_monitor(callback=lambda s: print(s))
26
+
27
+ # Fetch Azure ML jobs
28
+ jobs = fetch_azure_ml_jobs(days=7)
29
+ print(f"Found {len(jobs)} jobs in last 7 days")
30
+
31
+ # Calculate VM costs
32
+ costs = calculate_vm_costs(vm_size="Standard_D4ds_v5", hours=2.5)
33
+ print(f"Estimated cost: ${costs['total_cost_usd']:.2f}")
24
34
  """
25
35
 
26
36
  from __future__ import annotations
@@ -29,12 +39,15 @@ import json
29
39
  import subprocess
30
40
  import time
31
41
  from dataclasses import dataclass, field, asdict
32
- from datetime import datetime
42
+ from datetime import datetime, timedelta
33
43
  from pathlib import Path
34
44
  from typing import Callable
35
45
  import urllib.request
36
46
  import urllib.error
37
47
  import socket
48
+ import logging
49
+
50
+ logger = logging.getLogger(__name__)
38
51
 
39
52
 
40
53
  @dataclass
@@ -105,9 +118,10 @@ class VMMonitor:
105
118
  self.timeout = timeout
106
119
 
107
120
  def check_vnc(self) -> bool:
108
- """Check if VNC port is reachable."""
121
+ """Check if VNC port is reachable via SSH tunnel (localhost)."""
109
122
  try:
110
- url = f"http://{self.config.ssh_host}:{self.config.vnc_port}/"
123
+ # VNC is only accessible via SSH tunnel at localhost, not the public IP
124
+ url = f"http://localhost:{self.config.vnc_port}/"
111
125
  req = urllib.request.Request(url, method="HEAD")
112
126
  with urllib.request.urlopen(req, timeout=self.timeout):
113
127
  return True
@@ -120,9 +134,12 @@ class VMMonitor:
120
134
  result = subprocess.run(
121
135
  [
122
136
  "ssh",
123
- "-o", "StrictHostKeyChecking=no",
124
- "-o", f"ConnectTimeout={self.timeout}",
125
- "-o", "BatchMode=yes",
137
+ "-o",
138
+ "StrictHostKeyChecking=no",
139
+ "-o",
140
+ f"ConnectTimeout={self.timeout}",
141
+ "-o",
142
+ "BatchMode=yes",
126
143
  f"{self.config.ssh_user}@{self.config.ssh_host}",
127
144
  "echo ok",
128
145
  ],
@@ -145,9 +162,12 @@ class VMMonitor:
145
162
  result = subprocess.run(
146
163
  [
147
164
  "ssh",
148
- "-o", "StrictHostKeyChecking=no",
149
- "-o", f"ConnectTimeout={self.timeout}",
150
- "-o", "BatchMode=yes",
165
+ "-o",
166
+ "StrictHostKeyChecking=no",
167
+ "-o",
168
+ f"ConnectTimeout={self.timeout}",
169
+ "-o",
170
+ "BatchMode=yes",
151
171
  f"{self.config.ssh_user}@{self.config.ssh_host}",
152
172
  cmd,
153
173
  ],
@@ -173,9 +193,12 @@ class VMMonitor:
173
193
  result = subprocess.run(
174
194
  [
175
195
  "ssh",
176
- "-o", "StrictHostKeyChecking=no",
177
- "-o", f"ConnectTimeout={self.timeout}",
178
- "-o", "BatchMode=yes",
196
+ "-o",
197
+ "StrictHostKeyChecking=no",
198
+ "-o",
199
+ f"ConnectTimeout={self.timeout}",
200
+ "-o",
201
+ "BatchMode=yes",
179
202
  f"{self.config.ssh_user}@{self.config.ssh_host}",
180
203
  cmd,
181
204
  ],
@@ -191,9 +214,12 @@ class VMMonitor:
191
214
  log_result = subprocess.run(
192
215
  [
193
216
  "ssh",
194
- "-o", "StrictHostKeyChecking=no",
195
- "-o", f"ConnectTimeout={self.timeout}",
196
- "-o", "BatchMode=yes",
217
+ "-o",
218
+ "StrictHostKeyChecking=no",
219
+ "-o",
220
+ f"ConnectTimeout={self.timeout}",
221
+ "-o",
222
+ "BatchMode=yes",
197
223
  f"{self.config.ssh_user}@{self.config.ssh_host}",
198
224
  log_cmd,
199
225
  ],
@@ -220,9 +246,12 @@ class VMMonitor:
220
246
  result = subprocess.run(
221
247
  [
222
248
  "ssh",
223
- "-o", "StrictHostKeyChecking=no",
224
- "-o", f"ConnectTimeout={self.timeout}",
225
- "-o", "BatchMode=yes",
249
+ "-o",
250
+ "StrictHostKeyChecking=no",
251
+ "-o",
252
+ f"ConnectTimeout={self.timeout}",
253
+ "-o",
254
+ "BatchMode=yes",
226
255
  f"{self.config.ssh_user}@{self.config.ssh_host}",
227
256
  cmd,
228
257
  ],
@@ -233,7 +262,7 @@ class VMMonitor:
233
262
  if result.returncode == 0 and result.stdout.strip():
234
263
  try:
235
264
  bytes_size = int(result.stdout.strip())
236
- return round(bytes_size / (1024 ** 3), 2)
265
+ return round(bytes_size / (1024**3), 2)
237
266
  except ValueError:
238
267
  continue
239
268
  return None
@@ -257,7 +286,9 @@ class VMMonitor:
257
286
 
258
287
  if status.ssh_reachable:
259
288
  # Check container
260
- status.container_running, status.container_logs = self.get_container_status()
289
+ status.container_running, status.container_logs = (
290
+ self.get_container_status()
291
+ )
261
292
 
262
293
  # Check WAA probe
263
294
  status.waa_ready, status.waa_probe_response = self.check_waa_probe()
@@ -412,7 +443,9 @@ class VMPoolRegistry:
412
443
  resource_group=resource_group,
413
444
  location=location,
414
445
  vm_size=vm_size,
415
- workers=[PoolWorker(name=name, ip=ip, status="ready") for name, ip in workers],
446
+ workers=[
447
+ PoolWorker(name=name, ip=ip, status="ready") for name, ip in workers
448
+ ],
416
449
  )
417
450
  self.save()
418
451
  return self._pool
@@ -468,7 +501,9 @@ class VMPoolRegistry:
468
501
  class VMRegistry:
469
502
  """Manage a registry of VMs and their status."""
470
503
 
471
- def __init__(self, registry_file: str | Path = "benchmark_results/vm_registry.json"):
504
+ def __init__(
505
+ self, registry_file: str | Path = "benchmark_results/vm_registry.json"
506
+ ):
472
507
  """Initialize registry.
473
508
 
474
509
  Args:
@@ -546,17 +581,23 @@ def main():
546
581
  parser.add_argument("--host", help="SSH host")
547
582
  parser.add_argument("--user", default="azureuser", help="SSH user")
548
583
  parser.add_argument("--container", default="winarena", help="Docker container name")
549
- parser.add_argument("--interval", type=int, default=30, help="Check interval in seconds")
584
+ parser.add_argument(
585
+ "--interval", type=int, default=30, help="Check interval in seconds"
586
+ )
550
587
  parser.add_argument("--output", help="Output file for status updates (JSON lines)")
551
588
  parser.add_argument("--list", action="store_true", help="List all registered VMs")
552
- parser.add_argument("--check-all", action="store_true", help="Check all registered VMs")
589
+ parser.add_argument(
590
+ "--check-all", action="store_true", help="Check all registered VMs"
591
+ )
553
592
 
554
593
  args = parser.parse_args()
555
594
 
556
595
  if args.list:
557
596
  registry = VMRegistry()
558
597
  for vm in registry.list():
559
- print(f" {vm.name}: {vm.ssh_user}@{vm.ssh_host} (container: {vm.docker_container})")
598
+ print(
599
+ f" {vm.name}: {vm.ssh_user}@{vm.ssh_host} (container: {vm.docker_container})"
600
+ )
560
601
  return
561
602
 
562
603
  if args.check_all:
@@ -586,12 +627,14 @@ def main():
586
627
  ts = datetime.now().strftime("%H:%M:%S")
587
628
  waa_str = "READY!" if status.waa_ready else "not ready"
588
629
  disk_str = f"{status.disk_usage_gb}GB" if status.disk_usage_gb else "?"
589
- print(f"[{ts}] SSH: {'✓' if status.ssh_reachable else '✗'} | "
590
- f"VNC: {'✓' if status.vnc_reachable else '✗'} | "
591
- f"WAA: {waa_str} | Disk: {disk_str}")
630
+ print(
631
+ f"[{ts}] SSH: {'✓' if status.ssh_reachable else '✗'} | "
632
+ f"VNC: {'✓' if status.vnc_reachable else '✗'} | "
633
+ f"WAA: {waa_str} | Disk: {disk_str}"
634
+ )
592
635
  if status.container_logs:
593
636
  # Show last log line
594
- last_line = status.container_logs.split('\n')[-1][:80]
637
+ last_line = status.container_logs.split("\n")[-1][:80]
595
638
  print(f" Log: {last_line}")
596
639
 
597
640
  print(f"Monitoring {args.host}... (Ctrl+C to stop)")
@@ -606,5 +649,463 @@ def main():
606
649
  print("\nMonitoring stopped.")
607
650
 
608
651
 
652
+ # ============================================================================
653
+ # Azure ML Job Tracking
654
+ # ============================================================================
655
+
656
+
657
+ @dataclass
658
+ class AzureMLJob:
659
+ """Represents an Azure ML job."""
660
+
661
+ job_id: str
662
+ display_name: str
663
+ status: str # running, completed, failed, canceled
664
+ created_at: str
665
+ compute_target: str | None = None
666
+ duration_minutes: float | None = None
667
+ cost_usd: float | None = None
668
+ azure_dashboard_url: str | None = None
669
+
670
+
671
+ def fetch_azure_ml_jobs(
672
+ resource_group: str = "openadapt-agents",
673
+ workspace_name: str = "openadapt-ml",
674
+ days: int = 7,
675
+ max_results: int = 20,
676
+ ) -> list[AzureMLJob]:
677
+ """Fetch recent Azure ML jobs.
678
+
679
+ Args:
680
+ resource_group: Azure resource group name.
681
+ workspace_name: Azure ML workspace name.
682
+ days: Number of days to look back.
683
+ max_results: Maximum number of jobs to return.
684
+
685
+ Returns:
686
+ List of AzureMLJob objects, sorted by creation time (newest first).
687
+ """
688
+ try:
689
+ result = subprocess.run(
690
+ [
691
+ "az",
692
+ "ml",
693
+ "job",
694
+ "list",
695
+ "--resource-group",
696
+ resource_group,
697
+ "--workspace-name",
698
+ workspace_name,
699
+ "--query",
700
+ "[].{name:name,display_name:display_name,status:status,created_at:creation_context.created_at,compute:compute}",
701
+ "-o",
702
+ "json",
703
+ ],
704
+ capture_output=True,
705
+ text=True,
706
+ timeout=30,
707
+ )
708
+
709
+ if result.returncode != 0:
710
+ logger.error(f"Azure CLI error: {result.stderr}")
711
+ return []
712
+
713
+ jobs_raw = json.loads(result.stdout)
714
+
715
+ # Filter by date
716
+ cutoff_date = datetime.now() - timedelta(days=days)
717
+ jobs = []
718
+
719
+ for job in jobs_raw[:max_results]:
720
+ created_at = job.get("created_at", "")
721
+ try:
722
+ # Parse ISO format: 2026-01-17T10:30:00Z
723
+ job_date = datetime.fromisoformat(
724
+ created_at.replace("Z", "+00:00")
725
+ if created_at
726
+ else datetime.now().isoformat()
727
+ )
728
+ if job_date < cutoff_date.replace(tzinfo=job_date.tzinfo):
729
+ continue
730
+ except (ValueError, AttributeError):
731
+ # If date parsing fails, include the job
732
+ pass
733
+
734
+ # Calculate duration for completed jobs
735
+ duration_minutes = None
736
+ status = job.get("status", "unknown").lower()
737
+
738
+ # Build Azure dashboard URL
739
+ subscription_id = get_azure_subscription_id()
740
+ wsid = f"/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.MachineLearningServices/workspaces/{workspace_name}"
741
+ dashboard_url = (
742
+ f"https://ml.azure.com/runs/{job.get('name', '')}?wsid={wsid}"
743
+ )
744
+
745
+ jobs.append(
746
+ AzureMLJob(
747
+ job_id=job.get("name", "unknown"),
748
+ display_name=job.get("display_name", ""),
749
+ status=status,
750
+ created_at=created_at,
751
+ compute_target=job.get("compute", None),
752
+ duration_minutes=duration_minutes,
753
+ azure_dashboard_url=dashboard_url,
754
+ )
755
+ )
756
+
757
+ return jobs
758
+
759
+ except Exception as e:
760
+ logger.error(f"Error fetching Azure ML jobs: {e}")
761
+ return []
762
+
763
+
764
+ def get_azure_subscription_id() -> str:
765
+ """Get the current Azure subscription ID."""
766
+ try:
767
+ result = subprocess.run(
768
+ ["az", "account", "show", "--query", "id", "-o", "tsv"],
769
+ capture_output=True,
770
+ text=True,
771
+ timeout=10,
772
+ )
773
+ if result.returncode == 0:
774
+ return result.stdout.strip()
775
+ except Exception:
776
+ pass
777
+ return "unknown"
778
+
779
+
780
+ # ============================================================================
781
+ # Cost Tracking
782
+ # ============================================================================
783
+
784
+
785
+ @dataclass
786
+ class VMCostEstimate:
787
+ """Estimated costs for VM usage."""
788
+
789
+ vm_size: str
790
+ hourly_rate_usd: float
791
+ hours_elapsed: float
792
+ cost_usd: float
793
+ cost_per_hour_usd: float
794
+ cost_per_day_usd: float
795
+ cost_per_week_usd: float
796
+
797
+
798
+ # Azure VM pricing (US East, as of Jan 2025)
799
+ VM_PRICING = {
800
+ "Standard_D2_v3": 0.096,
801
+ "Standard_D4_v3": 0.192,
802
+ "Standard_D8_v3": 0.384,
803
+ "Standard_D4s_v3": 0.192,
804
+ "Standard_D8s_v3": 0.384,
805
+ "Standard_D4ds_v5": 0.192,
806
+ "Standard_D8ds_v5": 0.384,
807
+ "Standard_D16ds_v5": 0.768,
808
+ "Standard_D32ds_v5": 1.536,
809
+ }
810
+
811
+
812
+ def calculate_vm_costs(
813
+ vm_size: str, hours: float, hourly_rate_override: float | None = None
814
+ ) -> VMCostEstimate:
815
+ """Calculate VM cost estimates.
816
+
817
+ Args:
818
+ vm_size: Azure VM size (e.g., "Standard_D4ds_v5").
819
+ hours: Number of hours the VM has been running.
820
+ hourly_rate_override: Override default hourly rate (for custom pricing).
821
+
822
+ Returns:
823
+ VMCostEstimate with cost breakdown.
824
+ """
825
+ hourly_rate = hourly_rate_override or VM_PRICING.get(vm_size, 0.20)
826
+ cost_usd = hourly_rate * hours
827
+
828
+ return VMCostEstimate(
829
+ vm_size=vm_size,
830
+ hourly_rate_usd=hourly_rate,
831
+ hours_elapsed=hours,
832
+ cost_usd=cost_usd,
833
+ cost_per_hour_usd=hourly_rate,
834
+ cost_per_day_usd=hourly_rate * 24,
835
+ cost_per_week_usd=hourly_rate * 24 * 7,
836
+ )
837
+
838
+
839
+ def get_vm_uptime_hours(
840
+ resource_group: str, vm_name: str, check_actual_state: bool = True
841
+ ) -> float:
842
+ """Get VM uptime in hours.
843
+
844
+ Args:
845
+ resource_group: Azure resource group.
846
+ vm_name: VM name.
847
+ check_actual_state: If True, check if VM is actually running.
848
+
849
+ Returns:
850
+ Hours since VM started, or 0 if VM is not running.
851
+ """
852
+ try:
853
+ # Get VM creation time or last start time
854
+ result = subprocess.run(
855
+ [
856
+ "az",
857
+ "vm",
858
+ "show",
859
+ "-d",
860
+ "-g",
861
+ resource_group,
862
+ "-n",
863
+ vm_name,
864
+ "--query",
865
+ "{powerState:powerState}",
866
+ "-o",
867
+ "json",
868
+ ],
869
+ capture_output=True,
870
+ text=True,
871
+ timeout=10,
872
+ )
873
+
874
+ if result.returncode != 0:
875
+ return 0.0
876
+
877
+ info = json.loads(result.stdout)
878
+ power_state = info.get("powerState", "")
879
+
880
+ # Check if VM is running
881
+ if check_actual_state and "running" not in power_state.lower():
882
+ return 0.0
883
+
884
+ # Try to get activity logs for last start time
885
+ result = subprocess.run(
886
+ [
887
+ "az",
888
+ "monitor",
889
+ "activity-log",
890
+ "list",
891
+ "--resource-group",
892
+ resource_group,
893
+ "--resource-id",
894
+ f"/subscriptions/{get_azure_subscription_id()}/resourceGroups/{resource_group}/providers/Microsoft.Compute/virtualMachines/{vm_name}",
895
+ "--query",
896
+ "[?operationName.localizedValue=='Start Virtual Machine' || operationName.localizedValue=='Create or Update Virtual Machine'].eventTimestamp | [0]",
897
+ "-o",
898
+ "tsv",
899
+ ],
900
+ capture_output=True,
901
+ text=True,
902
+ timeout=15,
903
+ )
904
+
905
+ if result.returncode == 0 and result.stdout.strip():
906
+ start_time_str = result.stdout.strip()
907
+ start_time = datetime.fromisoformat(start_time_str.replace("Z", "+00:00"))
908
+ elapsed = datetime.now(start_time.tzinfo) - start_time
909
+ return elapsed.total_seconds() / 3600
910
+
911
+ # Fallback: assume started 1 hour ago if we can't determine
912
+ return 1.0
913
+
914
+ except Exception as e:
915
+ logger.debug(f"Error getting VM uptime: {e}")
916
+ return 0.0
917
+
918
+
919
+ # ============================================================================
920
+ # VM Activity Detection
921
+ # ============================================================================
922
+
923
+
924
+ @dataclass
925
+ class VMActivity:
926
+ """Current VM activity information."""
927
+
928
+ is_active: bool
929
+ activity_type: str # idle, benchmark_running, training, setup, unknown
930
+ description: str
931
+ benchmark_progress: dict | None = None # If benchmark is running
932
+ last_action_time: str | None = None
933
+
934
+
935
+ def detect_vm_activity(
936
+ ip: str,
937
+ ssh_user: str = "azureuser",
938
+ docker_container: str = "winarena",
939
+ internal_ip: str = "localhost", # WAA server bound to localhost via Docker port forward
940
+ ) -> VMActivity:
941
+ """Detect what the VM is currently doing.
942
+
943
+ Args:
944
+ ip: VM IP address.
945
+ ssh_user: SSH username.
946
+ docker_container: Docker container name.
947
+ internal_ip: Internal IP for WAA server.
948
+
949
+ Returns:
950
+ VMActivity with current activity information.
951
+ """
952
+ try:
953
+ # Check if container is running
954
+ result = subprocess.run(
955
+ [
956
+ "ssh",
957
+ "-o",
958
+ "StrictHostKeyChecking=no",
959
+ "-o",
960
+ "ConnectTimeout=5",
961
+ f"{ssh_user}@{ip}",
962
+ f"docker ps -q -f name={docker_container}",
963
+ ],
964
+ capture_output=True,
965
+ text=True,
966
+ timeout=10,
967
+ )
968
+
969
+ if result.returncode != 0 or not result.stdout.strip():
970
+ return VMActivity(
971
+ is_active=False,
972
+ activity_type="idle",
973
+ description="Container not running",
974
+ )
975
+
976
+ # Check WAA probe for benchmark status
977
+ result = subprocess.run(
978
+ [
979
+ "ssh",
980
+ "-o",
981
+ "StrictHostKeyChecking=no",
982
+ "-o",
983
+ "ConnectTimeout=5",
984
+ f"{ssh_user}@{ip}",
985
+ f"curl -s --connect-timeout 3 http://{internal_ip}:5000/probe",
986
+ ],
987
+ capture_output=True,
988
+ text=True,
989
+ timeout=10,
990
+ )
991
+
992
+ if result.returncode == 0 and result.stdout.strip():
993
+ probe_response = result.stdout.strip()
994
+ try:
995
+ probe_data = json.loads(probe_response)
996
+ # WAA is ready and responsive - check if benchmark is actually running
997
+ # by looking for python processes (Navi agent or our client)
998
+ python_check = subprocess.run(
999
+ [
1000
+ "ssh",
1001
+ "-o",
1002
+ "StrictHostKeyChecking=no",
1003
+ "-o",
1004
+ "ConnectTimeout=5",
1005
+ f"{ssh_user}@{ip}",
1006
+ f"docker exec {docker_container} pgrep -f 'python.*run' 2>/dev/null | head -1",
1007
+ ],
1008
+ capture_output=True,
1009
+ text=True,
1010
+ timeout=10,
1011
+ )
1012
+ is_running = bool(python_check.stdout.strip())
1013
+
1014
+ return VMActivity(
1015
+ is_active=is_running,
1016
+ activity_type="benchmark_running" if is_running else "idle",
1017
+ description="WAA benchmark running"
1018
+ if is_running
1019
+ else "WAA ready - idle",
1020
+ benchmark_progress=probe_data,
1021
+ )
1022
+ except json.JSONDecodeError:
1023
+ # Got response but not JSON - maybe setup phase
1024
+ return VMActivity(
1025
+ is_active=True,
1026
+ activity_type="setup",
1027
+ description="WAA starting up",
1028
+ )
1029
+
1030
+ # Container running but WAA not ready
1031
+ return VMActivity(
1032
+ is_active=True,
1033
+ activity_type="setup",
1034
+ description="Windows VM booting or WAA initializing",
1035
+ )
1036
+
1037
+ except Exception as e:
1038
+ logger.debug(f"Error detecting VM activity: {e}")
1039
+ return VMActivity(
1040
+ is_active=False,
1041
+ activity_type="unknown",
1042
+ description=f"Error checking activity: {str(e)[:100]}",
1043
+ )
1044
+
1045
+
1046
+ # ============================================================================
1047
+ # Evaluation History
1048
+ # ============================================================================
1049
+
1050
+
1051
+ @dataclass
1052
+ class EvaluationRun:
1053
+ """Historical evaluation run."""
1054
+
1055
+ run_id: str
1056
+ started_at: str
1057
+ completed_at: str | None
1058
+ num_tasks: int
1059
+ success_rate: float | None
1060
+ agent_type: str
1061
+ status: str # running, completed, failed
1062
+
1063
+
1064
+ def get_evaluation_history(
1065
+ results_dir: Path | str = "benchmark_results", max_runs: int = 10
1066
+ ) -> list[EvaluationRun]:
1067
+ """Get history of evaluation runs from results directory.
1068
+
1069
+ Args:
1070
+ results_dir: Path to benchmark results directory.
1071
+ max_runs: Maximum number of runs to return.
1072
+
1073
+ Returns:
1074
+ List of EvaluationRun objects, sorted by start time (newest first).
1075
+ """
1076
+ results_path = Path(results_dir)
1077
+ if not results_path.exists():
1078
+ return []
1079
+
1080
+ runs = []
1081
+
1082
+ # Look for run directories or result files
1083
+ for item in sorted(results_path.iterdir(), reverse=True):
1084
+ if item.is_dir():
1085
+ # Check for summary.json or similar
1086
+ summary_file = item / "summary.json"
1087
+ if summary_file.exists():
1088
+ try:
1089
+ summary = json.loads(summary_file.read_text())
1090
+ runs.append(
1091
+ EvaluationRun(
1092
+ run_id=item.name,
1093
+ started_at=summary.get("started_at", "unknown"),
1094
+ completed_at=summary.get("completed_at", None),
1095
+ num_tasks=summary.get("num_tasks", 0),
1096
+ success_rate=summary.get("success_rate", None),
1097
+ agent_type=summary.get("agent_type", "unknown"),
1098
+ status=summary.get("status", "completed"),
1099
+ )
1100
+ )
1101
+ except (json.JSONDecodeError, KeyError):
1102
+ continue
1103
+
1104
+ if len(runs) >= max_runs:
1105
+ break
1106
+
1107
+ return runs
1108
+
1109
+
609
1110
  if __name__ == "__main__":
610
1111
  main()