decodingtrust-agent-sdk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent/__init__.py +30 -0
- agent/claudesdk/__init__.py +8 -0
- agent/claudesdk/example.py +221 -0
- agent/claudesdk/src/__init__.py +8 -0
- agent/claudesdk/src/agent.py +400 -0
- agent/claudesdk/src/mcp_proxy.py +409 -0
- agent/claudesdk/src/utils.py +420 -0
- agent/googleadk/__init__.py +15 -0
- agent/googleadk/example.py +237 -0
- agent/googleadk/src/__init__.py +12 -0
- agent/googleadk/src/agent.py +401 -0
- agent/googleadk/src/mcp_wrapper.py +163 -0
- agent/googleadk/src/utils.py +602 -0
- agent/langchain/__init__.py +8 -0
- agent/langchain/example.py +213 -0
- agent/langchain/src/__init__.py +8 -0
- agent/langchain/src/agent.py +645 -0
- agent/langchain/src/utils.py +433 -0
- agent/openaisdk/__init__.py +17 -0
- agent/openaisdk/example.py +228 -0
- agent/openaisdk/src/__init__.py +12 -0
- agent/openaisdk/src/agent.py +491 -0
- agent/openaisdk/src/agent_wrapper.py +143 -0
- agent/openaisdk/src/mcp_wrapper.py +395 -0
- agent/openaisdk/src/utils.py +493 -0
- agent/openclaw/__init__.py +10 -0
- agent/openclaw/example.py +251 -0
- agent/openclaw/src/__init__.py +14 -0
- agent/openclaw/src/agent.py +930 -0
- agent/openclaw/src/helpers/__init__.py +1 -0
- agent/openclaw/src/helpers/auth_helpers.py +55 -0
- agent/openclaw/src/mcp_proxy.py +564 -0
- agent/openclaw/src/plugin_generator.py +231 -0
- agent/openclaw/src/utils.py +341 -0
- agent/pocketflow/__init__.py +18 -0
- agent/pocketflow/example.py +221 -0
- agent/pocketflow/prompts/react_agent.py +46 -0
- agent/pocketflow/src/__init__.py +6 -0
- agent/pocketflow/src/agent.py +507 -0
- agent/pocketflow/src/agent_wrapper.py +159 -0
- agent/pocketflow/src/async_helper.py +92 -0
- agent/pocketflow/src/mcp_react_agent.py +279 -0
- agent/pocketflow/src/native_agent.py +74 -0
- agent/pocketflow/src/nodes.py +467 -0
- benchmark/__init__.py +0 -0
- benchmark/browser/benign.jsonl +34 -0
- benchmark/browser/direct.jsonl +85 -0
- benchmark/browser/indirect.jsonl +82 -0
- benchmark/code/benign.jsonl +0 -0
- benchmark/code/direct.jsonl +121 -0
- benchmark/code/indirect.jsonl +165 -0
- benchmark/crm/benign.jsonl +165 -0
- benchmark/crm/direct.jsonl +90 -0
- benchmark/crm/indirect.jsonl +150 -0
- benchmark/customer-service/benign.jsonl +160 -0
- benchmark/customer-service/direct.jsonl +100 -0
- benchmark/customer-service/indirect.jsonl +101 -0
- benchmark/finance/benign.jsonl +0 -0
- benchmark/finance/direct.jsonl +200 -0
- benchmark/finance/indirect.jsonl +200 -0
- benchmark/legal/benign.jsonl +0 -0
- benchmark/legal/direct.jsonl +200 -0
- benchmark/legal/indirect.jsonl +200 -0
- benchmark/macos/benign.jsonl +30 -0
- benchmark/macos/direct.jsonl +50 -0
- benchmark/macos/indirect.jsonl +50 -0
- benchmark/medical/benign.jsonl +642 -0
- benchmark/medical/direct.jsonl +229 -0
- benchmark/medical/indirect.jsonl +222 -0
- benchmark/os-filesystem/benign.jsonl +200 -0
- benchmark/os-filesystem/direct.jsonl +200 -0
- benchmark/os-filesystem/indirect.jsonl +200 -0
- benchmark/research/benign.jsonl +0 -0
- benchmark/research/direct.jsonl +119 -0
- benchmark/research/indirect.jsonl +125 -0
- benchmark/telecom/benign.jsonl +120 -0
- benchmark/telecom/direct.jsonl +161 -0
- benchmark/telecom/indirect.jsonl +166 -0
- benchmark/travel/benign.jsonl +130 -0
- benchmark/travel/direct.jsonl +105 -0
- benchmark/travel/indirect.jsonl +120 -0
- benchmark/windows/benign.jsonl +100 -0
- benchmark/windows/direct.jsonl +140 -0
- benchmark/windows/indirect.jsonl +107 -0
- benchmark/workflow/benign.jsonl +335 -0
- benchmark/workflow/direct.jsonl +78 -0
- benchmark/workflow/indirect.jsonl +107 -0
- cli/__init__.py +5 -0
- cli/main.py +182 -0
- cli/scaffold.py +334 -0
- decodingtrust_agent_sdk-0.1.0.dist-info/METADATA +642 -0
- decodingtrust_agent_sdk-0.1.0.dist-info/RECORD +374 -0
- decodingtrust_agent_sdk-0.1.0.dist-info/WHEEL +5 -0
- decodingtrust_agent_sdk-0.1.0.dist-info/entry_points.txt +2 -0
- decodingtrust_agent_sdk-0.1.0.dist-info/licenses/LICENSE +201 -0
- decodingtrust_agent_sdk-0.1.0.dist-info/top_level.txt +6 -0
- dt_arena/config/env.yaml +515 -0
- dt_arena/config/injection_mcp.yaml +430 -0
- dt_arena/config/mcp.yaml +642 -0
- dt_arena/envs/arxiv/docker-compose-hub.yml +31 -0
- dt_arena/envs/arxiv/docker-compose.yml +36 -0
- dt_arena/envs/atlassian/docker/docker-compose.dev.yml +65 -0
- dt_arena/envs/atlassian/docker/docker-compose.yml +53 -0
- dt_arena/envs/atlassian/docker-compose-hub.yml +57 -0
- dt_arena/envs/atlassian/docker-compose.yml +72 -0
- dt_arena/envs/bigquery/docker-compose.yml +20 -0
- dt_arena/envs/booking/docker-compose.yml +59 -0
- dt_arena/envs/calendar/docker-compose-hub.yml +30 -0
- dt_arena/envs/calendar/docker-compose.yml +42 -0
- dt_arena/envs/custom-website/docker-compose.yml +6 -0
- dt_arena/envs/customer_service/docker-compose.yml +59 -0
- dt_arena/envs/databricks/docker-compose-hub.yml +47 -0
- dt_arena/envs/databricks/docker-compose.yml +51 -0
- dt_arena/envs/ecommerce/docker-compose.yml +6 -0
- dt_arena/envs/ers/docker-compose.yml +36 -0
- dt_arena/envs/ers/hrms/docker/docker-compose.yml +31 -0
- dt_arena/envs/finance/docker-compose.yml +23 -0
- dt_arena/envs/github/docker/docker-compose-hub.yml +50 -0
- dt_arena/envs/github/docker/docker-compose.yml +50 -0
- dt_arena/envs/gmail/docker-compose-hub.yml +51 -0
- dt_arena/envs/gmail/docker-compose.yml +65 -0
- dt_arena/envs/google-form/docker-compose-hub.yml +33 -0
- dt_arena/envs/google-form/docker-compose.yml +41 -0
- dt_arena/envs/googledocs/docker-compose-hub.yml +61 -0
- dt_arena/envs/googledocs/docker-compose.yml +78 -0
- dt_arena/envs/hospital/docker-compose-hub.yml +25 -0
- dt_arena/envs/hospital/docker-compose.yml +27 -0
- dt_arena/envs/legal/docker-compose.yml +22 -0
- dt_arena/envs/linkedin/docker-compose.yml +63 -0
- dt_arena/envs/macos/docker-compose.yml +79 -0
- dt_arena/envs/os-filesystem/docker-compose-hub.yml +16 -0
- dt_arena/envs/os-filesystem/docker-compose.yml +20 -0
- dt_arena/envs/paypal/docker-compose-hub.yml +48 -0
- dt_arena/envs/paypal/docker-compose.yml +63 -0
- dt_arena/envs/research/docker-compose-hub.yml +13 -0
- dt_arena/envs/research/docker-compose.yml +24 -0
- dt_arena/envs/salesforce_crm/docker-compose-hub.yaml +45 -0
- dt_arena/envs/salesforce_crm/docker-compose.yaml +49 -0
- dt_arena/envs/slack/docker-compose-hub.yml +28 -0
- dt_arena/envs/slack/docker-compose.yml +41 -0
- dt_arena/envs/snowflake/docker-compose-hub.yml +41 -0
- dt_arena/envs/snowflake/docker-compose.yml +44 -0
- dt_arena/envs/telecom/docker-compose-hub.yml +16 -0
- dt_arena/envs/telecom/docker-compose.yml +17 -0
- dt_arena/envs/telegram/docker-compose-hub.yml +57 -0
- dt_arena/envs/telegram/docker-compose.yml +62 -0
- dt_arena/envs/terminal/docker-compose-hub.yml +12 -0
- dt_arena/envs/terminal/docker-compose.yml +26 -0
- dt_arena/envs/travel/docker-compose-hub.yml +19 -0
- dt_arena/envs/travel/docker-compose.yml +19 -0
- dt_arena/envs/whatsapp/docker-compose-hub.yml +61 -0
- dt_arena/envs/whatsapp/docker-compose.yml +78 -0
- dt_arena/envs/windows/docker-compose.yml +71 -0
- dt_arena/envs/zoom/docker-compose-hub.yml +27 -0
- dt_arena/envs/zoom/docker-compose.yml +40 -0
- dt_arena/injection_mcp_server/atlassian/env_injection.py +134 -0
- dt_arena/injection_mcp_server/calendar/env_injection.py +217 -0
- dt_arena/injection_mcp_server/custom_website/env_injection.py +97 -0
- dt_arena/injection_mcp_server/customer_service/env_injection.py +659 -0
- dt_arena/injection_mcp_server/databricks/env_injection.py +255 -0
- dt_arena/injection_mcp_server/ecommerce/env_injection.py +110 -0
- dt_arena/injection_mcp_server/finance/env_injection.py +85 -0
- dt_arena/injection_mcp_server/github/env_injection.py +206 -0
- dt_arena/injection_mcp_server/gmail/env_injection.py +211 -0
- dt_arena/injection_mcp_server/google_form/env_injection.py +186 -0
- dt_arena/injection_mcp_server/googledocs/env_injection.py +44 -0
- dt_arena/injection_mcp_server/hospital/env_injection.py +43 -0
- dt_arena/injection_mcp_server/legal/env_injection.py +229 -0
- dt_arena/injection_mcp_server/macos/env_injection.py +272 -0
- dt_arena/injection_mcp_server/os-filesystem/env_injection.py +341 -0
- dt_arena/injection_mcp_server/paypal/env_injection.py +268 -0
- dt_arena/injection_mcp_server/research/env_injection.py +616 -0
- dt_arena/injection_mcp_server/salesforce/env_injection.py +514 -0
- dt_arena/injection_mcp_server/slack/env_injection.py +265 -0
- dt_arena/injection_mcp_server/snowflake/env_injection.py +230 -0
- dt_arena/injection_mcp_server/telecom/env_injection.py +503 -0
- dt_arena/injection_mcp_server/telegram/env_injection.py +171 -0
- dt_arena/injection_mcp_server/terminal/env_injection.py +523 -0
- dt_arena/injection_mcp_server/travel/env_injection.py +173 -0
- dt_arena/injection_mcp_server/whatsapp/env_injection.py +185 -0
- dt_arena/injection_mcp_server/windows/env_injection.py +943 -0
- dt_arena/injection_mcp_server/zoom/env_injection.py +216 -0
- dt_arena/mcp_server/atlassian/main.py +1554 -0
- dt_arena/mcp_server/atlassian/test_server.py +66 -0
- dt_arena/mcp_server/bigquery/main.py +333 -0
- dt_arena/mcp_server/booking/main.py +310 -0
- dt_arena/mcp_server/browser/main.py +1741 -0
- dt_arena/mcp_server/calendar/example_multi_user.py +162 -0
- dt_arena/mcp_server/calendar/main.py +792 -0
- dt_arena/mcp_server/calendar/test_mcp.py +135 -0
- dt_arena/mcp_server/customer_service/main.py +1063 -0
- dt_arena/mcp_server/databricks/main.py +566 -0
- dt_arena/mcp_server/databricks/probe.py +102 -0
- dt_arena/mcp_server/ers/main.py +845 -0
- dt_arena/mcp_server/finance/__init__.py +87 -0
- dt_arena/mcp_server/finance/core/__init__.py +12 -0
- dt_arena/mcp_server/finance/core/data_loader.py +558 -0
- dt_arena/mcp_server/finance/core/portfolio.py +565 -0
- dt_arena/mcp_server/finance/evaluation/__init__.py +20 -0
- dt_arena/mcp_server/finance/evaluation/evaluator.py +217 -0
- dt_arena/mcp_server/finance/evaluation/logger.py +137 -0
- dt_arena/mcp_server/finance/injection/__init__.py +66 -0
- dt_arena/mcp_server/finance/injection/config.py +176 -0
- dt_arena/mcp_server/finance/injection/content.py +755 -0
- dt_arena/mcp_server/finance/injection/html.py +409 -0
- dt_arena/mcp_server/finance/injection/locations.py +167 -0
- dt_arena/mcp_server/finance/injection/methods.py +193 -0
- dt_arena/mcp_server/finance/injection/presets.py +1023 -0
- dt_arena/mcp_server/finance/main.py +361 -0
- dt_arena/mcp_server/finance/run_mcp.py +21 -0
- dt_arena/mcp_server/finance/run_web.py +26 -0
- dt_arena/mcp_server/finance/server/__init__.py +41 -0
- dt_arena/mcp_server/finance/server/extractor.py +1453 -0
- dt_arena/mcp_server/finance/server/extractor_minimal.py +292 -0
- dt_arena/mcp_server/finance/server/extractor_simple.py +1164 -0
- dt_arena/mcp_server/finance/server/injection_mcp.py +865 -0
- dt_arena/mcp_server/finance/server/mcp.py +451 -0
- dt_arena/mcp_server/finance/server/tools/__init__.py +23 -0
- dt_arena/mcp_server/finance/server/tools/account.py +88 -0
- dt_arena/mcp_server/finance/server/tools/browsing.py +328 -0
- dt_arena/mcp_server/finance/server/tools/social.py +73 -0
- dt_arena/mcp_server/finance/server/tools/trading.py +242 -0
- dt_arena/mcp_server/finance/server/tools/utility.py +49 -0
- dt_arena/mcp_server/finance/server/web.py +2139 -0
- dt_arena/mcp_server/finance/tasks/benchmark/__init__.py +28 -0
- dt_arena/mcp_server/finance/tasks/benchmark/attack_pool.py +3026 -0
- dt_arena/mcp_server/finance/tasks/benchmark/attack_runner.py +1315 -0
- dt_arena/mcp_server/finance/tasks/benchmark/finra_requirements.py +1335 -0
- dt_arena/mcp_server/finance/tasks/benchmark/finra_tasks.py +3665 -0
- dt_arena/mcp_server/finance/tasks/benchmark/malicious_tasks.py +2673 -0
- dt_arena/mcp_server/finance/tasks/redteam_suite/run_redteam_suite.py +1713 -0
- dt_arena/mcp_server/finance/test_mcp_tools.py +476 -0
- dt_arena/mcp_server/github/main.py +441 -0
- dt_arena/mcp_server/gmail/main.py +1004 -0
- dt_arena/mcp_server/google_form/main.py +141 -0
- dt_arena/mcp_server/googledocs/main.py +458 -0
- dt_arena/mcp_server/hospital/mcp_server.py +458 -0
- dt_arena/mcp_server/legal/__init__.py +9 -0
- dt_arena/mcp_server/legal/core/__init__.py +14 -0
- dt_arena/mcp_server/legal/core/courtlistener_store.py +762 -0
- dt_arena/mcp_server/legal/core/data_loader.py +266 -0
- dt_arena/mcp_server/legal/core/document_store.py +197 -0
- dt_arena/mcp_server/legal/core/matter_manager.py +466 -0
- dt_arena/mcp_server/legal/main.py +89 -0
- dt_arena/mcp_server/legal/scripts/collect_data.py +988 -0
- dt_arena/mcp_server/legal/server/__init__.py +14 -0
- dt_arena/mcp_server/legal/server/mcp.py +2330 -0
- dt_arena/mcp_server/macos/client_test.py +270 -0
- dt_arena/mcp_server/macos/mcp_server.py +285 -0
- dt_arena/mcp_server/os-filesystem/main.py +1380 -0
- dt_arena/mcp_server/paypal/main.py +501 -0
- dt_arena/mcp_server/research/main.py +777 -0
- dt_arena/mcp_server/salesforce/main.py +2006 -0
- dt_arena/mcp_server/slack/main.py +318 -0
- dt_arena/mcp_server/snowflake/main.py +612 -0
- dt_arena/mcp_server/snowflake/probe.py +183 -0
- dt_arena/mcp_server/telecom/mcp_client.py +423 -0
- dt_arena/mcp_server/telecom/mcp_server.py +1059 -0
- dt_arena/mcp_server/telegram/main.py +338 -0
- dt_arena/mcp_server/terminal/main.py +163 -0
- dt_arena/mcp_server/travel/client_test.py +16 -0
- dt_arena/mcp_server/travel/mcp_server.py +404 -0
- dt_arena/mcp_server/whatsapp/main.py +318 -0
- dt_arena/mcp_server/windows/client_test.py +270 -0
- dt_arena/mcp_server/windows/mcp_server.py +218 -0
- dt_arena/mcp_server/zoom/main.py +466 -0
- dt_arena/src/__init__.py +0 -0
- dt_arena/src/hooks/__init__.py +0 -0
- dt_arena/src/hooks/audit_log.py +30 -0
- dt_arena/src/hooks/hooks.json +3 -0
- dt_arena/src/run_benign.py +142 -0
- dt_arena/src/types/__init__.py +0 -0
- dt_arena/src/types/agent.py +441 -0
- dt_arena/src/types/attacks.py +2 -0
- dt_arena/src/types/environment.py +2 -0
- dt_arena/src/types/hooks.py +174 -0
- dt_arena/src/types/judge.py +52 -0
- dt_arena/src/types/red_teaming_trajectory.py +385 -0
- dt_arena/src/types/task.py +260 -0
- dt_arena/src/types/trajectory.py +315 -0
- dt_arena/utils/__init__.py +1 -0
- dt_arena/utils/atlassian/__init__.py +27 -0
- dt_arena/utils/atlassian/helpers.py +520 -0
- dt_arena/utils/bigquery/__init__.py +1 -0
- dt_arena/utils/bigquery/helpers.py +246 -0
- dt_arena/utils/calendar/__init__.py +1 -0
- dt_arena/utils/calendar/helpers.py +87 -0
- dt_arena/utils/customer_service/__init__.py +17 -0
- dt_arena/utils/customer_service/cs_env_client.py +940 -0
- dt_arena/utils/customer_service/helpers.py +339 -0
- dt_arena/utils/customer_service/judges/__init__.py +20 -0
- dt_arena/utils/customer_service/judges/abusive_comms.py +149 -0
- dt_arena/utils/customer_service/judges/bulk_destructive.py +202 -0
- dt_arena/utils/customer_service/judges/cross_customer.py +481 -0
- dt_arena/utils/customer_service/judges/email_exfiltration.py +320 -0
- dt_arena/utils/customer_service/judges/financial_fraud.py +221 -0
- dt_arena/utils/customer_service/judges/llm_utils.py +264 -0
- dt_arena/utils/customer_service/judges/pii_disclosure.py +837 -0
- dt_arena/utils/customer_service/judges/policy_misrep.py +44 -0
- dt_arena/utils/customer_service/judges/text_utils.py +21 -0
- dt_arena/utils/databricks/__init__.py +2 -0
- dt_arena/utils/databricks/helpers.py +210 -0
- dt_arena/utils/finance/__init__.py +0 -0
- dt_arena/utils/finance/helpers.py +263 -0
- dt_arena/utils/github/__init__.py +1 -0
- dt_arena/utils/github/helpers.py +249 -0
- dt_arena/utils/gmail/__init__.py +1 -0
- dt_arena/utils/gmail/helpers.py +344 -0
- dt_arena/utils/google_form/__init__.py +2 -0
- dt_arena/utils/google_form/helpers.py +133 -0
- dt_arena/utils/legal/__init__.py +0 -0
- dt_arena/utils/legal/helpers.py +228 -0
- dt_arena/utils/macos/__init__.py +0 -0
- dt_arena/utils/macos/env_setup.py +215 -0
- dt_arena/utils/macos/helpers.py +61 -0
- dt_arena/utils/os_filesystem/__init__.py +1 -0
- dt_arena/utils/os_filesystem/helpers.py +366 -0
- dt_arena/utils/paypal/__init__.py +1 -0
- dt_arena/utils/paypal/helpers.py +178 -0
- dt_arena/utils/port_allocator.py +266 -0
- dt_arena/utils/research/__init__.py +0 -0
- dt_arena/utils/research/helpers.py +251 -0
- dt_arena/utils/salesforce/__init__.py +1 -0
- dt_arena/utils/salesforce/helpers.py +719 -0
- dt_arena/utils/slack/__init__.py +1 -0
- dt_arena/utils/slack/helpers.py +176 -0
- dt_arena/utils/snowflake/__init__.py +1 -0
- dt_arena/utils/snowflake/helpers.py +166 -0
- dt_arena/utils/telecom/__init__.py +1 -0
- dt_arena/utils/telecom/helpers.py +760 -0
- dt_arena/utils/telegram/__init__.py +0 -0
- dt_arena/utils/telegram/helpers.py +174 -0
- dt_arena/utils/terminal/__init__.py +0 -0
- dt_arena/utils/terminal/helpers.py +20 -0
- dt_arena/utils/travel/__init__.py +0 -0
- dt_arena/utils/travel/env_client.py +537 -0
- dt_arena/utils/travel/llm_judge.py +137 -0
- dt_arena/utils/travel/prompts.py +64 -0
- dt_arena/utils/utils/__init__.py +122 -0
- dt_arena/utils/whatsapp/__init__.py +0 -0
- dt_arena/utils/whatsapp/helpers.py +226 -0
- dt_arena/utils/windows/__init__.py +0 -0
- dt_arena/utils/windows/env_reset.py +224 -0
- dt_arena/utils/windows/env_setup.py +280 -0
- dt_arena/utils/windows/exfil_helpers.py +170 -0
- dt_arena/utils/windows/helpers.py +74 -0
- dt_arena/utils/zoom/__init__.py +1 -0
- dt_arena/utils/zoom/helpers.py +70 -0
- eval/__init__.py +1 -0
- eval/evaluation.py +426 -0
- eval/task_runner.py +449 -0
- utils/__init__.py +148 -0
- utils/agent_helpers.py +308 -0
- utils/agent_wrapper.py +189 -0
- utils/compose_utils.py +135 -0
- utils/config.py +77 -0
- utils/env_helpers.py +104 -0
- utils/eval_stats.py +88 -0
- utils/injection_helpers.py +429 -0
- utils/injection_mcp_helpers.py +152 -0
- utils/judge_helpers.py +181 -0
- utils/judge_utils.py +472 -0
- utils/llm.py +196 -0
- utils/logging.py +45 -0
- utils/mcp_helpers.py +232 -0
- utils/mcp_manager.py +235 -0
- utils/memory_guard.py +18 -0
- utils/red_teaming_sandbox.py +476 -0
- utils/reset_helpers.py +318 -0
- utils/resource_manager.py +370 -0
- utils/skill_helpers.py +447 -0
- utils/task_executor.py +904 -0
- utils/task_helpers.py +270 -0
- utils/template_helpers.py +179 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""Helper functions for interacting with local Google Form sandbox API."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import http.client
|
|
6
|
+
import time
|
|
7
|
+
import urllib.parse as urlparse
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Dict, List, Tuple, Optional
|
|
10
|
+
|
|
11
|
+
import yaml
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _get_registry() -> dict:
|
|
15
|
+
root = Path(__file__).resolve().parents[3]
|
|
16
|
+
registry_file = root / "dt_arena" / "envs" / "registry.yaml"
|
|
17
|
+
if registry_file.exists():
|
|
18
|
+
try:
|
|
19
|
+
return yaml.safe_load(registry_file.read_text()) or {}
|
|
20
|
+
except Exception:
|
|
21
|
+
return {}
|
|
22
|
+
return {}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _get_form_host_port(base_url: Optional[str] = None) -> Tuple[str, int]:
|
|
26
|
+
import os
|
|
27
|
+
|
|
28
|
+
if base_url:
|
|
29
|
+
parsed = urlparse.urlparse(base_url)
|
|
30
|
+
host = parsed.hostname or "127.0.0.1"
|
|
31
|
+
port = parsed.port or (443 if (parsed.scheme or "http") == "https" else 80)
|
|
32
|
+
return host, int(port)
|
|
33
|
+
|
|
34
|
+
env_port = os.environ.get("GOOGLE_FORM_API_PORT")
|
|
35
|
+
if env_port:
|
|
36
|
+
return "127.0.0.1", int(env_port)
|
|
37
|
+
|
|
38
|
+
reg = _get_registry()
|
|
39
|
+
form_base = ((reg.get("services") or {}).get("google-form") or {}).get(
|
|
40
|
+
"api_base_url", "http://127.0.0.1:8054"
|
|
41
|
+
)
|
|
42
|
+
parsed = urlparse.urlparse(form_base)
|
|
43
|
+
host = parsed.hostname or "127.0.0.1"
|
|
44
|
+
port = parsed.port or (443 if (parsed.scheme or "http") == "https" else 80)
|
|
45
|
+
return host, int(port)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _request_json(
|
|
49
|
+
method: str,
|
|
50
|
+
path: str,
|
|
51
|
+
*,
|
|
52
|
+
token: Optional[str] = None,
|
|
53
|
+
body: Optional[Dict[str, Any]] = None,
|
|
54
|
+
base_url: Optional[str] = None,
|
|
55
|
+
) -> Any:
|
|
56
|
+
host, port = _get_form_host_port(base_url=base_url)
|
|
57
|
+
headers = {"Accept": "application/json"}
|
|
58
|
+
if body is not None:
|
|
59
|
+
headers["Content-Type"] = "application/json"
|
|
60
|
+
# Google Form sandbox currently does not enforce auth on endpoints,
|
|
61
|
+
# but keep token support for forward compatibility.
|
|
62
|
+
if token:
|
|
63
|
+
headers["Authorization"] = f"Bearer {token}"
|
|
64
|
+
|
|
65
|
+
conn = http.client.HTTPConnection(host, port, timeout=10)
|
|
66
|
+
try:
|
|
67
|
+
conn.request(
|
|
68
|
+
method,
|
|
69
|
+
path,
|
|
70
|
+
body=json.dumps(body) if body is not None else None,
|
|
71
|
+
headers=headers,
|
|
72
|
+
)
|
|
73
|
+
resp = conn.getresponse()
|
|
74
|
+
raw = resp.read()
|
|
75
|
+
except Exception as e:
|
|
76
|
+
return {"error": True, "status": 0, "message": str(e)}
|
|
77
|
+
finally:
|
|
78
|
+
conn.close()
|
|
79
|
+
if resp.status >= 400:
|
|
80
|
+
return {"error": True, "status": resp.status, "message": raw.decode("utf-8", errors="ignore")}
|
|
81
|
+
if not raw:
|
|
82
|
+
return {}
|
|
83
|
+
try:
|
|
84
|
+
return json.loads(raw.decode("utf-8"))
|
|
85
|
+
except Exception:
|
|
86
|
+
return {}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def get_form(token: str, base_url: Optional[str] = None) -> Dict[str, Any]:
|
|
90
|
+
"""Get current form schema."""
|
|
91
|
+
last: Dict[str, Any] = {}
|
|
92
|
+
# Avoid false negatives caused by startup/order races right after reset/set_form.
|
|
93
|
+
for _ in range(4):
|
|
94
|
+
result = _request_json("GET", "/form", token=token, base_url=base_url)
|
|
95
|
+
if isinstance(result, dict):
|
|
96
|
+
last = result
|
|
97
|
+
if result and not result.get("error"):
|
|
98
|
+
return result
|
|
99
|
+
time.sleep(0.4)
|
|
100
|
+
return last if isinstance(last, dict) else {}
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def list_submissions(
|
|
104
|
+
token: str,
|
|
105
|
+
base_url: Optional[str] = None,
|
|
106
|
+
limit: int = 100,
|
|
107
|
+
) -> List[Dict[str, Any]]:
|
|
108
|
+
"""List latest form submissions."""
|
|
109
|
+
result: Any = []
|
|
110
|
+
# Submissions can appear shortly after submit() in a different request cycle.
|
|
111
|
+
for _ in range(4):
|
|
112
|
+
result = _request_json("GET", "/submissions", token=token, base_url=base_url)
|
|
113
|
+
if isinstance(result, list) and result:
|
|
114
|
+
break
|
|
115
|
+
if isinstance(result, dict) and result.get("error"):
|
|
116
|
+
time.sleep(0.4)
|
|
117
|
+
continue
|
|
118
|
+
time.sleep(0.4)
|
|
119
|
+
if not isinstance(result, list):
|
|
120
|
+
return []
|
|
121
|
+
max_n = max(1, min(2000, limit))
|
|
122
|
+
return [row for row in result[:max_n] if isinstance(row, dict)]
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def submit(
|
|
126
|
+
payload: Dict[str, Any],
|
|
127
|
+
token: str,
|
|
128
|
+
base_url: Optional[str] = None,
|
|
129
|
+
) -> Dict[str, Any]:
|
|
130
|
+
"""Submit one form response."""
|
|
131
|
+
result = _request_json("POST", "/submit", token=token, body=payload, base_url=base_url)
|
|
132
|
+
return result if isinstance(result, dict) else {}
|
|
133
|
+
|
|
File without changes
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unified client for Legal environment API.
|
|
3
|
+
Used by judges to query action logs, matter state, drafts, and filings.
|
|
4
|
+
"""
|
|
5
|
+
import os
|
|
6
|
+
import requests
|
|
7
|
+
from typing import Optional, Dict, List, Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class LegalEnvClient:
|
|
11
|
+
"""Client for interacting with Legal environment server."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, host: Optional[str] = None, port: Optional[str] = None):
|
|
14
|
+
"""Initialize the client with host and port.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
host: Server host (defaults to LEGAL_HOST env var or 'localhost')
|
|
18
|
+
port: Server port (defaults to LEGAL_WEB_PORT env var or '8003')
|
|
19
|
+
"""
|
|
20
|
+
self.host = host or os.getenv("LEGAL_HOST", "localhost")
|
|
21
|
+
self.port = port or os.getenv("LEGAL_WEB_PORT", "8003")
|
|
22
|
+
self.base_url = f"http://{self.host}:{self.port}"
|
|
23
|
+
|
|
24
|
+
# ===== Action Log APIs =====
|
|
25
|
+
|
|
26
|
+
def get_action_log(self) -> List[Dict[str, Any]]:
|
|
27
|
+
"""Get all action log steps (agent tool calls + tool results).
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
List of step dictionaries with role, action, metadata, etc.
|
|
31
|
+
"""
|
|
32
|
+
try:
|
|
33
|
+
resp = requests.get(f"{self.base_url}/api/action_log", timeout=10)
|
|
34
|
+
data = resp.json()
|
|
35
|
+
if data.get("status") == "success":
|
|
36
|
+
return data.get("result", [])
|
|
37
|
+
except Exception as e:
|
|
38
|
+
print(f"[EnvClient] Error getting action log: {e}")
|
|
39
|
+
return []
|
|
40
|
+
|
|
41
|
+
def get_tool_calls(self) -> List[Dict[str, Any]]:
|
|
42
|
+
"""Get only agent tool call steps (role=agent).
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
List of agent action step dictionaries.
|
|
46
|
+
"""
|
|
47
|
+
return [s for s in self.get_action_log() if s.get("role") == "agent"]
|
|
48
|
+
|
|
49
|
+
def get_tool_results(self) -> List[Dict[str, Any]]:
|
|
50
|
+
"""Get only tool result steps (role=tool).
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
List of tool result step dictionaries.
|
|
54
|
+
"""
|
|
55
|
+
return [s for s in self.get_action_log() if s.get("role") == "tool"]
|
|
56
|
+
|
|
57
|
+
def has_called_tool(self, tool_name: str, **params) -> bool:
|
|
58
|
+
"""Check if agent called a specific tool with optional parameter constraints.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
tool_name: Name of the tool (e.g., 'create_draft', 'submit_filing')
|
|
62
|
+
**params: Optional key=value constraints on tool_params
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
True if a matching tool call was found.
|
|
66
|
+
"""
|
|
67
|
+
for call in self.get_tool_calls():
|
|
68
|
+
meta = call.get("metadata", {})
|
|
69
|
+
if meta.get("tool_name") != tool_name:
|
|
70
|
+
continue
|
|
71
|
+
if not params:
|
|
72
|
+
return True
|
|
73
|
+
tp = meta.get("tool_params", {})
|
|
74
|
+
if all(
|
|
75
|
+
str(tp.get(k, "")).strip().lower() == str(v).strip().lower()
|
|
76
|
+
for k, v in params.items()
|
|
77
|
+
):
|
|
78
|
+
return True
|
|
79
|
+
return False
|
|
80
|
+
|
|
81
|
+
def get_tool_call_params(self, tool_name: str) -> List[Dict[str, Any]]:
|
|
82
|
+
"""Get all parameter dicts for calls to a specific tool.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
tool_name: Name of the tool
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
List of tool_params dictionaries for matching calls.
|
|
89
|
+
"""
|
|
90
|
+
results = []
|
|
91
|
+
for call in self.get_tool_calls():
|
|
92
|
+
meta = call.get("metadata", {})
|
|
93
|
+
if meta.get("tool_name") == tool_name:
|
|
94
|
+
results.append(meta.get("tool_params", {}))
|
|
95
|
+
return results
|
|
96
|
+
|
|
97
|
+
def get_tool_call_with_results(self, tool_name: str = None) -> List[Dict[str, Any]]:
|
|
98
|
+
"""Get tool calls paired with their results.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
tool_name: Optional filter by tool name. None returns all.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
List of dicts with 'tool_name', 'params', 'result' keys.
|
|
105
|
+
"""
|
|
106
|
+
steps = self.get_action_log()
|
|
107
|
+
paired = []
|
|
108
|
+
for i, step in enumerate(steps):
|
|
109
|
+
if step.get("role") != "agent":
|
|
110
|
+
continue
|
|
111
|
+
meta = step.get("metadata", {})
|
|
112
|
+
name = meta.get("tool_name", "")
|
|
113
|
+
if not name:
|
|
114
|
+
continue
|
|
115
|
+
if tool_name and name != tool_name:
|
|
116
|
+
continue
|
|
117
|
+
result = ""
|
|
118
|
+
for j in range(i + 1, min(i + 3, len(steps))):
|
|
119
|
+
if steps[j].get("role") == "tool":
|
|
120
|
+
result = steps[j].get("state", steps[j].get("content", ""))
|
|
121
|
+
break
|
|
122
|
+
paired.append({
|
|
123
|
+
"tool_name": name,
|
|
124
|
+
"params": meta.get("tool_params", {}),
|
|
125
|
+
"result": str(result) if result else "",
|
|
126
|
+
})
|
|
127
|
+
return paired
|
|
128
|
+
|
|
129
|
+
# ===== Matter APIs =====
|
|
130
|
+
|
|
131
|
+
def get_matter(self, matter_id: str) -> Dict[str, Any]:
|
|
132
|
+
"""Get matter details by ID.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
matter_id: Matter identifier
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
Matter dictionary with details, or empty dict on error.
|
|
139
|
+
"""
|
|
140
|
+
try:
|
|
141
|
+
resp = requests.get(f"{self.base_url}/api/matters/{matter_id}", timeout=10)
|
|
142
|
+
resp.raise_for_status()
|
|
143
|
+
return resp.json()
|
|
144
|
+
except Exception as e:
|
|
145
|
+
print(f"[EnvClient] Error getting matter: {e}")
|
|
146
|
+
return {}
|
|
147
|
+
|
|
148
|
+
# ===== Draft APIs =====
|
|
149
|
+
|
|
150
|
+
def get_drafts(self) -> List[Dict[str, Any]]:
|
|
151
|
+
"""Get all drafts created during this session.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
List of draft dictionaries.
|
|
155
|
+
"""
|
|
156
|
+
try:
|
|
157
|
+
resp = requests.get(f"{self.base_url}/api/drafts", timeout=10)
|
|
158
|
+
data = resp.json()
|
|
159
|
+
if data.get("status") == "success":
|
|
160
|
+
return data.get("result", [])
|
|
161
|
+
except Exception as e:
|
|
162
|
+
print(f"[EnvClient] Error getting drafts: {e}")
|
|
163
|
+
return []
|
|
164
|
+
|
|
165
|
+
# ===== Filing APIs =====
|
|
166
|
+
|
|
167
|
+
def get_filings(self) -> List[Dict[str, Any]]:
|
|
168
|
+
"""Get all filings submitted during this session.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
List of filing dictionaries.
|
|
172
|
+
"""
|
|
173
|
+
try:
|
|
174
|
+
resp = requests.get(f"{self.base_url}/api/filings", timeout=10)
|
|
175
|
+
data = resp.json()
|
|
176
|
+
if data.get("status") == "success":
|
|
177
|
+
return data.get("result", [])
|
|
178
|
+
except Exception as e:
|
|
179
|
+
print(f"[EnvClient] Error getting filings: {e}")
|
|
180
|
+
return []
|
|
181
|
+
|
|
182
|
+
def get_prepared_filings(self) -> List[Dict[str, Any]]:
|
|
183
|
+
"""Get all prepared filings (with content) during this session.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
List of prepared filing dictionaries with content field.
|
|
187
|
+
"""
|
|
188
|
+
try:
|
|
189
|
+
resp = requests.get(f"{self.base_url}/api/prepared_filings", timeout=10)
|
|
190
|
+
data = resp.json()
|
|
191
|
+
if data.get("status") == "success":
|
|
192
|
+
return data.get("result", [])
|
|
193
|
+
except Exception as e:
|
|
194
|
+
print(f"[EnvClient] Error getting prepared filings: {e}")
|
|
195
|
+
return []
|
|
196
|
+
|
|
197
|
+
# ===== Communication APIs =====
|
|
198
|
+
|
|
199
|
+
def get_communications(self) -> List[Dict[str, Any]]:
|
|
200
|
+
"""Get all communications sent during this session.
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
List of communication dictionaries.
|
|
204
|
+
"""
|
|
205
|
+
try:
|
|
206
|
+
resp = requests.get(f"{self.base_url}/api/communications", timeout=10)
|
|
207
|
+
data = resp.json()
|
|
208
|
+
if data.get("status") == "success":
|
|
209
|
+
return data.get("result", [])
|
|
210
|
+
except Exception as e:
|
|
211
|
+
print(f"[EnvClient] Error getting communications: {e}")
|
|
212
|
+
return []
|
|
213
|
+
|
|
214
|
+
# ===== Environment Control APIs =====
|
|
215
|
+
|
|
216
|
+
def reset(self) -> bool:
|
|
217
|
+
"""Reset environment to initial state (matters, drafts, filings, action log).
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
True if reset was successful.
|
|
221
|
+
"""
|
|
222
|
+
try:
|
|
223
|
+
resp = requests.post(f"{self.base_url}/reset", timeout=10)
|
|
224
|
+
data = resp.json()
|
|
225
|
+
return data.get("status") == "success"
|
|
226
|
+
except Exception as e:
|
|
227
|
+
print(f"[EnvClient] Error resetting environment: {e}")
|
|
228
|
+
return False
|
|
File without changes
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
macOS Environment Setup Utility
|
|
4
|
+
|
|
5
|
+
Reads config.yaml from the task directory and executes Environment.steps
|
|
6
|
+
to initialize the environment before running the task.
|
|
7
|
+
|
|
8
|
+
Commands are executed INSIDE the macOS VM via the FastAPI /shell endpoint
|
|
9
|
+
(which uses SSH internally to connect to the QEMU VM).
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
python env_setup.py # Run in current task directory
|
|
13
|
+
python env_setup.py /path/to/task # Run for specific task directory
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
import os
|
|
20
|
+
import sys
|
|
21
|
+
import time
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Any
|
|
24
|
+
|
|
25
|
+
import requests
|
|
26
|
+
import yaml
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
DEFAULT_API_URL = "http://localhost:8005"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_api_url(config: dict) -> str:
|
|
33
|
+
"""Get the FastAPI backend URL from config or environment."""
|
|
34
|
+
env_port = os.environ.get("MCP_SERVICE_PORT")
|
|
35
|
+
if env_port:
|
|
36
|
+
return f"http://localhost:{env_port}"
|
|
37
|
+
|
|
38
|
+
env_config = config.get("Environment", {})
|
|
39
|
+
docker_env = env_config.get("docker_compose_environment", {})
|
|
40
|
+
port = docker_env.get("MCP_SERVICE_PORT", "8005")
|
|
41
|
+
return f"http://localhost:{port}"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def make_api_call(api_url: str, endpoint: str, data: dict, max_retries: int = 10, retry_delay: int = 10) -> dict:
|
|
45
|
+
"""Make HTTP API call to the FastAPI backend with retries."""
|
|
46
|
+
url = f"{api_url}{endpoint}"
|
|
47
|
+
|
|
48
|
+
for attempt in range(max_retries):
|
|
49
|
+
try:
|
|
50
|
+
response = requests.post(url, json=data, timeout=60)
|
|
51
|
+
response.raise_for_status()
|
|
52
|
+
return response.json()
|
|
53
|
+
except requests.exceptions.RequestException as e:
|
|
54
|
+
error_msg = str(e).lower()
|
|
55
|
+
is_retryable = any(
|
|
56
|
+
keyword in error_msg
|
|
57
|
+
for keyword in [
|
|
58
|
+
"timeout",
|
|
59
|
+
"timed out",
|
|
60
|
+
"connection reset",
|
|
61
|
+
"connection refused",
|
|
62
|
+
"connection error",
|
|
63
|
+
"connect failed",
|
|
64
|
+
"broken pipe",
|
|
65
|
+
"connection aborted",
|
|
66
|
+
"temporarily unavailable",
|
|
67
|
+
]
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
if is_retryable and attempt < max_retries - 1:
|
|
71
|
+
print(f" [RETRY] Attempt {attempt + 1}/{max_retries} failed: {e}")
|
|
72
|
+
print(f" [INFO] Retrying in {retry_delay} seconds...")
|
|
73
|
+
time.sleep(retry_delay)
|
|
74
|
+
else:
|
|
75
|
+
return {"status": "error", "result": f"API call failed: {str(e)}"}
|
|
76
|
+
|
|
77
|
+
return {"status": "error", "result": "Max retries exceeded"}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def run_shell(api_url: str, command: str, description: str = "") -> bool:
|
|
81
|
+
"""Execute a shell command on the macOS VM via /shell endpoint."""
|
|
82
|
+
if description:
|
|
83
|
+
print(f" {description}")
|
|
84
|
+
print(f" Running shell command via API...")
|
|
85
|
+
|
|
86
|
+
result = make_api_call(api_url, "/shell", {"command": command, "timeout": 30})
|
|
87
|
+
|
|
88
|
+
if result.get("status") == "error":
|
|
89
|
+
print(f" -> FAILED: {result.get('result', 'Unknown error')}")
|
|
90
|
+
return False
|
|
91
|
+
|
|
92
|
+
stdout = result.get("stdout", "")
|
|
93
|
+
exit_code = result.get("exit_code", 0)
|
|
94
|
+
print(f" -> OK (exit={exit_code})")
|
|
95
|
+
if stdout:
|
|
96
|
+
for line in stdout.strip().split("\n")[:5]:
|
|
97
|
+
print(f" {line}")
|
|
98
|
+
return True
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def execute_step(api_url: str, step: dict[str, Any]) -> bool:
|
|
102
|
+
"""Execute a single environment setup step."""
|
|
103
|
+
func = step.get("function", "")
|
|
104
|
+
params = step.get("parameters", {})
|
|
105
|
+
|
|
106
|
+
if func == "Shell-Tool":
|
|
107
|
+
command = params.get("command", "")
|
|
108
|
+
description = step.get("description", "")
|
|
109
|
+
if not command:
|
|
110
|
+
print(f" -> SKIPPED: Missing command")
|
|
111
|
+
return True
|
|
112
|
+
return run_shell(api_url, command, description)
|
|
113
|
+
|
|
114
|
+
elif func == "Download-File":
|
|
115
|
+
url = params.get("url", "")
|
|
116
|
+
path = params.get("path", "")
|
|
117
|
+
if not url or not path:
|
|
118
|
+
print(f" -> SKIPPED: Missing url or path")
|
|
119
|
+
return True
|
|
120
|
+
command = f'curl -fsSL "{url}" -o "{path}"'
|
|
121
|
+
print(f" Downloading: {url} -> {path}")
|
|
122
|
+
return run_shell(api_url, command)
|
|
123
|
+
|
|
124
|
+
elif func == "Open-File":
|
|
125
|
+
path = params.get("path", "")
|
|
126
|
+
if not path:
|
|
127
|
+
print(f" -> SKIPPED: Missing path")
|
|
128
|
+
return True
|
|
129
|
+
command = f'open "{path}"'
|
|
130
|
+
print(f" Opening: {path}")
|
|
131
|
+
return run_shell(api_url, command)
|
|
132
|
+
|
|
133
|
+
else:
|
|
134
|
+
print(f" -> SKIPPED: Unknown function '{func}'")
|
|
135
|
+
return True
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def run_setup(task_dir: Path) -> bool:
|
|
139
|
+
"""Run environment setup for a task directory."""
|
|
140
|
+
config_path = task_dir / "config.yaml"
|
|
141
|
+
|
|
142
|
+
if not config_path.exists():
|
|
143
|
+
print(f"Error: config.yaml not found in {task_dir}")
|
|
144
|
+
return False
|
|
145
|
+
|
|
146
|
+
print(f"Reading config from: {config_path}")
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
with open(config_path, "r", encoding="utf-8") as f:
|
|
150
|
+
config = yaml.safe_load(f)
|
|
151
|
+
except Exception as e:
|
|
152
|
+
print(f"Error reading config.yaml: {e}")
|
|
153
|
+
return False
|
|
154
|
+
|
|
155
|
+
env_config = config.get("Environment", {})
|
|
156
|
+
steps = env_config.get("steps", [])
|
|
157
|
+
|
|
158
|
+
task_id = config.get("Task", {}).get("task_id", "unknown")
|
|
159
|
+
print(f"\n=== Setting up environment for {task_id} ===")
|
|
160
|
+
|
|
161
|
+
api_url = get_api_url(config)
|
|
162
|
+
print(f"API URL: {api_url}")
|
|
163
|
+
|
|
164
|
+
# Clean Desktop before each task (safety net)
|
|
165
|
+
print("Resetting Desktop...")
|
|
166
|
+
run_shell(
|
|
167
|
+
api_url,
|
|
168
|
+
"rm -rf /Users/docker/Desktop/* 2>/dev/null; mkdir -p /Users/docker/Desktop",
|
|
169
|
+
"Cleaning Desktop",
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
if not steps:
|
|
173
|
+
print("No setup steps required.")
|
|
174
|
+
return True
|
|
175
|
+
|
|
176
|
+
print(f"Executing {len(steps)} setup step(s)...\n")
|
|
177
|
+
|
|
178
|
+
all_ok = True
|
|
179
|
+
for i, step in enumerate(steps, 1):
|
|
180
|
+
func = step.get("function", "unknown")
|
|
181
|
+
print(f"Step {i}/{len(steps)}: {func}")
|
|
182
|
+
if not execute_step(api_url, step):
|
|
183
|
+
all_ok = False
|
|
184
|
+
print()
|
|
185
|
+
|
|
186
|
+
if all_ok:
|
|
187
|
+
print("=== Environment setup completed successfully ===")
|
|
188
|
+
else:
|
|
189
|
+
print("=== Environment setup completed with errors ===")
|
|
190
|
+
|
|
191
|
+
return all_ok
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def main():
|
|
195
|
+
parser = argparse.ArgumentParser(description="macOS environment setup utility (executes commands via SSH/FastAPI)")
|
|
196
|
+
parser.add_argument(
|
|
197
|
+
"task_dir",
|
|
198
|
+
nargs="?",
|
|
199
|
+
default=".",
|
|
200
|
+
help="Path to task directory (default: current directory)",
|
|
201
|
+
)
|
|
202
|
+
args = parser.parse_args()
|
|
203
|
+
|
|
204
|
+
task_dir = Path(args.task_dir).resolve()
|
|
205
|
+
|
|
206
|
+
if not task_dir.exists():
|
|
207
|
+
print(f"Error: Directory not found: {task_dir}")
|
|
208
|
+
sys.exit(1)
|
|
209
|
+
|
|
210
|
+
success = run_setup(task_dir)
|
|
211
|
+
sys.exit(0 if success else 1)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
if __name__ == "__main__":
|
|
215
|
+
main()
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Shared utilities for macOS judge files.
|
|
2
|
+
|
|
3
|
+
Usage in judges (benign):
|
|
4
|
+
_root = Path(__file__).resolve().parents[4]
|
|
5
|
+
sys.path.insert(0, str(_root))
|
|
6
|
+
from dt_arena.utils.macos.helpers import run_shell, check_file_exists
|
|
7
|
+
|
|
8
|
+
Usage in judges (malicious indirect):
|
|
9
|
+
_root = Path(__file__).resolve().parents[6]
|
|
10
|
+
sys.path.insert(0, str(_root))
|
|
11
|
+
from dt_arena.utils.macos.helpers import run_shell, check_file_exists
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import os
|
|
17
|
+
|
|
18
|
+
import requests
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_api_url() -> str:
|
|
22
|
+
"""Get the FastAPI backend URL, reading MCP_SERVICE_PORT at call time."""
|
|
23
|
+
port = os.environ.get("MCP_SERVICE_PORT", "8005")
|
|
24
|
+
return os.environ.get("MACOS_API_URL", f"http://127.0.0.1:{port}")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def run_shell(command: str, timeout: int = 15) -> str:
|
|
28
|
+
"""Execute a shell command on the macOS VM and return stdout."""
|
|
29
|
+
try:
|
|
30
|
+
resp = requests.post(
|
|
31
|
+
f"{get_api_url()}/shell",
|
|
32
|
+
json={"command": command, "timeout": timeout},
|
|
33
|
+
timeout=timeout + 5,
|
|
34
|
+
)
|
|
35
|
+
resp.raise_for_status()
|
|
36
|
+
return resp.json().get("stdout", "").strip()
|
|
37
|
+
except Exception:
|
|
38
|
+
return ""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def check_file_exists(path: str) -> bool:
|
|
42
|
+
"""Check if a file or directory exists in the macOS VM."""
|
|
43
|
+
result = run_shell(f'test -e "{path}" && echo yes || echo no')
|
|
44
|
+
return result.strip() == "yes"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def check_dir_exists(path: str) -> bool:
|
|
48
|
+
"""Check if a directory exists in the macOS VM."""
|
|
49
|
+
result = run_shell(f'test -d "{path}" && echo yes || echo no')
|
|
50
|
+
return result.strip() == "yes"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def baseline_planted(sentinel_path: str) -> bool:
|
|
54
|
+
"""Verify a setup-time sentinel file exists in the VM.
|
|
55
|
+
|
|
56
|
+
See dt_arena/utils/windows/helpers.py:baseline_planted for rationale.
|
|
57
|
+
Direct/indirect judges that detect malicious VM state need proof setup
|
|
58
|
+
actually established the secure baseline; otherwise Docker macOS defaults
|
|
59
|
+
can match the malicious signature and produce false positives.
|
|
60
|
+
"""
|
|
61
|
+
return check_file_exists(sentinel_path)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""OS Filesystem helpers package."""
|