decodingtrust-agent-sdk 0.2.9__tar.gz → 0.2.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {decodingtrust_agent_sdk-0.2.9/decodingtrust_agent_sdk.egg-info → decodingtrust_agent_sdk-0.2.10}/PKG-INFO +69 -15
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/README.md +61 -14
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/hermes/example.py +1 -1
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/hermes/src/hermes_runner.py +1 -1
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/langchain/example.py +2 -2
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/openaisdk/example.py +2 -2
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/pocketflow/example.py +2 -2
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/strands/example.py +1 -1
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/cli/scaffold.py +2 -2
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10/decodingtrust_agent_sdk.egg-info}/PKG-INFO +69 -15
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/decodingtrust_agent_sdk.egg-info/requires.txt +8 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/config/mcp.yaml +7 -1
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/browser/main.py +16 -3
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/src/types/agent.py +2 -2
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/eval/evaluation.py +31 -11
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/eval/task_runner.py +2 -2
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/pyproject.toml +19 -1
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/__init__.py +6 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/agent_helpers.py +47 -63
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/config.py +57 -7
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/judge_helpers.py +28 -11
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/LICENSE +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/MANIFEST.in +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/claudesdk/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/claudesdk/example.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/claudesdk/src/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/claudesdk/src/agent.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/claudesdk/src/mcp_proxy.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/claudesdk/src/utils.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/googleadk/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/googleadk/example.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/googleadk/src/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/googleadk/src/agent.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/googleadk/src/mcp_wrapper.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/googleadk/src/utils.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/hermes/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/hermes/src/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/hermes/src/agent.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/hermes/src/mcp_proxy.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/hermes/src/utils.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/langchain/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/langchain/src/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/langchain/src/agent.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/langchain/src/utils.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/openaisdk/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/openaisdk/src/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/openaisdk/src/agent.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/openaisdk/src/agent_wrapper.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/openaisdk/src/mcp_wrapper.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/openaisdk/src/utils.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/openclaw/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/openclaw/example.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/openclaw/src/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/openclaw/src/agent.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/openclaw/src/helpers/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/openclaw/src/helpers/auth_helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/openclaw/src/mcp_proxy.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/openclaw/src/plugin_generator.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/openclaw/src/utils.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/pocketflow/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/pocketflow/prompts/react_agent.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/pocketflow/src/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/pocketflow/src/agent.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/pocketflow/src/agent_wrapper.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/pocketflow/src/async_helper.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/pocketflow/src/mcp_react_agent.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/pocketflow/src/native_agent.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/pocketflow/src/nodes.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/strands/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/strands/src/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/strands/src/agent.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/strands/src/utils.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/browser/benign.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/browser/direct.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/browser/indirect.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/code/benign.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/code/direct.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/code/indirect.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/crm/benign.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/crm/direct.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/crm/indirect.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/customer-service/benign.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/customer-service/direct.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/customer-service/indirect.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/finance/benign.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/finance/direct.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/finance/indirect.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/legal/benign.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/legal/direct.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/legal/indirect.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/macos/benign.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/macos/direct.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/macos/indirect.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/medical/benign.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/medical/direct.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/medical/indirect.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/os-filesystem/benign.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/os-filesystem/direct.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/os-filesystem/indirect.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/research/benign.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/research/direct.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/research/indirect.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/telecom/benign.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/telecom/direct.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/telecom/indirect.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/travel/benign.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/travel/direct.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/travel/indirect.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/windows/benign.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/windows/direct.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/windows/indirect.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/workflow/benign.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/workflow/direct.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/benchmark/workflow/indirect.jsonl +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/cli/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/cli/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/decodingtrust_agent_sdk.egg-info/SOURCES.txt +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/decodingtrust_agent_sdk.egg-info/dependency_links.txt +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/decodingtrust_agent_sdk.egg-info/entry_points.txt +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/decodingtrust_agent_sdk.egg-info/top_level.txt +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/a2a_agents/generic/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/a2a_agents/generic/agent.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/config/env.yaml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/config/injection_mcp.yaml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/arxiv/docker-compose-hub.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/arxiv/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/atlassian/docker-compose-hub.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/atlassian/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/bigquery/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/booking/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/calendar/docker-compose-hub.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/calendar/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/custom-website/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/customer_service/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/databricks/docker-compose-hub.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/databricks/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/ecommerce/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/ers/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/finance/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/gmail/docker-compose-hub.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/gmail/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/google-form/docker-compose-hub.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/google-form/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/googledocs/docker-compose-hub.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/googledocs/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/hospital/docker-compose-hub.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/hospital/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/legal/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/linkedin/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/macos/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/os-filesystem/docker-compose-hub.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/os-filesystem/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/paypal/docker-compose-hub.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/paypal/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/research/docker-compose-hub.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/research/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/salesforce_crm/docker-compose-hub.yaml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/salesforce_crm/docker-compose.yaml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/slack/docker-compose-hub.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/slack/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/snowflake/docker-compose-hub.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/snowflake/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/telecom/docker-compose-hub.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/telecom/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/telegram/docker-compose-hub.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/telegram/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/terminal/docker-compose-hub.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/terminal/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/travel/docker-compose-hub.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/travel/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/whatsapp/docker-compose-hub.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/whatsapp/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/windows/dns_listener.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/windows/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/windows/exfil_listener.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/windows/ftp_listener.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/windows/reset_server.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/windows/validate.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/zoom/docker-compose-hub.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/envs/zoom/docker-compose.yml +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/atlassian/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/calendar/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/custom_website/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/customer_service/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/databricks/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/ecommerce/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/finance/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/github/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/gmail/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/google_form/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/googledocs/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/hospital/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/legal/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/macos/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/os-filesystem/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/paypal/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/research/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/salesforce/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/slack/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/snowflake/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/telecom/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/telegram/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/terminal/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/travel/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/whatsapp/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/windows/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/injection_mcp_server/zoom/env_injection.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/atlassian/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/atlassian/test_server.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/bigquery/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/booking/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/calendar/example_multi_user.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/calendar/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/calendar/test_mcp.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/customer_service/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/databricks/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/databricks/probe.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/ers/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/core/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/core/data_loader.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/core/portfolio.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/evaluation/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/evaluation/evaluator.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/evaluation/logger.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/injection/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/injection/config.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/injection/content.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/injection/html.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/injection/locations.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/injection/methods.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/injection/presets.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/run_mcp.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/run_web.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/server/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/server/extractor.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/server/extractor_minimal.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/server/extractor_simple.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/server/injection_mcp.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/server/mcp.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/server/tools/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/server/tools/account.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/server/tools/browsing.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/server/tools/social.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/server/tools/trading.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/server/tools/utility.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/server/web.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/tasks/benchmark/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/tasks/benchmark/attack_pool.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/tasks/benchmark/attack_runner.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/tasks/benchmark/finra_requirements.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/tasks/benchmark/finra_tasks.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/tasks/benchmark/malicious_tasks.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/tasks/redteam_suite/run_redteam_suite.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/finance/test_mcp_tools.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/github/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/gmail/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/google_form/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/googledocs/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/hospital/mcp_server.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/legal/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/legal/core/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/legal/core/courtlistener_store.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/legal/core/data_loader.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/legal/core/document_store.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/legal/core/matter_manager.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/legal/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/legal/scripts/collect_data.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/legal/server/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/legal/server/mcp.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/macos/client_test.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/macos/mcp_server.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/os-filesystem/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/paypal/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/research/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/salesforce/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/slack/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/snowflake/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/snowflake/probe.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/telecom/mcp_client.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/telecom/mcp_server.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/telegram/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/terminal/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/travel/client_test.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/travel/mcp_server.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/whatsapp/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/windows/client_test.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/windows/mcp_server.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/mcp_server/zoom/main.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/src/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/src/hooks/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/src/hooks/audit_log.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/src/hooks/hooks.json +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/src/run_benign.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/src/types/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/src/types/attacks.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/src/types/environment.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/src/types/hooks.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/src/types/judge.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/src/types/red_teaming_trajectory.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/src/types/task.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/src/types/trajectory.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/atlassian/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/atlassian/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/bigquery/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/bigquery/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/calendar/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/calendar/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/chase/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/chase/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/customer_service/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/customer_service/cs_env_client.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/customer_service/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/customer_service/judges/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/customer_service/judges/abusive_comms.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/customer_service/judges/bulk_destructive.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/customer_service/judges/cross_customer.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/customer_service/judges/email_exfiltration.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/customer_service/judges/financial_fraud.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/customer_service/judges/llm_utils.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/customer_service/judges/pii_disclosure.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/customer_service/judges/policy_misrep.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/customer_service/judges/text_utils.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/databricks/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/databricks/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/finance/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/finance/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/github/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/github/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/gmail/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/gmail/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/google_form/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/google_form/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/legal/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/legal/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/macos/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/macos/env_setup.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/macos/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/os_filesystem/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/os_filesystem/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/paypal/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/paypal/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/port_allocator.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/research/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/research/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/robinhood/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/robinhood/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/salesforce/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/salesforce/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/slack/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/slack/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/snowflake/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/snowflake/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/telecom/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/telecom/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/telegram/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/telegram/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/terminal/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/terminal/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/travel/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/travel/env_client.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/travel/llm_judge.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/travel/prompts.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/utils/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/whatsapp/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/whatsapp/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/windows/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/windows/env_reset.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/windows/env_setup.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/windows/exfil_helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/windows/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/zoom/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/dt_arena/utils/zoom/helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/eval/__init__.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/eval/_ui.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/setup.cfg +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/a2a_helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/agent_wrapper.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/compose_utils.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/env_helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/eval_stats.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/injection_a2a_helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/injection_helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/injection_mcp_helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/judge_utils.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/llm.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/logging.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/mcp_helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/mcp_manager.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/memory_guard.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/red_teaming_sandbox.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/reset_helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/resource_manager.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/skill_helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/task_executor.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/task_helpers.py +0 -0
- {decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/utils/template_helpers.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: decodingtrust-agent-sdk
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.10
|
|
4
4
|
Summary: DecodingTrust Agent Platform (DTap) — A controllable and interactive red-teaming platform for AI agents
|
|
5
5
|
Author-email: DTap Team <zhaorun@uchicago.edu>
|
|
6
6
|
License: Apache License
|
|
@@ -245,6 +245,8 @@ Requires-Dist: rich>=13.0.0
|
|
|
245
245
|
Requires-Dist: pandas>=2.0.0
|
|
246
246
|
Requires-Dist: psutil>=5.9.0
|
|
247
247
|
Requires-Dist: huggingface_hub>=0.20.0
|
|
248
|
+
Requires-Dist: playwright>=1.53
|
|
249
|
+
Requires-Dist: pillow>=10.0.0
|
|
248
250
|
Provides-Extra: openai
|
|
249
251
|
Requires-Dist: openai>=2.6.1; extra == "openai"
|
|
250
252
|
Requires-Dist: openai-agents>=0.8.4; extra == "openai"
|
|
@@ -260,6 +262,10 @@ Requires-Dist: google-generativeai>=0.3.0; extra == "google"
|
|
|
260
262
|
Requires-Dist: google-genai>=1.0.0; extra == "google"
|
|
261
263
|
Requires-Dist: google-api-core>=2.28.0; extra == "google"
|
|
262
264
|
Requires-Dist: google-api-python-client>=2.100.0; extra == "google"
|
|
265
|
+
Requires-Dist: google-cloud-monitoring>=2.20.0; extra == "google"
|
|
266
|
+
Requires-Dist: google-cloud-trace>=1.13.0; extra == "google"
|
|
267
|
+
Requires-Dist: opentelemetry-exporter-gcp-trace>=1.7.0; extra == "google"
|
|
268
|
+
Requires-Dist: opentelemetry-exporter-gcp-monitoring>=1.7.0a0; extra == "google"
|
|
263
269
|
Provides-Extra: strands
|
|
264
270
|
Requires-Dist: strands-agents>=1.40.0; extra == "strands"
|
|
265
271
|
Provides-Extra: langchain
|
|
@@ -269,6 +275,7 @@ Requires-Dist: langchain-openai>=0.2.0; extra == "langchain"
|
|
|
269
275
|
Requires-Dist: langchain-anthropic>=0.2.0; extra == "langchain"
|
|
270
276
|
Provides-Extra: pocketflow
|
|
271
277
|
Requires-Dist: pocketflow==0.0.3; extra == "pocketflow"
|
|
278
|
+
Provides-Extra: browser
|
|
272
279
|
Provides-Extra: all
|
|
273
280
|
Requires-Dist: decodingtrust-agent-sdk[claude,google,langchain,openai,pocketflow,strands]; extra == "all"
|
|
274
281
|
Provides-Extra: dev
|
|
@@ -340,14 +347,57 @@ We have publicly released the full evaluation results, including the complete re
|
|
|
340
347
|
|
|
341
348
|
## Installation
|
|
342
349
|
|
|
350
|
+
### Option A — from PyPI (recommended for users)
|
|
351
|
+
|
|
352
|
+
```bash
|
|
353
|
+
pip install decodingtrust-agent-sdk # core (includes the browser domain deps)
|
|
354
|
+
# …plus the backend(s) you actually use (see "Agent backends" below):
|
|
355
|
+
pip install "decodingtrust-agent-sdk[openai]" # OpenAI Agents SDK
|
|
356
|
+
pip install "decodingtrust-agent-sdk[google]" # Google ADK / Gemini
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
This installs the `dtap` CLI. Use it instead of `python eval/evaluation.py`, and select
|
|
360
|
+
benchmark tasks with `--domain`:
|
|
361
|
+
|
|
362
|
+
```bash
|
|
363
|
+
dtap eval --domain crm --task-type benign --agent-type openaisdk --model gpt-5.4 --max-parallel 4
|
|
364
|
+
```
|
|
365
|
+
|
|
366
|
+
On first run, the per-task dataset is auto-downloaded from HuggingFace — **only for the
|
|
367
|
+
domain(s) you request**. Set `HF_TOKEN` to avoid unauthenticated rate-limiting (HTTP 429):
|
|
368
|
+
|
|
369
|
+
```bash
|
|
370
|
+
export HF_TOKEN=hf_...
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
### Option B — from source (for development)
|
|
374
|
+
|
|
343
375
|
```bash
|
|
344
376
|
git clone https://github.com/AI-secure/DecodingTrust-Agent.git
|
|
345
377
|
cd DecodingTrust-Agent
|
|
346
|
-
pip install -
|
|
347
|
-
|
|
378
|
+
pip install -e ".[openai]" # or [all] for every backend
|
|
379
|
+
# (here `python eval/evaluation.py --task-list benchmark/...` also works)
|
|
348
380
|
```
|
|
349
381
|
|
|
350
|
-
|
|
382
|
+
### Agent backends (optional extras)
|
|
383
|
+
|
|
384
|
+
Install only the framework you evaluate with:
|
|
385
|
+
|
|
386
|
+
| Extra | Backend (`--agent-type`) |
|
|
387
|
+
|---|---|
|
|
388
|
+
| `openai` | `openaisdk` |
|
|
389
|
+
| `claude` | `claudesdk` |
|
|
390
|
+
| `google` | `googleadk` |
|
|
391
|
+
| `langchain` | `langchain` |
|
|
392
|
+
| `strands` | `strands` |
|
|
393
|
+
| `pocketflow` | `pocketflow` |
|
|
394
|
+
| `all` | every backend above |
|
|
395
|
+
|
|
396
|
+
(The `browser` domain needs no extra — its Playwright deps are part of the core install.)
|
|
397
|
+
|
|
398
|
+
### Model keys & Docker
|
|
399
|
+
|
|
400
|
+
Set the API key for your backbone model (only the providers you use):
|
|
351
401
|
|
|
352
402
|
```bash
|
|
353
403
|
export OPENAI_API_KEY=sk-...
|
|
@@ -357,6 +407,10 @@ export GOOGLE_API_KEY=...
|
|
|
357
407
|
|
|
358
408
|
Docker is required: each task spins up isolated MCP servers and Docker-based environments through `TaskExecutor`.
|
|
359
409
|
|
|
410
|
+
> **Browser domain note:** browser tasks send full-page screenshots (large image-token
|
|
411
|
+
> input). With vision models on a metered tier, start at `--max-parallel 2` to avoid
|
|
412
|
+
> provider token-rate limits (HTTP 429), then raise it if your quota allows.
|
|
413
|
+
|
|
360
414
|
---
|
|
361
415
|
|
|
362
416
|
## Quick Start
|
|
@@ -367,7 +421,7 @@ A single benign CRM task with the OpenAI Agents SDK backbone:
|
|
|
367
421
|
python eval/evaluation.py \
|
|
368
422
|
--task-list benchmark/crm/benign.jsonl \
|
|
369
423
|
--agent-type openaisdk \
|
|
370
|
-
--model gpt-
|
|
424
|
+
--model gpt-5.4 \
|
|
371
425
|
--max-parallel 4
|
|
372
426
|
```
|
|
373
427
|
|
|
@@ -405,7 +459,7 @@ Run every benign + direct + indirect task in a domain by pointing `--task-list`
|
|
|
405
459
|
python eval/evaluation.py \
|
|
406
460
|
--task-list benchmark/finance \
|
|
407
461
|
--agent-type openaisdk \
|
|
408
|
-
--model gpt-
|
|
462
|
+
--model gpt-5.4 \
|
|
409
463
|
--max-parallel 8
|
|
410
464
|
```
|
|
411
465
|
|
|
@@ -415,7 +469,7 @@ python eval/evaluation.py \
|
|
|
415
469
|
|
|
416
470
|
```bash
|
|
417
471
|
# Benign utility only
|
|
418
|
-
python eval/evaluation.py --task-list benchmark/crm/benign.jsonl --agent-type openaisdk --model gpt-
|
|
472
|
+
python eval/evaluation.py --task-list benchmark/crm/benign.jsonl --agent-type openaisdk --model gpt-5.4
|
|
419
473
|
|
|
420
474
|
# Direct prompt injection only
|
|
421
475
|
python eval/evaluation.py --task-list benchmark/crm/direct.jsonl --agent-type claudesdk --model claude-sonnet-4-20250514
|
|
@@ -432,7 +486,7 @@ python eval/evaluation.py \
|
|
|
432
486
|
--task-type malicious \
|
|
433
487
|
--threat-model indirect \
|
|
434
488
|
--risk-category data-exfiltration \
|
|
435
|
-
--agent-type openaisdk --model gpt-
|
|
489
|
+
--agent-type openaisdk --model gpt-5.4
|
|
436
490
|
```
|
|
437
491
|
|
|
438
492
|
### The entire benchmark
|
|
@@ -443,7 +497,7 @@ Point `--task-list` at the top-level [`benchmark/`](benchmark/) directory to run
|
|
|
443
497
|
python eval/evaluation.py \
|
|
444
498
|
--task-list benchmark \
|
|
445
499
|
--agent-type openaisdk \
|
|
446
|
-
--model gpt-
|
|
500
|
+
--model gpt-5.4 \
|
|
447
501
|
--max-parallel 16 \
|
|
448
502
|
--skip-existing
|
|
449
503
|
```
|
|
@@ -478,7 +532,7 @@ Any JSONL file with the schema below is a valid `--task-list`. Pick a subset of
|
|
|
478
532
|
Run it like any built-in task list:
|
|
479
533
|
|
|
480
534
|
```bash
|
|
481
|
-
python eval/evaluation.py --task-list my_subset.jsonl --agent-type openaisdk --model gpt-
|
|
535
|
+
python eval/evaluation.py --task-list my_subset.jsonl --agent-type openaisdk --model gpt-5.4 --max-parallel 4
|
|
482
536
|
```
|
|
483
537
|
|
|
484
538
|
A few practical patterns:
|
|
@@ -486,11 +540,11 @@ A few practical patterns:
|
|
|
486
540
|
```bash
|
|
487
541
|
# Curate from an existing file
|
|
488
542
|
grep '"risk_category": "data-exfiltration"' benchmark/crm/indirect.jsonl > my_crm_exfil.jsonl
|
|
489
|
-
python eval/evaluation.py --task-list my_crm_exfil.jsonl --agent-type openaisdk --model gpt-
|
|
543
|
+
python eval/evaluation.py --task-list my_crm_exfil.jsonl --agent-type openaisdk --model gpt-5.4
|
|
490
544
|
|
|
491
545
|
# Try just one task end-to-end
|
|
492
546
|
echo '{"domain": "crm", "type": "benign", "task_id": "1"}' > one_task.jsonl
|
|
493
|
-
python eval/evaluation.py --task-list one_task.jsonl --agent-type openaisdk --model gpt-
|
|
547
|
+
python eval/evaluation.py --task-list one_task.jsonl --agent-type openaisdk --model gpt-5.4
|
|
494
548
|
```
|
|
495
549
|
|
|
496
550
|
For per-task internals (`config.yaml`, `judge.py`, `setup.sh`), see [docs/quickstart.md](docs/quickstart.md).
|
|
@@ -508,7 +562,7 @@ For per-task internals (`config.yaml`, `judge.py`, `setup.sh`), see [docs/quicks
|
|
|
508
562
|
| `--risk-category` | `None` | e.g. `data-exfiltration` |
|
|
509
563
|
| `--max-parallel` | `5` | Concurrent tasks (environments are reused across tasks) |
|
|
510
564
|
| `--agent-type` | `openaisdk` | `openaisdk`, `claudesdk`, `googleadk`, `langchain`, `pocketflow`, `openclaw` |
|
|
511
|
-
| `--model` | `gpt-
|
|
565
|
+
| `--model` | `gpt-5.4` | Backbone model identifier |
|
|
512
566
|
| `--temperature` | `None` | Sampling temperature (model default if unset) |
|
|
513
567
|
| `--port-range` | `None` | Dynamic MCP port range, e.g. `"10000-12000"` |
|
|
514
568
|
| `--direct-prompt` | off | For direct threat model, use the malicious goal as-is instead of replaying attack turns |
|
|
@@ -558,13 +612,13 @@ async def main():
|
|
|
558
612
|
native = OpenAIAgent(
|
|
559
613
|
name="MyAgent",
|
|
560
614
|
instructions="You are a helpful CRM assistant.",
|
|
561
|
-
model="gpt-
|
|
615
|
+
model="gpt-5.4",
|
|
562
616
|
mcp_servers=[my_custom_server],
|
|
563
617
|
)
|
|
564
618
|
|
|
565
619
|
# 2. Load the benchmark task config (adds salesforce, gmail, etc.)
|
|
566
620
|
agent_cfg = AgentConfig.from_yaml("dataset/crm/benign/1/config.yaml")
|
|
567
|
-
runtime_cfg = RuntimeConfig(model="gpt-
|
|
621
|
+
runtime_cfg = RuntimeConfig(model="gpt-5.4", temperature=0.1, max_turns=200,
|
|
568
622
|
output_dir="./results")
|
|
569
623
|
|
|
570
624
|
# 3. Wrap — auto-detects OpenAI SDK / LangChain / Claude SDK / Google ADK
|
|
@@ -59,14 +59,57 @@ We have publicly released the full evaluation results, including the complete re
|
|
|
59
59
|
|
|
60
60
|
## Installation
|
|
61
61
|
|
|
62
|
+
### Option A — from PyPI (recommended for users)
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install decodingtrust-agent-sdk # core (includes the browser domain deps)
|
|
66
|
+
# …plus the backend(s) you actually use (see "Agent backends" below):
|
|
67
|
+
pip install "decodingtrust-agent-sdk[openai]" # OpenAI Agents SDK
|
|
68
|
+
pip install "decodingtrust-agent-sdk[google]" # Google ADK / Gemini
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
This installs the `dtap` CLI. Use it instead of `python eval/evaluation.py`, and select
|
|
72
|
+
benchmark tasks with `--domain`:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
dtap eval --domain crm --task-type benign --agent-type openaisdk --model gpt-5.4 --max-parallel 4
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
On first run, the per-task dataset is auto-downloaded from HuggingFace — **only for the
|
|
79
|
+
domain(s) you request**. Set `HF_TOKEN` to avoid unauthenticated rate-limiting (HTTP 429):
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
export HF_TOKEN=hf_...
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Option B — from source (for development)
|
|
86
|
+
|
|
62
87
|
```bash
|
|
63
88
|
git clone https://github.com/AI-secure/DecodingTrust-Agent.git
|
|
64
89
|
cd DecodingTrust-Agent
|
|
65
|
-
pip install -
|
|
66
|
-
|
|
90
|
+
pip install -e ".[openai]" # or [all] for every backend
|
|
91
|
+
# (here `python eval/evaluation.py --task-list benchmark/...` also works)
|
|
67
92
|
```
|
|
68
93
|
|
|
69
|
-
|
|
94
|
+
### Agent backends (optional extras)
|
|
95
|
+
|
|
96
|
+
Install only the framework you evaluate with:
|
|
97
|
+
|
|
98
|
+
| Extra | Backend (`--agent-type`) |
|
|
99
|
+
|---|---|
|
|
100
|
+
| `openai` | `openaisdk` |
|
|
101
|
+
| `claude` | `claudesdk` |
|
|
102
|
+
| `google` | `googleadk` |
|
|
103
|
+
| `langchain` | `langchain` |
|
|
104
|
+
| `strands` | `strands` |
|
|
105
|
+
| `pocketflow` | `pocketflow` |
|
|
106
|
+
| `all` | every backend above |
|
|
107
|
+
|
|
108
|
+
(The `browser` domain needs no extra — its Playwright deps are part of the core install.)
|
|
109
|
+
|
|
110
|
+
### Model keys & Docker
|
|
111
|
+
|
|
112
|
+
Set the API key for your backbone model (only the providers you use):
|
|
70
113
|
|
|
71
114
|
```bash
|
|
72
115
|
export OPENAI_API_KEY=sk-...
|
|
@@ -76,6 +119,10 @@ export GOOGLE_API_KEY=...
|
|
|
76
119
|
|
|
77
120
|
Docker is required: each task spins up isolated MCP servers and Docker-based environments through `TaskExecutor`.
|
|
78
121
|
|
|
122
|
+
> **Browser domain note:** browser tasks send full-page screenshots (large image-token
|
|
123
|
+
> input). With vision models on a metered tier, start at `--max-parallel 2` to avoid
|
|
124
|
+
> provider token-rate limits (HTTP 429), then raise it if your quota allows.
|
|
125
|
+
|
|
79
126
|
---
|
|
80
127
|
|
|
81
128
|
## Quick Start
|
|
@@ -86,7 +133,7 @@ A single benign CRM task with the OpenAI Agents SDK backbone:
|
|
|
86
133
|
python eval/evaluation.py \
|
|
87
134
|
--task-list benchmark/crm/benign.jsonl \
|
|
88
135
|
--agent-type openaisdk \
|
|
89
|
-
--model gpt-
|
|
136
|
+
--model gpt-5.4 \
|
|
90
137
|
--max-parallel 4
|
|
91
138
|
```
|
|
92
139
|
|
|
@@ -124,7 +171,7 @@ Run every benign + direct + indirect task in a domain by pointing `--task-list`
|
|
|
124
171
|
python eval/evaluation.py \
|
|
125
172
|
--task-list benchmark/finance \
|
|
126
173
|
--agent-type openaisdk \
|
|
127
|
-
--model gpt-
|
|
174
|
+
--model gpt-5.4 \
|
|
128
175
|
--max-parallel 8
|
|
129
176
|
```
|
|
130
177
|
|
|
@@ -134,7 +181,7 @@ python eval/evaluation.py \
|
|
|
134
181
|
|
|
135
182
|
```bash
|
|
136
183
|
# Benign utility only
|
|
137
|
-
python eval/evaluation.py --task-list benchmark/crm/benign.jsonl --agent-type openaisdk --model gpt-
|
|
184
|
+
python eval/evaluation.py --task-list benchmark/crm/benign.jsonl --agent-type openaisdk --model gpt-5.4
|
|
138
185
|
|
|
139
186
|
# Direct prompt injection only
|
|
140
187
|
python eval/evaluation.py --task-list benchmark/crm/direct.jsonl --agent-type claudesdk --model claude-sonnet-4-20250514
|
|
@@ -151,7 +198,7 @@ python eval/evaluation.py \
|
|
|
151
198
|
--task-type malicious \
|
|
152
199
|
--threat-model indirect \
|
|
153
200
|
--risk-category data-exfiltration \
|
|
154
|
-
--agent-type openaisdk --model gpt-
|
|
201
|
+
--agent-type openaisdk --model gpt-5.4
|
|
155
202
|
```
|
|
156
203
|
|
|
157
204
|
### The entire benchmark
|
|
@@ -162,7 +209,7 @@ Point `--task-list` at the top-level [`benchmark/`](benchmark/) directory to run
|
|
|
162
209
|
python eval/evaluation.py \
|
|
163
210
|
--task-list benchmark \
|
|
164
211
|
--agent-type openaisdk \
|
|
165
|
-
--model gpt-
|
|
212
|
+
--model gpt-5.4 \
|
|
166
213
|
--max-parallel 16 \
|
|
167
214
|
--skip-existing
|
|
168
215
|
```
|
|
@@ -197,7 +244,7 @@ Any JSONL file with the schema below is a valid `--task-list`. Pick a subset of
|
|
|
197
244
|
Run it like any built-in task list:
|
|
198
245
|
|
|
199
246
|
```bash
|
|
200
|
-
python eval/evaluation.py --task-list my_subset.jsonl --agent-type openaisdk --model gpt-
|
|
247
|
+
python eval/evaluation.py --task-list my_subset.jsonl --agent-type openaisdk --model gpt-5.4 --max-parallel 4
|
|
201
248
|
```
|
|
202
249
|
|
|
203
250
|
A few practical patterns:
|
|
@@ -205,11 +252,11 @@ A few practical patterns:
|
|
|
205
252
|
```bash
|
|
206
253
|
# Curate from an existing file
|
|
207
254
|
grep '"risk_category": "data-exfiltration"' benchmark/crm/indirect.jsonl > my_crm_exfil.jsonl
|
|
208
|
-
python eval/evaluation.py --task-list my_crm_exfil.jsonl --agent-type openaisdk --model gpt-
|
|
255
|
+
python eval/evaluation.py --task-list my_crm_exfil.jsonl --agent-type openaisdk --model gpt-5.4
|
|
209
256
|
|
|
210
257
|
# Try just one task end-to-end
|
|
211
258
|
echo '{"domain": "crm", "type": "benign", "task_id": "1"}' > one_task.jsonl
|
|
212
|
-
python eval/evaluation.py --task-list one_task.jsonl --agent-type openaisdk --model gpt-
|
|
259
|
+
python eval/evaluation.py --task-list one_task.jsonl --agent-type openaisdk --model gpt-5.4
|
|
213
260
|
```
|
|
214
261
|
|
|
215
262
|
For per-task internals (`config.yaml`, `judge.py`, `setup.sh`), see [docs/quickstart.md](docs/quickstart.md).
|
|
@@ -227,7 +274,7 @@ For per-task internals (`config.yaml`, `judge.py`, `setup.sh`), see [docs/quicks
|
|
|
227
274
|
| `--risk-category` | `None` | e.g. `data-exfiltration` |
|
|
228
275
|
| `--max-parallel` | `5` | Concurrent tasks (environments are reused across tasks) |
|
|
229
276
|
| `--agent-type` | `openaisdk` | `openaisdk`, `claudesdk`, `googleadk`, `langchain`, `pocketflow`, `openclaw` |
|
|
230
|
-
| `--model` | `gpt-
|
|
277
|
+
| `--model` | `gpt-5.4` | Backbone model identifier |
|
|
231
278
|
| `--temperature` | `None` | Sampling temperature (model default if unset) |
|
|
232
279
|
| `--port-range` | `None` | Dynamic MCP port range, e.g. `"10000-12000"` |
|
|
233
280
|
| `--direct-prompt` | off | For direct threat model, use the malicious goal as-is instead of replaying attack turns |
|
|
@@ -277,13 +324,13 @@ async def main():
|
|
|
277
324
|
native = OpenAIAgent(
|
|
278
325
|
name="MyAgent",
|
|
279
326
|
instructions="You are a helpful CRM assistant.",
|
|
280
|
-
model="gpt-
|
|
327
|
+
model="gpt-5.4",
|
|
281
328
|
mcp_servers=[my_custom_server],
|
|
282
329
|
)
|
|
283
330
|
|
|
284
331
|
# 2. Load the benchmark task config (adds salesforce, gmail, etc.)
|
|
285
332
|
agent_cfg = AgentConfig.from_yaml("dataset/crm/benign/1/config.yaml")
|
|
286
|
-
runtime_cfg = RuntimeConfig(model="gpt-
|
|
333
|
+
runtime_cfg = RuntimeConfig(model="gpt-5.4", temperature=0.1, max_turns=200,
|
|
287
334
|
output_dir="./results")
|
|
288
335
|
|
|
289
336
|
# 3. Wrap — auto-detects OpenAI SDK / LangChain / Claude SDK / Google ADK
|
|
@@ -8,7 +8,7 @@ injection.
|
|
|
8
8
|
|
|
9
9
|
Usage:
|
|
10
10
|
python agent/hermes/example.py --config path/to/config.yaml
|
|
11
|
-
python agent/hermes/example.py --config path/to/config.yaml --model openai/gpt-
|
|
11
|
+
python agent/hermes/example.py --config path/to/config.yaml --model openai/gpt-5.4
|
|
12
12
|
python agent/hermes/example.py --config path/to/config.yaml --debug
|
|
13
13
|
|
|
14
14
|
Prerequisites:
|
{decodingtrust_agent_sdk-0.2.9 → decodingtrust_agent_sdk-0.2.10}/agent/hermes/src/hermes_runner.py
RENAMED
|
@@ -25,7 +25,7 @@ Request JSON schema::
|
|
|
25
25
|
"base_url": "https://api.openai.com/v1", # may be ""
|
|
26
26
|
"api_key": "sk-...", # may be null
|
|
27
27
|
"provider": "openai", # may be null
|
|
28
|
-
"model": "gpt-
|
|
28
|
+
"model": "gpt-5.4",
|
|
29
29
|
"max_turns": 30,
|
|
30
30
|
"system_prompt": "You are ...", # may be null
|
|
31
31
|
"enabled_toolsets": ["mcp-salesforce"], # MCP-only restriction
|
|
@@ -184,7 +184,7 @@ async def main() -> None:
|
|
|
184
184
|
agent = build_agent(
|
|
185
185
|
native_agent=native,
|
|
186
186
|
agent_cfg=AgentConfig(system_prompt=""),
|
|
187
|
-
runtime_cfg=RuntimeConfig(model="gpt-
|
|
187
|
+
runtime_cfg=RuntimeConfig(model="gpt-5.4", max_turns=10),
|
|
188
188
|
)
|
|
189
189
|
async with agent:
|
|
190
190
|
result = await agent.run("Say hello.", metadata={{"task_id": "smoke"}})
|
|
@@ -207,7 +207,7 @@ from .agent import {class_name}
|
|
|
207
207
|
async def main() -> None:
|
|
208
208
|
agent = {class_name}(
|
|
209
209
|
agent_config=AgentConfig(system_prompt=""),
|
|
210
|
-
runtime_config=RuntimeConfig(model="gpt-
|
|
210
|
+
runtime_config=RuntimeConfig(model="gpt-5.4", max_turns=10),
|
|
211
211
|
)
|
|
212
212
|
async with agent:
|
|
213
213
|
result = await agent.run("Say hello.", metadata={{"task_id": "smoke"}})
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: decodingtrust-agent-sdk
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.10
|
|
4
4
|
Summary: DecodingTrust Agent Platform (DTap) — A controllable and interactive red-teaming platform for AI agents
|
|
5
5
|
Author-email: DTap Team <zhaorun@uchicago.edu>
|
|
6
6
|
License: Apache License
|
|
@@ -245,6 +245,8 @@ Requires-Dist: rich>=13.0.0
|
|
|
245
245
|
Requires-Dist: pandas>=2.0.0
|
|
246
246
|
Requires-Dist: psutil>=5.9.0
|
|
247
247
|
Requires-Dist: huggingface_hub>=0.20.0
|
|
248
|
+
Requires-Dist: playwright>=1.53
|
|
249
|
+
Requires-Dist: pillow>=10.0.0
|
|
248
250
|
Provides-Extra: openai
|
|
249
251
|
Requires-Dist: openai>=2.6.1; extra == "openai"
|
|
250
252
|
Requires-Dist: openai-agents>=0.8.4; extra == "openai"
|
|
@@ -260,6 +262,10 @@ Requires-Dist: google-generativeai>=0.3.0; extra == "google"
|
|
|
260
262
|
Requires-Dist: google-genai>=1.0.0; extra == "google"
|
|
261
263
|
Requires-Dist: google-api-core>=2.28.0; extra == "google"
|
|
262
264
|
Requires-Dist: google-api-python-client>=2.100.0; extra == "google"
|
|
265
|
+
Requires-Dist: google-cloud-monitoring>=2.20.0; extra == "google"
|
|
266
|
+
Requires-Dist: google-cloud-trace>=1.13.0; extra == "google"
|
|
267
|
+
Requires-Dist: opentelemetry-exporter-gcp-trace>=1.7.0; extra == "google"
|
|
268
|
+
Requires-Dist: opentelemetry-exporter-gcp-monitoring>=1.7.0a0; extra == "google"
|
|
263
269
|
Provides-Extra: strands
|
|
264
270
|
Requires-Dist: strands-agents>=1.40.0; extra == "strands"
|
|
265
271
|
Provides-Extra: langchain
|
|
@@ -269,6 +275,7 @@ Requires-Dist: langchain-openai>=0.2.0; extra == "langchain"
|
|
|
269
275
|
Requires-Dist: langchain-anthropic>=0.2.0; extra == "langchain"
|
|
270
276
|
Provides-Extra: pocketflow
|
|
271
277
|
Requires-Dist: pocketflow==0.0.3; extra == "pocketflow"
|
|
278
|
+
Provides-Extra: browser
|
|
272
279
|
Provides-Extra: all
|
|
273
280
|
Requires-Dist: decodingtrust-agent-sdk[claude,google,langchain,openai,pocketflow,strands]; extra == "all"
|
|
274
281
|
Provides-Extra: dev
|
|
@@ -340,14 +347,57 @@ We have publicly released the full evaluation results, including the complete re
|
|
|
340
347
|
|
|
341
348
|
## Installation
|
|
342
349
|
|
|
350
|
+
### Option A — from PyPI (recommended for users)
|
|
351
|
+
|
|
352
|
+
```bash
|
|
353
|
+
pip install decodingtrust-agent-sdk # core (includes the browser domain deps)
|
|
354
|
+
# …plus the backend(s) you actually use (see "Agent backends" below):
|
|
355
|
+
pip install "decodingtrust-agent-sdk[openai]" # OpenAI Agents SDK
|
|
356
|
+
pip install "decodingtrust-agent-sdk[google]" # Google ADK / Gemini
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
This installs the `dtap` CLI. Use it instead of `python eval/evaluation.py`, and select
|
|
360
|
+
benchmark tasks with `--domain`:
|
|
361
|
+
|
|
362
|
+
```bash
|
|
363
|
+
dtap eval --domain crm --task-type benign --agent-type openaisdk --model gpt-5.4 --max-parallel 4
|
|
364
|
+
```
|
|
365
|
+
|
|
366
|
+
On first run, the per-task dataset is auto-downloaded from HuggingFace — **only for the
|
|
367
|
+
domain(s) you request**. Set `HF_TOKEN` to avoid unauthenticated rate-limiting (HTTP 429):
|
|
368
|
+
|
|
369
|
+
```bash
|
|
370
|
+
export HF_TOKEN=hf_...
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
### Option B — from source (for development)
|
|
374
|
+
|
|
343
375
|
```bash
|
|
344
376
|
git clone https://github.com/AI-secure/DecodingTrust-Agent.git
|
|
345
377
|
cd DecodingTrust-Agent
|
|
346
|
-
pip install -
|
|
347
|
-
|
|
378
|
+
pip install -e ".[openai]" # or [all] for every backend
|
|
379
|
+
# (here `python eval/evaluation.py --task-list benchmark/...` also works)
|
|
348
380
|
```
|
|
349
381
|
|
|
350
|
-
|
|
382
|
+
### Agent backends (optional extras)
|
|
383
|
+
|
|
384
|
+
Install only the framework you evaluate with:
|
|
385
|
+
|
|
386
|
+
| Extra | Backend (`--agent-type`) |
|
|
387
|
+
|---|---|
|
|
388
|
+
| `openai` | `openaisdk` |
|
|
389
|
+
| `claude` | `claudesdk` |
|
|
390
|
+
| `google` | `googleadk` |
|
|
391
|
+
| `langchain` | `langchain` |
|
|
392
|
+
| `strands` | `strands` |
|
|
393
|
+
| `pocketflow` | `pocketflow` |
|
|
394
|
+
| `all` | every backend above |
|
|
395
|
+
|
|
396
|
+
(The `browser` domain needs no extra — its Playwright deps are part of the core install.)
|
|
397
|
+
|
|
398
|
+
### Model keys & Docker
|
|
399
|
+
|
|
400
|
+
Set the API key for your backbone model (only the providers you use):
|
|
351
401
|
|
|
352
402
|
```bash
|
|
353
403
|
export OPENAI_API_KEY=sk-...
|
|
@@ -357,6 +407,10 @@ export GOOGLE_API_KEY=...
|
|
|
357
407
|
|
|
358
408
|
Docker is required: each task spins up isolated MCP servers and Docker-based environments through `TaskExecutor`.
|
|
359
409
|
|
|
410
|
+
> **Browser domain note:** browser tasks send full-page screenshots (large image-token
|
|
411
|
+
> input). With vision models on a metered tier, start at `--max-parallel 2` to avoid
|
|
412
|
+
> provider token-rate limits (HTTP 429), then raise it if your quota allows.
|
|
413
|
+
|
|
360
414
|
---
|
|
361
415
|
|
|
362
416
|
## Quick Start
|
|
@@ -367,7 +421,7 @@ A single benign CRM task with the OpenAI Agents SDK backbone:
|
|
|
367
421
|
python eval/evaluation.py \
|
|
368
422
|
--task-list benchmark/crm/benign.jsonl \
|
|
369
423
|
--agent-type openaisdk \
|
|
370
|
-
--model gpt-
|
|
424
|
+
--model gpt-5.4 \
|
|
371
425
|
--max-parallel 4
|
|
372
426
|
```
|
|
373
427
|
|
|
@@ -405,7 +459,7 @@ Run every benign + direct + indirect task in a domain by pointing `--task-list`
|
|
|
405
459
|
python eval/evaluation.py \
|
|
406
460
|
--task-list benchmark/finance \
|
|
407
461
|
--agent-type openaisdk \
|
|
408
|
-
--model gpt-
|
|
462
|
+
--model gpt-5.4 \
|
|
409
463
|
--max-parallel 8
|
|
410
464
|
```
|
|
411
465
|
|
|
@@ -415,7 +469,7 @@ python eval/evaluation.py \
|
|
|
415
469
|
|
|
416
470
|
```bash
|
|
417
471
|
# Benign utility only
|
|
418
|
-
python eval/evaluation.py --task-list benchmark/crm/benign.jsonl --agent-type openaisdk --model gpt-
|
|
472
|
+
python eval/evaluation.py --task-list benchmark/crm/benign.jsonl --agent-type openaisdk --model gpt-5.4
|
|
419
473
|
|
|
420
474
|
# Direct prompt injection only
|
|
421
475
|
python eval/evaluation.py --task-list benchmark/crm/direct.jsonl --agent-type claudesdk --model claude-sonnet-4-20250514
|
|
@@ -432,7 +486,7 @@ python eval/evaluation.py \
|
|
|
432
486
|
--task-type malicious \
|
|
433
487
|
--threat-model indirect \
|
|
434
488
|
--risk-category data-exfiltration \
|
|
435
|
-
--agent-type openaisdk --model gpt-
|
|
489
|
+
--agent-type openaisdk --model gpt-5.4
|
|
436
490
|
```
|
|
437
491
|
|
|
438
492
|
### The entire benchmark
|
|
@@ -443,7 +497,7 @@ Point `--task-list` at the top-level [`benchmark/`](benchmark/) directory to run
|
|
|
443
497
|
python eval/evaluation.py \
|
|
444
498
|
--task-list benchmark \
|
|
445
499
|
--agent-type openaisdk \
|
|
446
|
-
--model gpt-
|
|
500
|
+
--model gpt-5.4 \
|
|
447
501
|
--max-parallel 16 \
|
|
448
502
|
--skip-existing
|
|
449
503
|
```
|
|
@@ -478,7 +532,7 @@ Any JSONL file with the schema below is a valid `--task-list`. Pick a subset of
|
|
|
478
532
|
Run it like any built-in task list:
|
|
479
533
|
|
|
480
534
|
```bash
|
|
481
|
-
python eval/evaluation.py --task-list my_subset.jsonl --agent-type openaisdk --model gpt-
|
|
535
|
+
python eval/evaluation.py --task-list my_subset.jsonl --agent-type openaisdk --model gpt-5.4 --max-parallel 4
|
|
482
536
|
```
|
|
483
537
|
|
|
484
538
|
A few practical patterns:
|
|
@@ -486,11 +540,11 @@ A few practical patterns:
|
|
|
486
540
|
```bash
|
|
487
541
|
# Curate from an existing file
|
|
488
542
|
grep '"risk_category": "data-exfiltration"' benchmark/crm/indirect.jsonl > my_crm_exfil.jsonl
|
|
489
|
-
python eval/evaluation.py --task-list my_crm_exfil.jsonl --agent-type openaisdk --model gpt-
|
|
543
|
+
python eval/evaluation.py --task-list my_crm_exfil.jsonl --agent-type openaisdk --model gpt-5.4
|
|
490
544
|
|
|
491
545
|
# Try just one task end-to-end
|
|
492
546
|
echo '{"domain": "crm", "type": "benign", "task_id": "1"}' > one_task.jsonl
|
|
493
|
-
python eval/evaluation.py --task-list one_task.jsonl --agent-type openaisdk --model gpt-
|
|
547
|
+
python eval/evaluation.py --task-list one_task.jsonl --agent-type openaisdk --model gpt-5.4
|
|
494
548
|
```
|
|
495
549
|
|
|
496
550
|
For per-task internals (`config.yaml`, `judge.py`, `setup.sh`), see [docs/quickstart.md](docs/quickstart.md).
|
|
@@ -508,7 +562,7 @@ For per-task internals (`config.yaml`, `judge.py`, `setup.sh`), see [docs/quicks
|
|
|
508
562
|
| `--risk-category` | `None` | e.g. `data-exfiltration` |
|
|
509
563
|
| `--max-parallel` | `5` | Concurrent tasks (environments are reused across tasks) |
|
|
510
564
|
| `--agent-type` | `openaisdk` | `openaisdk`, `claudesdk`, `googleadk`, `langchain`, `pocketflow`, `openclaw` |
|
|
511
|
-
| `--model` | `gpt-
|
|
565
|
+
| `--model` | `gpt-5.4` | Backbone model identifier |
|
|
512
566
|
| `--temperature` | `None` | Sampling temperature (model default if unset) |
|
|
513
567
|
| `--port-range` | `None` | Dynamic MCP port range, e.g. `"10000-12000"` |
|
|
514
568
|
| `--direct-prompt` | off | For direct threat model, use the malicious goal as-is instead of replaying attack turns |
|
|
@@ -558,13 +612,13 @@ async def main():
|
|
|
558
612
|
native = OpenAIAgent(
|
|
559
613
|
name="MyAgent",
|
|
560
614
|
instructions="You are a helpful CRM assistant.",
|
|
561
|
-
model="gpt-
|
|
615
|
+
model="gpt-5.4",
|
|
562
616
|
mcp_servers=[my_custom_server],
|
|
563
617
|
)
|
|
564
618
|
|
|
565
619
|
# 2. Load the benchmark task config (adds salesforce, gmail, etc.)
|
|
566
620
|
agent_cfg = AgentConfig.from_yaml("dataset/crm/benign/1/config.yaml")
|
|
567
|
-
runtime_cfg = RuntimeConfig(model="gpt-
|
|
621
|
+
runtime_cfg = RuntimeConfig(model="gpt-5.4", temperature=0.1, max_turns=200,
|
|
568
622
|
output_dir="./results")
|
|
569
623
|
|
|
570
624
|
# 3. Wrap — auto-detects OpenAI SDK / LangChain / Claude SDK / Google ADK
|
|
@@ -21,10 +21,14 @@ rich>=13.0.0
|
|
|
21
21
|
pandas>=2.0.0
|
|
22
22
|
psutil>=5.9.0
|
|
23
23
|
huggingface_hub>=0.20.0
|
|
24
|
+
playwright>=1.53
|
|
25
|
+
pillow>=10.0.0
|
|
24
26
|
|
|
25
27
|
[all]
|
|
26
28
|
decodingtrust-agent-sdk[claude,google,langchain,openai,pocketflow,strands]
|
|
27
29
|
|
|
30
|
+
[browser]
|
|
31
|
+
|
|
28
32
|
[claude]
|
|
29
33
|
anthropic>=0.18.0
|
|
30
34
|
claude-agent-sdk>=0.1.18
|
|
@@ -44,6 +48,10 @@ google-generativeai>=0.3.0
|
|
|
44
48
|
google-genai>=1.0.0
|
|
45
49
|
google-api-core>=2.28.0
|
|
46
50
|
google-api-python-client>=2.100.0
|
|
51
|
+
google-cloud-monitoring>=2.20.0
|
|
52
|
+
google-cloud-trace>=1.13.0
|
|
53
|
+
opentelemetry-exporter-gcp-trace>=1.7.0
|
|
54
|
+
opentelemetry-exporter-gcp-monitoring>=1.7.0a0
|
|
47
55
|
|
|
48
56
|
[langchain]
|
|
49
57
|
langchain>=0.3.0
|