decodingtrust-agent-sdk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent/__init__.py +30 -0
- agent/claudesdk/__init__.py +8 -0
- agent/claudesdk/example.py +221 -0
- agent/claudesdk/src/__init__.py +8 -0
- agent/claudesdk/src/agent.py +400 -0
- agent/claudesdk/src/mcp_proxy.py +409 -0
- agent/claudesdk/src/utils.py +420 -0
- agent/googleadk/__init__.py +15 -0
- agent/googleadk/example.py +237 -0
- agent/googleadk/src/__init__.py +12 -0
- agent/googleadk/src/agent.py +401 -0
- agent/googleadk/src/mcp_wrapper.py +163 -0
- agent/googleadk/src/utils.py +602 -0
- agent/langchain/__init__.py +8 -0
- agent/langchain/example.py +213 -0
- agent/langchain/src/__init__.py +8 -0
- agent/langchain/src/agent.py +645 -0
- agent/langchain/src/utils.py +433 -0
- agent/openaisdk/__init__.py +17 -0
- agent/openaisdk/example.py +228 -0
- agent/openaisdk/src/__init__.py +12 -0
- agent/openaisdk/src/agent.py +491 -0
- agent/openaisdk/src/agent_wrapper.py +143 -0
- agent/openaisdk/src/mcp_wrapper.py +395 -0
- agent/openaisdk/src/utils.py +493 -0
- agent/openclaw/__init__.py +10 -0
- agent/openclaw/example.py +251 -0
- agent/openclaw/src/__init__.py +14 -0
- agent/openclaw/src/agent.py +930 -0
- agent/openclaw/src/helpers/__init__.py +1 -0
- agent/openclaw/src/helpers/auth_helpers.py +55 -0
- agent/openclaw/src/mcp_proxy.py +564 -0
- agent/openclaw/src/plugin_generator.py +231 -0
- agent/openclaw/src/utils.py +341 -0
- agent/pocketflow/__init__.py +18 -0
- agent/pocketflow/example.py +221 -0
- agent/pocketflow/prompts/react_agent.py +46 -0
- agent/pocketflow/src/__init__.py +6 -0
- agent/pocketflow/src/agent.py +507 -0
- agent/pocketflow/src/agent_wrapper.py +159 -0
- agent/pocketflow/src/async_helper.py +92 -0
- agent/pocketflow/src/mcp_react_agent.py +279 -0
- agent/pocketflow/src/native_agent.py +74 -0
- agent/pocketflow/src/nodes.py +467 -0
- benchmark/__init__.py +0 -0
- benchmark/browser/benign.jsonl +34 -0
- benchmark/browser/direct.jsonl +85 -0
- benchmark/browser/indirect.jsonl +82 -0
- benchmark/code/benign.jsonl +0 -0
- benchmark/code/direct.jsonl +121 -0
- benchmark/code/indirect.jsonl +165 -0
- benchmark/crm/benign.jsonl +165 -0
- benchmark/crm/direct.jsonl +90 -0
- benchmark/crm/indirect.jsonl +150 -0
- benchmark/customer-service/benign.jsonl +160 -0
- benchmark/customer-service/direct.jsonl +100 -0
- benchmark/customer-service/indirect.jsonl +101 -0
- benchmark/finance/benign.jsonl +0 -0
- benchmark/finance/direct.jsonl +200 -0
- benchmark/finance/indirect.jsonl +200 -0
- benchmark/legal/benign.jsonl +0 -0
- benchmark/legal/direct.jsonl +200 -0
- benchmark/legal/indirect.jsonl +200 -0
- benchmark/macos/benign.jsonl +30 -0
- benchmark/macos/direct.jsonl +50 -0
- benchmark/macos/indirect.jsonl +50 -0
- benchmark/medical/benign.jsonl +642 -0
- benchmark/medical/direct.jsonl +229 -0
- benchmark/medical/indirect.jsonl +222 -0
- benchmark/os-filesystem/benign.jsonl +200 -0
- benchmark/os-filesystem/direct.jsonl +200 -0
- benchmark/os-filesystem/indirect.jsonl +200 -0
- benchmark/research/benign.jsonl +0 -0
- benchmark/research/direct.jsonl +119 -0
- benchmark/research/indirect.jsonl +125 -0
- benchmark/telecom/benign.jsonl +120 -0
- benchmark/telecom/direct.jsonl +161 -0
- benchmark/telecom/indirect.jsonl +166 -0
- benchmark/travel/benign.jsonl +130 -0
- benchmark/travel/direct.jsonl +105 -0
- benchmark/travel/indirect.jsonl +120 -0
- benchmark/windows/benign.jsonl +100 -0
- benchmark/windows/direct.jsonl +140 -0
- benchmark/windows/indirect.jsonl +107 -0
- benchmark/workflow/benign.jsonl +335 -0
- benchmark/workflow/direct.jsonl +78 -0
- benchmark/workflow/indirect.jsonl +107 -0
- cli/__init__.py +5 -0
- cli/main.py +182 -0
- cli/scaffold.py +334 -0
- decodingtrust_agent_sdk-0.1.0.dist-info/METADATA +642 -0
- decodingtrust_agent_sdk-0.1.0.dist-info/RECORD +374 -0
- decodingtrust_agent_sdk-0.1.0.dist-info/WHEEL +5 -0
- decodingtrust_agent_sdk-0.1.0.dist-info/entry_points.txt +2 -0
- decodingtrust_agent_sdk-0.1.0.dist-info/licenses/LICENSE +201 -0
- decodingtrust_agent_sdk-0.1.0.dist-info/top_level.txt +6 -0
- dt_arena/config/env.yaml +515 -0
- dt_arena/config/injection_mcp.yaml +430 -0
- dt_arena/config/mcp.yaml +642 -0
- dt_arena/envs/arxiv/docker-compose-hub.yml +31 -0
- dt_arena/envs/arxiv/docker-compose.yml +36 -0
- dt_arena/envs/atlassian/docker/docker-compose.dev.yml +65 -0
- dt_arena/envs/atlassian/docker/docker-compose.yml +53 -0
- dt_arena/envs/atlassian/docker-compose-hub.yml +57 -0
- dt_arena/envs/atlassian/docker-compose.yml +72 -0
- dt_arena/envs/bigquery/docker-compose.yml +20 -0
- dt_arena/envs/booking/docker-compose.yml +59 -0
- dt_arena/envs/calendar/docker-compose-hub.yml +30 -0
- dt_arena/envs/calendar/docker-compose.yml +42 -0
- dt_arena/envs/custom-website/docker-compose.yml +6 -0
- dt_arena/envs/customer_service/docker-compose.yml +59 -0
- dt_arena/envs/databricks/docker-compose-hub.yml +47 -0
- dt_arena/envs/databricks/docker-compose.yml +51 -0
- dt_arena/envs/ecommerce/docker-compose.yml +6 -0
- dt_arena/envs/ers/docker-compose.yml +36 -0
- dt_arena/envs/ers/hrms/docker/docker-compose.yml +31 -0
- dt_arena/envs/finance/docker-compose.yml +23 -0
- dt_arena/envs/github/docker/docker-compose-hub.yml +50 -0
- dt_arena/envs/github/docker/docker-compose.yml +50 -0
- dt_arena/envs/gmail/docker-compose-hub.yml +51 -0
- dt_arena/envs/gmail/docker-compose.yml +65 -0
- dt_arena/envs/google-form/docker-compose-hub.yml +33 -0
- dt_arena/envs/google-form/docker-compose.yml +41 -0
- dt_arena/envs/googledocs/docker-compose-hub.yml +61 -0
- dt_arena/envs/googledocs/docker-compose.yml +78 -0
- dt_arena/envs/hospital/docker-compose-hub.yml +25 -0
- dt_arena/envs/hospital/docker-compose.yml +27 -0
- dt_arena/envs/legal/docker-compose.yml +22 -0
- dt_arena/envs/linkedin/docker-compose.yml +63 -0
- dt_arena/envs/macos/docker-compose.yml +79 -0
- dt_arena/envs/os-filesystem/docker-compose-hub.yml +16 -0
- dt_arena/envs/os-filesystem/docker-compose.yml +20 -0
- dt_arena/envs/paypal/docker-compose-hub.yml +48 -0
- dt_arena/envs/paypal/docker-compose.yml +63 -0
- dt_arena/envs/research/docker-compose-hub.yml +13 -0
- dt_arena/envs/research/docker-compose.yml +24 -0
- dt_arena/envs/salesforce_crm/docker-compose-hub.yaml +45 -0
- dt_arena/envs/salesforce_crm/docker-compose.yaml +49 -0
- dt_arena/envs/slack/docker-compose-hub.yml +28 -0
- dt_arena/envs/slack/docker-compose.yml +41 -0
- dt_arena/envs/snowflake/docker-compose-hub.yml +41 -0
- dt_arena/envs/snowflake/docker-compose.yml +44 -0
- dt_arena/envs/telecom/docker-compose-hub.yml +16 -0
- dt_arena/envs/telecom/docker-compose.yml +17 -0
- dt_arena/envs/telegram/docker-compose-hub.yml +57 -0
- dt_arena/envs/telegram/docker-compose.yml +62 -0
- dt_arena/envs/terminal/docker-compose-hub.yml +12 -0
- dt_arena/envs/terminal/docker-compose.yml +26 -0
- dt_arena/envs/travel/docker-compose-hub.yml +19 -0
- dt_arena/envs/travel/docker-compose.yml +19 -0
- dt_arena/envs/whatsapp/docker-compose-hub.yml +61 -0
- dt_arena/envs/whatsapp/docker-compose.yml +78 -0
- dt_arena/envs/windows/docker-compose.yml +71 -0
- dt_arena/envs/zoom/docker-compose-hub.yml +27 -0
- dt_arena/envs/zoom/docker-compose.yml +40 -0
- dt_arena/injection_mcp_server/atlassian/env_injection.py +134 -0
- dt_arena/injection_mcp_server/calendar/env_injection.py +217 -0
- dt_arena/injection_mcp_server/custom_website/env_injection.py +97 -0
- dt_arena/injection_mcp_server/customer_service/env_injection.py +659 -0
- dt_arena/injection_mcp_server/databricks/env_injection.py +255 -0
- dt_arena/injection_mcp_server/ecommerce/env_injection.py +110 -0
- dt_arena/injection_mcp_server/finance/env_injection.py +85 -0
- dt_arena/injection_mcp_server/github/env_injection.py +206 -0
- dt_arena/injection_mcp_server/gmail/env_injection.py +211 -0
- dt_arena/injection_mcp_server/google_form/env_injection.py +186 -0
- dt_arena/injection_mcp_server/googledocs/env_injection.py +44 -0
- dt_arena/injection_mcp_server/hospital/env_injection.py +43 -0
- dt_arena/injection_mcp_server/legal/env_injection.py +229 -0
- dt_arena/injection_mcp_server/macos/env_injection.py +272 -0
- dt_arena/injection_mcp_server/os-filesystem/env_injection.py +341 -0
- dt_arena/injection_mcp_server/paypal/env_injection.py +268 -0
- dt_arena/injection_mcp_server/research/env_injection.py +616 -0
- dt_arena/injection_mcp_server/salesforce/env_injection.py +514 -0
- dt_arena/injection_mcp_server/slack/env_injection.py +265 -0
- dt_arena/injection_mcp_server/snowflake/env_injection.py +230 -0
- dt_arena/injection_mcp_server/telecom/env_injection.py +503 -0
- dt_arena/injection_mcp_server/telegram/env_injection.py +171 -0
- dt_arena/injection_mcp_server/terminal/env_injection.py +523 -0
- dt_arena/injection_mcp_server/travel/env_injection.py +173 -0
- dt_arena/injection_mcp_server/whatsapp/env_injection.py +185 -0
- dt_arena/injection_mcp_server/windows/env_injection.py +943 -0
- dt_arena/injection_mcp_server/zoom/env_injection.py +216 -0
- dt_arena/mcp_server/atlassian/main.py +1554 -0
- dt_arena/mcp_server/atlassian/test_server.py +66 -0
- dt_arena/mcp_server/bigquery/main.py +333 -0
- dt_arena/mcp_server/booking/main.py +310 -0
- dt_arena/mcp_server/browser/main.py +1741 -0
- dt_arena/mcp_server/calendar/example_multi_user.py +162 -0
- dt_arena/mcp_server/calendar/main.py +792 -0
- dt_arena/mcp_server/calendar/test_mcp.py +135 -0
- dt_arena/mcp_server/customer_service/main.py +1063 -0
- dt_arena/mcp_server/databricks/main.py +566 -0
- dt_arena/mcp_server/databricks/probe.py +102 -0
- dt_arena/mcp_server/ers/main.py +845 -0
- dt_arena/mcp_server/finance/__init__.py +87 -0
- dt_arena/mcp_server/finance/core/__init__.py +12 -0
- dt_arena/mcp_server/finance/core/data_loader.py +558 -0
- dt_arena/mcp_server/finance/core/portfolio.py +565 -0
- dt_arena/mcp_server/finance/evaluation/__init__.py +20 -0
- dt_arena/mcp_server/finance/evaluation/evaluator.py +217 -0
- dt_arena/mcp_server/finance/evaluation/logger.py +137 -0
- dt_arena/mcp_server/finance/injection/__init__.py +66 -0
- dt_arena/mcp_server/finance/injection/config.py +176 -0
- dt_arena/mcp_server/finance/injection/content.py +755 -0
- dt_arena/mcp_server/finance/injection/html.py +409 -0
- dt_arena/mcp_server/finance/injection/locations.py +167 -0
- dt_arena/mcp_server/finance/injection/methods.py +193 -0
- dt_arena/mcp_server/finance/injection/presets.py +1023 -0
- dt_arena/mcp_server/finance/main.py +361 -0
- dt_arena/mcp_server/finance/run_mcp.py +21 -0
- dt_arena/mcp_server/finance/run_web.py +26 -0
- dt_arena/mcp_server/finance/server/__init__.py +41 -0
- dt_arena/mcp_server/finance/server/extractor.py +1453 -0
- dt_arena/mcp_server/finance/server/extractor_minimal.py +292 -0
- dt_arena/mcp_server/finance/server/extractor_simple.py +1164 -0
- dt_arena/mcp_server/finance/server/injection_mcp.py +865 -0
- dt_arena/mcp_server/finance/server/mcp.py +451 -0
- dt_arena/mcp_server/finance/server/tools/__init__.py +23 -0
- dt_arena/mcp_server/finance/server/tools/account.py +88 -0
- dt_arena/mcp_server/finance/server/tools/browsing.py +328 -0
- dt_arena/mcp_server/finance/server/tools/social.py +73 -0
- dt_arena/mcp_server/finance/server/tools/trading.py +242 -0
- dt_arena/mcp_server/finance/server/tools/utility.py +49 -0
- dt_arena/mcp_server/finance/server/web.py +2139 -0
- dt_arena/mcp_server/finance/tasks/benchmark/__init__.py +28 -0
- dt_arena/mcp_server/finance/tasks/benchmark/attack_pool.py +3026 -0
- dt_arena/mcp_server/finance/tasks/benchmark/attack_runner.py +1315 -0
- dt_arena/mcp_server/finance/tasks/benchmark/finra_requirements.py +1335 -0
- dt_arena/mcp_server/finance/tasks/benchmark/finra_tasks.py +3665 -0
- dt_arena/mcp_server/finance/tasks/benchmark/malicious_tasks.py +2673 -0
- dt_arena/mcp_server/finance/tasks/redteam_suite/run_redteam_suite.py +1713 -0
- dt_arena/mcp_server/finance/test_mcp_tools.py +476 -0
- dt_arena/mcp_server/github/main.py +441 -0
- dt_arena/mcp_server/gmail/main.py +1004 -0
- dt_arena/mcp_server/google_form/main.py +141 -0
- dt_arena/mcp_server/googledocs/main.py +458 -0
- dt_arena/mcp_server/hospital/mcp_server.py +458 -0
- dt_arena/mcp_server/legal/__init__.py +9 -0
- dt_arena/mcp_server/legal/core/__init__.py +14 -0
- dt_arena/mcp_server/legal/core/courtlistener_store.py +762 -0
- dt_arena/mcp_server/legal/core/data_loader.py +266 -0
- dt_arena/mcp_server/legal/core/document_store.py +197 -0
- dt_arena/mcp_server/legal/core/matter_manager.py +466 -0
- dt_arena/mcp_server/legal/main.py +89 -0
- dt_arena/mcp_server/legal/scripts/collect_data.py +988 -0
- dt_arena/mcp_server/legal/server/__init__.py +14 -0
- dt_arena/mcp_server/legal/server/mcp.py +2330 -0
- dt_arena/mcp_server/macos/client_test.py +270 -0
- dt_arena/mcp_server/macos/mcp_server.py +285 -0
- dt_arena/mcp_server/os-filesystem/main.py +1380 -0
- dt_arena/mcp_server/paypal/main.py +501 -0
- dt_arena/mcp_server/research/main.py +777 -0
- dt_arena/mcp_server/salesforce/main.py +2006 -0
- dt_arena/mcp_server/slack/main.py +318 -0
- dt_arena/mcp_server/snowflake/main.py +612 -0
- dt_arena/mcp_server/snowflake/probe.py +183 -0
- dt_arena/mcp_server/telecom/mcp_client.py +423 -0
- dt_arena/mcp_server/telecom/mcp_server.py +1059 -0
- dt_arena/mcp_server/telegram/main.py +338 -0
- dt_arena/mcp_server/terminal/main.py +163 -0
- dt_arena/mcp_server/travel/client_test.py +16 -0
- dt_arena/mcp_server/travel/mcp_server.py +404 -0
- dt_arena/mcp_server/whatsapp/main.py +318 -0
- dt_arena/mcp_server/windows/client_test.py +270 -0
- dt_arena/mcp_server/windows/mcp_server.py +218 -0
- dt_arena/mcp_server/zoom/main.py +466 -0
- dt_arena/src/__init__.py +0 -0
- dt_arena/src/hooks/__init__.py +0 -0
- dt_arena/src/hooks/audit_log.py +30 -0
- dt_arena/src/hooks/hooks.json +3 -0
- dt_arena/src/run_benign.py +142 -0
- dt_arena/src/types/__init__.py +0 -0
- dt_arena/src/types/agent.py +441 -0
- dt_arena/src/types/attacks.py +2 -0
- dt_arena/src/types/environment.py +2 -0
- dt_arena/src/types/hooks.py +174 -0
- dt_arena/src/types/judge.py +52 -0
- dt_arena/src/types/red_teaming_trajectory.py +385 -0
- dt_arena/src/types/task.py +260 -0
- dt_arena/src/types/trajectory.py +315 -0
- dt_arena/utils/__init__.py +1 -0
- dt_arena/utils/atlassian/__init__.py +27 -0
- dt_arena/utils/atlassian/helpers.py +520 -0
- dt_arena/utils/bigquery/__init__.py +1 -0
- dt_arena/utils/bigquery/helpers.py +246 -0
- dt_arena/utils/calendar/__init__.py +1 -0
- dt_arena/utils/calendar/helpers.py +87 -0
- dt_arena/utils/customer_service/__init__.py +17 -0
- dt_arena/utils/customer_service/cs_env_client.py +940 -0
- dt_arena/utils/customer_service/helpers.py +339 -0
- dt_arena/utils/customer_service/judges/__init__.py +20 -0
- dt_arena/utils/customer_service/judges/abusive_comms.py +149 -0
- dt_arena/utils/customer_service/judges/bulk_destructive.py +202 -0
- dt_arena/utils/customer_service/judges/cross_customer.py +481 -0
- dt_arena/utils/customer_service/judges/email_exfiltration.py +320 -0
- dt_arena/utils/customer_service/judges/financial_fraud.py +221 -0
- dt_arena/utils/customer_service/judges/llm_utils.py +264 -0
- dt_arena/utils/customer_service/judges/pii_disclosure.py +837 -0
- dt_arena/utils/customer_service/judges/policy_misrep.py +44 -0
- dt_arena/utils/customer_service/judges/text_utils.py +21 -0
- dt_arena/utils/databricks/__init__.py +2 -0
- dt_arena/utils/databricks/helpers.py +210 -0
- dt_arena/utils/finance/__init__.py +0 -0
- dt_arena/utils/finance/helpers.py +263 -0
- dt_arena/utils/github/__init__.py +1 -0
- dt_arena/utils/github/helpers.py +249 -0
- dt_arena/utils/gmail/__init__.py +1 -0
- dt_arena/utils/gmail/helpers.py +344 -0
- dt_arena/utils/google_form/__init__.py +2 -0
- dt_arena/utils/google_form/helpers.py +133 -0
- dt_arena/utils/legal/__init__.py +0 -0
- dt_arena/utils/legal/helpers.py +228 -0
- dt_arena/utils/macos/__init__.py +0 -0
- dt_arena/utils/macos/env_setup.py +215 -0
- dt_arena/utils/macos/helpers.py +61 -0
- dt_arena/utils/os_filesystem/__init__.py +1 -0
- dt_arena/utils/os_filesystem/helpers.py +366 -0
- dt_arena/utils/paypal/__init__.py +1 -0
- dt_arena/utils/paypal/helpers.py +178 -0
- dt_arena/utils/port_allocator.py +266 -0
- dt_arena/utils/research/__init__.py +0 -0
- dt_arena/utils/research/helpers.py +251 -0
- dt_arena/utils/salesforce/__init__.py +1 -0
- dt_arena/utils/salesforce/helpers.py +719 -0
- dt_arena/utils/slack/__init__.py +1 -0
- dt_arena/utils/slack/helpers.py +176 -0
- dt_arena/utils/snowflake/__init__.py +1 -0
- dt_arena/utils/snowflake/helpers.py +166 -0
- dt_arena/utils/telecom/__init__.py +1 -0
- dt_arena/utils/telecom/helpers.py +760 -0
- dt_arena/utils/telegram/__init__.py +0 -0
- dt_arena/utils/telegram/helpers.py +174 -0
- dt_arena/utils/terminal/__init__.py +0 -0
- dt_arena/utils/terminal/helpers.py +20 -0
- dt_arena/utils/travel/__init__.py +0 -0
- dt_arena/utils/travel/env_client.py +537 -0
- dt_arena/utils/travel/llm_judge.py +137 -0
- dt_arena/utils/travel/prompts.py +64 -0
- dt_arena/utils/utils/__init__.py +122 -0
- dt_arena/utils/whatsapp/__init__.py +0 -0
- dt_arena/utils/whatsapp/helpers.py +226 -0
- dt_arena/utils/windows/__init__.py +0 -0
- dt_arena/utils/windows/env_reset.py +224 -0
- dt_arena/utils/windows/env_setup.py +280 -0
- dt_arena/utils/windows/exfil_helpers.py +170 -0
- dt_arena/utils/windows/helpers.py +74 -0
- dt_arena/utils/zoom/__init__.py +1 -0
- dt_arena/utils/zoom/helpers.py +70 -0
- eval/__init__.py +1 -0
- eval/evaluation.py +426 -0
- eval/task_runner.py +449 -0
- utils/__init__.py +148 -0
- utils/agent_helpers.py +308 -0
- utils/agent_wrapper.py +189 -0
- utils/compose_utils.py +135 -0
- utils/config.py +77 -0
- utils/env_helpers.py +104 -0
- utils/eval_stats.py +88 -0
- utils/injection_helpers.py +429 -0
- utils/injection_mcp_helpers.py +152 -0
- utils/judge_helpers.py +181 -0
- utils/judge_utils.py +472 -0
- utils/llm.py +196 -0
- utils/logging.py +45 -0
- utils/mcp_helpers.py +232 -0
- utils/mcp_manager.py +235 -0
- utils/memory_guard.py +18 -0
- utils/red_teaming_sandbox.py +476 -0
- utils/reset_helpers.py +318 -0
- utils/resource_manager.py +370 -0
- utils/skill_helpers.py +447 -0
- utils/task_executor.py +904 -0
- utils/task_helpers.py +270 -0
- utils/template_helpers.py +179 -0
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
import json
|
|
5
|
+
import time
|
|
6
|
+
import traceback
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Awaitable, Callable, Dict, List, Optional, Protocol, runtime_checkable
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class ToolCallContext:
|
|
14
|
+
"""Describes an in-flight MCP tool call.
|
|
15
|
+
|
|
16
|
+
A pre-hook may return a modified ToolCallContext to rewrite `arguments`
|
|
17
|
+
or attach metadata before the call is dispatched.
|
|
18
|
+
"""
|
|
19
|
+
framework: str
|
|
20
|
+
server: str
|
|
21
|
+
tool_name: str
|
|
22
|
+
arguments: Dict[str, Any]
|
|
23
|
+
trace_id: Optional[str] = None
|
|
24
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class ToolCallResult:
|
|
29
|
+
"""Outcome of a dispatched MCP tool call.
|
|
30
|
+
|
|
31
|
+
`raw` holds whatever the underlying SDK returned (shape is framework
|
|
32
|
+
specific). `exception` is set if the call raised. `duration` is seconds.
|
|
33
|
+
A post-hook may return a modified ToolCallResult to rewrite the output.
|
|
34
|
+
"""
|
|
35
|
+
raw: Any = None
|
|
36
|
+
is_error: bool = False
|
|
37
|
+
duration: float = 0.0
|
|
38
|
+
exception: Optional[BaseException] = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@runtime_checkable
|
|
42
|
+
class ToolCallHook(Protocol):
|
|
43
|
+
"""Protocol every hook implementation satisfies.
|
|
44
|
+
|
|
45
|
+
Either method may be omitted (return None). Returning a new context or
|
|
46
|
+
result from pre/post replaces the in-flight value.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
async def on_pre_tool_call(
|
|
50
|
+
self, ctx: ToolCallContext
|
|
51
|
+
) -> Optional[ToolCallContext]: ...
|
|
52
|
+
|
|
53
|
+
async def on_post_tool_call(
|
|
54
|
+
self, ctx: ToolCallContext, result: ToolCallResult
|
|
55
|
+
) -> Optional[ToolCallResult]: ...
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
_HOOKS_CONFIG_PATH = Path(__file__).resolve().parents[1] / "hooks" / "hooks.json"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _load_hooks_from_config() -> List["ToolCallHook"]:
|
|
62
|
+
"""Instantiate hooks listed in `dt_arena/src/hooks/hooks.json`.
|
|
63
|
+
|
|
64
|
+
Expected format::
|
|
65
|
+
|
|
66
|
+
{"hooks": ["dt_arena.src.hooks.audit_log:AuditHook", ...]}
|
|
67
|
+
|
|
68
|
+
Missing file, empty list, or a malformed entry all degrade gracefully —
|
|
69
|
+
we log and skip. Each spec is ``module.path:ClassName``; the class is
|
|
70
|
+
imported and instantiated with no arguments.
|
|
71
|
+
"""
|
|
72
|
+
if not _HOOKS_CONFIG_PATH.is_file():
|
|
73
|
+
return []
|
|
74
|
+
try:
|
|
75
|
+
config = json.loads(_HOOKS_CONFIG_PATH.read_text())
|
|
76
|
+
except Exception as e:
|
|
77
|
+
print(f"[HookManager] failed to parse {_HOOKS_CONFIG_PATH}: {e}")
|
|
78
|
+
return []
|
|
79
|
+
|
|
80
|
+
hooks: List[ToolCallHook] = []
|
|
81
|
+
for spec in config.get("hooks", []) or []:
|
|
82
|
+
module_name, sep, class_name = spec.rpartition(":")
|
|
83
|
+
if not sep or not module_name or not class_name:
|
|
84
|
+
print(f"[HookManager] invalid hook spec '{spec}' (expected 'module:ClassName')")
|
|
85
|
+
continue
|
|
86
|
+
try:
|
|
87
|
+
cls = getattr(importlib.import_module(module_name), class_name)
|
|
88
|
+
hooks.append(cls())
|
|
89
|
+
except Exception as e:
|
|
90
|
+
print(f"[HookManager] failed to load hook '{spec}': {e}")
|
|
91
|
+
return hooks
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class HookManager:
|
|
95
|
+
"""Runs registered hooks around a framework's real tool-dispatch call.
|
|
96
|
+
|
|
97
|
+
Hooks listed in ``dt_arena/src/hooks/hooks.json`` are auto-loaded on
|
|
98
|
+
every construction, so any agent built via ``build_agent`` or directly
|
|
99
|
+
inherits them without further wiring.
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
def __init__(self, hooks: Optional[List[ToolCallHook]] = None):
|
|
103
|
+
# Config-file hooks run first; explicit per-agent hooks run after.
|
|
104
|
+
self._hooks: List[ToolCallHook] = _load_hooks_from_config() + list(hooks or [])
|
|
105
|
+
|
|
106
|
+
def register(self, hook: ToolCallHook) -> None:
|
|
107
|
+
self._hooks.append(hook)
|
|
108
|
+
|
|
109
|
+
def clear(self) -> None:
|
|
110
|
+
self._hooks.clear()
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def hooks(self) -> List[ToolCallHook]:
|
|
114
|
+
return list(self._hooks)
|
|
115
|
+
|
|
116
|
+
async def _run_pre(self, ctx: ToolCallContext) -> ToolCallContext:
|
|
117
|
+
for hook in self._hooks:
|
|
118
|
+
fn = getattr(hook, "on_pre_tool_call", None)
|
|
119
|
+
if fn is None:
|
|
120
|
+
continue
|
|
121
|
+
new_ctx = await fn(ctx)
|
|
122
|
+
if new_ctx is not None:
|
|
123
|
+
ctx = new_ctx
|
|
124
|
+
return ctx
|
|
125
|
+
|
|
126
|
+
async def _run_post(
|
|
127
|
+
self, ctx: ToolCallContext, result: ToolCallResult
|
|
128
|
+
) -> ToolCallResult:
|
|
129
|
+
for hook in self._hooks:
|
|
130
|
+
fn = getattr(hook, "on_post_tool_call", None)
|
|
131
|
+
if fn is None:
|
|
132
|
+
continue
|
|
133
|
+
new_result = await fn(ctx, result)
|
|
134
|
+
if new_result is not None:
|
|
135
|
+
result = new_result
|
|
136
|
+
return result
|
|
137
|
+
|
|
138
|
+
async def wrap(
|
|
139
|
+
self,
|
|
140
|
+
ctx: ToolCallContext,
|
|
141
|
+
call: Callable[[Dict[str, Any]], Awaitable[Any]],
|
|
142
|
+
) -> Any:
|
|
143
|
+
"""Run pre-hooks, dispatch `call(arguments)`, run post-hooks.
|
|
144
|
+
|
|
145
|
+
`call` is a framework-specific async callable that performs the real
|
|
146
|
+
MCP dispatch given the (possibly rewritten) arguments and returns
|
|
147
|
+
whatever that SDK returns. Post-hooks run even on exception.
|
|
148
|
+
"""
|
|
149
|
+
# Fast path: no hooks registered.
|
|
150
|
+
if not self._hooks:
|
|
151
|
+
return await call(ctx.arguments)
|
|
152
|
+
|
|
153
|
+
ctx = await self._run_pre(ctx)
|
|
154
|
+
|
|
155
|
+
start = time.perf_counter()
|
|
156
|
+
result = ToolCallResult()
|
|
157
|
+
try:
|
|
158
|
+
result.raw = await call(ctx.arguments)
|
|
159
|
+
except BaseException as exc:
|
|
160
|
+
result.exception = exc
|
|
161
|
+
result.is_error = True
|
|
162
|
+
result.duration = time.perf_counter() - start
|
|
163
|
+
try:
|
|
164
|
+
await self._run_post(ctx, result)
|
|
165
|
+
except Exception:
|
|
166
|
+
traceback.print_exc()
|
|
167
|
+
raise
|
|
168
|
+
else:
|
|
169
|
+
result.duration = time.perf_counter() - start
|
|
170
|
+
|
|
171
|
+
final = await self._run_post(ctx, result)
|
|
172
|
+
if final.exception is not None and final.exception is not result.exception:
|
|
173
|
+
raise final.exception
|
|
174
|
+
return final.raw
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import List, Tuple, Dict, Any, Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BaseJudge(ABC):
|
|
6
|
+
"""Base class for judges that evaluate agent behavior.
|
|
7
|
+
|
|
8
|
+
DEPRECATION NOTICE:
|
|
9
|
+
The `trajectory` argument has been removed from `eval_task` and `eval_attack`
|
|
10
|
+
methods. This parameter is deprecated and no longer supported. Judge
|
|
11
|
+
implementations should only use `agent_responses` for evaluation.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, domain: str, judge_model: str = "gpt-4o"):
|
|
15
|
+
self.domain = domain
|
|
16
|
+
self.judge_model = judge_model
|
|
17
|
+
|
|
18
|
+
@abstractmethod
|
|
19
|
+
def eval_task(self, agent_responses: List[str]) -> Tuple[bool, Dict[str, Any]]:
|
|
20
|
+
"""
|
|
21
|
+
Evaluate whether the agent successfully completed the task.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
agent_responses: List of responses produced by the agent at each turn.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Tuple of (success: bool, metadata: dict) where metadata contains
|
|
28
|
+
additional information about the evaluation result.
|
|
29
|
+
|
|
30
|
+
Note:
|
|
31
|
+
DEPRECATED: The `trajectory` parameter has been removed.
|
|
32
|
+
This parameter is no longer supported.
|
|
33
|
+
"""
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
def eval_attack(self, agent_responses: List[str]) -> Optional[Tuple[bool, Dict[str, Any]]]:
|
|
38
|
+
"""
|
|
39
|
+
Evaluate whether an attack was successful against the agent.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
agent_responses: List of responses produced by the agent at each turn.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Tuple of (success: bool, metadata: dict) for adversarial tasks,
|
|
46
|
+
or None for benign tasks where no attack evaluation is needed.
|
|
47
|
+
|
|
48
|
+
Note:
|
|
49
|
+
DEPRECATED: The `trajectory` parameter has been removed.
|
|
50
|
+
This parameter is no longer supported.
|
|
51
|
+
"""
|
|
52
|
+
pass
|
|
@@ -0,0 +1,385 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Optional, Any, List, Dict, Literal
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class RedTeamingResult:
|
|
10
|
+
"""
|
|
11
|
+
Result from a red-teaming attack run.
|
|
12
|
+
|
|
13
|
+
This class encapsulates the outcome of a red-teaming attack,
|
|
14
|
+
including success status, detailed results, and the complete trajectory.
|
|
15
|
+
"""
|
|
16
|
+
success: bool
|
|
17
|
+
result: Dict[str, Any]
|
|
18
|
+
trajectory: Optional['RedTeamingTrajectory'] = None
|
|
19
|
+
|
|
20
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
21
|
+
"""Convert the result to a dictionary."""
|
|
22
|
+
return {
|
|
23
|
+
"success": self.success,
|
|
24
|
+
"result": self.result,
|
|
25
|
+
"trajectory": self.trajectory.to_dict()
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class RedTeamingTrajectory:
|
|
30
|
+
"""
|
|
31
|
+
A class for building and manipulating red-teaming attack trajectories.
|
|
32
|
+
|
|
33
|
+
This class tracks the complete lifecycle of a red-teaming attack:
|
|
34
|
+
- Attack configuration and metadata
|
|
35
|
+
- Decision-making process
|
|
36
|
+
- Query attempts and target responses
|
|
37
|
+
- Judge evaluations
|
|
38
|
+
- Environment/MCP injections
|
|
39
|
+
- Skills loaded and used
|
|
40
|
+
- Final attack outcome
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
domain: str,
|
|
46
|
+
task_id: str,
|
|
47
|
+
risk_category: str,
|
|
48
|
+
threat_model: Literal["indirect", "direct"],
|
|
49
|
+
original_task: Optional[str] = None,
|
|
50
|
+
malicious_goal: Optional[str] = None,
|
|
51
|
+
agent_model: Optional[str] = None,
|
|
52
|
+
victim_model: Optional[str] = None,
|
|
53
|
+
victim_arch: Optional[str] = None,
|
|
54
|
+
judge_model: Optional[str] = None,
|
|
55
|
+
max_attack_iterations: int = 10,
|
|
56
|
+
):
|
|
57
|
+
"""
|
|
58
|
+
Initialize a new red-teaming trajectory.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
domain: Task domain (e.g., "travel", "crm")
|
|
62
|
+
risk_category: Risk category (e.g., "accommodation-preference-violation")
|
|
63
|
+
task_id: Unique task identifier
|
|
64
|
+
threat_model: Threat model type - must be "indirect" or "direct"
|
|
65
|
+
original_task: The original benign task description
|
|
66
|
+
malicious_goal: The malicious objective to achieve
|
|
67
|
+
agent_model: LLM model used by red-teaming agent
|
|
68
|
+
victim_model: LLM model used by victim agent
|
|
69
|
+
victim_arch: Architecture type of victim agent (e.g., "pocketflow", "openaisdk", "langchain")
|
|
70
|
+
judge_model: LLM model used by judge
|
|
71
|
+
max_attack_iterations: Maximum attack attempts allowed
|
|
72
|
+
"""
|
|
73
|
+
# Validate threat_model
|
|
74
|
+
if threat_model not in ["indirect", "direct"]:
|
|
75
|
+
raise ValueError(f"threat_model must be 'indirect' or 'direct', got: {threat_model}")
|
|
76
|
+
|
|
77
|
+
self.data = {
|
|
78
|
+
"task_info": {
|
|
79
|
+
"task_id": task_id,
|
|
80
|
+
"domain": domain,
|
|
81
|
+
"risk_category": risk_category,
|
|
82
|
+
"threat_model": threat_model,
|
|
83
|
+
"original_task": original_task,
|
|
84
|
+
"malicious_goal": malicious_goal,
|
|
85
|
+
},
|
|
86
|
+
"attack_config": {
|
|
87
|
+
"agent_model": agent_model,
|
|
88
|
+
"victim_model": victim_model,
|
|
89
|
+
"victim_arch": victim_arch,
|
|
90
|
+
"judge_model": judge_model,
|
|
91
|
+
"max_attack_iterations": max_attack_iterations,
|
|
92
|
+
},
|
|
93
|
+
"attack_result": {
|
|
94
|
+
"success": False,
|
|
95
|
+
"duration": 0.0,
|
|
96
|
+
"timestamp": datetime.now().isoformat(),
|
|
97
|
+
"successful_attack_sequence": [],
|
|
98
|
+
},
|
|
99
|
+
"attack_trajectory": []
|
|
100
|
+
}
|
|
101
|
+
self._start_time: Optional[float] = None
|
|
102
|
+
self._next_step_id: int = 1
|
|
103
|
+
|
|
104
|
+
@classmethod
|
|
105
|
+
def load(cls, filepath: str) -> 'RedTeamingTrajectory':
|
|
106
|
+
"""
|
|
107
|
+
Load trajectory from a JSON file.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
filepath: Path to trajectory JSON file
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
RedTeamingTrajectory instance with loaded data
|
|
114
|
+
"""
|
|
115
|
+
with open(filepath, 'r', encoding='utf-8') as f:
|
|
116
|
+
data = json.load(f)
|
|
117
|
+
|
|
118
|
+
# Support both old and new format
|
|
119
|
+
task_info = data.get("task_info") or data.get("attack_info", {})
|
|
120
|
+
attack_config = data.get("attack_config", {})
|
|
121
|
+
|
|
122
|
+
instance = cls(
|
|
123
|
+
domain=task_info.get("domain", ""),
|
|
124
|
+
risk_category=task_info.get("risk_category", ""),
|
|
125
|
+
task_id=task_info.get("task_id", ""),
|
|
126
|
+
threat_model=task_info.get("threat_model", "indirect"), # Default to indirect for backward compatibility
|
|
127
|
+
original_task=task_info.get("original_task"),
|
|
128
|
+
malicious_goal=task_info.get("malicious_goal"),
|
|
129
|
+
agent_model=attack_config.get("agent_model"),
|
|
130
|
+
victim_model=attack_config.get("victim_model") or attack_config.get("target_model"), # backward compat
|
|
131
|
+
victim_arch=attack_config.get("victim_arch"),
|
|
132
|
+
judge_model=attack_config.get("judge_model"),
|
|
133
|
+
max_attack_iterations=attack_config.get("max_attack_iterations", 10),
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# Load full data
|
|
137
|
+
instance.data = data
|
|
138
|
+
return instance
|
|
139
|
+
|
|
140
|
+
def append_action(
|
|
141
|
+
self,
|
|
142
|
+
role: str,
|
|
143
|
+
action: Optional[str] = None,
|
|
144
|
+
state: Optional[Any] = None,
|
|
145
|
+
reasoning: Optional[str] = None,
|
|
146
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
147
|
+
):
|
|
148
|
+
"""
|
|
149
|
+
Append a generic action step to the trajectory.
|
|
150
|
+
|
|
151
|
+
This is the unified interface for recording all agent actions.
|
|
152
|
+
Follows the structure: step_id, role, reasoning (optional), action/state, metadata
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
role: Role of the step ("attacker", "victim", "judge", "tool")
|
|
156
|
+
action: Action name (for attacker role)
|
|
157
|
+
state: State/response (for victim/judge/tool roles)
|
|
158
|
+
reasoning: Reasoning behind the action (optional, for attacker)
|
|
159
|
+
metadata: Additional metadata
|
|
160
|
+
"""
|
|
161
|
+
step = {
|
|
162
|
+
"step_id": self._next_step_id,
|
|
163
|
+
"role": role,
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
# Add reasoning if provided (between role and action/state)
|
|
167
|
+
if reasoning:
|
|
168
|
+
step["reasoning"] = reasoning
|
|
169
|
+
|
|
170
|
+
# Add action or state depending on role
|
|
171
|
+
if action:
|
|
172
|
+
step["action"] = action
|
|
173
|
+
if state is not None:
|
|
174
|
+
step["state"] = state
|
|
175
|
+
|
|
176
|
+
# Add metadata
|
|
177
|
+
if metadata:
|
|
178
|
+
step["metadata"] = metadata
|
|
179
|
+
|
|
180
|
+
self.data["attack_trajectory"].append(step)
|
|
181
|
+
self._next_step_id += 1
|
|
182
|
+
|
|
183
|
+
def append_attacker_action(
|
|
184
|
+
self,
|
|
185
|
+
action: str,
|
|
186
|
+
reasoning: Optional[str] = None,
|
|
187
|
+
**kwargs
|
|
188
|
+
):
|
|
189
|
+
"""
|
|
190
|
+
Append an attacker action step to the trajectory.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
action: Action name (dynamically determined from action space)
|
|
194
|
+
reasoning: Reasoning behind the action
|
|
195
|
+
**kwargs: Action-specific metadata
|
|
196
|
+
"""
|
|
197
|
+
self.append_action(
|
|
198
|
+
role="attacker",
|
|
199
|
+
action=action,
|
|
200
|
+
reasoning=reasoning,
|
|
201
|
+
metadata=kwargs if kwargs else None
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
def append_victim_response(
|
|
205
|
+
self,
|
|
206
|
+
response: str,
|
|
207
|
+
query: str,
|
|
208
|
+
trajectory: Optional[List[Dict[str, Any]]] = None
|
|
209
|
+
):
|
|
210
|
+
"""
|
|
211
|
+
Append a victim agent response step.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
response: Victim agent's response
|
|
215
|
+
query: Query sent to victim
|
|
216
|
+
trajectory: Victim agent's execution trajectory
|
|
217
|
+
"""
|
|
218
|
+
metadata = {"query": query}
|
|
219
|
+
if trajectory:
|
|
220
|
+
metadata["victim_trajectory"] = trajectory
|
|
221
|
+
|
|
222
|
+
self.append_action(
|
|
223
|
+
role="victim",
|
|
224
|
+
state=response,
|
|
225
|
+
metadata=metadata
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
def append_verifiable_judge(
|
|
229
|
+
self,
|
|
230
|
+
attack_result: Optional[Dict[str, Any]] = None,
|
|
231
|
+
task_result: Optional[Dict[str, Any]] = None,
|
|
232
|
+
):
|
|
233
|
+
"""
|
|
234
|
+
Append a verifiable (programmatic) judge evaluation step.
|
|
235
|
+
|
|
236
|
+
For indirect threat model, both attack and task results are stored
|
|
237
|
+
in a single entry.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
attack_result: Full result from the verifiable attack judge
|
|
241
|
+
(keys: success, message, metadata)
|
|
242
|
+
task_result: Full result from the verifiable task judge
|
|
243
|
+
(keys: success, message, metadata). Only present for indirect model.
|
|
244
|
+
"""
|
|
245
|
+
state: Dict[str, Any] = {}
|
|
246
|
+
step_metadata: Dict[str, Any] = {}
|
|
247
|
+
|
|
248
|
+
if attack_result:
|
|
249
|
+
state["attack"] = attack_result.get("success", False)
|
|
250
|
+
step_metadata["attack"] = {
|
|
251
|
+
"success": attack_result.get("success", False),
|
|
252
|
+
"explanation": attack_result.get("message", ""),
|
|
253
|
+
"metadata": attack_result.get("metadata", {}),
|
|
254
|
+
}
|
|
255
|
+
if task_result:
|
|
256
|
+
state["task"] = task_result.get("success", False)
|
|
257
|
+
step_metadata["task"] = {
|
|
258
|
+
"success": task_result.get("success", False),
|
|
259
|
+
"explanation": task_result.get("message", ""),
|
|
260
|
+
"metadata": task_result.get("metadata", {}),
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
self.append_action(
|
|
264
|
+
role="verifiable_judge",
|
|
265
|
+
state=state,
|
|
266
|
+
metadata=step_metadata,
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
def append_feedback_judge(
|
|
270
|
+
self,
|
|
271
|
+
failure_reason: str = "",
|
|
272
|
+
improvement_suggestions: str = "",
|
|
273
|
+
feedback_details: Optional[Dict[str, Any]] = None,
|
|
274
|
+
):
|
|
275
|
+
"""
|
|
276
|
+
Append an LLM feedback judge evaluation step.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
failure_reason: Why the attack failed according to the LLM judge
|
|
280
|
+
improvement_suggestions: Suggestions for improving the attack
|
|
281
|
+
feedback_details: Full parsed structured feedback from LLM judge
|
|
282
|
+
"""
|
|
283
|
+
step_metadata: Dict[str, Any] = {
|
|
284
|
+
"failure_reason": failure_reason,
|
|
285
|
+
"improvement_suggestions": improvement_suggestions,
|
|
286
|
+
}
|
|
287
|
+
if feedback_details:
|
|
288
|
+
step_metadata["feedback_details"] = feedback_details
|
|
289
|
+
self.append_action(
|
|
290
|
+
role="feedback_judge",
|
|
291
|
+
state=False,
|
|
292
|
+
metadata=step_metadata,
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
def set_success(self, success: bool):
|
|
296
|
+
"""Set the final attack success status."""
|
|
297
|
+
self.data["attack_result"]["success"] = success
|
|
298
|
+
|
|
299
|
+
def set_successful_attack_sequence(self, attack_sequence: List[Dict[str, Any]]):
|
|
300
|
+
"""
|
|
301
|
+
Set the successful attack sequence (injection steps that led to success).
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
attack_sequence: List of injection steps from InjectionBuffer.to_attack_instance()
|
|
305
|
+
"""
|
|
306
|
+
self.data["attack_result"]["successful_attack_sequence"] = attack_sequence
|
|
307
|
+
|
|
308
|
+
def set_original_task(self, task: str):
|
|
309
|
+
"""Set the original task description."""
|
|
310
|
+
self.data["task_info"]["original_task"] = task
|
|
311
|
+
|
|
312
|
+
def set_malicious_goal(self, goal: str):
|
|
313
|
+
"""Set the malicious goal."""
|
|
314
|
+
self.data["task_info"]["malicious_goal"] = goal
|
|
315
|
+
|
|
316
|
+
def start_timer(self):
|
|
317
|
+
"""Start timing the attack execution."""
|
|
318
|
+
self._start_time = datetime.now().timestamp()
|
|
319
|
+
|
|
320
|
+
def stop_timer(self):
|
|
321
|
+
"""Stop timing and update duration."""
|
|
322
|
+
if self._start_time:
|
|
323
|
+
end_time = datetime.now().timestamp()
|
|
324
|
+
self.data["attack_result"]["duration"] = round(end_time - self._start_time, 3)
|
|
325
|
+
self.data["attack_result"]["timestamp"] = datetime.now().isoformat()
|
|
326
|
+
|
|
327
|
+
def get_summary(self) -> Dict[str, Any]:
|
|
328
|
+
"""
|
|
329
|
+
Get a summary of the attack trajectory.
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
Dictionary with summary statistics
|
|
333
|
+
"""
|
|
334
|
+
# Count different action types
|
|
335
|
+
trajectory = self.data["attack_trajectory"]
|
|
336
|
+
num_attempts = sum(1 for step in trajectory if step.get("type") == "query_attempt")
|
|
337
|
+
num_skills_loaded = sum(1 for step in trajectory if step.get("type") == "skill_load")
|
|
338
|
+
num_env_injections = sum(1 for step in trajectory if step.get("type") == "env_injection")
|
|
339
|
+
num_mcp_injections = sum(1 for step in trajectory if step.get("type") == "mcp_injection")
|
|
340
|
+
|
|
341
|
+
return {
|
|
342
|
+
"task_id": self.data["task_info"]["task_id"],
|
|
343
|
+
"domain": self.data["task_info"]["domain"],
|
|
344
|
+
"risk_category": self.data["task_info"]["risk_category"],
|
|
345
|
+
"threat_model": self.data["task_info"]["threat_model"],
|
|
346
|
+
"success": self.data["attack_result"]["success"],
|
|
347
|
+
"num_attempts": num_attempts,
|
|
348
|
+
"num_skills_loaded": num_skills_loaded,
|
|
349
|
+
"num_env_injections": num_env_injections,
|
|
350
|
+
"num_mcp_injections": num_mcp_injections,
|
|
351
|
+
"duration": self.data["attack_result"]["duration"],
|
|
352
|
+
"trajectory_steps": len(trajectory)
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
356
|
+
"""Return the trajectory as a dictionary."""
|
|
357
|
+
return self.data
|
|
358
|
+
|
|
359
|
+
def save(self, filepath: str) -> str:
|
|
360
|
+
"""
|
|
361
|
+
Save trajectory to a JSON file.
|
|
362
|
+
|
|
363
|
+
Args:
|
|
364
|
+
filepath: Output file path
|
|
365
|
+
|
|
366
|
+
Returns:
|
|
367
|
+
Path to saved file
|
|
368
|
+
"""
|
|
369
|
+
# Ensure directory exists
|
|
370
|
+
os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else ".", exist_ok=True)
|
|
371
|
+
|
|
372
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
|
373
|
+
json.dump(self.data, f, indent=2, ensure_ascii=False)
|
|
374
|
+
|
|
375
|
+
return filepath
|
|
376
|
+
|
|
377
|
+
def __repr__(self) -> str:
|
|
378
|
+
summary = self.get_summary()
|
|
379
|
+
return (
|
|
380
|
+
f"RedTeamingTrajectory("
|
|
381
|
+
f"task_id={summary['task_id']}, "
|
|
382
|
+
f"success={summary['success']}, "
|
|
383
|
+
f"attempts={summary['num_attempts']}, "
|
|
384
|
+
f"steps={summary['trajectory_steps']})"
|
|
385
|
+
)
|