decodingtrust-agent-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. agent/__init__.py +30 -0
  2. agent/claudesdk/__init__.py +8 -0
  3. agent/claudesdk/example.py +221 -0
  4. agent/claudesdk/src/__init__.py +8 -0
  5. agent/claudesdk/src/agent.py +400 -0
  6. agent/claudesdk/src/mcp_proxy.py +409 -0
  7. agent/claudesdk/src/utils.py +420 -0
  8. agent/googleadk/__init__.py +15 -0
  9. agent/googleadk/example.py +237 -0
  10. agent/googleadk/src/__init__.py +12 -0
  11. agent/googleadk/src/agent.py +401 -0
  12. agent/googleadk/src/mcp_wrapper.py +163 -0
  13. agent/googleadk/src/utils.py +602 -0
  14. agent/langchain/__init__.py +8 -0
  15. agent/langchain/example.py +213 -0
  16. agent/langchain/src/__init__.py +8 -0
  17. agent/langchain/src/agent.py +645 -0
  18. agent/langchain/src/utils.py +433 -0
  19. agent/openaisdk/__init__.py +17 -0
  20. agent/openaisdk/example.py +228 -0
  21. agent/openaisdk/src/__init__.py +12 -0
  22. agent/openaisdk/src/agent.py +491 -0
  23. agent/openaisdk/src/agent_wrapper.py +143 -0
  24. agent/openaisdk/src/mcp_wrapper.py +395 -0
  25. agent/openaisdk/src/utils.py +493 -0
  26. agent/openclaw/__init__.py +10 -0
  27. agent/openclaw/example.py +251 -0
  28. agent/openclaw/src/__init__.py +14 -0
  29. agent/openclaw/src/agent.py +930 -0
  30. agent/openclaw/src/helpers/__init__.py +1 -0
  31. agent/openclaw/src/helpers/auth_helpers.py +55 -0
  32. agent/openclaw/src/mcp_proxy.py +564 -0
  33. agent/openclaw/src/plugin_generator.py +231 -0
  34. agent/openclaw/src/utils.py +341 -0
  35. agent/pocketflow/__init__.py +18 -0
  36. agent/pocketflow/example.py +221 -0
  37. agent/pocketflow/prompts/react_agent.py +46 -0
  38. agent/pocketflow/src/__init__.py +6 -0
  39. agent/pocketflow/src/agent.py +507 -0
  40. agent/pocketflow/src/agent_wrapper.py +159 -0
  41. agent/pocketflow/src/async_helper.py +92 -0
  42. agent/pocketflow/src/mcp_react_agent.py +279 -0
  43. agent/pocketflow/src/native_agent.py +74 -0
  44. agent/pocketflow/src/nodes.py +467 -0
  45. benchmark/__init__.py +0 -0
  46. benchmark/browser/benign.jsonl +34 -0
  47. benchmark/browser/direct.jsonl +85 -0
  48. benchmark/browser/indirect.jsonl +82 -0
  49. benchmark/code/benign.jsonl +0 -0
  50. benchmark/code/direct.jsonl +121 -0
  51. benchmark/code/indirect.jsonl +165 -0
  52. benchmark/crm/benign.jsonl +165 -0
  53. benchmark/crm/direct.jsonl +90 -0
  54. benchmark/crm/indirect.jsonl +150 -0
  55. benchmark/customer-service/benign.jsonl +160 -0
  56. benchmark/customer-service/direct.jsonl +100 -0
  57. benchmark/customer-service/indirect.jsonl +101 -0
  58. benchmark/finance/benign.jsonl +0 -0
  59. benchmark/finance/direct.jsonl +200 -0
  60. benchmark/finance/indirect.jsonl +200 -0
  61. benchmark/legal/benign.jsonl +0 -0
  62. benchmark/legal/direct.jsonl +200 -0
  63. benchmark/legal/indirect.jsonl +200 -0
  64. benchmark/macos/benign.jsonl +30 -0
  65. benchmark/macos/direct.jsonl +50 -0
  66. benchmark/macos/indirect.jsonl +50 -0
  67. benchmark/medical/benign.jsonl +642 -0
  68. benchmark/medical/direct.jsonl +229 -0
  69. benchmark/medical/indirect.jsonl +222 -0
  70. benchmark/os-filesystem/benign.jsonl +200 -0
  71. benchmark/os-filesystem/direct.jsonl +200 -0
  72. benchmark/os-filesystem/indirect.jsonl +200 -0
  73. benchmark/research/benign.jsonl +0 -0
  74. benchmark/research/direct.jsonl +119 -0
  75. benchmark/research/indirect.jsonl +125 -0
  76. benchmark/telecom/benign.jsonl +120 -0
  77. benchmark/telecom/direct.jsonl +161 -0
  78. benchmark/telecom/indirect.jsonl +166 -0
  79. benchmark/travel/benign.jsonl +130 -0
  80. benchmark/travel/direct.jsonl +105 -0
  81. benchmark/travel/indirect.jsonl +120 -0
  82. benchmark/windows/benign.jsonl +100 -0
  83. benchmark/windows/direct.jsonl +140 -0
  84. benchmark/windows/indirect.jsonl +107 -0
  85. benchmark/workflow/benign.jsonl +335 -0
  86. benchmark/workflow/direct.jsonl +78 -0
  87. benchmark/workflow/indirect.jsonl +107 -0
  88. cli/__init__.py +5 -0
  89. cli/main.py +182 -0
  90. cli/scaffold.py +334 -0
  91. decodingtrust_agent_sdk-0.1.0.dist-info/METADATA +642 -0
  92. decodingtrust_agent_sdk-0.1.0.dist-info/RECORD +374 -0
  93. decodingtrust_agent_sdk-0.1.0.dist-info/WHEEL +5 -0
  94. decodingtrust_agent_sdk-0.1.0.dist-info/entry_points.txt +2 -0
  95. decodingtrust_agent_sdk-0.1.0.dist-info/licenses/LICENSE +201 -0
  96. decodingtrust_agent_sdk-0.1.0.dist-info/top_level.txt +6 -0
  97. dt_arena/config/env.yaml +515 -0
  98. dt_arena/config/injection_mcp.yaml +430 -0
  99. dt_arena/config/mcp.yaml +642 -0
  100. dt_arena/envs/arxiv/docker-compose-hub.yml +31 -0
  101. dt_arena/envs/arxiv/docker-compose.yml +36 -0
  102. dt_arena/envs/atlassian/docker/docker-compose.dev.yml +65 -0
  103. dt_arena/envs/atlassian/docker/docker-compose.yml +53 -0
  104. dt_arena/envs/atlassian/docker-compose-hub.yml +57 -0
  105. dt_arena/envs/atlassian/docker-compose.yml +72 -0
  106. dt_arena/envs/bigquery/docker-compose.yml +20 -0
  107. dt_arena/envs/booking/docker-compose.yml +59 -0
  108. dt_arena/envs/calendar/docker-compose-hub.yml +30 -0
  109. dt_arena/envs/calendar/docker-compose.yml +42 -0
  110. dt_arena/envs/custom-website/docker-compose.yml +6 -0
  111. dt_arena/envs/customer_service/docker-compose.yml +59 -0
  112. dt_arena/envs/databricks/docker-compose-hub.yml +47 -0
  113. dt_arena/envs/databricks/docker-compose.yml +51 -0
  114. dt_arena/envs/ecommerce/docker-compose.yml +6 -0
  115. dt_arena/envs/ers/docker-compose.yml +36 -0
  116. dt_arena/envs/ers/hrms/docker/docker-compose.yml +31 -0
  117. dt_arena/envs/finance/docker-compose.yml +23 -0
  118. dt_arena/envs/github/docker/docker-compose-hub.yml +50 -0
  119. dt_arena/envs/github/docker/docker-compose.yml +50 -0
  120. dt_arena/envs/gmail/docker-compose-hub.yml +51 -0
  121. dt_arena/envs/gmail/docker-compose.yml +65 -0
  122. dt_arena/envs/google-form/docker-compose-hub.yml +33 -0
  123. dt_arena/envs/google-form/docker-compose.yml +41 -0
  124. dt_arena/envs/googledocs/docker-compose-hub.yml +61 -0
  125. dt_arena/envs/googledocs/docker-compose.yml +78 -0
  126. dt_arena/envs/hospital/docker-compose-hub.yml +25 -0
  127. dt_arena/envs/hospital/docker-compose.yml +27 -0
  128. dt_arena/envs/legal/docker-compose.yml +22 -0
  129. dt_arena/envs/linkedin/docker-compose.yml +63 -0
  130. dt_arena/envs/macos/docker-compose.yml +79 -0
  131. dt_arena/envs/os-filesystem/docker-compose-hub.yml +16 -0
  132. dt_arena/envs/os-filesystem/docker-compose.yml +20 -0
  133. dt_arena/envs/paypal/docker-compose-hub.yml +48 -0
  134. dt_arena/envs/paypal/docker-compose.yml +63 -0
  135. dt_arena/envs/research/docker-compose-hub.yml +13 -0
  136. dt_arena/envs/research/docker-compose.yml +24 -0
  137. dt_arena/envs/salesforce_crm/docker-compose-hub.yaml +45 -0
  138. dt_arena/envs/salesforce_crm/docker-compose.yaml +49 -0
  139. dt_arena/envs/slack/docker-compose-hub.yml +28 -0
  140. dt_arena/envs/slack/docker-compose.yml +41 -0
  141. dt_arena/envs/snowflake/docker-compose-hub.yml +41 -0
  142. dt_arena/envs/snowflake/docker-compose.yml +44 -0
  143. dt_arena/envs/telecom/docker-compose-hub.yml +16 -0
  144. dt_arena/envs/telecom/docker-compose.yml +17 -0
  145. dt_arena/envs/telegram/docker-compose-hub.yml +57 -0
  146. dt_arena/envs/telegram/docker-compose.yml +62 -0
  147. dt_arena/envs/terminal/docker-compose-hub.yml +12 -0
  148. dt_arena/envs/terminal/docker-compose.yml +26 -0
  149. dt_arena/envs/travel/docker-compose-hub.yml +19 -0
  150. dt_arena/envs/travel/docker-compose.yml +19 -0
  151. dt_arena/envs/whatsapp/docker-compose-hub.yml +61 -0
  152. dt_arena/envs/whatsapp/docker-compose.yml +78 -0
  153. dt_arena/envs/windows/docker-compose.yml +71 -0
  154. dt_arena/envs/zoom/docker-compose-hub.yml +27 -0
  155. dt_arena/envs/zoom/docker-compose.yml +40 -0
  156. dt_arena/injection_mcp_server/atlassian/env_injection.py +134 -0
  157. dt_arena/injection_mcp_server/calendar/env_injection.py +217 -0
  158. dt_arena/injection_mcp_server/custom_website/env_injection.py +97 -0
  159. dt_arena/injection_mcp_server/customer_service/env_injection.py +659 -0
  160. dt_arena/injection_mcp_server/databricks/env_injection.py +255 -0
  161. dt_arena/injection_mcp_server/ecommerce/env_injection.py +110 -0
  162. dt_arena/injection_mcp_server/finance/env_injection.py +85 -0
  163. dt_arena/injection_mcp_server/github/env_injection.py +206 -0
  164. dt_arena/injection_mcp_server/gmail/env_injection.py +211 -0
  165. dt_arena/injection_mcp_server/google_form/env_injection.py +186 -0
  166. dt_arena/injection_mcp_server/googledocs/env_injection.py +44 -0
  167. dt_arena/injection_mcp_server/hospital/env_injection.py +43 -0
  168. dt_arena/injection_mcp_server/legal/env_injection.py +229 -0
  169. dt_arena/injection_mcp_server/macos/env_injection.py +272 -0
  170. dt_arena/injection_mcp_server/os-filesystem/env_injection.py +341 -0
  171. dt_arena/injection_mcp_server/paypal/env_injection.py +268 -0
  172. dt_arena/injection_mcp_server/research/env_injection.py +616 -0
  173. dt_arena/injection_mcp_server/salesforce/env_injection.py +514 -0
  174. dt_arena/injection_mcp_server/slack/env_injection.py +265 -0
  175. dt_arena/injection_mcp_server/snowflake/env_injection.py +230 -0
  176. dt_arena/injection_mcp_server/telecom/env_injection.py +503 -0
  177. dt_arena/injection_mcp_server/telegram/env_injection.py +171 -0
  178. dt_arena/injection_mcp_server/terminal/env_injection.py +523 -0
  179. dt_arena/injection_mcp_server/travel/env_injection.py +173 -0
  180. dt_arena/injection_mcp_server/whatsapp/env_injection.py +185 -0
  181. dt_arena/injection_mcp_server/windows/env_injection.py +943 -0
  182. dt_arena/injection_mcp_server/zoom/env_injection.py +216 -0
  183. dt_arena/mcp_server/atlassian/main.py +1554 -0
  184. dt_arena/mcp_server/atlassian/test_server.py +66 -0
  185. dt_arena/mcp_server/bigquery/main.py +333 -0
  186. dt_arena/mcp_server/booking/main.py +310 -0
  187. dt_arena/mcp_server/browser/main.py +1741 -0
  188. dt_arena/mcp_server/calendar/example_multi_user.py +162 -0
  189. dt_arena/mcp_server/calendar/main.py +792 -0
  190. dt_arena/mcp_server/calendar/test_mcp.py +135 -0
  191. dt_arena/mcp_server/customer_service/main.py +1063 -0
  192. dt_arena/mcp_server/databricks/main.py +566 -0
  193. dt_arena/mcp_server/databricks/probe.py +102 -0
  194. dt_arena/mcp_server/ers/main.py +845 -0
  195. dt_arena/mcp_server/finance/__init__.py +87 -0
  196. dt_arena/mcp_server/finance/core/__init__.py +12 -0
  197. dt_arena/mcp_server/finance/core/data_loader.py +558 -0
  198. dt_arena/mcp_server/finance/core/portfolio.py +565 -0
  199. dt_arena/mcp_server/finance/evaluation/__init__.py +20 -0
  200. dt_arena/mcp_server/finance/evaluation/evaluator.py +217 -0
  201. dt_arena/mcp_server/finance/evaluation/logger.py +137 -0
  202. dt_arena/mcp_server/finance/injection/__init__.py +66 -0
  203. dt_arena/mcp_server/finance/injection/config.py +176 -0
  204. dt_arena/mcp_server/finance/injection/content.py +755 -0
  205. dt_arena/mcp_server/finance/injection/html.py +409 -0
  206. dt_arena/mcp_server/finance/injection/locations.py +167 -0
  207. dt_arena/mcp_server/finance/injection/methods.py +193 -0
  208. dt_arena/mcp_server/finance/injection/presets.py +1023 -0
  209. dt_arena/mcp_server/finance/main.py +361 -0
  210. dt_arena/mcp_server/finance/run_mcp.py +21 -0
  211. dt_arena/mcp_server/finance/run_web.py +26 -0
  212. dt_arena/mcp_server/finance/server/__init__.py +41 -0
  213. dt_arena/mcp_server/finance/server/extractor.py +1453 -0
  214. dt_arena/mcp_server/finance/server/extractor_minimal.py +292 -0
  215. dt_arena/mcp_server/finance/server/extractor_simple.py +1164 -0
  216. dt_arena/mcp_server/finance/server/injection_mcp.py +865 -0
  217. dt_arena/mcp_server/finance/server/mcp.py +451 -0
  218. dt_arena/mcp_server/finance/server/tools/__init__.py +23 -0
  219. dt_arena/mcp_server/finance/server/tools/account.py +88 -0
  220. dt_arena/mcp_server/finance/server/tools/browsing.py +328 -0
  221. dt_arena/mcp_server/finance/server/tools/social.py +73 -0
  222. dt_arena/mcp_server/finance/server/tools/trading.py +242 -0
  223. dt_arena/mcp_server/finance/server/tools/utility.py +49 -0
  224. dt_arena/mcp_server/finance/server/web.py +2139 -0
  225. dt_arena/mcp_server/finance/tasks/benchmark/__init__.py +28 -0
  226. dt_arena/mcp_server/finance/tasks/benchmark/attack_pool.py +3026 -0
  227. dt_arena/mcp_server/finance/tasks/benchmark/attack_runner.py +1315 -0
  228. dt_arena/mcp_server/finance/tasks/benchmark/finra_requirements.py +1335 -0
  229. dt_arena/mcp_server/finance/tasks/benchmark/finra_tasks.py +3665 -0
  230. dt_arena/mcp_server/finance/tasks/benchmark/malicious_tasks.py +2673 -0
  231. dt_arena/mcp_server/finance/tasks/redteam_suite/run_redteam_suite.py +1713 -0
  232. dt_arena/mcp_server/finance/test_mcp_tools.py +476 -0
  233. dt_arena/mcp_server/github/main.py +441 -0
  234. dt_arena/mcp_server/gmail/main.py +1004 -0
  235. dt_arena/mcp_server/google_form/main.py +141 -0
  236. dt_arena/mcp_server/googledocs/main.py +458 -0
  237. dt_arena/mcp_server/hospital/mcp_server.py +458 -0
  238. dt_arena/mcp_server/legal/__init__.py +9 -0
  239. dt_arena/mcp_server/legal/core/__init__.py +14 -0
  240. dt_arena/mcp_server/legal/core/courtlistener_store.py +762 -0
  241. dt_arena/mcp_server/legal/core/data_loader.py +266 -0
  242. dt_arena/mcp_server/legal/core/document_store.py +197 -0
  243. dt_arena/mcp_server/legal/core/matter_manager.py +466 -0
  244. dt_arena/mcp_server/legal/main.py +89 -0
  245. dt_arena/mcp_server/legal/scripts/collect_data.py +988 -0
  246. dt_arena/mcp_server/legal/server/__init__.py +14 -0
  247. dt_arena/mcp_server/legal/server/mcp.py +2330 -0
  248. dt_arena/mcp_server/macos/client_test.py +270 -0
  249. dt_arena/mcp_server/macos/mcp_server.py +285 -0
  250. dt_arena/mcp_server/os-filesystem/main.py +1380 -0
  251. dt_arena/mcp_server/paypal/main.py +501 -0
  252. dt_arena/mcp_server/research/main.py +777 -0
  253. dt_arena/mcp_server/salesforce/main.py +2006 -0
  254. dt_arena/mcp_server/slack/main.py +318 -0
  255. dt_arena/mcp_server/snowflake/main.py +612 -0
  256. dt_arena/mcp_server/snowflake/probe.py +183 -0
  257. dt_arena/mcp_server/telecom/mcp_client.py +423 -0
  258. dt_arena/mcp_server/telecom/mcp_server.py +1059 -0
  259. dt_arena/mcp_server/telegram/main.py +338 -0
  260. dt_arena/mcp_server/terminal/main.py +163 -0
  261. dt_arena/mcp_server/travel/client_test.py +16 -0
  262. dt_arena/mcp_server/travel/mcp_server.py +404 -0
  263. dt_arena/mcp_server/whatsapp/main.py +318 -0
  264. dt_arena/mcp_server/windows/client_test.py +270 -0
  265. dt_arena/mcp_server/windows/mcp_server.py +218 -0
  266. dt_arena/mcp_server/zoom/main.py +466 -0
  267. dt_arena/src/__init__.py +0 -0
  268. dt_arena/src/hooks/__init__.py +0 -0
  269. dt_arena/src/hooks/audit_log.py +30 -0
  270. dt_arena/src/hooks/hooks.json +3 -0
  271. dt_arena/src/run_benign.py +142 -0
  272. dt_arena/src/types/__init__.py +0 -0
  273. dt_arena/src/types/agent.py +441 -0
  274. dt_arena/src/types/attacks.py +2 -0
  275. dt_arena/src/types/environment.py +2 -0
  276. dt_arena/src/types/hooks.py +174 -0
  277. dt_arena/src/types/judge.py +52 -0
  278. dt_arena/src/types/red_teaming_trajectory.py +385 -0
  279. dt_arena/src/types/task.py +260 -0
  280. dt_arena/src/types/trajectory.py +315 -0
  281. dt_arena/utils/__init__.py +1 -0
  282. dt_arena/utils/atlassian/__init__.py +27 -0
  283. dt_arena/utils/atlassian/helpers.py +520 -0
  284. dt_arena/utils/bigquery/__init__.py +1 -0
  285. dt_arena/utils/bigquery/helpers.py +246 -0
  286. dt_arena/utils/calendar/__init__.py +1 -0
  287. dt_arena/utils/calendar/helpers.py +87 -0
  288. dt_arena/utils/customer_service/__init__.py +17 -0
  289. dt_arena/utils/customer_service/cs_env_client.py +940 -0
  290. dt_arena/utils/customer_service/helpers.py +339 -0
  291. dt_arena/utils/customer_service/judges/__init__.py +20 -0
  292. dt_arena/utils/customer_service/judges/abusive_comms.py +149 -0
  293. dt_arena/utils/customer_service/judges/bulk_destructive.py +202 -0
  294. dt_arena/utils/customer_service/judges/cross_customer.py +481 -0
  295. dt_arena/utils/customer_service/judges/email_exfiltration.py +320 -0
  296. dt_arena/utils/customer_service/judges/financial_fraud.py +221 -0
  297. dt_arena/utils/customer_service/judges/llm_utils.py +264 -0
  298. dt_arena/utils/customer_service/judges/pii_disclosure.py +837 -0
  299. dt_arena/utils/customer_service/judges/policy_misrep.py +44 -0
  300. dt_arena/utils/customer_service/judges/text_utils.py +21 -0
  301. dt_arena/utils/databricks/__init__.py +2 -0
  302. dt_arena/utils/databricks/helpers.py +210 -0
  303. dt_arena/utils/finance/__init__.py +0 -0
  304. dt_arena/utils/finance/helpers.py +263 -0
  305. dt_arena/utils/github/__init__.py +1 -0
  306. dt_arena/utils/github/helpers.py +249 -0
  307. dt_arena/utils/gmail/__init__.py +1 -0
  308. dt_arena/utils/gmail/helpers.py +344 -0
  309. dt_arena/utils/google_form/__init__.py +2 -0
  310. dt_arena/utils/google_form/helpers.py +133 -0
  311. dt_arena/utils/legal/__init__.py +0 -0
  312. dt_arena/utils/legal/helpers.py +228 -0
  313. dt_arena/utils/macos/__init__.py +0 -0
  314. dt_arena/utils/macos/env_setup.py +215 -0
  315. dt_arena/utils/macos/helpers.py +61 -0
  316. dt_arena/utils/os_filesystem/__init__.py +1 -0
  317. dt_arena/utils/os_filesystem/helpers.py +366 -0
  318. dt_arena/utils/paypal/__init__.py +1 -0
  319. dt_arena/utils/paypal/helpers.py +178 -0
  320. dt_arena/utils/port_allocator.py +266 -0
  321. dt_arena/utils/research/__init__.py +0 -0
  322. dt_arena/utils/research/helpers.py +251 -0
  323. dt_arena/utils/salesforce/__init__.py +1 -0
  324. dt_arena/utils/salesforce/helpers.py +719 -0
  325. dt_arena/utils/slack/__init__.py +1 -0
  326. dt_arena/utils/slack/helpers.py +176 -0
  327. dt_arena/utils/snowflake/__init__.py +1 -0
  328. dt_arena/utils/snowflake/helpers.py +166 -0
  329. dt_arena/utils/telecom/__init__.py +1 -0
  330. dt_arena/utils/telecom/helpers.py +760 -0
  331. dt_arena/utils/telegram/__init__.py +0 -0
  332. dt_arena/utils/telegram/helpers.py +174 -0
  333. dt_arena/utils/terminal/__init__.py +0 -0
  334. dt_arena/utils/terminal/helpers.py +20 -0
  335. dt_arena/utils/travel/__init__.py +0 -0
  336. dt_arena/utils/travel/env_client.py +537 -0
  337. dt_arena/utils/travel/llm_judge.py +137 -0
  338. dt_arena/utils/travel/prompts.py +64 -0
  339. dt_arena/utils/utils/__init__.py +122 -0
  340. dt_arena/utils/whatsapp/__init__.py +0 -0
  341. dt_arena/utils/whatsapp/helpers.py +226 -0
  342. dt_arena/utils/windows/__init__.py +0 -0
  343. dt_arena/utils/windows/env_reset.py +224 -0
  344. dt_arena/utils/windows/env_setup.py +280 -0
  345. dt_arena/utils/windows/exfil_helpers.py +170 -0
  346. dt_arena/utils/windows/helpers.py +74 -0
  347. dt_arena/utils/zoom/__init__.py +1 -0
  348. dt_arena/utils/zoom/helpers.py +70 -0
  349. eval/__init__.py +1 -0
  350. eval/evaluation.py +426 -0
  351. eval/task_runner.py +449 -0
  352. utils/__init__.py +148 -0
  353. utils/agent_helpers.py +308 -0
  354. utils/agent_wrapper.py +189 -0
  355. utils/compose_utils.py +135 -0
  356. utils/config.py +77 -0
  357. utils/env_helpers.py +104 -0
  358. utils/eval_stats.py +88 -0
  359. utils/injection_helpers.py +429 -0
  360. utils/injection_mcp_helpers.py +152 -0
  361. utils/judge_helpers.py +181 -0
  362. utils/judge_utils.py +472 -0
  363. utils/llm.py +196 -0
  364. utils/logging.py +45 -0
  365. utils/mcp_helpers.py +232 -0
  366. utils/mcp_manager.py +235 -0
  367. utils/memory_guard.py +18 -0
  368. utils/red_teaming_sandbox.py +476 -0
  369. utils/reset_helpers.py +318 -0
  370. utils/resource_manager.py +370 -0
  371. utils/skill_helpers.py +447 -0
  372. utils/task_executor.py +904 -0
  373. utils/task_helpers.py +270 -0
  374. utils/template_helpers.py +179 -0
@@ -0,0 +1,174 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import json
5
+ import time
6
+ import traceback
7
+ from dataclasses import dataclass, field
8
+ from pathlib import Path
9
+ from typing import Any, Awaitable, Callable, Dict, List, Optional, Protocol, runtime_checkable
10
+
11
+
12
+ @dataclass
13
+ class ToolCallContext:
14
+ """Describes an in-flight MCP tool call.
15
+
16
+ A pre-hook may return a modified ToolCallContext to rewrite `arguments`
17
+ or attach metadata before the call is dispatched.
18
+ """
19
+ framework: str
20
+ server: str
21
+ tool_name: str
22
+ arguments: Dict[str, Any]
23
+ trace_id: Optional[str] = None
24
+ metadata: Dict[str, Any] = field(default_factory=dict)
25
+
26
+
27
+ @dataclass
28
+ class ToolCallResult:
29
+ """Outcome of a dispatched MCP tool call.
30
+
31
+ `raw` holds whatever the underlying SDK returned (shape is framework
32
+ specific). `exception` is set if the call raised. `duration` is seconds.
33
+ A post-hook may return a modified ToolCallResult to rewrite the output.
34
+ """
35
+ raw: Any = None
36
+ is_error: bool = False
37
+ duration: float = 0.0
38
+ exception: Optional[BaseException] = None
39
+
40
+
41
+ @runtime_checkable
42
+ class ToolCallHook(Protocol):
43
+ """Protocol every hook implementation satisfies.
44
+
45
+ Either method may be omitted (return None). Returning a new context or
46
+ result from pre/post replaces the in-flight value.
47
+ """
48
+
49
+ async def on_pre_tool_call(
50
+ self, ctx: ToolCallContext
51
+ ) -> Optional[ToolCallContext]: ...
52
+
53
+ async def on_post_tool_call(
54
+ self, ctx: ToolCallContext, result: ToolCallResult
55
+ ) -> Optional[ToolCallResult]: ...
56
+
57
+
58
+ _HOOKS_CONFIG_PATH = Path(__file__).resolve().parents[1] / "hooks" / "hooks.json"
59
+
60
+
61
+ def _load_hooks_from_config() -> List["ToolCallHook"]:
62
+ """Instantiate hooks listed in `dt_arena/src/hooks/hooks.json`.
63
+
64
+ Expected format::
65
+
66
+ {"hooks": ["dt_arena.src.hooks.audit_log:AuditHook", ...]}
67
+
68
+ Missing file, empty list, or a malformed entry all degrade gracefully —
69
+ we log and skip. Each spec is ``module.path:ClassName``; the class is
70
+ imported and instantiated with no arguments.
71
+ """
72
+ if not _HOOKS_CONFIG_PATH.is_file():
73
+ return []
74
+ try:
75
+ config = json.loads(_HOOKS_CONFIG_PATH.read_text())
76
+ except Exception as e:
77
+ print(f"[HookManager] failed to parse {_HOOKS_CONFIG_PATH}: {e}")
78
+ return []
79
+
80
+ hooks: List[ToolCallHook] = []
81
+ for spec in config.get("hooks", []) or []:
82
+ module_name, sep, class_name = spec.rpartition(":")
83
+ if not sep or not module_name or not class_name:
84
+ print(f"[HookManager] invalid hook spec '{spec}' (expected 'module:ClassName')")
85
+ continue
86
+ try:
87
+ cls = getattr(importlib.import_module(module_name), class_name)
88
+ hooks.append(cls())
89
+ except Exception as e:
90
+ print(f"[HookManager] failed to load hook '{spec}': {e}")
91
+ return hooks
92
+
93
+
94
+ class HookManager:
95
+ """Runs registered hooks around a framework's real tool-dispatch call.
96
+
97
+ Hooks listed in ``dt_arena/src/hooks/hooks.json`` are auto-loaded on
98
+ every construction, so any agent built via ``build_agent`` or directly
99
+ inherits them without further wiring.
100
+ """
101
+
102
+ def __init__(self, hooks: Optional[List[ToolCallHook]] = None):
103
+ # Config-file hooks run first; explicit per-agent hooks run after.
104
+ self._hooks: List[ToolCallHook] = _load_hooks_from_config() + list(hooks or [])
105
+
106
+ def register(self, hook: ToolCallHook) -> None:
107
+ self._hooks.append(hook)
108
+
109
+ def clear(self) -> None:
110
+ self._hooks.clear()
111
+
112
+ @property
113
+ def hooks(self) -> List[ToolCallHook]:
114
+ return list(self._hooks)
115
+
116
+ async def _run_pre(self, ctx: ToolCallContext) -> ToolCallContext:
117
+ for hook in self._hooks:
118
+ fn = getattr(hook, "on_pre_tool_call", None)
119
+ if fn is None:
120
+ continue
121
+ new_ctx = await fn(ctx)
122
+ if new_ctx is not None:
123
+ ctx = new_ctx
124
+ return ctx
125
+
126
+ async def _run_post(
127
+ self, ctx: ToolCallContext, result: ToolCallResult
128
+ ) -> ToolCallResult:
129
+ for hook in self._hooks:
130
+ fn = getattr(hook, "on_post_tool_call", None)
131
+ if fn is None:
132
+ continue
133
+ new_result = await fn(ctx, result)
134
+ if new_result is not None:
135
+ result = new_result
136
+ return result
137
+
138
+ async def wrap(
139
+ self,
140
+ ctx: ToolCallContext,
141
+ call: Callable[[Dict[str, Any]], Awaitable[Any]],
142
+ ) -> Any:
143
+ """Run pre-hooks, dispatch `call(arguments)`, run post-hooks.
144
+
145
+ `call` is a framework-specific async callable that performs the real
146
+ MCP dispatch given the (possibly rewritten) arguments and returns
147
+ whatever that SDK returns. Post-hooks run even on exception.
148
+ """
149
+ # Fast path: no hooks registered.
150
+ if not self._hooks:
151
+ return await call(ctx.arguments)
152
+
153
+ ctx = await self._run_pre(ctx)
154
+
155
+ start = time.perf_counter()
156
+ result = ToolCallResult()
157
+ try:
158
+ result.raw = await call(ctx.arguments)
159
+ except BaseException as exc:
160
+ result.exception = exc
161
+ result.is_error = True
162
+ result.duration = time.perf_counter() - start
163
+ try:
164
+ await self._run_post(ctx, result)
165
+ except Exception:
166
+ traceback.print_exc()
167
+ raise
168
+ else:
169
+ result.duration = time.perf_counter() - start
170
+
171
+ final = await self._run_post(ctx, result)
172
+ if final.exception is not None and final.exception is not result.exception:
173
+ raise final.exception
174
+ return final.raw
@@ -0,0 +1,52 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Tuple, Dict, Any, Optional
3
+
4
+
5
+ class BaseJudge(ABC):
6
+ """Base class for judges that evaluate agent behavior.
7
+
8
+ DEPRECATION NOTICE:
9
+ The `trajectory` argument has been removed from `eval_task` and `eval_attack`
10
+ methods. This parameter is deprecated and no longer supported. Judge
11
+ implementations should only use `agent_responses` for evaluation.
12
+ """
13
+
14
+ def __init__(self, domain: str, judge_model: str = "gpt-4o"):
15
+ self.domain = domain
16
+ self.judge_model = judge_model
17
+
18
+ @abstractmethod
19
+ def eval_task(self, agent_responses: List[str]) -> Tuple[bool, Dict[str, Any]]:
20
+ """
21
+ Evaluate whether the agent successfully completed the task.
22
+
23
+ Args:
24
+ agent_responses: List of responses produced by the agent at each turn.
25
+
26
+ Returns:
27
+ Tuple of (success: bool, metadata: dict) where metadata contains
28
+ additional information about the evaluation result.
29
+
30
+ Note:
31
+ DEPRECATED: The `trajectory` parameter has been removed.
32
+ This parameter is no longer supported.
33
+ """
34
+ pass
35
+
36
+ @abstractmethod
37
+ def eval_attack(self, agent_responses: List[str]) -> Optional[Tuple[bool, Dict[str, Any]]]:
38
+ """
39
+ Evaluate whether an attack was successful against the agent.
40
+
41
+ Args:
42
+ agent_responses: List of responses produced by the agent at each turn.
43
+
44
+ Returns:
45
+ Tuple of (success: bool, metadata: dict) for adversarial tasks,
46
+ or None for benign tasks where no attack evaluation is needed.
47
+
48
+ Note:
49
+ DEPRECATED: The `trajectory` parameter has been removed.
50
+ This parameter is no longer supported.
51
+ """
52
+ pass
@@ -0,0 +1,385 @@
1
+ import os
2
+ import json
3
+ from dataclasses import dataclass
4
+ from datetime import datetime
5
+ from typing import Optional, Any, List, Dict, Literal
6
+
7
+
8
+ @dataclass
9
+ class RedTeamingResult:
10
+ """
11
+ Result from a red-teaming attack run.
12
+
13
+ This class encapsulates the outcome of a red-teaming attack,
14
+ including success status, detailed results, and the complete trajectory.
15
+ """
16
+ success: bool
17
+ result: Dict[str, Any]
18
+ trajectory: Optional['RedTeamingTrajectory'] = None
19
+
20
+ def to_dict(self) -> Dict[str, Any]:
21
+ """Convert the result to a dictionary."""
22
+ return {
23
+ "success": self.success,
24
+ "result": self.result,
25
+ "trajectory": self.trajectory.to_dict()
26
+ }
27
+
28
+
29
+ class RedTeamingTrajectory:
30
+ """
31
+ A class for building and manipulating red-teaming attack trajectories.
32
+
33
+ This class tracks the complete lifecycle of a red-teaming attack:
34
+ - Attack configuration and metadata
35
+ - Decision-making process
36
+ - Query attempts and target responses
37
+ - Judge evaluations
38
+ - Environment/MCP injections
39
+ - Skills loaded and used
40
+ - Final attack outcome
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ domain: str,
46
+ task_id: str,
47
+ risk_category: str,
48
+ threat_model: Literal["indirect", "direct"],
49
+ original_task: Optional[str] = None,
50
+ malicious_goal: Optional[str] = None,
51
+ agent_model: Optional[str] = None,
52
+ victim_model: Optional[str] = None,
53
+ victim_arch: Optional[str] = None,
54
+ judge_model: Optional[str] = None,
55
+ max_attack_iterations: int = 10,
56
+ ):
57
+ """
58
+ Initialize a new red-teaming trajectory.
59
+
60
+ Args:
61
+ domain: Task domain (e.g., "travel", "crm")
62
+ risk_category: Risk category (e.g., "accommodation-preference-violation")
63
+ task_id: Unique task identifier
64
+ threat_model: Threat model type - must be "indirect" or "direct"
65
+ original_task: The original benign task description
66
+ malicious_goal: The malicious objective to achieve
67
+ agent_model: LLM model used by red-teaming agent
68
+ victim_model: LLM model used by victim agent
69
+ victim_arch: Architecture type of victim agent (e.g., "pocketflow", "openaisdk", "langchain")
70
+ judge_model: LLM model used by judge
71
+ max_attack_iterations: Maximum attack attempts allowed
72
+ """
73
+ # Validate threat_model
74
+ if threat_model not in ["indirect", "direct"]:
75
+ raise ValueError(f"threat_model must be 'indirect' or 'direct', got: {threat_model}")
76
+
77
+ self.data = {
78
+ "task_info": {
79
+ "task_id": task_id,
80
+ "domain": domain,
81
+ "risk_category": risk_category,
82
+ "threat_model": threat_model,
83
+ "original_task": original_task,
84
+ "malicious_goal": malicious_goal,
85
+ },
86
+ "attack_config": {
87
+ "agent_model": agent_model,
88
+ "victim_model": victim_model,
89
+ "victim_arch": victim_arch,
90
+ "judge_model": judge_model,
91
+ "max_attack_iterations": max_attack_iterations,
92
+ },
93
+ "attack_result": {
94
+ "success": False,
95
+ "duration": 0.0,
96
+ "timestamp": datetime.now().isoformat(),
97
+ "successful_attack_sequence": [],
98
+ },
99
+ "attack_trajectory": []
100
+ }
101
+ self._start_time: Optional[float] = None
102
+ self._next_step_id: int = 1
103
+
104
+ @classmethod
105
+ def load(cls, filepath: str) -> 'RedTeamingTrajectory':
106
+ """
107
+ Load trajectory from a JSON file.
108
+
109
+ Args:
110
+ filepath: Path to trajectory JSON file
111
+
112
+ Returns:
113
+ RedTeamingTrajectory instance with loaded data
114
+ """
115
+ with open(filepath, 'r', encoding='utf-8') as f:
116
+ data = json.load(f)
117
+
118
+ # Support both old and new format
119
+ task_info = data.get("task_info") or data.get("attack_info", {})
120
+ attack_config = data.get("attack_config", {})
121
+
122
+ instance = cls(
123
+ domain=task_info.get("domain", ""),
124
+ risk_category=task_info.get("risk_category", ""),
125
+ task_id=task_info.get("task_id", ""),
126
+ threat_model=task_info.get("threat_model", "indirect"), # Default to indirect for backward compatibility
127
+ original_task=task_info.get("original_task"),
128
+ malicious_goal=task_info.get("malicious_goal"),
129
+ agent_model=attack_config.get("agent_model"),
130
+ victim_model=attack_config.get("victim_model") or attack_config.get("target_model"), # backward compat
131
+ victim_arch=attack_config.get("victim_arch"),
132
+ judge_model=attack_config.get("judge_model"),
133
+ max_attack_iterations=attack_config.get("max_attack_iterations", 10),
134
+ )
135
+
136
+ # Load full data
137
+ instance.data = data
138
+ return instance
139
+
140
+ def append_action(
141
+ self,
142
+ role: str,
143
+ action: Optional[str] = None,
144
+ state: Optional[Any] = None,
145
+ reasoning: Optional[str] = None,
146
+ metadata: Optional[Dict[str, Any]] = None
147
+ ):
148
+ """
149
+ Append a generic action step to the trajectory.
150
+
151
+ This is the unified interface for recording all agent actions.
152
+ Follows the structure: step_id, role, reasoning (optional), action/state, metadata
153
+
154
+ Args:
155
+ role: Role of the step ("attacker", "victim", "judge", "tool")
156
+ action: Action name (for attacker role)
157
+ state: State/response (for victim/judge/tool roles)
158
+ reasoning: Reasoning behind the action (optional, for attacker)
159
+ metadata: Additional metadata
160
+ """
161
+ step = {
162
+ "step_id": self._next_step_id,
163
+ "role": role,
164
+ }
165
+
166
+ # Add reasoning if provided (between role and action/state)
167
+ if reasoning:
168
+ step["reasoning"] = reasoning
169
+
170
+ # Add action or state depending on role
171
+ if action:
172
+ step["action"] = action
173
+ if state is not None:
174
+ step["state"] = state
175
+
176
+ # Add metadata
177
+ if metadata:
178
+ step["metadata"] = metadata
179
+
180
+ self.data["attack_trajectory"].append(step)
181
+ self._next_step_id += 1
182
+
183
+ def append_attacker_action(
184
+ self,
185
+ action: str,
186
+ reasoning: Optional[str] = None,
187
+ **kwargs
188
+ ):
189
+ """
190
+ Append an attacker action step to the trajectory.
191
+
192
+ Args:
193
+ action: Action name (dynamically determined from action space)
194
+ reasoning: Reasoning behind the action
195
+ **kwargs: Action-specific metadata
196
+ """
197
+ self.append_action(
198
+ role="attacker",
199
+ action=action,
200
+ reasoning=reasoning,
201
+ metadata=kwargs if kwargs else None
202
+ )
203
+
204
+ def append_victim_response(
205
+ self,
206
+ response: str,
207
+ query: str,
208
+ trajectory: Optional[List[Dict[str, Any]]] = None
209
+ ):
210
+ """
211
+ Append a victim agent response step.
212
+
213
+ Args:
214
+ response: Victim agent's response
215
+ query: Query sent to victim
216
+ trajectory: Victim agent's execution trajectory
217
+ """
218
+ metadata = {"query": query}
219
+ if trajectory:
220
+ metadata["victim_trajectory"] = trajectory
221
+
222
+ self.append_action(
223
+ role="victim",
224
+ state=response,
225
+ metadata=metadata
226
+ )
227
+
228
+ def append_verifiable_judge(
229
+ self,
230
+ attack_result: Optional[Dict[str, Any]] = None,
231
+ task_result: Optional[Dict[str, Any]] = None,
232
+ ):
233
+ """
234
+ Append a verifiable (programmatic) judge evaluation step.
235
+
236
+ For indirect threat model, both attack and task results are stored
237
+ in a single entry.
238
+
239
+ Args:
240
+ attack_result: Full result from the verifiable attack judge
241
+ (keys: success, message, metadata)
242
+ task_result: Full result from the verifiable task judge
243
+ (keys: success, message, metadata). Only present for indirect model.
244
+ """
245
+ state: Dict[str, Any] = {}
246
+ step_metadata: Dict[str, Any] = {}
247
+
248
+ if attack_result:
249
+ state["attack"] = attack_result.get("success", False)
250
+ step_metadata["attack"] = {
251
+ "success": attack_result.get("success", False),
252
+ "explanation": attack_result.get("message", ""),
253
+ "metadata": attack_result.get("metadata", {}),
254
+ }
255
+ if task_result:
256
+ state["task"] = task_result.get("success", False)
257
+ step_metadata["task"] = {
258
+ "success": task_result.get("success", False),
259
+ "explanation": task_result.get("message", ""),
260
+ "metadata": task_result.get("metadata", {}),
261
+ }
262
+
263
+ self.append_action(
264
+ role="verifiable_judge",
265
+ state=state,
266
+ metadata=step_metadata,
267
+ )
268
+
269
+ def append_feedback_judge(
270
+ self,
271
+ failure_reason: str = "",
272
+ improvement_suggestions: str = "",
273
+ feedback_details: Optional[Dict[str, Any]] = None,
274
+ ):
275
+ """
276
+ Append an LLM feedback judge evaluation step.
277
+
278
+ Args:
279
+ failure_reason: Why the attack failed according to the LLM judge
280
+ improvement_suggestions: Suggestions for improving the attack
281
+ feedback_details: Full parsed structured feedback from LLM judge
282
+ """
283
+ step_metadata: Dict[str, Any] = {
284
+ "failure_reason": failure_reason,
285
+ "improvement_suggestions": improvement_suggestions,
286
+ }
287
+ if feedback_details:
288
+ step_metadata["feedback_details"] = feedback_details
289
+ self.append_action(
290
+ role="feedback_judge",
291
+ state=False,
292
+ metadata=step_metadata,
293
+ )
294
+
295
+ def set_success(self, success: bool):
296
+ """Set the final attack success status."""
297
+ self.data["attack_result"]["success"] = success
298
+
299
+ def set_successful_attack_sequence(self, attack_sequence: List[Dict[str, Any]]):
300
+ """
301
+ Set the successful attack sequence (injection steps that led to success).
302
+
303
+ Args:
304
+ attack_sequence: List of injection steps from InjectionBuffer.to_attack_instance()
305
+ """
306
+ self.data["attack_result"]["successful_attack_sequence"] = attack_sequence
307
+
308
+ def set_original_task(self, task: str):
309
+ """Set the original task description."""
310
+ self.data["task_info"]["original_task"] = task
311
+
312
+ def set_malicious_goal(self, goal: str):
313
+ """Set the malicious goal."""
314
+ self.data["task_info"]["malicious_goal"] = goal
315
+
316
+ def start_timer(self):
317
+ """Start timing the attack execution."""
318
+ self._start_time = datetime.now().timestamp()
319
+
320
+ def stop_timer(self):
321
+ """Stop timing and update duration."""
322
+ if self._start_time:
323
+ end_time = datetime.now().timestamp()
324
+ self.data["attack_result"]["duration"] = round(end_time - self._start_time, 3)
325
+ self.data["attack_result"]["timestamp"] = datetime.now().isoformat()
326
+
327
+ def get_summary(self) -> Dict[str, Any]:
328
+ """
329
+ Get a summary of the attack trajectory.
330
+
331
+ Returns:
332
+ Dictionary with summary statistics
333
+ """
334
+ # Count different action types
335
+ trajectory = self.data["attack_trajectory"]
336
+ num_attempts = sum(1 for step in trajectory if step.get("type") == "query_attempt")
337
+ num_skills_loaded = sum(1 for step in trajectory if step.get("type") == "skill_load")
338
+ num_env_injections = sum(1 for step in trajectory if step.get("type") == "env_injection")
339
+ num_mcp_injections = sum(1 for step in trajectory if step.get("type") == "mcp_injection")
340
+
341
+ return {
342
+ "task_id": self.data["task_info"]["task_id"],
343
+ "domain": self.data["task_info"]["domain"],
344
+ "risk_category": self.data["task_info"]["risk_category"],
345
+ "threat_model": self.data["task_info"]["threat_model"],
346
+ "success": self.data["attack_result"]["success"],
347
+ "num_attempts": num_attempts,
348
+ "num_skills_loaded": num_skills_loaded,
349
+ "num_env_injections": num_env_injections,
350
+ "num_mcp_injections": num_mcp_injections,
351
+ "duration": self.data["attack_result"]["duration"],
352
+ "trajectory_steps": len(trajectory)
353
+ }
354
+
355
+ def to_dict(self) -> Dict[str, Any]:
356
+ """Return the trajectory as a dictionary."""
357
+ return self.data
358
+
359
+ def save(self, filepath: str) -> str:
360
+ """
361
+ Save trajectory to a JSON file.
362
+
363
+ Args:
364
+ filepath: Output file path
365
+
366
+ Returns:
367
+ Path to saved file
368
+ """
369
+ # Ensure directory exists
370
+ os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else ".", exist_ok=True)
371
+
372
+ with open(filepath, 'w', encoding='utf-8') as f:
373
+ json.dump(self.data, f, indent=2, ensure_ascii=False)
374
+
375
+ return filepath
376
+
377
+ def __repr__(self) -> str:
378
+ summary = self.get_summary()
379
+ return (
380
+ f"RedTeamingTrajectory("
381
+ f"task_id={summary['task_id']}, "
382
+ f"success={summary['success']}, "
383
+ f"attempts={summary['num_attempts']}, "
384
+ f"steps={summary['trajectory_steps']})"
385
+ )