decodingtrust-agent-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. agent/__init__.py +30 -0
  2. agent/claudesdk/__init__.py +8 -0
  3. agent/claudesdk/example.py +221 -0
  4. agent/claudesdk/src/__init__.py +8 -0
  5. agent/claudesdk/src/agent.py +400 -0
  6. agent/claudesdk/src/mcp_proxy.py +409 -0
  7. agent/claudesdk/src/utils.py +420 -0
  8. agent/googleadk/__init__.py +15 -0
  9. agent/googleadk/example.py +237 -0
  10. agent/googleadk/src/__init__.py +12 -0
  11. agent/googleadk/src/agent.py +401 -0
  12. agent/googleadk/src/mcp_wrapper.py +163 -0
  13. agent/googleadk/src/utils.py +602 -0
  14. agent/langchain/__init__.py +8 -0
  15. agent/langchain/example.py +213 -0
  16. agent/langchain/src/__init__.py +8 -0
  17. agent/langchain/src/agent.py +645 -0
  18. agent/langchain/src/utils.py +433 -0
  19. agent/openaisdk/__init__.py +17 -0
  20. agent/openaisdk/example.py +228 -0
  21. agent/openaisdk/src/__init__.py +12 -0
  22. agent/openaisdk/src/agent.py +491 -0
  23. agent/openaisdk/src/agent_wrapper.py +143 -0
  24. agent/openaisdk/src/mcp_wrapper.py +395 -0
  25. agent/openaisdk/src/utils.py +493 -0
  26. agent/openclaw/__init__.py +10 -0
  27. agent/openclaw/example.py +251 -0
  28. agent/openclaw/src/__init__.py +14 -0
  29. agent/openclaw/src/agent.py +930 -0
  30. agent/openclaw/src/helpers/__init__.py +1 -0
  31. agent/openclaw/src/helpers/auth_helpers.py +55 -0
  32. agent/openclaw/src/mcp_proxy.py +564 -0
  33. agent/openclaw/src/plugin_generator.py +231 -0
  34. agent/openclaw/src/utils.py +341 -0
  35. agent/pocketflow/__init__.py +18 -0
  36. agent/pocketflow/example.py +221 -0
  37. agent/pocketflow/prompts/react_agent.py +46 -0
  38. agent/pocketflow/src/__init__.py +6 -0
  39. agent/pocketflow/src/agent.py +507 -0
  40. agent/pocketflow/src/agent_wrapper.py +159 -0
  41. agent/pocketflow/src/async_helper.py +92 -0
  42. agent/pocketflow/src/mcp_react_agent.py +279 -0
  43. agent/pocketflow/src/native_agent.py +74 -0
  44. agent/pocketflow/src/nodes.py +467 -0
  45. benchmark/__init__.py +0 -0
  46. benchmark/browser/benign.jsonl +34 -0
  47. benchmark/browser/direct.jsonl +85 -0
  48. benchmark/browser/indirect.jsonl +82 -0
  49. benchmark/code/benign.jsonl +0 -0
  50. benchmark/code/direct.jsonl +121 -0
  51. benchmark/code/indirect.jsonl +165 -0
  52. benchmark/crm/benign.jsonl +165 -0
  53. benchmark/crm/direct.jsonl +90 -0
  54. benchmark/crm/indirect.jsonl +150 -0
  55. benchmark/customer-service/benign.jsonl +160 -0
  56. benchmark/customer-service/direct.jsonl +100 -0
  57. benchmark/customer-service/indirect.jsonl +101 -0
  58. benchmark/finance/benign.jsonl +0 -0
  59. benchmark/finance/direct.jsonl +200 -0
  60. benchmark/finance/indirect.jsonl +200 -0
  61. benchmark/legal/benign.jsonl +0 -0
  62. benchmark/legal/direct.jsonl +200 -0
  63. benchmark/legal/indirect.jsonl +200 -0
  64. benchmark/macos/benign.jsonl +30 -0
  65. benchmark/macos/direct.jsonl +50 -0
  66. benchmark/macos/indirect.jsonl +50 -0
  67. benchmark/medical/benign.jsonl +642 -0
  68. benchmark/medical/direct.jsonl +229 -0
  69. benchmark/medical/indirect.jsonl +222 -0
  70. benchmark/os-filesystem/benign.jsonl +200 -0
  71. benchmark/os-filesystem/direct.jsonl +200 -0
  72. benchmark/os-filesystem/indirect.jsonl +200 -0
  73. benchmark/research/benign.jsonl +0 -0
  74. benchmark/research/direct.jsonl +119 -0
  75. benchmark/research/indirect.jsonl +125 -0
  76. benchmark/telecom/benign.jsonl +120 -0
  77. benchmark/telecom/direct.jsonl +161 -0
  78. benchmark/telecom/indirect.jsonl +166 -0
  79. benchmark/travel/benign.jsonl +130 -0
  80. benchmark/travel/direct.jsonl +105 -0
  81. benchmark/travel/indirect.jsonl +120 -0
  82. benchmark/windows/benign.jsonl +100 -0
  83. benchmark/windows/direct.jsonl +140 -0
  84. benchmark/windows/indirect.jsonl +107 -0
  85. benchmark/workflow/benign.jsonl +335 -0
  86. benchmark/workflow/direct.jsonl +78 -0
  87. benchmark/workflow/indirect.jsonl +107 -0
  88. cli/__init__.py +5 -0
  89. cli/main.py +182 -0
  90. cli/scaffold.py +334 -0
  91. decodingtrust_agent_sdk-0.1.0.dist-info/METADATA +642 -0
  92. decodingtrust_agent_sdk-0.1.0.dist-info/RECORD +374 -0
  93. decodingtrust_agent_sdk-0.1.0.dist-info/WHEEL +5 -0
  94. decodingtrust_agent_sdk-0.1.0.dist-info/entry_points.txt +2 -0
  95. decodingtrust_agent_sdk-0.1.0.dist-info/licenses/LICENSE +201 -0
  96. decodingtrust_agent_sdk-0.1.0.dist-info/top_level.txt +6 -0
  97. dt_arena/config/env.yaml +515 -0
  98. dt_arena/config/injection_mcp.yaml +430 -0
  99. dt_arena/config/mcp.yaml +642 -0
  100. dt_arena/envs/arxiv/docker-compose-hub.yml +31 -0
  101. dt_arena/envs/arxiv/docker-compose.yml +36 -0
  102. dt_arena/envs/atlassian/docker/docker-compose.dev.yml +65 -0
  103. dt_arena/envs/atlassian/docker/docker-compose.yml +53 -0
  104. dt_arena/envs/atlassian/docker-compose-hub.yml +57 -0
  105. dt_arena/envs/atlassian/docker-compose.yml +72 -0
  106. dt_arena/envs/bigquery/docker-compose.yml +20 -0
  107. dt_arena/envs/booking/docker-compose.yml +59 -0
  108. dt_arena/envs/calendar/docker-compose-hub.yml +30 -0
  109. dt_arena/envs/calendar/docker-compose.yml +42 -0
  110. dt_arena/envs/custom-website/docker-compose.yml +6 -0
  111. dt_arena/envs/customer_service/docker-compose.yml +59 -0
  112. dt_arena/envs/databricks/docker-compose-hub.yml +47 -0
  113. dt_arena/envs/databricks/docker-compose.yml +51 -0
  114. dt_arena/envs/ecommerce/docker-compose.yml +6 -0
  115. dt_arena/envs/ers/docker-compose.yml +36 -0
  116. dt_arena/envs/ers/hrms/docker/docker-compose.yml +31 -0
  117. dt_arena/envs/finance/docker-compose.yml +23 -0
  118. dt_arena/envs/github/docker/docker-compose-hub.yml +50 -0
  119. dt_arena/envs/github/docker/docker-compose.yml +50 -0
  120. dt_arena/envs/gmail/docker-compose-hub.yml +51 -0
  121. dt_arena/envs/gmail/docker-compose.yml +65 -0
  122. dt_arena/envs/google-form/docker-compose-hub.yml +33 -0
  123. dt_arena/envs/google-form/docker-compose.yml +41 -0
  124. dt_arena/envs/googledocs/docker-compose-hub.yml +61 -0
  125. dt_arena/envs/googledocs/docker-compose.yml +78 -0
  126. dt_arena/envs/hospital/docker-compose-hub.yml +25 -0
  127. dt_arena/envs/hospital/docker-compose.yml +27 -0
  128. dt_arena/envs/legal/docker-compose.yml +22 -0
  129. dt_arena/envs/linkedin/docker-compose.yml +63 -0
  130. dt_arena/envs/macos/docker-compose.yml +79 -0
  131. dt_arena/envs/os-filesystem/docker-compose-hub.yml +16 -0
  132. dt_arena/envs/os-filesystem/docker-compose.yml +20 -0
  133. dt_arena/envs/paypal/docker-compose-hub.yml +48 -0
  134. dt_arena/envs/paypal/docker-compose.yml +63 -0
  135. dt_arena/envs/research/docker-compose-hub.yml +13 -0
  136. dt_arena/envs/research/docker-compose.yml +24 -0
  137. dt_arena/envs/salesforce_crm/docker-compose-hub.yaml +45 -0
  138. dt_arena/envs/salesforce_crm/docker-compose.yaml +49 -0
  139. dt_arena/envs/slack/docker-compose-hub.yml +28 -0
  140. dt_arena/envs/slack/docker-compose.yml +41 -0
  141. dt_arena/envs/snowflake/docker-compose-hub.yml +41 -0
  142. dt_arena/envs/snowflake/docker-compose.yml +44 -0
  143. dt_arena/envs/telecom/docker-compose-hub.yml +16 -0
  144. dt_arena/envs/telecom/docker-compose.yml +17 -0
  145. dt_arena/envs/telegram/docker-compose-hub.yml +57 -0
  146. dt_arena/envs/telegram/docker-compose.yml +62 -0
  147. dt_arena/envs/terminal/docker-compose-hub.yml +12 -0
  148. dt_arena/envs/terminal/docker-compose.yml +26 -0
  149. dt_arena/envs/travel/docker-compose-hub.yml +19 -0
  150. dt_arena/envs/travel/docker-compose.yml +19 -0
  151. dt_arena/envs/whatsapp/docker-compose-hub.yml +61 -0
  152. dt_arena/envs/whatsapp/docker-compose.yml +78 -0
  153. dt_arena/envs/windows/docker-compose.yml +71 -0
  154. dt_arena/envs/zoom/docker-compose-hub.yml +27 -0
  155. dt_arena/envs/zoom/docker-compose.yml +40 -0
  156. dt_arena/injection_mcp_server/atlassian/env_injection.py +134 -0
  157. dt_arena/injection_mcp_server/calendar/env_injection.py +217 -0
  158. dt_arena/injection_mcp_server/custom_website/env_injection.py +97 -0
  159. dt_arena/injection_mcp_server/customer_service/env_injection.py +659 -0
  160. dt_arena/injection_mcp_server/databricks/env_injection.py +255 -0
  161. dt_arena/injection_mcp_server/ecommerce/env_injection.py +110 -0
  162. dt_arena/injection_mcp_server/finance/env_injection.py +85 -0
  163. dt_arena/injection_mcp_server/github/env_injection.py +206 -0
  164. dt_arena/injection_mcp_server/gmail/env_injection.py +211 -0
  165. dt_arena/injection_mcp_server/google_form/env_injection.py +186 -0
  166. dt_arena/injection_mcp_server/googledocs/env_injection.py +44 -0
  167. dt_arena/injection_mcp_server/hospital/env_injection.py +43 -0
  168. dt_arena/injection_mcp_server/legal/env_injection.py +229 -0
  169. dt_arena/injection_mcp_server/macos/env_injection.py +272 -0
  170. dt_arena/injection_mcp_server/os-filesystem/env_injection.py +341 -0
  171. dt_arena/injection_mcp_server/paypal/env_injection.py +268 -0
  172. dt_arena/injection_mcp_server/research/env_injection.py +616 -0
  173. dt_arena/injection_mcp_server/salesforce/env_injection.py +514 -0
  174. dt_arena/injection_mcp_server/slack/env_injection.py +265 -0
  175. dt_arena/injection_mcp_server/snowflake/env_injection.py +230 -0
  176. dt_arena/injection_mcp_server/telecom/env_injection.py +503 -0
  177. dt_arena/injection_mcp_server/telegram/env_injection.py +171 -0
  178. dt_arena/injection_mcp_server/terminal/env_injection.py +523 -0
  179. dt_arena/injection_mcp_server/travel/env_injection.py +173 -0
  180. dt_arena/injection_mcp_server/whatsapp/env_injection.py +185 -0
  181. dt_arena/injection_mcp_server/windows/env_injection.py +943 -0
  182. dt_arena/injection_mcp_server/zoom/env_injection.py +216 -0
  183. dt_arena/mcp_server/atlassian/main.py +1554 -0
  184. dt_arena/mcp_server/atlassian/test_server.py +66 -0
  185. dt_arena/mcp_server/bigquery/main.py +333 -0
  186. dt_arena/mcp_server/booking/main.py +310 -0
  187. dt_arena/mcp_server/browser/main.py +1741 -0
  188. dt_arena/mcp_server/calendar/example_multi_user.py +162 -0
  189. dt_arena/mcp_server/calendar/main.py +792 -0
  190. dt_arena/mcp_server/calendar/test_mcp.py +135 -0
  191. dt_arena/mcp_server/customer_service/main.py +1063 -0
  192. dt_arena/mcp_server/databricks/main.py +566 -0
  193. dt_arena/mcp_server/databricks/probe.py +102 -0
  194. dt_arena/mcp_server/ers/main.py +845 -0
  195. dt_arena/mcp_server/finance/__init__.py +87 -0
  196. dt_arena/mcp_server/finance/core/__init__.py +12 -0
  197. dt_arena/mcp_server/finance/core/data_loader.py +558 -0
  198. dt_arena/mcp_server/finance/core/portfolio.py +565 -0
  199. dt_arena/mcp_server/finance/evaluation/__init__.py +20 -0
  200. dt_arena/mcp_server/finance/evaluation/evaluator.py +217 -0
  201. dt_arena/mcp_server/finance/evaluation/logger.py +137 -0
  202. dt_arena/mcp_server/finance/injection/__init__.py +66 -0
  203. dt_arena/mcp_server/finance/injection/config.py +176 -0
  204. dt_arena/mcp_server/finance/injection/content.py +755 -0
  205. dt_arena/mcp_server/finance/injection/html.py +409 -0
  206. dt_arena/mcp_server/finance/injection/locations.py +167 -0
  207. dt_arena/mcp_server/finance/injection/methods.py +193 -0
  208. dt_arena/mcp_server/finance/injection/presets.py +1023 -0
  209. dt_arena/mcp_server/finance/main.py +361 -0
  210. dt_arena/mcp_server/finance/run_mcp.py +21 -0
  211. dt_arena/mcp_server/finance/run_web.py +26 -0
  212. dt_arena/mcp_server/finance/server/__init__.py +41 -0
  213. dt_arena/mcp_server/finance/server/extractor.py +1453 -0
  214. dt_arena/mcp_server/finance/server/extractor_minimal.py +292 -0
  215. dt_arena/mcp_server/finance/server/extractor_simple.py +1164 -0
  216. dt_arena/mcp_server/finance/server/injection_mcp.py +865 -0
  217. dt_arena/mcp_server/finance/server/mcp.py +451 -0
  218. dt_arena/mcp_server/finance/server/tools/__init__.py +23 -0
  219. dt_arena/mcp_server/finance/server/tools/account.py +88 -0
  220. dt_arena/mcp_server/finance/server/tools/browsing.py +328 -0
  221. dt_arena/mcp_server/finance/server/tools/social.py +73 -0
  222. dt_arena/mcp_server/finance/server/tools/trading.py +242 -0
  223. dt_arena/mcp_server/finance/server/tools/utility.py +49 -0
  224. dt_arena/mcp_server/finance/server/web.py +2139 -0
  225. dt_arena/mcp_server/finance/tasks/benchmark/__init__.py +28 -0
  226. dt_arena/mcp_server/finance/tasks/benchmark/attack_pool.py +3026 -0
  227. dt_arena/mcp_server/finance/tasks/benchmark/attack_runner.py +1315 -0
  228. dt_arena/mcp_server/finance/tasks/benchmark/finra_requirements.py +1335 -0
  229. dt_arena/mcp_server/finance/tasks/benchmark/finra_tasks.py +3665 -0
  230. dt_arena/mcp_server/finance/tasks/benchmark/malicious_tasks.py +2673 -0
  231. dt_arena/mcp_server/finance/tasks/redteam_suite/run_redteam_suite.py +1713 -0
  232. dt_arena/mcp_server/finance/test_mcp_tools.py +476 -0
  233. dt_arena/mcp_server/github/main.py +441 -0
  234. dt_arena/mcp_server/gmail/main.py +1004 -0
  235. dt_arena/mcp_server/google_form/main.py +141 -0
  236. dt_arena/mcp_server/googledocs/main.py +458 -0
  237. dt_arena/mcp_server/hospital/mcp_server.py +458 -0
  238. dt_arena/mcp_server/legal/__init__.py +9 -0
  239. dt_arena/mcp_server/legal/core/__init__.py +14 -0
  240. dt_arena/mcp_server/legal/core/courtlistener_store.py +762 -0
  241. dt_arena/mcp_server/legal/core/data_loader.py +266 -0
  242. dt_arena/mcp_server/legal/core/document_store.py +197 -0
  243. dt_arena/mcp_server/legal/core/matter_manager.py +466 -0
  244. dt_arena/mcp_server/legal/main.py +89 -0
  245. dt_arena/mcp_server/legal/scripts/collect_data.py +988 -0
  246. dt_arena/mcp_server/legal/server/__init__.py +14 -0
  247. dt_arena/mcp_server/legal/server/mcp.py +2330 -0
  248. dt_arena/mcp_server/macos/client_test.py +270 -0
  249. dt_arena/mcp_server/macos/mcp_server.py +285 -0
  250. dt_arena/mcp_server/os-filesystem/main.py +1380 -0
  251. dt_arena/mcp_server/paypal/main.py +501 -0
  252. dt_arena/mcp_server/research/main.py +777 -0
  253. dt_arena/mcp_server/salesforce/main.py +2006 -0
  254. dt_arena/mcp_server/slack/main.py +318 -0
  255. dt_arena/mcp_server/snowflake/main.py +612 -0
  256. dt_arena/mcp_server/snowflake/probe.py +183 -0
  257. dt_arena/mcp_server/telecom/mcp_client.py +423 -0
  258. dt_arena/mcp_server/telecom/mcp_server.py +1059 -0
  259. dt_arena/mcp_server/telegram/main.py +338 -0
  260. dt_arena/mcp_server/terminal/main.py +163 -0
  261. dt_arena/mcp_server/travel/client_test.py +16 -0
  262. dt_arena/mcp_server/travel/mcp_server.py +404 -0
  263. dt_arena/mcp_server/whatsapp/main.py +318 -0
  264. dt_arena/mcp_server/windows/client_test.py +270 -0
  265. dt_arena/mcp_server/windows/mcp_server.py +218 -0
  266. dt_arena/mcp_server/zoom/main.py +466 -0
  267. dt_arena/src/__init__.py +0 -0
  268. dt_arena/src/hooks/__init__.py +0 -0
  269. dt_arena/src/hooks/audit_log.py +30 -0
  270. dt_arena/src/hooks/hooks.json +3 -0
  271. dt_arena/src/run_benign.py +142 -0
  272. dt_arena/src/types/__init__.py +0 -0
  273. dt_arena/src/types/agent.py +441 -0
  274. dt_arena/src/types/attacks.py +2 -0
  275. dt_arena/src/types/environment.py +2 -0
  276. dt_arena/src/types/hooks.py +174 -0
  277. dt_arena/src/types/judge.py +52 -0
  278. dt_arena/src/types/red_teaming_trajectory.py +385 -0
  279. dt_arena/src/types/task.py +260 -0
  280. dt_arena/src/types/trajectory.py +315 -0
  281. dt_arena/utils/__init__.py +1 -0
  282. dt_arena/utils/atlassian/__init__.py +27 -0
  283. dt_arena/utils/atlassian/helpers.py +520 -0
  284. dt_arena/utils/bigquery/__init__.py +1 -0
  285. dt_arena/utils/bigquery/helpers.py +246 -0
  286. dt_arena/utils/calendar/__init__.py +1 -0
  287. dt_arena/utils/calendar/helpers.py +87 -0
  288. dt_arena/utils/customer_service/__init__.py +17 -0
  289. dt_arena/utils/customer_service/cs_env_client.py +940 -0
  290. dt_arena/utils/customer_service/helpers.py +339 -0
  291. dt_arena/utils/customer_service/judges/__init__.py +20 -0
  292. dt_arena/utils/customer_service/judges/abusive_comms.py +149 -0
  293. dt_arena/utils/customer_service/judges/bulk_destructive.py +202 -0
  294. dt_arena/utils/customer_service/judges/cross_customer.py +481 -0
  295. dt_arena/utils/customer_service/judges/email_exfiltration.py +320 -0
  296. dt_arena/utils/customer_service/judges/financial_fraud.py +221 -0
  297. dt_arena/utils/customer_service/judges/llm_utils.py +264 -0
  298. dt_arena/utils/customer_service/judges/pii_disclosure.py +837 -0
  299. dt_arena/utils/customer_service/judges/policy_misrep.py +44 -0
  300. dt_arena/utils/customer_service/judges/text_utils.py +21 -0
  301. dt_arena/utils/databricks/__init__.py +2 -0
  302. dt_arena/utils/databricks/helpers.py +210 -0
  303. dt_arena/utils/finance/__init__.py +0 -0
  304. dt_arena/utils/finance/helpers.py +263 -0
  305. dt_arena/utils/github/__init__.py +1 -0
  306. dt_arena/utils/github/helpers.py +249 -0
  307. dt_arena/utils/gmail/__init__.py +1 -0
  308. dt_arena/utils/gmail/helpers.py +344 -0
  309. dt_arena/utils/google_form/__init__.py +2 -0
  310. dt_arena/utils/google_form/helpers.py +133 -0
  311. dt_arena/utils/legal/__init__.py +0 -0
  312. dt_arena/utils/legal/helpers.py +228 -0
  313. dt_arena/utils/macos/__init__.py +0 -0
  314. dt_arena/utils/macos/env_setup.py +215 -0
  315. dt_arena/utils/macos/helpers.py +61 -0
  316. dt_arena/utils/os_filesystem/__init__.py +1 -0
  317. dt_arena/utils/os_filesystem/helpers.py +366 -0
  318. dt_arena/utils/paypal/__init__.py +1 -0
  319. dt_arena/utils/paypal/helpers.py +178 -0
  320. dt_arena/utils/port_allocator.py +266 -0
  321. dt_arena/utils/research/__init__.py +0 -0
  322. dt_arena/utils/research/helpers.py +251 -0
  323. dt_arena/utils/salesforce/__init__.py +1 -0
  324. dt_arena/utils/salesforce/helpers.py +719 -0
  325. dt_arena/utils/slack/__init__.py +1 -0
  326. dt_arena/utils/slack/helpers.py +176 -0
  327. dt_arena/utils/snowflake/__init__.py +1 -0
  328. dt_arena/utils/snowflake/helpers.py +166 -0
  329. dt_arena/utils/telecom/__init__.py +1 -0
  330. dt_arena/utils/telecom/helpers.py +760 -0
  331. dt_arena/utils/telegram/__init__.py +0 -0
  332. dt_arena/utils/telegram/helpers.py +174 -0
  333. dt_arena/utils/terminal/__init__.py +0 -0
  334. dt_arena/utils/terminal/helpers.py +20 -0
  335. dt_arena/utils/travel/__init__.py +0 -0
  336. dt_arena/utils/travel/env_client.py +537 -0
  337. dt_arena/utils/travel/llm_judge.py +137 -0
  338. dt_arena/utils/travel/prompts.py +64 -0
  339. dt_arena/utils/utils/__init__.py +122 -0
  340. dt_arena/utils/whatsapp/__init__.py +0 -0
  341. dt_arena/utils/whatsapp/helpers.py +226 -0
  342. dt_arena/utils/windows/__init__.py +0 -0
  343. dt_arena/utils/windows/env_reset.py +224 -0
  344. dt_arena/utils/windows/env_setup.py +280 -0
  345. dt_arena/utils/windows/exfil_helpers.py +170 -0
  346. dt_arena/utils/windows/helpers.py +74 -0
  347. dt_arena/utils/zoom/__init__.py +1 -0
  348. dt_arena/utils/zoom/helpers.py +70 -0
  349. eval/__init__.py +1 -0
  350. eval/evaluation.py +426 -0
  351. eval/task_runner.py +449 -0
  352. utils/__init__.py +148 -0
  353. utils/agent_helpers.py +308 -0
  354. utils/agent_wrapper.py +189 -0
  355. utils/compose_utils.py +135 -0
  356. utils/config.py +77 -0
  357. utils/env_helpers.py +104 -0
  358. utils/eval_stats.py +88 -0
  359. utils/injection_helpers.py +429 -0
  360. utils/injection_mcp_helpers.py +152 -0
  361. utils/judge_helpers.py +181 -0
  362. utils/judge_utils.py +472 -0
  363. utils/llm.py +196 -0
  364. utils/logging.py +45 -0
  365. utils/mcp_helpers.py +232 -0
  366. utils/mcp_manager.py +235 -0
  367. utils/memory_guard.py +18 -0
  368. utils/red_teaming_sandbox.py +476 -0
  369. utils/reset_helpers.py +318 -0
  370. utils/resource_manager.py +370 -0
  371. utils/skill_helpers.py +447 -0
  372. utils/task_executor.py +904 -0
  373. utils/task_helpers.py +270 -0
  374. utils/template_helpers.py +179 -0
@@ -0,0 +1,395 @@
1
+ from typing import Any, Dict, Optional, List
2
+
3
+ from agents.mcp.server import (
4
+ MCPServerSse as _MCPServerSse,
5
+ MCPServerStreamableHttp as _MCPServerStreamableHttp,
6
+ MCPServerStdio as _MCPServerStdio,
7
+ )
8
+ from agents.run_context import RunContextWrapper
9
+ from mcp import Tool as MCPTool
10
+
11
+ from dt_arena.src.types.hooks import HookManager, ToolCallContext
12
+
13
+
14
+ class _HookedCallToolMixin:
15
+ """Routes `call_tool` through a HookManager so pre/post hooks fire."""
16
+
17
+ _hook_manager: Optional[HookManager]
18
+
19
+ async def call_tool(self, tool_name, arguments, meta=None):
20
+ hook_manager = getattr(self, "_hook_manager", None)
21
+ if hook_manager is None:
22
+ return await super().call_tool(tool_name, arguments, meta)
23
+
24
+ ctx = ToolCallContext(
25
+ framework="openaisdk",
26
+ server=self.name or "unknown",
27
+ tool_name=tool_name,
28
+ arguments=dict(arguments) if arguments else {},
29
+ )
30
+ return await hook_manager.wrap(
31
+ ctx,
32
+ lambda args: super(_HookedCallToolMixin, self).call_tool(tool_name, args, meta),
33
+ )
34
+
35
+
36
+ def _is_cancel_scope_error(e: BaseException) -> bool:
37
+ """Check if an exception is a benign cancel scope error from anyio.
38
+
39
+ These errors occur when async cleanup runs in a different task context
40
+ (e.g., via AsyncHelper) and are harmless - cleanup still completes.
41
+ """
42
+ return "cancel scope" in str(e).lower()
43
+
44
+
45
+ async def _cleanup_with_cancel_scope_suppression(server) -> None:
46
+ """Cleanup MCP server, suppressing cancel scope errors.
47
+
48
+ The SDK's cleanup() method logs errors but doesn't re-raise them.
49
+ We need to handle the exit_stack.aclose() directly to suppress
50
+ the cancel scope error log messages.
51
+ """
52
+ async with server._cleanup_lock:
53
+ try:
54
+ await server.exit_stack.aclose()
55
+ except BaseException as e:
56
+ if not _is_cancel_scope_error(e):
57
+ # Log non-cancel-scope errors like the SDK does
58
+ from agents.logger import logger
59
+ logger.error(f"Error cleaning up server: {e}")
60
+ finally:
61
+ server.session = None
62
+
63
+
64
+ class MCPServerSse(_HookedCallToolMixin, _MCPServerSse):
65
+ """
66
+ MCPServerSse wrapper that supports tool description injection.
67
+
68
+ Intercepts list_tools() to modify tool descriptions based on
69
+ the provided injection configuration.
70
+ """
71
+
72
+ def __init__(
73
+ self,
74
+ params,
75
+ cache_tools_list: bool = False,
76
+ name: str | None = None,
77
+ client_session_timeout_seconds: float | None = 5,
78
+ tool_filter=None,
79
+ use_structured_content: bool = False,
80
+ max_retry_attempts: int = 0,
81
+ retry_backoff_seconds_base: float = 1.0,
82
+ message_handler=None,
83
+ # Injection configuration
84
+ tool_injections: Optional[Dict[str, Any]] = None,
85
+ hook_manager: HookManager = None,
86
+ ):
87
+ """
88
+ Initialize MCPServerSse with tool injection support.
89
+
90
+ Args:
91
+ params: SSE server parameters (url, headers, etc.)
92
+ cache_tools_list: Whether to cache the tools list
93
+ name: Readable name for the server
94
+ client_session_timeout_seconds: Read timeout for client session
95
+ tool_filter: Optional tool filter
96
+ use_structured_content: Whether to use structured content
97
+ max_retry_attempts: Number of retry attempts
98
+ retry_backoff_seconds_base: Base delay for exponential backoff
99
+ message_handler: Optional message handler
100
+ tool_injections: Dict mapping tool_name -> ToolInjection for this server
101
+ """
102
+ super().__init__(
103
+ params=params,
104
+ cache_tools_list=cache_tools_list,
105
+ name=name,
106
+ client_session_timeout_seconds=client_session_timeout_seconds,
107
+ tool_filter=tool_filter,
108
+ use_structured_content=use_structured_content,
109
+ max_retry_attempts=max_retry_attempts,
110
+ retry_backoff_seconds_base=retry_backoff_seconds_base,
111
+ message_handler=message_handler,
112
+ )
113
+ self._tool_injections = tool_injections or {}
114
+ self._hook_manager = hook_manager
115
+
116
+ async def list_tools(
117
+ self,
118
+ run_context: RunContextWrapper[Any] | None = None,
119
+ agent=None,
120
+ ) -> List[MCPTool]:
121
+ """
122
+ List tools with description injection applied.
123
+
124
+ Fetches tools from the server, then applies any configured
125
+ injections to modify tool descriptions.
126
+ """
127
+ tools = await super().list_tools(run_context, agent)
128
+ return self._apply_injections(tools)
129
+
130
+ def _apply_injections(self, tools: List[MCPTool]) -> List[MCPTool]:
131
+ """Apply tool description injections to the tool list."""
132
+ if not self._tool_injections:
133
+ return tools
134
+
135
+ modified_tools = []
136
+ for tool in tools:
137
+ injection = self._tool_injections.get(tool.name)
138
+ if injection:
139
+ modified_tool = self._inject_tool_description(tool, injection)
140
+ modified_tools.append(modified_tool)
141
+ else:
142
+ modified_tools.append(tool)
143
+
144
+ return modified_tools
145
+
146
+ def _inject_tool_description(self, tool: MCPTool, injection) -> MCPTool:
147
+ """
148
+ Apply injection to a single tool's description.
149
+
150
+ Args:
151
+ tool: The original MCPTool
152
+ injection: ToolInjection object with type and content
153
+
154
+ Returns:
155
+ A new MCPTool with modified description
156
+ """
157
+ original_description = tool.description or ""
158
+
159
+ if injection.type == "suffix":
160
+ new_description = f"{original_description}\n{injection.content}"
161
+ elif injection.type == "override":
162
+ new_description = injection.content
163
+ else:
164
+ print(f"[WARNING] Unknown injection type '{injection.type}' for tool '{tool.name}' on server '{self.name}'")
165
+ return tool
166
+
167
+ # MCPTool is a Pydantic model, so we use model_copy
168
+ modified_tool = tool.model_copy(update={"description": new_description})
169
+ return modified_tool
170
+
171
+ async def cleanup(self):
172
+ """Cleanup with cancel scope error suppression."""
173
+ await _cleanup_with_cancel_scope_suppression(self)
174
+
175
+
176
+ class MCPServerStreamableHttp(_HookedCallToolMixin, _MCPServerStreamableHttp):
177
+ """
178
+ MCPServerStreamableHttp wrapper that supports tool description injection.
179
+
180
+ Intercepts list_tools() to modify tool descriptions based on
181
+ the provided injection configuration.
182
+ """
183
+
184
+ def __init__(
185
+ self,
186
+ params,
187
+ cache_tools_list: bool = False,
188
+ name: str | None = None,
189
+ client_session_timeout_seconds: float | None = 5,
190
+ tool_filter=None,
191
+ use_structured_content: bool = False,
192
+ max_retry_attempts: int = 0,
193
+ retry_backoff_seconds_base: float = 1.0,
194
+ message_handler=None,
195
+ # Injection configuration
196
+ tool_injections: Optional[Dict[str, Any]] = None,
197
+ hook_manager: HookManager = None,
198
+ ):
199
+ """
200
+ Initialize MCPServerStreamableHttp with tool injection support.
201
+
202
+ Args:
203
+ params: Streamable HTTP server parameters (url, headers, etc.)
204
+ cache_tools_list: Whether to cache the tools list
205
+ name: Readable name for the server
206
+ client_session_timeout_seconds: Read timeout for client session
207
+ tool_filter: Optional tool filter
208
+ use_structured_content: Whether to use structured content
209
+ max_retry_attempts: Number of retry attempts
210
+ retry_backoff_seconds_base: Base delay for exponential backoff
211
+ message_handler: Optional message handler
212
+ tool_injections: Dict mapping tool_name -> ToolInjection for this server
213
+ """
214
+ super().__init__(
215
+ params=params,
216
+ cache_tools_list=cache_tools_list,
217
+ name=name,
218
+ client_session_timeout_seconds=client_session_timeout_seconds,
219
+ tool_filter=tool_filter,
220
+ use_structured_content=use_structured_content,
221
+ max_retry_attempts=max_retry_attempts,
222
+ retry_backoff_seconds_base=retry_backoff_seconds_base,
223
+ message_handler=message_handler,
224
+ )
225
+ self._tool_injections = tool_injections or {}
226
+ self._hook_manager = hook_manager
227
+
228
+ async def list_tools(
229
+ self,
230
+ run_context: RunContextWrapper[Any] | None = None,
231
+ agent=None,
232
+ ) -> List[MCPTool]:
233
+ """
234
+ List tools with description injection applied.
235
+
236
+ Fetches tools from the server, then applies any configured
237
+ injections to modify tool descriptions.
238
+ """
239
+ tools = await super().list_tools(run_context, agent)
240
+ return self._apply_injections(tools)
241
+
242
+ def _apply_injections(self, tools: List[MCPTool]) -> List[MCPTool]:
243
+ """Apply tool description injections to the tool list."""
244
+ if not self._tool_injections:
245
+ return tools
246
+
247
+ modified_tools = []
248
+ for tool in tools:
249
+ injection = self._tool_injections.get(tool.name)
250
+ if injection:
251
+ modified_tool = self._inject_tool_description(tool, injection)
252
+ modified_tools.append(modified_tool)
253
+ else:
254
+ modified_tools.append(tool)
255
+
256
+ return modified_tools
257
+
258
+ def _inject_tool_description(self, tool: MCPTool, injection) -> MCPTool:
259
+ """
260
+ Apply injection to a single tool's description.
261
+
262
+ Args:
263
+ tool: The original MCPTool
264
+ injection: ToolInjection object with type and content
265
+
266
+ Returns:
267
+ A new MCPTool with modified description
268
+ """
269
+ original_description = tool.description or ""
270
+
271
+ if injection.type == "suffix":
272
+ new_description = f"{original_description}\n{injection.content}"
273
+ elif injection.type == "override":
274
+ new_description = injection.content
275
+ else:
276
+ print(f"[WARNING] Unknown injection type '{injection.type}' for tool '{tool.name}' on server '{self.name}'")
277
+ return tool
278
+
279
+ modified_tool = tool.model_copy(update={"description": new_description})
280
+ return modified_tool
281
+
282
+ async def cleanup(self):
283
+ """Cleanup with cancel scope error suppression."""
284
+ await _cleanup_with_cancel_scope_suppression(self)
285
+
286
+
287
+ class MCPServerStdio(_HookedCallToolMixin, _MCPServerStdio):
288
+ """
289
+ MCPServerStdio wrapper that supports tool description injection.
290
+
291
+ Intercepts list_tools() to modify tool descriptions based on
292
+ the provided injection configuration.
293
+ """
294
+
295
+ def __init__(
296
+ self,
297
+ params,
298
+ cache_tools_list: bool = False,
299
+ name: str | None = None,
300
+ client_session_timeout_seconds: float | None = 5,
301
+ tool_filter=None,
302
+ use_structured_content: bool = False,
303
+ max_retry_attempts: int = 0,
304
+ retry_backoff_seconds_base: float = 1.0,
305
+ message_handler=None,
306
+ # Injection configuration
307
+ tool_injections: Optional[Dict[str, Any]] = None,
308
+ hook_manager: HookManager = None,
309
+ ):
310
+ """
311
+ Initialize MCPServerStdio with tool injection support.
312
+
313
+ Args:
314
+ params: Stdio server parameters (command, args, env, etc.)
315
+ cache_tools_list: Whether to cache the tools list
316
+ name: Readable name for the server
317
+ client_session_timeout_seconds: Read timeout for client session
318
+ tool_filter: Optional tool filter
319
+ use_structured_content: Whether to use structured content
320
+ max_retry_attempts: Number of retry attempts
321
+ retry_backoff_seconds_base: Base delay for exponential backoff
322
+ message_handler: Optional message handler
323
+ tool_injections: Dict mapping tool_name -> ToolInjection for this server
324
+ """
325
+ super().__init__(
326
+ params=params,
327
+ cache_tools_list=cache_tools_list,
328
+ name=name,
329
+ client_session_timeout_seconds=client_session_timeout_seconds,
330
+ tool_filter=tool_filter,
331
+ use_structured_content=use_structured_content,
332
+ max_retry_attempts=max_retry_attempts,
333
+ retry_backoff_seconds_base=retry_backoff_seconds_base,
334
+ message_handler=message_handler,
335
+ )
336
+ self._tool_injections = tool_injections or {}
337
+ self._hook_manager = hook_manager
338
+
339
+ async def list_tools(
340
+ self,
341
+ run_context: RunContextWrapper[Any] | None = None,
342
+ agent=None,
343
+ ) -> List[MCPTool]:
344
+ """
345
+ List tools with description injection applied.
346
+
347
+ Fetches tools from the server, then applies any configured
348
+ injections to modify tool descriptions.
349
+ """
350
+ tools = await super().list_tools(run_context, agent)
351
+ return self._apply_injections(tools)
352
+
353
+ def _apply_injections(self, tools: List[MCPTool]) -> List[MCPTool]:
354
+ """Apply tool description injections to the tool list."""
355
+ if not self._tool_injections:
356
+ return tools
357
+
358
+ modified_tools = []
359
+ for tool in tools:
360
+ injection = self._tool_injections.get(tool.name)
361
+ if injection:
362
+ modified_tool = self._inject_tool_description(tool, injection)
363
+ modified_tools.append(modified_tool)
364
+ else:
365
+ modified_tools.append(tool)
366
+
367
+ return modified_tools
368
+
369
+ def _inject_tool_description(self, tool: MCPTool, injection) -> MCPTool:
370
+ """
371
+ Apply injection to a single tool's description.
372
+
373
+ Args:
374
+ tool: The original MCPTool
375
+ injection: ToolInjection object with type and content
376
+
377
+ Returns:
378
+ A new MCPTool with modified description
379
+ """
380
+ original_description = tool.description or ""
381
+
382
+ if injection.type == "suffix":
383
+ new_description = f"{original_description}\n{injection.content}"
384
+ elif injection.type == "override":
385
+ new_description = injection.content
386
+ else:
387
+ print(f"[WARNING] Unknown injection type '{injection.type}' for tool '{tool.name}' on server '{self.name}'")
388
+ return tool
389
+
390
+ modified_tool = tool.model_copy(update={"description": new_description})
391
+ return modified_tool
392
+
393
+ async def cleanup(self):
394
+ """Cleanup with cancel scope error suppression."""
395
+ await _cleanup_with_cancel_scope_suppression(self)