decodingtrust-agent-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. agent/__init__.py +30 -0
  2. agent/claudesdk/__init__.py +8 -0
  3. agent/claudesdk/example.py +221 -0
  4. agent/claudesdk/src/__init__.py +8 -0
  5. agent/claudesdk/src/agent.py +400 -0
  6. agent/claudesdk/src/mcp_proxy.py +409 -0
  7. agent/claudesdk/src/utils.py +420 -0
  8. agent/googleadk/__init__.py +15 -0
  9. agent/googleadk/example.py +237 -0
  10. agent/googleadk/src/__init__.py +12 -0
  11. agent/googleadk/src/agent.py +401 -0
  12. agent/googleadk/src/mcp_wrapper.py +163 -0
  13. agent/googleadk/src/utils.py +602 -0
  14. agent/langchain/__init__.py +8 -0
  15. agent/langchain/example.py +213 -0
  16. agent/langchain/src/__init__.py +8 -0
  17. agent/langchain/src/agent.py +645 -0
  18. agent/langchain/src/utils.py +433 -0
  19. agent/openaisdk/__init__.py +17 -0
  20. agent/openaisdk/example.py +228 -0
  21. agent/openaisdk/src/__init__.py +12 -0
  22. agent/openaisdk/src/agent.py +491 -0
  23. agent/openaisdk/src/agent_wrapper.py +143 -0
  24. agent/openaisdk/src/mcp_wrapper.py +395 -0
  25. agent/openaisdk/src/utils.py +493 -0
  26. agent/openclaw/__init__.py +10 -0
  27. agent/openclaw/example.py +251 -0
  28. agent/openclaw/src/__init__.py +14 -0
  29. agent/openclaw/src/agent.py +930 -0
  30. agent/openclaw/src/helpers/__init__.py +1 -0
  31. agent/openclaw/src/helpers/auth_helpers.py +55 -0
  32. agent/openclaw/src/mcp_proxy.py +564 -0
  33. agent/openclaw/src/plugin_generator.py +231 -0
  34. agent/openclaw/src/utils.py +341 -0
  35. agent/pocketflow/__init__.py +18 -0
  36. agent/pocketflow/example.py +221 -0
  37. agent/pocketflow/prompts/react_agent.py +46 -0
  38. agent/pocketflow/src/__init__.py +6 -0
  39. agent/pocketflow/src/agent.py +507 -0
  40. agent/pocketflow/src/agent_wrapper.py +159 -0
  41. agent/pocketflow/src/async_helper.py +92 -0
  42. agent/pocketflow/src/mcp_react_agent.py +279 -0
  43. agent/pocketflow/src/native_agent.py +74 -0
  44. agent/pocketflow/src/nodes.py +467 -0
  45. benchmark/__init__.py +0 -0
  46. benchmark/browser/benign.jsonl +34 -0
  47. benchmark/browser/direct.jsonl +85 -0
  48. benchmark/browser/indirect.jsonl +82 -0
  49. benchmark/code/benign.jsonl +0 -0
  50. benchmark/code/direct.jsonl +121 -0
  51. benchmark/code/indirect.jsonl +165 -0
  52. benchmark/crm/benign.jsonl +165 -0
  53. benchmark/crm/direct.jsonl +90 -0
  54. benchmark/crm/indirect.jsonl +150 -0
  55. benchmark/customer-service/benign.jsonl +160 -0
  56. benchmark/customer-service/direct.jsonl +100 -0
  57. benchmark/customer-service/indirect.jsonl +101 -0
  58. benchmark/finance/benign.jsonl +0 -0
  59. benchmark/finance/direct.jsonl +200 -0
  60. benchmark/finance/indirect.jsonl +200 -0
  61. benchmark/legal/benign.jsonl +0 -0
  62. benchmark/legal/direct.jsonl +200 -0
  63. benchmark/legal/indirect.jsonl +200 -0
  64. benchmark/macos/benign.jsonl +30 -0
  65. benchmark/macos/direct.jsonl +50 -0
  66. benchmark/macos/indirect.jsonl +50 -0
  67. benchmark/medical/benign.jsonl +642 -0
  68. benchmark/medical/direct.jsonl +229 -0
  69. benchmark/medical/indirect.jsonl +222 -0
  70. benchmark/os-filesystem/benign.jsonl +200 -0
  71. benchmark/os-filesystem/direct.jsonl +200 -0
  72. benchmark/os-filesystem/indirect.jsonl +200 -0
  73. benchmark/research/benign.jsonl +0 -0
  74. benchmark/research/direct.jsonl +119 -0
  75. benchmark/research/indirect.jsonl +125 -0
  76. benchmark/telecom/benign.jsonl +120 -0
  77. benchmark/telecom/direct.jsonl +161 -0
  78. benchmark/telecom/indirect.jsonl +166 -0
  79. benchmark/travel/benign.jsonl +130 -0
  80. benchmark/travel/direct.jsonl +105 -0
  81. benchmark/travel/indirect.jsonl +120 -0
  82. benchmark/windows/benign.jsonl +100 -0
  83. benchmark/windows/direct.jsonl +140 -0
  84. benchmark/windows/indirect.jsonl +107 -0
  85. benchmark/workflow/benign.jsonl +335 -0
  86. benchmark/workflow/direct.jsonl +78 -0
  87. benchmark/workflow/indirect.jsonl +107 -0
  88. cli/__init__.py +5 -0
  89. cli/main.py +182 -0
  90. cli/scaffold.py +334 -0
  91. decodingtrust_agent_sdk-0.1.0.dist-info/METADATA +642 -0
  92. decodingtrust_agent_sdk-0.1.0.dist-info/RECORD +374 -0
  93. decodingtrust_agent_sdk-0.1.0.dist-info/WHEEL +5 -0
  94. decodingtrust_agent_sdk-0.1.0.dist-info/entry_points.txt +2 -0
  95. decodingtrust_agent_sdk-0.1.0.dist-info/licenses/LICENSE +201 -0
  96. decodingtrust_agent_sdk-0.1.0.dist-info/top_level.txt +6 -0
  97. dt_arena/config/env.yaml +515 -0
  98. dt_arena/config/injection_mcp.yaml +430 -0
  99. dt_arena/config/mcp.yaml +642 -0
  100. dt_arena/envs/arxiv/docker-compose-hub.yml +31 -0
  101. dt_arena/envs/arxiv/docker-compose.yml +36 -0
  102. dt_arena/envs/atlassian/docker/docker-compose.dev.yml +65 -0
  103. dt_arena/envs/atlassian/docker/docker-compose.yml +53 -0
  104. dt_arena/envs/atlassian/docker-compose-hub.yml +57 -0
  105. dt_arena/envs/atlassian/docker-compose.yml +72 -0
  106. dt_arena/envs/bigquery/docker-compose.yml +20 -0
  107. dt_arena/envs/booking/docker-compose.yml +59 -0
  108. dt_arena/envs/calendar/docker-compose-hub.yml +30 -0
  109. dt_arena/envs/calendar/docker-compose.yml +42 -0
  110. dt_arena/envs/custom-website/docker-compose.yml +6 -0
  111. dt_arena/envs/customer_service/docker-compose.yml +59 -0
  112. dt_arena/envs/databricks/docker-compose-hub.yml +47 -0
  113. dt_arena/envs/databricks/docker-compose.yml +51 -0
  114. dt_arena/envs/ecommerce/docker-compose.yml +6 -0
  115. dt_arena/envs/ers/docker-compose.yml +36 -0
  116. dt_arena/envs/ers/hrms/docker/docker-compose.yml +31 -0
  117. dt_arena/envs/finance/docker-compose.yml +23 -0
  118. dt_arena/envs/github/docker/docker-compose-hub.yml +50 -0
  119. dt_arena/envs/github/docker/docker-compose.yml +50 -0
  120. dt_arena/envs/gmail/docker-compose-hub.yml +51 -0
  121. dt_arena/envs/gmail/docker-compose.yml +65 -0
  122. dt_arena/envs/google-form/docker-compose-hub.yml +33 -0
  123. dt_arena/envs/google-form/docker-compose.yml +41 -0
  124. dt_arena/envs/googledocs/docker-compose-hub.yml +61 -0
  125. dt_arena/envs/googledocs/docker-compose.yml +78 -0
  126. dt_arena/envs/hospital/docker-compose-hub.yml +25 -0
  127. dt_arena/envs/hospital/docker-compose.yml +27 -0
  128. dt_arena/envs/legal/docker-compose.yml +22 -0
  129. dt_arena/envs/linkedin/docker-compose.yml +63 -0
  130. dt_arena/envs/macos/docker-compose.yml +79 -0
  131. dt_arena/envs/os-filesystem/docker-compose-hub.yml +16 -0
  132. dt_arena/envs/os-filesystem/docker-compose.yml +20 -0
  133. dt_arena/envs/paypal/docker-compose-hub.yml +48 -0
  134. dt_arena/envs/paypal/docker-compose.yml +63 -0
  135. dt_arena/envs/research/docker-compose-hub.yml +13 -0
  136. dt_arena/envs/research/docker-compose.yml +24 -0
  137. dt_arena/envs/salesforce_crm/docker-compose-hub.yaml +45 -0
  138. dt_arena/envs/salesforce_crm/docker-compose.yaml +49 -0
  139. dt_arena/envs/slack/docker-compose-hub.yml +28 -0
  140. dt_arena/envs/slack/docker-compose.yml +41 -0
  141. dt_arena/envs/snowflake/docker-compose-hub.yml +41 -0
  142. dt_arena/envs/snowflake/docker-compose.yml +44 -0
  143. dt_arena/envs/telecom/docker-compose-hub.yml +16 -0
  144. dt_arena/envs/telecom/docker-compose.yml +17 -0
  145. dt_arena/envs/telegram/docker-compose-hub.yml +57 -0
  146. dt_arena/envs/telegram/docker-compose.yml +62 -0
  147. dt_arena/envs/terminal/docker-compose-hub.yml +12 -0
  148. dt_arena/envs/terminal/docker-compose.yml +26 -0
  149. dt_arena/envs/travel/docker-compose-hub.yml +19 -0
  150. dt_arena/envs/travel/docker-compose.yml +19 -0
  151. dt_arena/envs/whatsapp/docker-compose-hub.yml +61 -0
  152. dt_arena/envs/whatsapp/docker-compose.yml +78 -0
  153. dt_arena/envs/windows/docker-compose.yml +71 -0
  154. dt_arena/envs/zoom/docker-compose-hub.yml +27 -0
  155. dt_arena/envs/zoom/docker-compose.yml +40 -0
  156. dt_arena/injection_mcp_server/atlassian/env_injection.py +134 -0
  157. dt_arena/injection_mcp_server/calendar/env_injection.py +217 -0
  158. dt_arena/injection_mcp_server/custom_website/env_injection.py +97 -0
  159. dt_arena/injection_mcp_server/customer_service/env_injection.py +659 -0
  160. dt_arena/injection_mcp_server/databricks/env_injection.py +255 -0
  161. dt_arena/injection_mcp_server/ecommerce/env_injection.py +110 -0
  162. dt_arena/injection_mcp_server/finance/env_injection.py +85 -0
  163. dt_arena/injection_mcp_server/github/env_injection.py +206 -0
  164. dt_arena/injection_mcp_server/gmail/env_injection.py +211 -0
  165. dt_arena/injection_mcp_server/google_form/env_injection.py +186 -0
  166. dt_arena/injection_mcp_server/googledocs/env_injection.py +44 -0
  167. dt_arena/injection_mcp_server/hospital/env_injection.py +43 -0
  168. dt_arena/injection_mcp_server/legal/env_injection.py +229 -0
  169. dt_arena/injection_mcp_server/macos/env_injection.py +272 -0
  170. dt_arena/injection_mcp_server/os-filesystem/env_injection.py +341 -0
  171. dt_arena/injection_mcp_server/paypal/env_injection.py +268 -0
  172. dt_arena/injection_mcp_server/research/env_injection.py +616 -0
  173. dt_arena/injection_mcp_server/salesforce/env_injection.py +514 -0
  174. dt_arena/injection_mcp_server/slack/env_injection.py +265 -0
  175. dt_arena/injection_mcp_server/snowflake/env_injection.py +230 -0
  176. dt_arena/injection_mcp_server/telecom/env_injection.py +503 -0
  177. dt_arena/injection_mcp_server/telegram/env_injection.py +171 -0
  178. dt_arena/injection_mcp_server/terminal/env_injection.py +523 -0
  179. dt_arena/injection_mcp_server/travel/env_injection.py +173 -0
  180. dt_arena/injection_mcp_server/whatsapp/env_injection.py +185 -0
  181. dt_arena/injection_mcp_server/windows/env_injection.py +943 -0
  182. dt_arena/injection_mcp_server/zoom/env_injection.py +216 -0
  183. dt_arena/mcp_server/atlassian/main.py +1554 -0
  184. dt_arena/mcp_server/atlassian/test_server.py +66 -0
  185. dt_arena/mcp_server/bigquery/main.py +333 -0
  186. dt_arena/mcp_server/booking/main.py +310 -0
  187. dt_arena/mcp_server/browser/main.py +1741 -0
  188. dt_arena/mcp_server/calendar/example_multi_user.py +162 -0
  189. dt_arena/mcp_server/calendar/main.py +792 -0
  190. dt_arena/mcp_server/calendar/test_mcp.py +135 -0
  191. dt_arena/mcp_server/customer_service/main.py +1063 -0
  192. dt_arena/mcp_server/databricks/main.py +566 -0
  193. dt_arena/mcp_server/databricks/probe.py +102 -0
  194. dt_arena/mcp_server/ers/main.py +845 -0
  195. dt_arena/mcp_server/finance/__init__.py +87 -0
  196. dt_arena/mcp_server/finance/core/__init__.py +12 -0
  197. dt_arena/mcp_server/finance/core/data_loader.py +558 -0
  198. dt_arena/mcp_server/finance/core/portfolio.py +565 -0
  199. dt_arena/mcp_server/finance/evaluation/__init__.py +20 -0
  200. dt_arena/mcp_server/finance/evaluation/evaluator.py +217 -0
  201. dt_arena/mcp_server/finance/evaluation/logger.py +137 -0
  202. dt_arena/mcp_server/finance/injection/__init__.py +66 -0
  203. dt_arena/mcp_server/finance/injection/config.py +176 -0
  204. dt_arena/mcp_server/finance/injection/content.py +755 -0
  205. dt_arena/mcp_server/finance/injection/html.py +409 -0
  206. dt_arena/mcp_server/finance/injection/locations.py +167 -0
  207. dt_arena/mcp_server/finance/injection/methods.py +193 -0
  208. dt_arena/mcp_server/finance/injection/presets.py +1023 -0
  209. dt_arena/mcp_server/finance/main.py +361 -0
  210. dt_arena/mcp_server/finance/run_mcp.py +21 -0
  211. dt_arena/mcp_server/finance/run_web.py +26 -0
  212. dt_arena/mcp_server/finance/server/__init__.py +41 -0
  213. dt_arena/mcp_server/finance/server/extractor.py +1453 -0
  214. dt_arena/mcp_server/finance/server/extractor_minimal.py +292 -0
  215. dt_arena/mcp_server/finance/server/extractor_simple.py +1164 -0
  216. dt_arena/mcp_server/finance/server/injection_mcp.py +865 -0
  217. dt_arena/mcp_server/finance/server/mcp.py +451 -0
  218. dt_arena/mcp_server/finance/server/tools/__init__.py +23 -0
  219. dt_arena/mcp_server/finance/server/tools/account.py +88 -0
  220. dt_arena/mcp_server/finance/server/tools/browsing.py +328 -0
  221. dt_arena/mcp_server/finance/server/tools/social.py +73 -0
  222. dt_arena/mcp_server/finance/server/tools/trading.py +242 -0
  223. dt_arena/mcp_server/finance/server/tools/utility.py +49 -0
  224. dt_arena/mcp_server/finance/server/web.py +2139 -0
  225. dt_arena/mcp_server/finance/tasks/benchmark/__init__.py +28 -0
  226. dt_arena/mcp_server/finance/tasks/benchmark/attack_pool.py +3026 -0
  227. dt_arena/mcp_server/finance/tasks/benchmark/attack_runner.py +1315 -0
  228. dt_arena/mcp_server/finance/tasks/benchmark/finra_requirements.py +1335 -0
  229. dt_arena/mcp_server/finance/tasks/benchmark/finra_tasks.py +3665 -0
  230. dt_arena/mcp_server/finance/tasks/benchmark/malicious_tasks.py +2673 -0
  231. dt_arena/mcp_server/finance/tasks/redteam_suite/run_redteam_suite.py +1713 -0
  232. dt_arena/mcp_server/finance/test_mcp_tools.py +476 -0
  233. dt_arena/mcp_server/github/main.py +441 -0
  234. dt_arena/mcp_server/gmail/main.py +1004 -0
  235. dt_arena/mcp_server/google_form/main.py +141 -0
  236. dt_arena/mcp_server/googledocs/main.py +458 -0
  237. dt_arena/mcp_server/hospital/mcp_server.py +458 -0
  238. dt_arena/mcp_server/legal/__init__.py +9 -0
  239. dt_arena/mcp_server/legal/core/__init__.py +14 -0
  240. dt_arena/mcp_server/legal/core/courtlistener_store.py +762 -0
  241. dt_arena/mcp_server/legal/core/data_loader.py +266 -0
  242. dt_arena/mcp_server/legal/core/document_store.py +197 -0
  243. dt_arena/mcp_server/legal/core/matter_manager.py +466 -0
  244. dt_arena/mcp_server/legal/main.py +89 -0
  245. dt_arena/mcp_server/legal/scripts/collect_data.py +988 -0
  246. dt_arena/mcp_server/legal/server/__init__.py +14 -0
  247. dt_arena/mcp_server/legal/server/mcp.py +2330 -0
  248. dt_arena/mcp_server/macos/client_test.py +270 -0
  249. dt_arena/mcp_server/macos/mcp_server.py +285 -0
  250. dt_arena/mcp_server/os-filesystem/main.py +1380 -0
  251. dt_arena/mcp_server/paypal/main.py +501 -0
  252. dt_arena/mcp_server/research/main.py +777 -0
  253. dt_arena/mcp_server/salesforce/main.py +2006 -0
  254. dt_arena/mcp_server/slack/main.py +318 -0
  255. dt_arena/mcp_server/snowflake/main.py +612 -0
  256. dt_arena/mcp_server/snowflake/probe.py +183 -0
  257. dt_arena/mcp_server/telecom/mcp_client.py +423 -0
  258. dt_arena/mcp_server/telecom/mcp_server.py +1059 -0
  259. dt_arena/mcp_server/telegram/main.py +338 -0
  260. dt_arena/mcp_server/terminal/main.py +163 -0
  261. dt_arena/mcp_server/travel/client_test.py +16 -0
  262. dt_arena/mcp_server/travel/mcp_server.py +404 -0
  263. dt_arena/mcp_server/whatsapp/main.py +318 -0
  264. dt_arena/mcp_server/windows/client_test.py +270 -0
  265. dt_arena/mcp_server/windows/mcp_server.py +218 -0
  266. dt_arena/mcp_server/zoom/main.py +466 -0
  267. dt_arena/src/__init__.py +0 -0
  268. dt_arena/src/hooks/__init__.py +0 -0
  269. dt_arena/src/hooks/audit_log.py +30 -0
  270. dt_arena/src/hooks/hooks.json +3 -0
  271. dt_arena/src/run_benign.py +142 -0
  272. dt_arena/src/types/__init__.py +0 -0
  273. dt_arena/src/types/agent.py +441 -0
  274. dt_arena/src/types/attacks.py +2 -0
  275. dt_arena/src/types/environment.py +2 -0
  276. dt_arena/src/types/hooks.py +174 -0
  277. dt_arena/src/types/judge.py +52 -0
  278. dt_arena/src/types/red_teaming_trajectory.py +385 -0
  279. dt_arena/src/types/task.py +260 -0
  280. dt_arena/src/types/trajectory.py +315 -0
  281. dt_arena/utils/__init__.py +1 -0
  282. dt_arena/utils/atlassian/__init__.py +27 -0
  283. dt_arena/utils/atlassian/helpers.py +520 -0
  284. dt_arena/utils/bigquery/__init__.py +1 -0
  285. dt_arena/utils/bigquery/helpers.py +246 -0
  286. dt_arena/utils/calendar/__init__.py +1 -0
  287. dt_arena/utils/calendar/helpers.py +87 -0
  288. dt_arena/utils/customer_service/__init__.py +17 -0
  289. dt_arena/utils/customer_service/cs_env_client.py +940 -0
  290. dt_arena/utils/customer_service/helpers.py +339 -0
  291. dt_arena/utils/customer_service/judges/__init__.py +20 -0
  292. dt_arena/utils/customer_service/judges/abusive_comms.py +149 -0
  293. dt_arena/utils/customer_service/judges/bulk_destructive.py +202 -0
  294. dt_arena/utils/customer_service/judges/cross_customer.py +481 -0
  295. dt_arena/utils/customer_service/judges/email_exfiltration.py +320 -0
  296. dt_arena/utils/customer_service/judges/financial_fraud.py +221 -0
  297. dt_arena/utils/customer_service/judges/llm_utils.py +264 -0
  298. dt_arena/utils/customer_service/judges/pii_disclosure.py +837 -0
  299. dt_arena/utils/customer_service/judges/policy_misrep.py +44 -0
  300. dt_arena/utils/customer_service/judges/text_utils.py +21 -0
  301. dt_arena/utils/databricks/__init__.py +2 -0
  302. dt_arena/utils/databricks/helpers.py +210 -0
  303. dt_arena/utils/finance/__init__.py +0 -0
  304. dt_arena/utils/finance/helpers.py +263 -0
  305. dt_arena/utils/github/__init__.py +1 -0
  306. dt_arena/utils/github/helpers.py +249 -0
  307. dt_arena/utils/gmail/__init__.py +1 -0
  308. dt_arena/utils/gmail/helpers.py +344 -0
  309. dt_arena/utils/google_form/__init__.py +2 -0
  310. dt_arena/utils/google_form/helpers.py +133 -0
  311. dt_arena/utils/legal/__init__.py +0 -0
  312. dt_arena/utils/legal/helpers.py +228 -0
  313. dt_arena/utils/macos/__init__.py +0 -0
  314. dt_arena/utils/macos/env_setup.py +215 -0
  315. dt_arena/utils/macos/helpers.py +61 -0
  316. dt_arena/utils/os_filesystem/__init__.py +1 -0
  317. dt_arena/utils/os_filesystem/helpers.py +366 -0
  318. dt_arena/utils/paypal/__init__.py +1 -0
  319. dt_arena/utils/paypal/helpers.py +178 -0
  320. dt_arena/utils/port_allocator.py +266 -0
  321. dt_arena/utils/research/__init__.py +0 -0
  322. dt_arena/utils/research/helpers.py +251 -0
  323. dt_arena/utils/salesforce/__init__.py +1 -0
  324. dt_arena/utils/salesforce/helpers.py +719 -0
  325. dt_arena/utils/slack/__init__.py +1 -0
  326. dt_arena/utils/slack/helpers.py +176 -0
  327. dt_arena/utils/snowflake/__init__.py +1 -0
  328. dt_arena/utils/snowflake/helpers.py +166 -0
  329. dt_arena/utils/telecom/__init__.py +1 -0
  330. dt_arena/utils/telecom/helpers.py +760 -0
  331. dt_arena/utils/telegram/__init__.py +0 -0
  332. dt_arena/utils/telegram/helpers.py +174 -0
  333. dt_arena/utils/terminal/__init__.py +0 -0
  334. dt_arena/utils/terminal/helpers.py +20 -0
  335. dt_arena/utils/travel/__init__.py +0 -0
  336. dt_arena/utils/travel/env_client.py +537 -0
  337. dt_arena/utils/travel/llm_judge.py +137 -0
  338. dt_arena/utils/travel/prompts.py +64 -0
  339. dt_arena/utils/utils/__init__.py +122 -0
  340. dt_arena/utils/whatsapp/__init__.py +0 -0
  341. dt_arena/utils/whatsapp/helpers.py +226 -0
  342. dt_arena/utils/windows/__init__.py +0 -0
  343. dt_arena/utils/windows/env_reset.py +224 -0
  344. dt_arena/utils/windows/env_setup.py +280 -0
  345. dt_arena/utils/windows/exfil_helpers.py +170 -0
  346. dt_arena/utils/windows/helpers.py +74 -0
  347. dt_arena/utils/zoom/__init__.py +1 -0
  348. dt_arena/utils/zoom/helpers.py +70 -0
  349. eval/__init__.py +1 -0
  350. eval/evaluation.py +426 -0
  351. eval/task_runner.py +449 -0
  352. utils/__init__.py +148 -0
  353. utils/agent_helpers.py +308 -0
  354. utils/agent_wrapper.py +189 -0
  355. utils/compose_utils.py +135 -0
  356. utils/config.py +77 -0
  357. utils/env_helpers.py +104 -0
  358. utils/eval_stats.py +88 -0
  359. utils/injection_helpers.py +429 -0
  360. utils/injection_mcp_helpers.py +152 -0
  361. utils/judge_helpers.py +181 -0
  362. utils/judge_utils.py +472 -0
  363. utils/llm.py +196 -0
  364. utils/logging.py +45 -0
  365. utils/mcp_helpers.py +232 -0
  366. utils/mcp_manager.py +235 -0
  367. utils/memory_guard.py +18 -0
  368. utils/red_teaming_sandbox.py +476 -0
  369. utils/reset_helpers.py +318 -0
  370. utils/resource_manager.py +370 -0
  371. utils/skill_helpers.py +447 -0
  372. utils/task_executor.py +904 -0
  373. utils/task_helpers.py +270 -0
  374. utils/template_helpers.py +179 -0
@@ -0,0 +1,409 @@
1
+ import asyncio
2
+ import os
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from mcp import ClientSession, Tool
6
+ from mcp.client.sse import sse_client
7
+ from mcp.client.stdio import StdioServerParameters, stdio_client
8
+ from mcp.client.streamable_http import streamablehttp_client
9
+
10
+ from mcp.server import Server
11
+ from mcp.types import TextContent, Tool as McpTool
12
+
13
+ from claude_code_sdk import (
14
+ SdkMcpTool,
15
+ McpSdkServerConfig,
16
+ )
17
+
18
+ from dt_arena.src.types.hooks import HookManager, ToolCallContext
19
+
20
+
21
+ class MCPProxyServer:
22
+ """
23
+ Proxy server that connects to a real MCP server, applies tool description
24
+ injections, and exposes the modified tools via Claude SDK's in-process
25
+ MCP server mechanism.
26
+
27
+ Key features:
28
+ - Connects during initialization (fail-fast)
29
+ - Supports http, sse, and stdio transports
30
+ - Applies tool description injections (suffix or override)
31
+ - Forwards tool calls to the real MCP server
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ name: str,
37
+ transport: str,
38
+ url: Optional[str] = None,
39
+ command: Optional[str] = None,
40
+ args: Optional[List[str]] = None,
41
+ env: Optional[Dict[str, str]] = None,
42
+ tool_injections: Optional[Dict[str, Any]] = None,
43
+ hook_manager: HookManager = None,
44
+ ):
45
+ """
46
+ Initialize the MCP Proxy Server.
47
+
48
+ Args:
49
+ name: Unique name for this server
50
+ transport: Transport type ("http", "sse", or "stdio")
51
+ url: URL for http/sse transports
52
+ command: Command for stdio transport
53
+ args: Arguments for stdio transport
54
+ env: Environment variables for stdio transport
55
+ tool_injections: Dict mapping tool_name -> ToolInjection for description modification
56
+ """
57
+ self.name = name
58
+ self.transport = transport
59
+ self.url = url
60
+ self.command = command
61
+ self.args = args or []
62
+ self.env = env or {}
63
+ self._tool_injections = tool_injections or {}
64
+ self._hook_manager = hook_manager
65
+
66
+ # Connection state
67
+ self._session: Optional[ClientSession] = None
68
+ self._connected = False
69
+ self._tools: List[Tool] = []
70
+ self._sdk_tools: List[SdkMcpTool] = []
71
+
72
+ # Background task for keeping connection alive
73
+ self._connection_task: Optional[asyncio.Task] = None
74
+ self._session_ready = asyncio.Event()
75
+ self._shutdown_event = asyncio.Event()
76
+
77
+ async def connect(self) -> None:
78
+ """
79
+ Connect to the real MCP server and fetch tools.
80
+
81
+ Raises:
82
+ ValueError: If connection parameters are invalid
83
+ ConnectionError: If connection to the MCP server fails
84
+ TimeoutError: If connection times out
85
+ """
86
+ if self._connected:
87
+ return
88
+
89
+ # Validate parameters
90
+ if self.transport in ("http", "sse") and not self.url:
91
+ raise ValueError(f"URL is required for {self.transport} transport")
92
+ if self.transport == "stdio" and not self.command:
93
+ raise ValueError("Command is required for stdio transport")
94
+
95
+ # Start connection in background task
96
+ self._connection_task = asyncio.create_task(self._maintain_connection())
97
+
98
+ # Wait for session to be ready with timeout
99
+ try:
100
+ await asyncio.wait_for(self._session_ready.wait(), timeout=30.0)
101
+ except asyncio.TimeoutError:
102
+ if self._connection_task and not self._connection_task.done():
103
+ self._connection_task.cancel()
104
+ try:
105
+ await self._connection_task
106
+ except asyncio.CancelledError:
107
+ pass
108
+ raise TimeoutError(f"Timeout connecting to MCP server '{self.name}'")
109
+
110
+ if not self._connected:
111
+ raise ConnectionError(f"Failed to connect to MCP server '{self.name}'")
112
+
113
+ # Create SDK tools from the fetched tools
114
+ self._create_sdk_tools()
115
+
116
+ async def _maintain_connection(self) -> None:
117
+ """Background task to maintain the MCP connection."""
118
+ try:
119
+ if self.transport == "sse":
120
+ await self._connect_sse()
121
+ elif self.transport == "http":
122
+ await self._connect_http()
123
+ elif self.transport == "stdio":
124
+ await self._connect_stdio()
125
+ else:
126
+ raise ValueError(f"Unsupported transport type: {self.transport}")
127
+ except Exception as e:
128
+ print(f"[ERROR] Connection to MCP server '{self.name}' failed: {e}")
129
+ self._session_ready.set() # Unblock waiters
130
+ raise
131
+
132
+ async def _connect_sse(self) -> None:
133
+ """Connect using SSE transport."""
134
+ async with sse_client(self.url) as (read, write):
135
+ self._session = ClientSession(read, write)
136
+ async with self._session:
137
+ await self._session.initialize()
138
+
139
+ # Fetch tools
140
+ response = await self._session.list_tools()
141
+ self._tools = response.tools
142
+ self._connected = True
143
+ self._session_ready.set()
144
+
145
+ # Keep connection alive until shutdown
146
+ await self._shutdown_event.wait()
147
+
148
+ async def _connect_http(self) -> None:
149
+ """Connect using HTTP (streamable) transport."""
150
+
151
+ async with streamablehttp_client(self.url) as (read, write, get_session_id):
152
+ self._session = ClientSession(read, write)
153
+ async with self._session:
154
+ await self._session.initialize()
155
+
156
+ # Fetch tools
157
+ response = await self._session.list_tools()
158
+ self._tools = response.tools
159
+ self._connected = True
160
+ self._session_ready.set()
161
+
162
+ # Keep connection alive until shutdown
163
+ await self._shutdown_event.wait()
164
+
165
+ async def _connect_stdio(self) -> None:
166
+ """Connect using stdio transport."""
167
+
168
+
169
+ # Build environment with PATH
170
+ env = {"PATH": os.environ.get("PATH", ""), **self.env}
171
+
172
+ server_params = StdioServerParameters(
173
+ command=self.command,
174
+ args=self.args,
175
+ env=env,
176
+ )
177
+
178
+ async with stdio_client(server_params) as (read, write):
179
+ self._session = ClientSession(read, write)
180
+ async with self._session:
181
+ await self._session.initialize()
182
+
183
+ # Fetch tools
184
+ response = await self._session.list_tools()
185
+ self._tools = response.tools
186
+ self._connected = True
187
+ self._session_ready.set()
188
+
189
+ # Keep connection alive until shutdown
190
+ await self._shutdown_event.wait()
191
+
192
+ def _create_sdk_tools(self) -> None:
193
+ """Create SDK tools from the fetched MCP tools with injected descriptions."""
194
+ self._sdk_tools = []
195
+
196
+ for tool in self._tools:
197
+ # Apply injection if configured
198
+ description = self._apply_injection(tool.name, tool.description or "")
199
+
200
+ # Create SDK tool with proper closure for handler
201
+ sdk_tool = self._create_sdk_tool(tool, description)
202
+ self._sdk_tools.append(sdk_tool)
203
+
204
+ def _create_sdk_tool(self, tool: Tool, description: str) -> SdkMcpTool:
205
+ """Create a single SDK tool with proper closure for the handler."""
206
+ tool_name = tool.name
207
+
208
+ async def handler(args: Dict[str, Any]) -> Dict[str, Any]:
209
+ return await self._call_tool(tool_name, args)
210
+
211
+ return SdkMcpTool(
212
+ name=tool.name,
213
+ description=description,
214
+ input_schema=tool.inputSchema,
215
+ handler=handler,
216
+ )
217
+
218
+ def _apply_injection(self, tool_name: str, original_description: str) -> str:
219
+ """Apply description injection to a tool."""
220
+ injection = self._tool_injections.get(tool_name)
221
+ if not injection:
222
+ return original_description
223
+
224
+ injection_type = getattr(injection, 'type', injection.get('type') if isinstance(injection, dict) else 'suffix')
225
+ injection_content = getattr(injection, 'content', injection.get('content') if isinstance(injection, dict) else '')
226
+
227
+ if injection_type == "suffix":
228
+ return f"{original_description}\n{injection_content}"
229
+ elif injection_type == "override":
230
+ return injection_content
231
+ else:
232
+ print(f"[WARNING] Unknown injection type '{injection_type}' for tool '{tool_name}' on server '{self.name}'")
233
+ return original_description
234
+
235
+ async def _call_tool(self, tool_name: str, arguments: Dict[str, Any]) -> Dict[str, Any]:
236
+ """Forward a tool call to the real MCP server."""
237
+ if not self._session or not self._connected:
238
+ raise ConnectionError(f"Not connected to MCP server '{self.name}'")
239
+
240
+ ctx = ToolCallContext(
241
+ framework="claudesdk",
242
+ server=self.name,
243
+ tool_name=tool_name,
244
+ arguments=arguments,
245
+ )
246
+
247
+ async def _dispatch(args: Dict[str, Any]):
248
+ return await self._session.call_tool(tool_name, arguments=args)
249
+
250
+ try:
251
+ result = await self._hook_manager.wrap(ctx, _dispatch)
252
+
253
+ # Convert MCP result to SDK format
254
+ content = []
255
+ if hasattr(result, 'content'):
256
+ for item in result.content:
257
+ if hasattr(item, 'text'):
258
+ content.append({"type": "text", "text": item.text})
259
+ elif hasattr(item, 'type') and item.type == 'text':
260
+ content.append({"type": "text", "text": getattr(item, 'text', str(item))})
261
+ else:
262
+ # Handle other content types
263
+ content.append({"type": "text", "text": str(item)})
264
+
265
+ return {
266
+ "content": content,
267
+ "is_error": getattr(result, 'isError', False),
268
+ }
269
+ except Exception as e:
270
+ return {
271
+ "content": [{"type": "text", "text": f"Error calling tool '{tool_name}': {e}"}],
272
+ "is_error": True,
273
+ }
274
+
275
+ def get_sdk_server_config(self) -> McpSdkServerConfig:
276
+ """
277
+ Get the SDK server configuration for use with ClaudeCodeOptions.
278
+
279
+ Returns:
280
+ McpSdkServerConfig that can be passed to ClaudeCodeOptions.mcp_servers
281
+
282
+ Raises:
283
+ RuntimeError: If the server is not connected
284
+ """
285
+ if not self._connected:
286
+ raise RuntimeError(f"MCP Proxy Server '{self.name}' is not connected. Call connect() first.")
287
+
288
+ # Create MCP server instance
289
+ server = Server(self.name, version="1.0.0")
290
+
291
+ # Store tools for handler access
292
+ tool_map = {tool.name: tool for tool in self._sdk_tools}
293
+
294
+ # Register list_tools handler
295
+ @server.list_tools()
296
+ async def list_tools() -> List[McpTool]:
297
+ """Return the list of available tools with injected descriptions."""
298
+ tool_list = []
299
+ for sdk_tool in self._sdk_tools:
300
+ tool_list.append(
301
+ McpTool(
302
+ name=sdk_tool.name,
303
+ description=sdk_tool.description,
304
+ inputSchema=sdk_tool.input_schema if isinstance(sdk_tool.input_schema, dict) else {},
305
+ )
306
+ )
307
+ return tool_list
308
+
309
+ # Register call_tool handler that forwards to real MCP server
310
+ @server.call_tool()
311
+ async def call_tool(name: str, arguments: Dict[str, Any]) -> Any:
312
+ """Execute a tool by forwarding to the real MCP server."""
313
+ if name not in tool_map:
314
+ raise ValueError(f"Tool '{name}' not found")
315
+
316
+ sdk_tool = tool_map[name]
317
+ result = await sdk_tool.handler(arguments)
318
+
319
+ # Convert to MCP format
320
+ content = []
321
+ if "content" in result:
322
+ for item in result["content"]:
323
+ if item.get("type") == "text":
324
+ content.append(TextContent(type="text", text=item["text"]))
325
+
326
+ return content
327
+
328
+ return McpSdkServerConfig(type="sdk", name=self.name, instance=server)
329
+
330
+ async def disconnect(self) -> None:
331
+ """Disconnect from the MCP server and clean up resources."""
332
+ self._shutdown_event.set()
333
+
334
+ if self._connection_task and not self._connection_task.done():
335
+ self._connection_task.cancel()
336
+ try:
337
+ await self._connection_task
338
+ except asyncio.CancelledError:
339
+ pass
340
+
341
+ self._session = None
342
+ self._connected = False
343
+ self._tools = []
344
+ self._sdk_tools = []
345
+ self._session_ready.clear()
346
+ self._shutdown_event.clear()
347
+
348
+ @property
349
+ def tools(self) -> List[Tool]:
350
+ """Get the list of tools from the real MCP server."""
351
+ return self._tools
352
+
353
+ @property
354
+ def sdk_tools(self) -> List[SdkMcpTool]:
355
+ """Get the list of SDK tools with injected descriptions."""
356
+ return self._sdk_tools
357
+
358
+ @property
359
+ def is_connected(self) -> bool:
360
+ """Check if the proxy is connected to the real MCP server."""
361
+ return self._connected
362
+
363
+
364
+ async def create_proxy_servers(
365
+ mcp_configs: Dict[str, Dict[str, Any]],
366
+ tool_injections: Optional[Dict[str, Dict[str, Any]]] = None,
367
+ ) -> Dict[str, MCPProxyServer]:
368
+ """
369
+ Create and connect proxy servers for multiple MCP configurations.
370
+
371
+ Args:
372
+ mcp_configs: Dict mapping server_name -> config dict with:
373
+ - transport: "http", "sse", or "stdio"
374
+ - url: URL for http/sse
375
+ - command: Command for stdio
376
+ - args: Args for stdio
377
+ - env: Env vars for stdio
378
+ tool_injections: Dict mapping server_name -> Dict[tool_name -> ToolInjection]
379
+
380
+ Returns:
381
+ Dict mapping server_name -> connected MCPProxyServer
382
+
383
+ Raises:
384
+ ConnectionError: If any server fails to connect
385
+ """
386
+ proxies = {}
387
+ tool_injections = tool_injections or {}
388
+
389
+ for server_name, config in mcp_configs.items():
390
+ proxy = MCPProxyServer(
391
+ name=server_name,
392
+ transport=config.get("transport", "sse"),
393
+ url=config.get("url"),
394
+ command=config.get("command"),
395
+ args=config.get("args"),
396
+ env=config.get("env"),
397
+ tool_injections=tool_injections.get(server_name, {}),
398
+ )
399
+
400
+ try:
401
+ await proxy.connect()
402
+ proxies[server_name] = proxy
403
+ except Exception as e:
404
+ # Clean up already connected proxies on failure
405
+ for p in proxies.values():
406
+ await p.disconnect()
407
+ raise ConnectionError(f"Failed to connect to MCP server '{server_name}': {e}") from e
408
+
409
+ return proxies