decodingtrust-agent-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. agent/__init__.py +30 -0
  2. agent/claudesdk/__init__.py +8 -0
  3. agent/claudesdk/example.py +221 -0
  4. agent/claudesdk/src/__init__.py +8 -0
  5. agent/claudesdk/src/agent.py +400 -0
  6. agent/claudesdk/src/mcp_proxy.py +409 -0
  7. agent/claudesdk/src/utils.py +420 -0
  8. agent/googleadk/__init__.py +15 -0
  9. agent/googleadk/example.py +237 -0
  10. agent/googleadk/src/__init__.py +12 -0
  11. agent/googleadk/src/agent.py +401 -0
  12. agent/googleadk/src/mcp_wrapper.py +163 -0
  13. agent/googleadk/src/utils.py +602 -0
  14. agent/langchain/__init__.py +8 -0
  15. agent/langchain/example.py +213 -0
  16. agent/langchain/src/__init__.py +8 -0
  17. agent/langchain/src/agent.py +645 -0
  18. agent/langchain/src/utils.py +433 -0
  19. agent/openaisdk/__init__.py +17 -0
  20. agent/openaisdk/example.py +228 -0
  21. agent/openaisdk/src/__init__.py +12 -0
  22. agent/openaisdk/src/agent.py +491 -0
  23. agent/openaisdk/src/agent_wrapper.py +143 -0
  24. agent/openaisdk/src/mcp_wrapper.py +395 -0
  25. agent/openaisdk/src/utils.py +493 -0
  26. agent/openclaw/__init__.py +10 -0
  27. agent/openclaw/example.py +251 -0
  28. agent/openclaw/src/__init__.py +14 -0
  29. agent/openclaw/src/agent.py +930 -0
  30. agent/openclaw/src/helpers/__init__.py +1 -0
  31. agent/openclaw/src/helpers/auth_helpers.py +55 -0
  32. agent/openclaw/src/mcp_proxy.py +564 -0
  33. agent/openclaw/src/plugin_generator.py +231 -0
  34. agent/openclaw/src/utils.py +341 -0
  35. agent/pocketflow/__init__.py +18 -0
  36. agent/pocketflow/example.py +221 -0
  37. agent/pocketflow/prompts/react_agent.py +46 -0
  38. agent/pocketflow/src/__init__.py +6 -0
  39. agent/pocketflow/src/agent.py +507 -0
  40. agent/pocketflow/src/agent_wrapper.py +159 -0
  41. agent/pocketflow/src/async_helper.py +92 -0
  42. agent/pocketflow/src/mcp_react_agent.py +279 -0
  43. agent/pocketflow/src/native_agent.py +74 -0
  44. agent/pocketflow/src/nodes.py +467 -0
  45. benchmark/__init__.py +0 -0
  46. benchmark/browser/benign.jsonl +34 -0
  47. benchmark/browser/direct.jsonl +85 -0
  48. benchmark/browser/indirect.jsonl +82 -0
  49. benchmark/code/benign.jsonl +0 -0
  50. benchmark/code/direct.jsonl +121 -0
  51. benchmark/code/indirect.jsonl +165 -0
  52. benchmark/crm/benign.jsonl +165 -0
  53. benchmark/crm/direct.jsonl +90 -0
  54. benchmark/crm/indirect.jsonl +150 -0
  55. benchmark/customer-service/benign.jsonl +160 -0
  56. benchmark/customer-service/direct.jsonl +100 -0
  57. benchmark/customer-service/indirect.jsonl +101 -0
  58. benchmark/finance/benign.jsonl +0 -0
  59. benchmark/finance/direct.jsonl +200 -0
  60. benchmark/finance/indirect.jsonl +200 -0
  61. benchmark/legal/benign.jsonl +0 -0
  62. benchmark/legal/direct.jsonl +200 -0
  63. benchmark/legal/indirect.jsonl +200 -0
  64. benchmark/macos/benign.jsonl +30 -0
  65. benchmark/macos/direct.jsonl +50 -0
  66. benchmark/macos/indirect.jsonl +50 -0
  67. benchmark/medical/benign.jsonl +642 -0
  68. benchmark/medical/direct.jsonl +229 -0
  69. benchmark/medical/indirect.jsonl +222 -0
  70. benchmark/os-filesystem/benign.jsonl +200 -0
  71. benchmark/os-filesystem/direct.jsonl +200 -0
  72. benchmark/os-filesystem/indirect.jsonl +200 -0
  73. benchmark/research/benign.jsonl +0 -0
  74. benchmark/research/direct.jsonl +119 -0
  75. benchmark/research/indirect.jsonl +125 -0
  76. benchmark/telecom/benign.jsonl +120 -0
  77. benchmark/telecom/direct.jsonl +161 -0
  78. benchmark/telecom/indirect.jsonl +166 -0
  79. benchmark/travel/benign.jsonl +130 -0
  80. benchmark/travel/direct.jsonl +105 -0
  81. benchmark/travel/indirect.jsonl +120 -0
  82. benchmark/windows/benign.jsonl +100 -0
  83. benchmark/windows/direct.jsonl +140 -0
  84. benchmark/windows/indirect.jsonl +107 -0
  85. benchmark/workflow/benign.jsonl +335 -0
  86. benchmark/workflow/direct.jsonl +78 -0
  87. benchmark/workflow/indirect.jsonl +107 -0
  88. cli/__init__.py +5 -0
  89. cli/main.py +182 -0
  90. cli/scaffold.py +334 -0
  91. decodingtrust_agent_sdk-0.1.0.dist-info/METADATA +642 -0
  92. decodingtrust_agent_sdk-0.1.0.dist-info/RECORD +374 -0
  93. decodingtrust_agent_sdk-0.1.0.dist-info/WHEEL +5 -0
  94. decodingtrust_agent_sdk-0.1.0.dist-info/entry_points.txt +2 -0
  95. decodingtrust_agent_sdk-0.1.0.dist-info/licenses/LICENSE +201 -0
  96. decodingtrust_agent_sdk-0.1.0.dist-info/top_level.txt +6 -0
  97. dt_arena/config/env.yaml +515 -0
  98. dt_arena/config/injection_mcp.yaml +430 -0
  99. dt_arena/config/mcp.yaml +642 -0
  100. dt_arena/envs/arxiv/docker-compose-hub.yml +31 -0
  101. dt_arena/envs/arxiv/docker-compose.yml +36 -0
  102. dt_arena/envs/atlassian/docker/docker-compose.dev.yml +65 -0
  103. dt_arena/envs/atlassian/docker/docker-compose.yml +53 -0
  104. dt_arena/envs/atlassian/docker-compose-hub.yml +57 -0
  105. dt_arena/envs/atlassian/docker-compose.yml +72 -0
  106. dt_arena/envs/bigquery/docker-compose.yml +20 -0
  107. dt_arena/envs/booking/docker-compose.yml +59 -0
  108. dt_arena/envs/calendar/docker-compose-hub.yml +30 -0
  109. dt_arena/envs/calendar/docker-compose.yml +42 -0
  110. dt_arena/envs/custom-website/docker-compose.yml +6 -0
  111. dt_arena/envs/customer_service/docker-compose.yml +59 -0
  112. dt_arena/envs/databricks/docker-compose-hub.yml +47 -0
  113. dt_arena/envs/databricks/docker-compose.yml +51 -0
  114. dt_arena/envs/ecommerce/docker-compose.yml +6 -0
  115. dt_arena/envs/ers/docker-compose.yml +36 -0
  116. dt_arena/envs/ers/hrms/docker/docker-compose.yml +31 -0
  117. dt_arena/envs/finance/docker-compose.yml +23 -0
  118. dt_arena/envs/github/docker/docker-compose-hub.yml +50 -0
  119. dt_arena/envs/github/docker/docker-compose.yml +50 -0
  120. dt_arena/envs/gmail/docker-compose-hub.yml +51 -0
  121. dt_arena/envs/gmail/docker-compose.yml +65 -0
  122. dt_arena/envs/google-form/docker-compose-hub.yml +33 -0
  123. dt_arena/envs/google-form/docker-compose.yml +41 -0
  124. dt_arena/envs/googledocs/docker-compose-hub.yml +61 -0
  125. dt_arena/envs/googledocs/docker-compose.yml +78 -0
  126. dt_arena/envs/hospital/docker-compose-hub.yml +25 -0
  127. dt_arena/envs/hospital/docker-compose.yml +27 -0
  128. dt_arena/envs/legal/docker-compose.yml +22 -0
  129. dt_arena/envs/linkedin/docker-compose.yml +63 -0
  130. dt_arena/envs/macos/docker-compose.yml +79 -0
  131. dt_arena/envs/os-filesystem/docker-compose-hub.yml +16 -0
  132. dt_arena/envs/os-filesystem/docker-compose.yml +20 -0
  133. dt_arena/envs/paypal/docker-compose-hub.yml +48 -0
  134. dt_arena/envs/paypal/docker-compose.yml +63 -0
  135. dt_arena/envs/research/docker-compose-hub.yml +13 -0
  136. dt_arena/envs/research/docker-compose.yml +24 -0
  137. dt_arena/envs/salesforce_crm/docker-compose-hub.yaml +45 -0
  138. dt_arena/envs/salesforce_crm/docker-compose.yaml +49 -0
  139. dt_arena/envs/slack/docker-compose-hub.yml +28 -0
  140. dt_arena/envs/slack/docker-compose.yml +41 -0
  141. dt_arena/envs/snowflake/docker-compose-hub.yml +41 -0
  142. dt_arena/envs/snowflake/docker-compose.yml +44 -0
  143. dt_arena/envs/telecom/docker-compose-hub.yml +16 -0
  144. dt_arena/envs/telecom/docker-compose.yml +17 -0
  145. dt_arena/envs/telegram/docker-compose-hub.yml +57 -0
  146. dt_arena/envs/telegram/docker-compose.yml +62 -0
  147. dt_arena/envs/terminal/docker-compose-hub.yml +12 -0
  148. dt_arena/envs/terminal/docker-compose.yml +26 -0
  149. dt_arena/envs/travel/docker-compose-hub.yml +19 -0
  150. dt_arena/envs/travel/docker-compose.yml +19 -0
  151. dt_arena/envs/whatsapp/docker-compose-hub.yml +61 -0
  152. dt_arena/envs/whatsapp/docker-compose.yml +78 -0
  153. dt_arena/envs/windows/docker-compose.yml +71 -0
  154. dt_arena/envs/zoom/docker-compose-hub.yml +27 -0
  155. dt_arena/envs/zoom/docker-compose.yml +40 -0
  156. dt_arena/injection_mcp_server/atlassian/env_injection.py +134 -0
  157. dt_arena/injection_mcp_server/calendar/env_injection.py +217 -0
  158. dt_arena/injection_mcp_server/custom_website/env_injection.py +97 -0
  159. dt_arena/injection_mcp_server/customer_service/env_injection.py +659 -0
  160. dt_arena/injection_mcp_server/databricks/env_injection.py +255 -0
  161. dt_arena/injection_mcp_server/ecommerce/env_injection.py +110 -0
  162. dt_arena/injection_mcp_server/finance/env_injection.py +85 -0
  163. dt_arena/injection_mcp_server/github/env_injection.py +206 -0
  164. dt_arena/injection_mcp_server/gmail/env_injection.py +211 -0
  165. dt_arena/injection_mcp_server/google_form/env_injection.py +186 -0
  166. dt_arena/injection_mcp_server/googledocs/env_injection.py +44 -0
  167. dt_arena/injection_mcp_server/hospital/env_injection.py +43 -0
  168. dt_arena/injection_mcp_server/legal/env_injection.py +229 -0
  169. dt_arena/injection_mcp_server/macos/env_injection.py +272 -0
  170. dt_arena/injection_mcp_server/os-filesystem/env_injection.py +341 -0
  171. dt_arena/injection_mcp_server/paypal/env_injection.py +268 -0
  172. dt_arena/injection_mcp_server/research/env_injection.py +616 -0
  173. dt_arena/injection_mcp_server/salesforce/env_injection.py +514 -0
  174. dt_arena/injection_mcp_server/slack/env_injection.py +265 -0
  175. dt_arena/injection_mcp_server/snowflake/env_injection.py +230 -0
  176. dt_arena/injection_mcp_server/telecom/env_injection.py +503 -0
  177. dt_arena/injection_mcp_server/telegram/env_injection.py +171 -0
  178. dt_arena/injection_mcp_server/terminal/env_injection.py +523 -0
  179. dt_arena/injection_mcp_server/travel/env_injection.py +173 -0
  180. dt_arena/injection_mcp_server/whatsapp/env_injection.py +185 -0
  181. dt_arena/injection_mcp_server/windows/env_injection.py +943 -0
  182. dt_arena/injection_mcp_server/zoom/env_injection.py +216 -0
  183. dt_arena/mcp_server/atlassian/main.py +1554 -0
  184. dt_arena/mcp_server/atlassian/test_server.py +66 -0
  185. dt_arena/mcp_server/bigquery/main.py +333 -0
  186. dt_arena/mcp_server/booking/main.py +310 -0
  187. dt_arena/mcp_server/browser/main.py +1741 -0
  188. dt_arena/mcp_server/calendar/example_multi_user.py +162 -0
  189. dt_arena/mcp_server/calendar/main.py +792 -0
  190. dt_arena/mcp_server/calendar/test_mcp.py +135 -0
  191. dt_arena/mcp_server/customer_service/main.py +1063 -0
  192. dt_arena/mcp_server/databricks/main.py +566 -0
  193. dt_arena/mcp_server/databricks/probe.py +102 -0
  194. dt_arena/mcp_server/ers/main.py +845 -0
  195. dt_arena/mcp_server/finance/__init__.py +87 -0
  196. dt_arena/mcp_server/finance/core/__init__.py +12 -0
  197. dt_arena/mcp_server/finance/core/data_loader.py +558 -0
  198. dt_arena/mcp_server/finance/core/portfolio.py +565 -0
  199. dt_arena/mcp_server/finance/evaluation/__init__.py +20 -0
  200. dt_arena/mcp_server/finance/evaluation/evaluator.py +217 -0
  201. dt_arena/mcp_server/finance/evaluation/logger.py +137 -0
  202. dt_arena/mcp_server/finance/injection/__init__.py +66 -0
  203. dt_arena/mcp_server/finance/injection/config.py +176 -0
  204. dt_arena/mcp_server/finance/injection/content.py +755 -0
  205. dt_arena/mcp_server/finance/injection/html.py +409 -0
  206. dt_arena/mcp_server/finance/injection/locations.py +167 -0
  207. dt_arena/mcp_server/finance/injection/methods.py +193 -0
  208. dt_arena/mcp_server/finance/injection/presets.py +1023 -0
  209. dt_arena/mcp_server/finance/main.py +361 -0
  210. dt_arena/mcp_server/finance/run_mcp.py +21 -0
  211. dt_arena/mcp_server/finance/run_web.py +26 -0
  212. dt_arena/mcp_server/finance/server/__init__.py +41 -0
  213. dt_arena/mcp_server/finance/server/extractor.py +1453 -0
  214. dt_arena/mcp_server/finance/server/extractor_minimal.py +292 -0
  215. dt_arena/mcp_server/finance/server/extractor_simple.py +1164 -0
  216. dt_arena/mcp_server/finance/server/injection_mcp.py +865 -0
  217. dt_arena/mcp_server/finance/server/mcp.py +451 -0
  218. dt_arena/mcp_server/finance/server/tools/__init__.py +23 -0
  219. dt_arena/mcp_server/finance/server/tools/account.py +88 -0
  220. dt_arena/mcp_server/finance/server/tools/browsing.py +328 -0
  221. dt_arena/mcp_server/finance/server/tools/social.py +73 -0
  222. dt_arena/mcp_server/finance/server/tools/trading.py +242 -0
  223. dt_arena/mcp_server/finance/server/tools/utility.py +49 -0
  224. dt_arena/mcp_server/finance/server/web.py +2139 -0
  225. dt_arena/mcp_server/finance/tasks/benchmark/__init__.py +28 -0
  226. dt_arena/mcp_server/finance/tasks/benchmark/attack_pool.py +3026 -0
  227. dt_arena/mcp_server/finance/tasks/benchmark/attack_runner.py +1315 -0
  228. dt_arena/mcp_server/finance/tasks/benchmark/finra_requirements.py +1335 -0
  229. dt_arena/mcp_server/finance/tasks/benchmark/finra_tasks.py +3665 -0
  230. dt_arena/mcp_server/finance/tasks/benchmark/malicious_tasks.py +2673 -0
  231. dt_arena/mcp_server/finance/tasks/redteam_suite/run_redteam_suite.py +1713 -0
  232. dt_arena/mcp_server/finance/test_mcp_tools.py +476 -0
  233. dt_arena/mcp_server/github/main.py +441 -0
  234. dt_arena/mcp_server/gmail/main.py +1004 -0
  235. dt_arena/mcp_server/google_form/main.py +141 -0
  236. dt_arena/mcp_server/googledocs/main.py +458 -0
  237. dt_arena/mcp_server/hospital/mcp_server.py +458 -0
  238. dt_arena/mcp_server/legal/__init__.py +9 -0
  239. dt_arena/mcp_server/legal/core/__init__.py +14 -0
  240. dt_arena/mcp_server/legal/core/courtlistener_store.py +762 -0
  241. dt_arena/mcp_server/legal/core/data_loader.py +266 -0
  242. dt_arena/mcp_server/legal/core/document_store.py +197 -0
  243. dt_arena/mcp_server/legal/core/matter_manager.py +466 -0
  244. dt_arena/mcp_server/legal/main.py +89 -0
  245. dt_arena/mcp_server/legal/scripts/collect_data.py +988 -0
  246. dt_arena/mcp_server/legal/server/__init__.py +14 -0
  247. dt_arena/mcp_server/legal/server/mcp.py +2330 -0
  248. dt_arena/mcp_server/macos/client_test.py +270 -0
  249. dt_arena/mcp_server/macos/mcp_server.py +285 -0
  250. dt_arena/mcp_server/os-filesystem/main.py +1380 -0
  251. dt_arena/mcp_server/paypal/main.py +501 -0
  252. dt_arena/mcp_server/research/main.py +777 -0
  253. dt_arena/mcp_server/salesforce/main.py +2006 -0
  254. dt_arena/mcp_server/slack/main.py +318 -0
  255. dt_arena/mcp_server/snowflake/main.py +612 -0
  256. dt_arena/mcp_server/snowflake/probe.py +183 -0
  257. dt_arena/mcp_server/telecom/mcp_client.py +423 -0
  258. dt_arena/mcp_server/telecom/mcp_server.py +1059 -0
  259. dt_arena/mcp_server/telegram/main.py +338 -0
  260. dt_arena/mcp_server/terminal/main.py +163 -0
  261. dt_arena/mcp_server/travel/client_test.py +16 -0
  262. dt_arena/mcp_server/travel/mcp_server.py +404 -0
  263. dt_arena/mcp_server/whatsapp/main.py +318 -0
  264. dt_arena/mcp_server/windows/client_test.py +270 -0
  265. dt_arena/mcp_server/windows/mcp_server.py +218 -0
  266. dt_arena/mcp_server/zoom/main.py +466 -0
  267. dt_arena/src/__init__.py +0 -0
  268. dt_arena/src/hooks/__init__.py +0 -0
  269. dt_arena/src/hooks/audit_log.py +30 -0
  270. dt_arena/src/hooks/hooks.json +3 -0
  271. dt_arena/src/run_benign.py +142 -0
  272. dt_arena/src/types/__init__.py +0 -0
  273. dt_arena/src/types/agent.py +441 -0
  274. dt_arena/src/types/attacks.py +2 -0
  275. dt_arena/src/types/environment.py +2 -0
  276. dt_arena/src/types/hooks.py +174 -0
  277. dt_arena/src/types/judge.py +52 -0
  278. dt_arena/src/types/red_teaming_trajectory.py +385 -0
  279. dt_arena/src/types/task.py +260 -0
  280. dt_arena/src/types/trajectory.py +315 -0
  281. dt_arena/utils/__init__.py +1 -0
  282. dt_arena/utils/atlassian/__init__.py +27 -0
  283. dt_arena/utils/atlassian/helpers.py +520 -0
  284. dt_arena/utils/bigquery/__init__.py +1 -0
  285. dt_arena/utils/bigquery/helpers.py +246 -0
  286. dt_arena/utils/calendar/__init__.py +1 -0
  287. dt_arena/utils/calendar/helpers.py +87 -0
  288. dt_arena/utils/customer_service/__init__.py +17 -0
  289. dt_arena/utils/customer_service/cs_env_client.py +940 -0
  290. dt_arena/utils/customer_service/helpers.py +339 -0
  291. dt_arena/utils/customer_service/judges/__init__.py +20 -0
  292. dt_arena/utils/customer_service/judges/abusive_comms.py +149 -0
  293. dt_arena/utils/customer_service/judges/bulk_destructive.py +202 -0
  294. dt_arena/utils/customer_service/judges/cross_customer.py +481 -0
  295. dt_arena/utils/customer_service/judges/email_exfiltration.py +320 -0
  296. dt_arena/utils/customer_service/judges/financial_fraud.py +221 -0
  297. dt_arena/utils/customer_service/judges/llm_utils.py +264 -0
  298. dt_arena/utils/customer_service/judges/pii_disclosure.py +837 -0
  299. dt_arena/utils/customer_service/judges/policy_misrep.py +44 -0
  300. dt_arena/utils/customer_service/judges/text_utils.py +21 -0
  301. dt_arena/utils/databricks/__init__.py +2 -0
  302. dt_arena/utils/databricks/helpers.py +210 -0
  303. dt_arena/utils/finance/__init__.py +0 -0
  304. dt_arena/utils/finance/helpers.py +263 -0
  305. dt_arena/utils/github/__init__.py +1 -0
  306. dt_arena/utils/github/helpers.py +249 -0
  307. dt_arena/utils/gmail/__init__.py +1 -0
  308. dt_arena/utils/gmail/helpers.py +344 -0
  309. dt_arena/utils/google_form/__init__.py +2 -0
  310. dt_arena/utils/google_form/helpers.py +133 -0
  311. dt_arena/utils/legal/__init__.py +0 -0
  312. dt_arena/utils/legal/helpers.py +228 -0
  313. dt_arena/utils/macos/__init__.py +0 -0
  314. dt_arena/utils/macos/env_setup.py +215 -0
  315. dt_arena/utils/macos/helpers.py +61 -0
  316. dt_arena/utils/os_filesystem/__init__.py +1 -0
  317. dt_arena/utils/os_filesystem/helpers.py +366 -0
  318. dt_arena/utils/paypal/__init__.py +1 -0
  319. dt_arena/utils/paypal/helpers.py +178 -0
  320. dt_arena/utils/port_allocator.py +266 -0
  321. dt_arena/utils/research/__init__.py +0 -0
  322. dt_arena/utils/research/helpers.py +251 -0
  323. dt_arena/utils/salesforce/__init__.py +1 -0
  324. dt_arena/utils/salesforce/helpers.py +719 -0
  325. dt_arena/utils/slack/__init__.py +1 -0
  326. dt_arena/utils/slack/helpers.py +176 -0
  327. dt_arena/utils/snowflake/__init__.py +1 -0
  328. dt_arena/utils/snowflake/helpers.py +166 -0
  329. dt_arena/utils/telecom/__init__.py +1 -0
  330. dt_arena/utils/telecom/helpers.py +760 -0
  331. dt_arena/utils/telegram/__init__.py +0 -0
  332. dt_arena/utils/telegram/helpers.py +174 -0
  333. dt_arena/utils/terminal/__init__.py +0 -0
  334. dt_arena/utils/terminal/helpers.py +20 -0
  335. dt_arena/utils/travel/__init__.py +0 -0
  336. dt_arena/utils/travel/env_client.py +537 -0
  337. dt_arena/utils/travel/llm_judge.py +137 -0
  338. dt_arena/utils/travel/prompts.py +64 -0
  339. dt_arena/utils/utils/__init__.py +122 -0
  340. dt_arena/utils/whatsapp/__init__.py +0 -0
  341. dt_arena/utils/whatsapp/helpers.py +226 -0
  342. dt_arena/utils/windows/__init__.py +0 -0
  343. dt_arena/utils/windows/env_reset.py +224 -0
  344. dt_arena/utils/windows/env_setup.py +280 -0
  345. dt_arena/utils/windows/exfil_helpers.py +170 -0
  346. dt_arena/utils/windows/helpers.py +74 -0
  347. dt_arena/utils/zoom/__init__.py +1 -0
  348. dt_arena/utils/zoom/helpers.py +70 -0
  349. eval/__init__.py +1 -0
  350. eval/evaluation.py +426 -0
  351. eval/task_runner.py +449 -0
  352. utils/__init__.py +148 -0
  353. utils/agent_helpers.py +308 -0
  354. utils/agent_wrapper.py +189 -0
  355. utils/compose_utils.py +135 -0
  356. utils/config.py +77 -0
  357. utils/env_helpers.py +104 -0
  358. utils/eval_stats.py +88 -0
  359. utils/injection_helpers.py +429 -0
  360. utils/injection_mcp_helpers.py +152 -0
  361. utils/judge_helpers.py +181 -0
  362. utils/judge_utils.py +472 -0
  363. utils/llm.py +196 -0
  364. utils/logging.py +45 -0
  365. utils/mcp_helpers.py +232 -0
  366. utils/mcp_manager.py +235 -0
  367. utils/memory_guard.py +18 -0
  368. utils/red_teaming_sandbox.py +476 -0
  369. utils/reset_helpers.py +318 -0
  370. utils/resource_manager.py +370 -0
  371. utils/skill_helpers.py +447 -0
  372. utils/task_executor.py +904 -0
  373. utils/task_helpers.py +270 -0
  374. utils/template_helpers.py +179 -0
utils/logging.py ADDED
@@ -0,0 +1,45 @@
1
+ from typing import Any, Dict, List, Optional
2
+
3
+
4
+ async def log_mcp_tools_to_file(
5
+ log_file: str,
6
+ mcp_servers: List[Any],
7
+ ) -> None:
8
+ """
9
+ Log all loaded MCP tools and their descriptions to a file for debugging.
10
+
11
+ Supports:
12
+ - OpenAI SDK (list_tools method)
13
+ - Google ADK (get_tools method)
14
+ - Claude SDK proxy (sdk_tools property)
15
+
16
+ Args:
17
+ log_file: Path to the log file
18
+ mcp_servers: List of MCP server instances (must have name and list_tools()/get_tools()/sdk_tools)
19
+ """
20
+ with open(log_file, "w") as f:
21
+ f.write("Tool List Log\n")
22
+ f.write(f"{'='*80}\n\n")
23
+ for server in mcp_servers:
24
+ try:
25
+ # Support OpenAI SDK (list_tools), Google ADK (get_tools), and Claude SDK proxy (sdk_tools)
26
+ if hasattr(server, 'list_tools'):
27
+ tools = await server.list_tools()
28
+ elif hasattr(server, 'get_tools'):
29
+ tools = await server.get_tools()
30
+ elif hasattr(server, 'sdk_tools'):
31
+ # Claude SDK proxy - sdk_tools is a property, not async
32
+ tools = server.sdk_tools
33
+ else:
34
+ f.write(f"MCP Server: {server.name} - No list_tools, get_tools, or sdk_tools method\n\n")
35
+ continue
36
+
37
+ f.write(f"MCP Server: {server.name}\n")
38
+ f.write(f"Tools loaded: {len(tools)}\n")
39
+ f.write(f"{'-'*40}\n")
40
+ for tool in tools:
41
+ f.write(f"\nTool: {tool.name}\n")
42
+ f.write(f"Description:\n{tool.description}\n")
43
+ f.write(f"\n{'='*80}\n\n")
44
+ except Exception as e:
45
+ f.write(f"MCP Server: {server.name} - Error listing tools: {e}\n\n")
utils/mcp_helpers.py ADDED
@@ -0,0 +1,232 @@
1
+ import os
2
+ from pathlib import Path
3
+ import yaml
4
+ from typing import Dict, Optional, Any
5
+ import tempfile
6
+ import shutil
7
+
8
+ from .config import MCP_CONFIG_PATH
9
+ from .template_helpers import (
10
+ allocate_server_port,
11
+ resolve_server_env_vars,
12
+ build_server_name_map,
13
+ wait_for_servers_ready,
14
+ )
15
+ from .mcp_manager import MCPServerManager
16
+
17
+
18
+ def _start_mcp_servers_impl(
19
+ config_path: str,
20
+ server_list: list[str],
21
+ resource_mgr: Any,
22
+ task_id: str,
23
+ host: str,
24
+ prefix: str,
25
+ log_prefix: str,
26
+ task_env_overrides: Optional[Dict[str, Dict[str, str]]] = None,
27
+ task_dir: Optional[Any] = None,
28
+ ) -> tuple[Optional[MCPServerManager], Dict[str, str]]:
29
+ """Internal implementation for starting MCP servers.
30
+
31
+ Args:
32
+ config_path: Path to MCP config YAML file
33
+ server_list: List of server names to start
34
+ resource_mgr: ResourceManager for port allocation
35
+ task_id: Unique task identifier
36
+ host: Host address for URLs
37
+ prefix: Prefix for port allocation (e.g., "mcp", "injection")
38
+ log_prefix: Prefix for log messages (e.g., "[MCP]", "[INJECTION MCP]")
39
+ task_env_overrides: Optional dict of {server_name_lower: {env_key: env_val}}
40
+ to merge into server env BEFORE starting servers.
41
+ Used by start_task_mcp_servers() to pass task-specific
42
+ env vars like USER_ACCESS_TOKEN from config.yaml.
43
+
44
+ Returns:
45
+ Tuple of (MCPServerManager or None, dict of {server_name: url})
46
+ """
47
+ if not server_list:
48
+ print(f"{log_prefix} No servers requested")
49
+ return None, {}
50
+
51
+ # Initialize manager and load available servers
52
+ manager = MCPServerManager(config_path)
53
+ available = manager.list_servers()
54
+ name_map = build_server_name_map(available)
55
+
56
+ # Load global configuration from the same config file for template resolution
57
+ try:
58
+ mcp_cfg = yaml.safe_load(open(config_path).read()) or {}
59
+ global_config = {srv["name"].lower(): srv for srv in (mcp_cfg.get("servers") or [])}
60
+ except Exception as exc:
61
+ print(f"[WARN] Failed to load config for template resolution: {exc}")
62
+ global_config = {}
63
+
64
+ # Resolve server names and validate
65
+ targets = []
66
+ for raw_name in server_list:
67
+ resolved = name_map.get(raw_name.lower())
68
+ if not resolved:
69
+ print(f"[WARN] Server '{raw_name}' not found in config; skipping.")
70
+ continue
71
+ if resolved not in targets:
72
+ targets.append(resolved)
73
+
74
+ if not targets:
75
+ print(f"[WARN] No {prefix} servers to start.")
76
+ return None, {}
77
+
78
+ # Cache for resolved port variables
79
+ resolved_ports: Dict[str, int] = {}
80
+ server_urls: Dict[str, str] = {}
81
+
82
+ # Process each server: allocate port, resolve env vars, configure
83
+ for server_name in targets:
84
+ key = server_name.lower()
85
+
86
+ # Get server config from manager
87
+ cfg = manager.get_server_config(server_name)
88
+ if not cfg:
89
+ continue
90
+
91
+ # Set up environment variables
92
+ env = cfg.setdefault("env", {})
93
+
94
+ # Find the port env var for this server's own port
95
+ port_key = "PORT"
96
+ for k in env:
97
+ if "PORT" in k.upper() and prefix.upper() in k.upper():
98
+ port_key = k
99
+ break
100
+
101
+ # Allocate port for this server
102
+ port = allocate_server_port(server_name, resource_mgr, task_id, prefix=prefix)
103
+ env[port_key] = str(port)
104
+
105
+ # Resolve template variables from global config
106
+ srv_cfg = global_config.get(key) or {}
107
+ srv_env = srv_cfg.get("env") or {}
108
+ resolved_env = resolve_server_env_vars(srv_env, resolved_ports, server_name)
109
+
110
+ # Merge resolved env vars
111
+ for env_key, env_val in resolved_env.items():
112
+ if env_key != port_key:
113
+ env[env_key] = env_val
114
+
115
+ # Merge task-specific env overrides
116
+ if task_env_overrides and key in task_env_overrides:
117
+ for env_key, env_val in task_env_overrides[key].items():
118
+ if env_val: # Only set non-empty values
119
+ env[env_key] = env_val
120
+ display_val = env_val[:20] + "..." if len(env_val) > 20 else env_val
121
+ print(f"{log_prefix} {server_name}: {env_key}={display_val}")
122
+
123
+ # Determine URL - only /mcp is supported
124
+ transport = cfg.get("transport", "http")
125
+ if transport != "http":
126
+ raise ValueError(f"Unsupported transport '{transport}' for server '{server_name}'. Only 'http' is supported.")
127
+
128
+ url = f"http://{host}:{port}/mcp"
129
+ server_urls[server_name] = url
130
+
131
+ # MCP config file
132
+ if task_dir is not None:
133
+ for server_name in targets:
134
+ mcp_config_file = task_dir / "metadata" / f"{server_name}_mcp_config.json"
135
+ if not mcp_config_file.exists():
136
+ continue
137
+ temp_mcp_config_file = Path(tempfile.mkstemp(prefix=f"dt_mcp_{server_name}_config_", suffix=".json")[1])
138
+ shutil.copy(mcp_config_file, temp_mcp_config_file)
139
+ with open(temp_mcp_config_file, "r") as f:
140
+ content = f.read()
141
+ for var_name, port in resolved_ports.items():
142
+ content = content.replace(f"<DT_ENV>{var_name}</DT_ENV>", str(port))
143
+ with open(temp_mcp_config_file, "w") as f:
144
+ f.write(content)
145
+ config_env_key = f"{server_name.upper()}_MCP_CONFIG"
146
+ manager.get_server_config(server_name)["env"][config_env_key] = str(temp_mcp_config_file)
147
+ # Also expose to the current process so judge.py (loaded in-process) can read it
148
+ os.environ[config_env_key] = str(temp_mcp_config_file)
149
+ print(f"[MCP] {server_name}: {config_env_key}={temp_mcp_config_file}")
150
+
151
+ # Start all servers
152
+ print(f"{log_prefix} Starting {len(targets)} server(s)...")
153
+ started = 0
154
+ for name in targets:
155
+ if manager.start_server(name, foreground=False):
156
+ started += 1
157
+ print(f" Started: {name} -> {server_urls.get(name, 'unknown')}")
158
+
159
+ if started == 0:
160
+ print(f"[WARN] No {prefix} servers were started.")
161
+ return None, server_urls
162
+
163
+ print(f"{log_prefix} Started {started}/{len(targets)} server(s)")
164
+
165
+ # Wait for servers to be ready
166
+ if server_urls:
167
+ print(f"{log_prefix} Waiting for servers to be ready...")
168
+ wait_for_servers_ready(server_urls)
169
+
170
+ return manager, server_urls
171
+
172
+
173
+ def start_task_mcp_servers(
174
+ agent_cfg: Any, # AgentConfig
175
+ task_id: str,
176
+ task_dir: Any, # Path
177
+ resource_mgr: Any, # ResourceManager
178
+ host: str = "127.0.0.1",
179
+ ) -> Optional[MCPServerManager]:
180
+ """Set up and start MCP servers for a task.
181
+
182
+ Args:
183
+ agent_cfg: Agent configuration with MCP server list
184
+ task_id: Unique task identifier for resource tracking
185
+ resource_mgr: Resource manager for port allocation
186
+ host: Host address for MCP server URLs (default: 127.0.0.1)
187
+
188
+ Returns:
189
+ MCPServerManager instance or None if no servers started
190
+ """
191
+ # Extract server list from agent config
192
+ requested_servers = [s.name for s in (agent_cfg.mcp_servers or []) if s.enabled]
193
+
194
+ if not requested_servers:
195
+ print("[MCP] No MCP servers requested")
196
+ return None
197
+
198
+ if not MCP_CONFIG_PATH.exists():
199
+ print(f"[WARN] MCP config not found: {MCP_CONFIG_PATH}")
200
+ return None
201
+
202
+ # Build task-specific env overrides from agent config
203
+ task_env_overrides: Dict[str, Dict[str, str]] = {}
204
+ for server in agent_cfg.mcp_servers:
205
+ if server.env:
206
+ task_env_overrides[server.name.lower()] = {
207
+ k: str(v) for k, v in server.env.items() if v
208
+ }
209
+
210
+ # Start MCP servers with task-specific env overrides
211
+ manager, server_urls = _start_mcp_servers_impl(
212
+ config_path=str(MCP_CONFIG_PATH),
213
+ server_list=requested_servers,
214
+ resource_mgr=resource_mgr,
215
+ task_id=task_id,
216
+ host=host,
217
+ prefix="mcp",
218
+ log_prefix="[MCP]",
219
+ task_env_overrides=task_env_overrides,
220
+ task_dir=task_dir,
221
+ )
222
+
223
+ if not manager:
224
+ return None
225
+
226
+ # Update agent config with URLs
227
+ for server in agent_cfg.mcp_servers:
228
+ if server.name in server_urls:
229
+ server.url = server_urls[server.name]
230
+ print(f"[MCP] Server '{server.name}' URL -> {server.url}")
231
+
232
+ return manager
utils/mcp_manager.py ADDED
@@ -0,0 +1,235 @@
1
+ import os
2
+ import subprocess
3
+ import sys
4
+ import time
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List, Optional
7
+ import yaml
8
+
9
+
10
+ class MCPServerManager:
11
+ """Manager for MCP server lifecycle"""
12
+
13
+ def __init__(self, registry_path: str = "mcp_registry.yaml"):
14
+ self.registry_path = Path(registry_path)
15
+ self.base_dir = self.registry_path.parent
16
+ self.config = self._load_config()
17
+ self.processes: Dict[str, subprocess.Popen] = {}
18
+
19
+ def _load_config(self) -> Dict[str, Any]:
20
+ """Load and validate the registry configuration"""
21
+ if not self.registry_path.exists():
22
+ raise FileNotFoundError(f"Registry not found: {self.registry_path}")
23
+
24
+ with open(self.registry_path, "r") as f:
25
+ config = yaml.safe_load(f)
26
+
27
+ if not config or "servers" not in config:
28
+ raise ValueError("Invalid registry: missing 'servers' section")
29
+
30
+ return config
31
+
32
+ def list_servers(self, enabled_only: bool = False) -> List[Dict[str, Any]]:
33
+ """List all servers in the registry"""
34
+ servers = self.config.get("servers", [])
35
+ if enabled_only:
36
+ servers = [s for s in servers if s.get("enabled", False)]
37
+ return servers
38
+
39
+ def get_server_config(self, name: str) -> Optional[Dict[str, Any]]:
40
+ """Get configuration for a specific server"""
41
+ for server in self.config.get("servers", []):
42
+ if server.get("name") == name:
43
+ return server
44
+ return None
45
+
46
+ def _resolve_path(self, relative_path: str) -> Path:
47
+ """Resolve server path relative to registry file"""
48
+ base = self.config.get("global", {}).get("base_dir", ".")
49
+ return (self.base_dir / base / relative_path).resolve()
50
+
51
+ def _setup_environment(self, server: Dict[str, Any]) -> Dict[str, str]:
52
+ """Setup environment variables for a server.
53
+
54
+ Inherits parent process env vars and overrides with server-specific values.
55
+ """
56
+ env = os.environ.copy()
57
+ env.update({k: str(v) for k, v in server.get("env", {}).items() if v is not None})
58
+ return env
59
+
60
+ def _create_log_dir(self) -> Path:
61
+ """Create log directory if it doesn't exist"""
62
+ log_dir = self.base_dir / self.config.get("global", {}).get("log_dir", "logs")
63
+ log_dir.mkdir(parents=True, exist_ok=True)
64
+ return log_dir
65
+
66
+ def start_server(
67
+ self,
68
+ name: str,
69
+ dry_run: bool = False,
70
+ foreground: bool = True
71
+ ) -> bool:
72
+ """Start a single MCP server (runs in foreground by default)"""
73
+ server = self.get_server_config(name)
74
+ if not server:
75
+ print(f"[ERROR] Server '{name}' not found in registry", file=sys.stderr)
76
+ return False
77
+
78
+ if not server.get("enabled", False):
79
+ print(f"[WARN] Server '{name}' is disabled in registry", file=sys.stderr)
80
+ return False
81
+
82
+ # Resolve server path
83
+ main_path = self._resolve_path(server["path"])
84
+ if not main_path.exists():
85
+ print(f"[ERROR] Server script not found: {main_path}", file=sys.stderr)
86
+ return False
87
+
88
+ # Setup environment
89
+ env = self._setup_environment(server)
90
+
91
+ # Build command
92
+ if "command" in server:
93
+ # Use custom command from registry (supports complex setups like supergateway)
94
+ cmd = []
95
+ for part in server["command"]:
96
+ # Expand environment variables in command (e.g., $PORT)
97
+ expanded = part
98
+ for env_key, env_val in env.items():
99
+ expanded = expanded.replace(f"${{{env_key}}}", str(env_val))
100
+ expanded = expanded.replace(f"${env_key}", str(env_val))
101
+ cmd.append(expanded)
102
+ else:
103
+ # Default: run with python executable
104
+ python_exe = self.config.get("global", {}).get("python_executable", "python3")
105
+ cmd = [python_exe, str(main_path)]
106
+
107
+ # Add any additional options
108
+ options = server.get("options", {})
109
+ if options:
110
+ for key, value in options.items():
111
+ cmd.extend([f"--{key.replace('_', '-')}", str(value)])
112
+
113
+ if dry_run:
114
+ print(f"\n[DRY RUN] Would start '{name}':")
115
+ print(f" Command: {' '.join(cmd)}")
116
+ print(f" Working dir: {main_path.parent}")
117
+ print(f" Environment variables:")
118
+ for key, value in server.get("env", {}).items():
119
+ display_value = value if len(str(value)) < 50 else f"{str(value)[:47]}..."
120
+ print(f" {key}={display_value}")
121
+ return True
122
+
123
+ # Create log files
124
+ log_dir = self._create_log_dir()
125
+ stdout_log = log_dir / f"{name}_stdout.log"
126
+ stderr_log = log_dir / f"{name}_stderr.log"
127
+
128
+ print(f"[MCP] Starting '{name}'...")
129
+ print(f" Command: {' '.join(cmd)}")
130
+ print(f" Logs: {stdout_log} / {stderr_log}")
131
+
132
+ if foreground:
133
+ # Run in foreground (blocking)
134
+ try:
135
+ subprocess.run(
136
+ cmd,
137
+ cwd=main_path.parent,
138
+ env=env,
139
+ check=True
140
+ )
141
+ return True
142
+ except subprocess.CalledProcessError as e:
143
+ print(f"[ERROR] Server '{name}' exited with code {e.returncode}", file=sys.stderr)
144
+ return False
145
+ except KeyboardInterrupt:
146
+ print(f"\n[WARN] Server '{name}' interrupted", file=sys.stderr)
147
+ return False
148
+ else:
149
+ # Run in background
150
+ try:
151
+ with open(stdout_log, "w") as stdout, open(stderr_log, "w") as stderr:
152
+ process = subprocess.Popen(
153
+ cmd,
154
+ cwd=main_path.parent,
155
+ env=env,
156
+ stdout=stdout,
157
+ stderr=stderr,
158
+ start_new_session=True
159
+ )
160
+
161
+ self.processes[name] = process
162
+
163
+ # Give it a moment to start
164
+ time.sleep(0.5)
165
+
166
+ # Check if it's still running
167
+ if process.poll() is None:
168
+ print(f"[MCP] Server '{name}' started (PID: {process.pid})")
169
+ return True
170
+ else:
171
+ print(f"[ERROR] Server '{name}' failed to start", file=sys.stderr)
172
+ return False
173
+
174
+ except Exception as e:
175
+ print(f"[ERROR] Failed to start '{name}': {e}", file=sys.stderr)
176
+ return False
177
+
178
+ def start_all(self, dry_run: bool = False, foreground: bool = True) -> int:
179
+ """Start all enabled servers (runs in foreground by default)"""
180
+ enabled_servers = self.list_servers(enabled_only=True)
181
+
182
+ if not enabled_servers:
183
+ print("[WARN] No enabled servers found in registry", file=sys.stderr)
184
+ return 0
185
+
186
+ print(f"\n[MCP] Starting {len(enabled_servers)} server(s)...\n")
187
+
188
+ success_count = 0
189
+ for server in enabled_servers:
190
+ name = server["name"]
191
+ if self.start_server(name, dry_run=dry_run, foreground=foreground):
192
+ success_count += 1
193
+ print()
194
+
195
+ if not dry_run:
196
+ if foreground:
197
+ print(f"[MCP] Started {success_count}/{len(enabled_servers)} server(s) in foreground")
198
+ print(" Press Ctrl+C to stop")
199
+ else:
200
+ print(f"[MCP] Started {success_count}/{len(enabled_servers)} server(s) in background")
201
+ if self.processes:
202
+ print("\n[MCP] Servers are running in the background.")
203
+ print(" Check logs in the 'logs/' directory for output.")
204
+ print(" Use Ctrl+C or kill the processes to stop them.")
205
+
206
+ return success_count
207
+
208
+ def stop_all(self):
209
+ """Stop all running servers"""
210
+ if not self.processes:
211
+ print("[WARN] No servers running", file=sys.stderr)
212
+ return
213
+
214
+ print(f"\n[MCP] Stopping {len(self.processes)} server(s)...\n")
215
+
216
+ for name, process in self.processes.items():
217
+ try:
218
+ process.terminate()
219
+ print(f"[MCP] Stopped '{name}' (PID: {process.pid})")
220
+ except Exception as e:
221
+ print(f"[ERROR] Failed to stop '{name}': {e}", file=sys.stderr)
222
+
223
+ # Wait for processes to terminate
224
+ time.sleep(1)
225
+
226
+ # Force kill any remaining
227
+ for name, process in self.processes.items():
228
+ if process.poll() is None:
229
+ try:
230
+ process.kill()
231
+ print(f"[WARN] Force killed '{name}'", file=sys.stderr)
232
+ except Exception:
233
+ pass
234
+
235
+ self.processes.clear()
utils/memory_guard.py ADDED
@@ -0,0 +1,18 @@
1
+ """Memory guard to prevent OOM from too many Docker containers."""
2
+
3
+ import psutil
4
+
5
+
6
+ def check_memory_before_launch(required_gb: float = 4, reserve_gb: float = 64) -> None:
7
+ """Raise RuntimeError if insufficient memory to launch a new container.
8
+
9
+ Args:
10
+ required_gb: Estimated memory needed by the new container.
11
+ reserve_gb: Memory to keep free for the OS and other processes.
12
+ """
13
+ available_gb = psutil.virtual_memory().available / (1024 ** 3)
14
+ if available_gb < required_gb + reserve_gb:
15
+ raise RuntimeError(
16
+ f"Available memory {available_gb:.1f}GB < needed {required_gb}GB + reserve {reserve_gb}GB. "
17
+ f"Refusing to start new container."
18
+ )