decodingtrust-agent-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. agent/__init__.py +30 -0
  2. agent/claudesdk/__init__.py +8 -0
  3. agent/claudesdk/example.py +221 -0
  4. agent/claudesdk/src/__init__.py +8 -0
  5. agent/claudesdk/src/agent.py +400 -0
  6. agent/claudesdk/src/mcp_proxy.py +409 -0
  7. agent/claudesdk/src/utils.py +420 -0
  8. agent/googleadk/__init__.py +15 -0
  9. agent/googleadk/example.py +237 -0
  10. agent/googleadk/src/__init__.py +12 -0
  11. agent/googleadk/src/agent.py +401 -0
  12. agent/googleadk/src/mcp_wrapper.py +163 -0
  13. agent/googleadk/src/utils.py +602 -0
  14. agent/langchain/__init__.py +8 -0
  15. agent/langchain/example.py +213 -0
  16. agent/langchain/src/__init__.py +8 -0
  17. agent/langchain/src/agent.py +645 -0
  18. agent/langchain/src/utils.py +433 -0
  19. agent/openaisdk/__init__.py +17 -0
  20. agent/openaisdk/example.py +228 -0
  21. agent/openaisdk/src/__init__.py +12 -0
  22. agent/openaisdk/src/agent.py +491 -0
  23. agent/openaisdk/src/agent_wrapper.py +143 -0
  24. agent/openaisdk/src/mcp_wrapper.py +395 -0
  25. agent/openaisdk/src/utils.py +493 -0
  26. agent/openclaw/__init__.py +10 -0
  27. agent/openclaw/example.py +251 -0
  28. agent/openclaw/src/__init__.py +14 -0
  29. agent/openclaw/src/agent.py +930 -0
  30. agent/openclaw/src/helpers/__init__.py +1 -0
  31. agent/openclaw/src/helpers/auth_helpers.py +55 -0
  32. agent/openclaw/src/mcp_proxy.py +564 -0
  33. agent/openclaw/src/plugin_generator.py +231 -0
  34. agent/openclaw/src/utils.py +341 -0
  35. agent/pocketflow/__init__.py +18 -0
  36. agent/pocketflow/example.py +221 -0
  37. agent/pocketflow/prompts/react_agent.py +46 -0
  38. agent/pocketflow/src/__init__.py +6 -0
  39. agent/pocketflow/src/agent.py +507 -0
  40. agent/pocketflow/src/agent_wrapper.py +159 -0
  41. agent/pocketflow/src/async_helper.py +92 -0
  42. agent/pocketflow/src/mcp_react_agent.py +279 -0
  43. agent/pocketflow/src/native_agent.py +74 -0
  44. agent/pocketflow/src/nodes.py +467 -0
  45. benchmark/__init__.py +0 -0
  46. benchmark/browser/benign.jsonl +34 -0
  47. benchmark/browser/direct.jsonl +85 -0
  48. benchmark/browser/indirect.jsonl +82 -0
  49. benchmark/code/benign.jsonl +0 -0
  50. benchmark/code/direct.jsonl +121 -0
  51. benchmark/code/indirect.jsonl +165 -0
  52. benchmark/crm/benign.jsonl +165 -0
  53. benchmark/crm/direct.jsonl +90 -0
  54. benchmark/crm/indirect.jsonl +150 -0
  55. benchmark/customer-service/benign.jsonl +160 -0
  56. benchmark/customer-service/direct.jsonl +100 -0
  57. benchmark/customer-service/indirect.jsonl +101 -0
  58. benchmark/finance/benign.jsonl +0 -0
  59. benchmark/finance/direct.jsonl +200 -0
  60. benchmark/finance/indirect.jsonl +200 -0
  61. benchmark/legal/benign.jsonl +0 -0
  62. benchmark/legal/direct.jsonl +200 -0
  63. benchmark/legal/indirect.jsonl +200 -0
  64. benchmark/macos/benign.jsonl +30 -0
  65. benchmark/macos/direct.jsonl +50 -0
  66. benchmark/macos/indirect.jsonl +50 -0
  67. benchmark/medical/benign.jsonl +642 -0
  68. benchmark/medical/direct.jsonl +229 -0
  69. benchmark/medical/indirect.jsonl +222 -0
  70. benchmark/os-filesystem/benign.jsonl +200 -0
  71. benchmark/os-filesystem/direct.jsonl +200 -0
  72. benchmark/os-filesystem/indirect.jsonl +200 -0
  73. benchmark/research/benign.jsonl +0 -0
  74. benchmark/research/direct.jsonl +119 -0
  75. benchmark/research/indirect.jsonl +125 -0
  76. benchmark/telecom/benign.jsonl +120 -0
  77. benchmark/telecom/direct.jsonl +161 -0
  78. benchmark/telecom/indirect.jsonl +166 -0
  79. benchmark/travel/benign.jsonl +130 -0
  80. benchmark/travel/direct.jsonl +105 -0
  81. benchmark/travel/indirect.jsonl +120 -0
  82. benchmark/windows/benign.jsonl +100 -0
  83. benchmark/windows/direct.jsonl +140 -0
  84. benchmark/windows/indirect.jsonl +107 -0
  85. benchmark/workflow/benign.jsonl +335 -0
  86. benchmark/workflow/direct.jsonl +78 -0
  87. benchmark/workflow/indirect.jsonl +107 -0
  88. cli/__init__.py +5 -0
  89. cli/main.py +182 -0
  90. cli/scaffold.py +334 -0
  91. decodingtrust_agent_sdk-0.1.0.dist-info/METADATA +642 -0
  92. decodingtrust_agent_sdk-0.1.0.dist-info/RECORD +374 -0
  93. decodingtrust_agent_sdk-0.1.0.dist-info/WHEEL +5 -0
  94. decodingtrust_agent_sdk-0.1.0.dist-info/entry_points.txt +2 -0
  95. decodingtrust_agent_sdk-0.1.0.dist-info/licenses/LICENSE +201 -0
  96. decodingtrust_agent_sdk-0.1.0.dist-info/top_level.txt +6 -0
  97. dt_arena/config/env.yaml +515 -0
  98. dt_arena/config/injection_mcp.yaml +430 -0
  99. dt_arena/config/mcp.yaml +642 -0
  100. dt_arena/envs/arxiv/docker-compose-hub.yml +31 -0
  101. dt_arena/envs/arxiv/docker-compose.yml +36 -0
  102. dt_arena/envs/atlassian/docker/docker-compose.dev.yml +65 -0
  103. dt_arena/envs/atlassian/docker/docker-compose.yml +53 -0
  104. dt_arena/envs/atlassian/docker-compose-hub.yml +57 -0
  105. dt_arena/envs/atlassian/docker-compose.yml +72 -0
  106. dt_arena/envs/bigquery/docker-compose.yml +20 -0
  107. dt_arena/envs/booking/docker-compose.yml +59 -0
  108. dt_arena/envs/calendar/docker-compose-hub.yml +30 -0
  109. dt_arena/envs/calendar/docker-compose.yml +42 -0
  110. dt_arena/envs/custom-website/docker-compose.yml +6 -0
  111. dt_arena/envs/customer_service/docker-compose.yml +59 -0
  112. dt_arena/envs/databricks/docker-compose-hub.yml +47 -0
  113. dt_arena/envs/databricks/docker-compose.yml +51 -0
  114. dt_arena/envs/ecommerce/docker-compose.yml +6 -0
  115. dt_arena/envs/ers/docker-compose.yml +36 -0
  116. dt_arena/envs/ers/hrms/docker/docker-compose.yml +31 -0
  117. dt_arena/envs/finance/docker-compose.yml +23 -0
  118. dt_arena/envs/github/docker/docker-compose-hub.yml +50 -0
  119. dt_arena/envs/github/docker/docker-compose.yml +50 -0
  120. dt_arena/envs/gmail/docker-compose-hub.yml +51 -0
  121. dt_arena/envs/gmail/docker-compose.yml +65 -0
  122. dt_arena/envs/google-form/docker-compose-hub.yml +33 -0
  123. dt_arena/envs/google-form/docker-compose.yml +41 -0
  124. dt_arena/envs/googledocs/docker-compose-hub.yml +61 -0
  125. dt_arena/envs/googledocs/docker-compose.yml +78 -0
  126. dt_arena/envs/hospital/docker-compose-hub.yml +25 -0
  127. dt_arena/envs/hospital/docker-compose.yml +27 -0
  128. dt_arena/envs/legal/docker-compose.yml +22 -0
  129. dt_arena/envs/linkedin/docker-compose.yml +63 -0
  130. dt_arena/envs/macos/docker-compose.yml +79 -0
  131. dt_arena/envs/os-filesystem/docker-compose-hub.yml +16 -0
  132. dt_arena/envs/os-filesystem/docker-compose.yml +20 -0
  133. dt_arena/envs/paypal/docker-compose-hub.yml +48 -0
  134. dt_arena/envs/paypal/docker-compose.yml +63 -0
  135. dt_arena/envs/research/docker-compose-hub.yml +13 -0
  136. dt_arena/envs/research/docker-compose.yml +24 -0
  137. dt_arena/envs/salesforce_crm/docker-compose-hub.yaml +45 -0
  138. dt_arena/envs/salesforce_crm/docker-compose.yaml +49 -0
  139. dt_arena/envs/slack/docker-compose-hub.yml +28 -0
  140. dt_arena/envs/slack/docker-compose.yml +41 -0
  141. dt_arena/envs/snowflake/docker-compose-hub.yml +41 -0
  142. dt_arena/envs/snowflake/docker-compose.yml +44 -0
  143. dt_arena/envs/telecom/docker-compose-hub.yml +16 -0
  144. dt_arena/envs/telecom/docker-compose.yml +17 -0
  145. dt_arena/envs/telegram/docker-compose-hub.yml +57 -0
  146. dt_arena/envs/telegram/docker-compose.yml +62 -0
  147. dt_arena/envs/terminal/docker-compose-hub.yml +12 -0
  148. dt_arena/envs/terminal/docker-compose.yml +26 -0
  149. dt_arena/envs/travel/docker-compose-hub.yml +19 -0
  150. dt_arena/envs/travel/docker-compose.yml +19 -0
  151. dt_arena/envs/whatsapp/docker-compose-hub.yml +61 -0
  152. dt_arena/envs/whatsapp/docker-compose.yml +78 -0
  153. dt_arena/envs/windows/docker-compose.yml +71 -0
  154. dt_arena/envs/zoom/docker-compose-hub.yml +27 -0
  155. dt_arena/envs/zoom/docker-compose.yml +40 -0
  156. dt_arena/injection_mcp_server/atlassian/env_injection.py +134 -0
  157. dt_arena/injection_mcp_server/calendar/env_injection.py +217 -0
  158. dt_arena/injection_mcp_server/custom_website/env_injection.py +97 -0
  159. dt_arena/injection_mcp_server/customer_service/env_injection.py +659 -0
  160. dt_arena/injection_mcp_server/databricks/env_injection.py +255 -0
  161. dt_arena/injection_mcp_server/ecommerce/env_injection.py +110 -0
  162. dt_arena/injection_mcp_server/finance/env_injection.py +85 -0
  163. dt_arena/injection_mcp_server/github/env_injection.py +206 -0
  164. dt_arena/injection_mcp_server/gmail/env_injection.py +211 -0
  165. dt_arena/injection_mcp_server/google_form/env_injection.py +186 -0
  166. dt_arena/injection_mcp_server/googledocs/env_injection.py +44 -0
  167. dt_arena/injection_mcp_server/hospital/env_injection.py +43 -0
  168. dt_arena/injection_mcp_server/legal/env_injection.py +229 -0
  169. dt_arena/injection_mcp_server/macos/env_injection.py +272 -0
  170. dt_arena/injection_mcp_server/os-filesystem/env_injection.py +341 -0
  171. dt_arena/injection_mcp_server/paypal/env_injection.py +268 -0
  172. dt_arena/injection_mcp_server/research/env_injection.py +616 -0
  173. dt_arena/injection_mcp_server/salesforce/env_injection.py +514 -0
  174. dt_arena/injection_mcp_server/slack/env_injection.py +265 -0
  175. dt_arena/injection_mcp_server/snowflake/env_injection.py +230 -0
  176. dt_arena/injection_mcp_server/telecom/env_injection.py +503 -0
  177. dt_arena/injection_mcp_server/telegram/env_injection.py +171 -0
  178. dt_arena/injection_mcp_server/terminal/env_injection.py +523 -0
  179. dt_arena/injection_mcp_server/travel/env_injection.py +173 -0
  180. dt_arena/injection_mcp_server/whatsapp/env_injection.py +185 -0
  181. dt_arena/injection_mcp_server/windows/env_injection.py +943 -0
  182. dt_arena/injection_mcp_server/zoom/env_injection.py +216 -0
  183. dt_arena/mcp_server/atlassian/main.py +1554 -0
  184. dt_arena/mcp_server/atlassian/test_server.py +66 -0
  185. dt_arena/mcp_server/bigquery/main.py +333 -0
  186. dt_arena/mcp_server/booking/main.py +310 -0
  187. dt_arena/mcp_server/browser/main.py +1741 -0
  188. dt_arena/mcp_server/calendar/example_multi_user.py +162 -0
  189. dt_arena/mcp_server/calendar/main.py +792 -0
  190. dt_arena/mcp_server/calendar/test_mcp.py +135 -0
  191. dt_arena/mcp_server/customer_service/main.py +1063 -0
  192. dt_arena/mcp_server/databricks/main.py +566 -0
  193. dt_arena/mcp_server/databricks/probe.py +102 -0
  194. dt_arena/mcp_server/ers/main.py +845 -0
  195. dt_arena/mcp_server/finance/__init__.py +87 -0
  196. dt_arena/mcp_server/finance/core/__init__.py +12 -0
  197. dt_arena/mcp_server/finance/core/data_loader.py +558 -0
  198. dt_arena/mcp_server/finance/core/portfolio.py +565 -0
  199. dt_arena/mcp_server/finance/evaluation/__init__.py +20 -0
  200. dt_arena/mcp_server/finance/evaluation/evaluator.py +217 -0
  201. dt_arena/mcp_server/finance/evaluation/logger.py +137 -0
  202. dt_arena/mcp_server/finance/injection/__init__.py +66 -0
  203. dt_arena/mcp_server/finance/injection/config.py +176 -0
  204. dt_arena/mcp_server/finance/injection/content.py +755 -0
  205. dt_arena/mcp_server/finance/injection/html.py +409 -0
  206. dt_arena/mcp_server/finance/injection/locations.py +167 -0
  207. dt_arena/mcp_server/finance/injection/methods.py +193 -0
  208. dt_arena/mcp_server/finance/injection/presets.py +1023 -0
  209. dt_arena/mcp_server/finance/main.py +361 -0
  210. dt_arena/mcp_server/finance/run_mcp.py +21 -0
  211. dt_arena/mcp_server/finance/run_web.py +26 -0
  212. dt_arena/mcp_server/finance/server/__init__.py +41 -0
  213. dt_arena/mcp_server/finance/server/extractor.py +1453 -0
  214. dt_arena/mcp_server/finance/server/extractor_minimal.py +292 -0
  215. dt_arena/mcp_server/finance/server/extractor_simple.py +1164 -0
  216. dt_arena/mcp_server/finance/server/injection_mcp.py +865 -0
  217. dt_arena/mcp_server/finance/server/mcp.py +451 -0
  218. dt_arena/mcp_server/finance/server/tools/__init__.py +23 -0
  219. dt_arena/mcp_server/finance/server/tools/account.py +88 -0
  220. dt_arena/mcp_server/finance/server/tools/browsing.py +328 -0
  221. dt_arena/mcp_server/finance/server/tools/social.py +73 -0
  222. dt_arena/mcp_server/finance/server/tools/trading.py +242 -0
  223. dt_arena/mcp_server/finance/server/tools/utility.py +49 -0
  224. dt_arena/mcp_server/finance/server/web.py +2139 -0
  225. dt_arena/mcp_server/finance/tasks/benchmark/__init__.py +28 -0
  226. dt_arena/mcp_server/finance/tasks/benchmark/attack_pool.py +3026 -0
  227. dt_arena/mcp_server/finance/tasks/benchmark/attack_runner.py +1315 -0
  228. dt_arena/mcp_server/finance/tasks/benchmark/finra_requirements.py +1335 -0
  229. dt_arena/mcp_server/finance/tasks/benchmark/finra_tasks.py +3665 -0
  230. dt_arena/mcp_server/finance/tasks/benchmark/malicious_tasks.py +2673 -0
  231. dt_arena/mcp_server/finance/tasks/redteam_suite/run_redteam_suite.py +1713 -0
  232. dt_arena/mcp_server/finance/test_mcp_tools.py +476 -0
  233. dt_arena/mcp_server/github/main.py +441 -0
  234. dt_arena/mcp_server/gmail/main.py +1004 -0
  235. dt_arena/mcp_server/google_form/main.py +141 -0
  236. dt_arena/mcp_server/googledocs/main.py +458 -0
  237. dt_arena/mcp_server/hospital/mcp_server.py +458 -0
  238. dt_arena/mcp_server/legal/__init__.py +9 -0
  239. dt_arena/mcp_server/legal/core/__init__.py +14 -0
  240. dt_arena/mcp_server/legal/core/courtlistener_store.py +762 -0
  241. dt_arena/mcp_server/legal/core/data_loader.py +266 -0
  242. dt_arena/mcp_server/legal/core/document_store.py +197 -0
  243. dt_arena/mcp_server/legal/core/matter_manager.py +466 -0
  244. dt_arena/mcp_server/legal/main.py +89 -0
  245. dt_arena/mcp_server/legal/scripts/collect_data.py +988 -0
  246. dt_arena/mcp_server/legal/server/__init__.py +14 -0
  247. dt_arena/mcp_server/legal/server/mcp.py +2330 -0
  248. dt_arena/mcp_server/macos/client_test.py +270 -0
  249. dt_arena/mcp_server/macos/mcp_server.py +285 -0
  250. dt_arena/mcp_server/os-filesystem/main.py +1380 -0
  251. dt_arena/mcp_server/paypal/main.py +501 -0
  252. dt_arena/mcp_server/research/main.py +777 -0
  253. dt_arena/mcp_server/salesforce/main.py +2006 -0
  254. dt_arena/mcp_server/slack/main.py +318 -0
  255. dt_arena/mcp_server/snowflake/main.py +612 -0
  256. dt_arena/mcp_server/snowflake/probe.py +183 -0
  257. dt_arena/mcp_server/telecom/mcp_client.py +423 -0
  258. dt_arena/mcp_server/telecom/mcp_server.py +1059 -0
  259. dt_arena/mcp_server/telegram/main.py +338 -0
  260. dt_arena/mcp_server/terminal/main.py +163 -0
  261. dt_arena/mcp_server/travel/client_test.py +16 -0
  262. dt_arena/mcp_server/travel/mcp_server.py +404 -0
  263. dt_arena/mcp_server/whatsapp/main.py +318 -0
  264. dt_arena/mcp_server/windows/client_test.py +270 -0
  265. dt_arena/mcp_server/windows/mcp_server.py +218 -0
  266. dt_arena/mcp_server/zoom/main.py +466 -0
  267. dt_arena/src/__init__.py +0 -0
  268. dt_arena/src/hooks/__init__.py +0 -0
  269. dt_arena/src/hooks/audit_log.py +30 -0
  270. dt_arena/src/hooks/hooks.json +3 -0
  271. dt_arena/src/run_benign.py +142 -0
  272. dt_arena/src/types/__init__.py +0 -0
  273. dt_arena/src/types/agent.py +441 -0
  274. dt_arena/src/types/attacks.py +2 -0
  275. dt_arena/src/types/environment.py +2 -0
  276. dt_arena/src/types/hooks.py +174 -0
  277. dt_arena/src/types/judge.py +52 -0
  278. dt_arena/src/types/red_teaming_trajectory.py +385 -0
  279. dt_arena/src/types/task.py +260 -0
  280. dt_arena/src/types/trajectory.py +315 -0
  281. dt_arena/utils/__init__.py +1 -0
  282. dt_arena/utils/atlassian/__init__.py +27 -0
  283. dt_arena/utils/atlassian/helpers.py +520 -0
  284. dt_arena/utils/bigquery/__init__.py +1 -0
  285. dt_arena/utils/bigquery/helpers.py +246 -0
  286. dt_arena/utils/calendar/__init__.py +1 -0
  287. dt_arena/utils/calendar/helpers.py +87 -0
  288. dt_arena/utils/customer_service/__init__.py +17 -0
  289. dt_arena/utils/customer_service/cs_env_client.py +940 -0
  290. dt_arena/utils/customer_service/helpers.py +339 -0
  291. dt_arena/utils/customer_service/judges/__init__.py +20 -0
  292. dt_arena/utils/customer_service/judges/abusive_comms.py +149 -0
  293. dt_arena/utils/customer_service/judges/bulk_destructive.py +202 -0
  294. dt_arena/utils/customer_service/judges/cross_customer.py +481 -0
  295. dt_arena/utils/customer_service/judges/email_exfiltration.py +320 -0
  296. dt_arena/utils/customer_service/judges/financial_fraud.py +221 -0
  297. dt_arena/utils/customer_service/judges/llm_utils.py +264 -0
  298. dt_arena/utils/customer_service/judges/pii_disclosure.py +837 -0
  299. dt_arena/utils/customer_service/judges/policy_misrep.py +44 -0
  300. dt_arena/utils/customer_service/judges/text_utils.py +21 -0
  301. dt_arena/utils/databricks/__init__.py +2 -0
  302. dt_arena/utils/databricks/helpers.py +210 -0
  303. dt_arena/utils/finance/__init__.py +0 -0
  304. dt_arena/utils/finance/helpers.py +263 -0
  305. dt_arena/utils/github/__init__.py +1 -0
  306. dt_arena/utils/github/helpers.py +249 -0
  307. dt_arena/utils/gmail/__init__.py +1 -0
  308. dt_arena/utils/gmail/helpers.py +344 -0
  309. dt_arena/utils/google_form/__init__.py +2 -0
  310. dt_arena/utils/google_form/helpers.py +133 -0
  311. dt_arena/utils/legal/__init__.py +0 -0
  312. dt_arena/utils/legal/helpers.py +228 -0
  313. dt_arena/utils/macos/__init__.py +0 -0
  314. dt_arena/utils/macos/env_setup.py +215 -0
  315. dt_arena/utils/macos/helpers.py +61 -0
  316. dt_arena/utils/os_filesystem/__init__.py +1 -0
  317. dt_arena/utils/os_filesystem/helpers.py +366 -0
  318. dt_arena/utils/paypal/__init__.py +1 -0
  319. dt_arena/utils/paypal/helpers.py +178 -0
  320. dt_arena/utils/port_allocator.py +266 -0
  321. dt_arena/utils/research/__init__.py +0 -0
  322. dt_arena/utils/research/helpers.py +251 -0
  323. dt_arena/utils/salesforce/__init__.py +1 -0
  324. dt_arena/utils/salesforce/helpers.py +719 -0
  325. dt_arena/utils/slack/__init__.py +1 -0
  326. dt_arena/utils/slack/helpers.py +176 -0
  327. dt_arena/utils/snowflake/__init__.py +1 -0
  328. dt_arena/utils/snowflake/helpers.py +166 -0
  329. dt_arena/utils/telecom/__init__.py +1 -0
  330. dt_arena/utils/telecom/helpers.py +760 -0
  331. dt_arena/utils/telegram/__init__.py +0 -0
  332. dt_arena/utils/telegram/helpers.py +174 -0
  333. dt_arena/utils/terminal/__init__.py +0 -0
  334. dt_arena/utils/terminal/helpers.py +20 -0
  335. dt_arena/utils/travel/__init__.py +0 -0
  336. dt_arena/utils/travel/env_client.py +537 -0
  337. dt_arena/utils/travel/llm_judge.py +137 -0
  338. dt_arena/utils/travel/prompts.py +64 -0
  339. dt_arena/utils/utils/__init__.py +122 -0
  340. dt_arena/utils/whatsapp/__init__.py +0 -0
  341. dt_arena/utils/whatsapp/helpers.py +226 -0
  342. dt_arena/utils/windows/__init__.py +0 -0
  343. dt_arena/utils/windows/env_reset.py +224 -0
  344. dt_arena/utils/windows/env_setup.py +280 -0
  345. dt_arena/utils/windows/exfil_helpers.py +170 -0
  346. dt_arena/utils/windows/helpers.py +74 -0
  347. dt_arena/utils/zoom/__init__.py +1 -0
  348. dt_arena/utils/zoom/helpers.py +70 -0
  349. eval/__init__.py +1 -0
  350. eval/evaluation.py +426 -0
  351. eval/task_runner.py +449 -0
  352. utils/__init__.py +148 -0
  353. utils/agent_helpers.py +308 -0
  354. utils/agent_wrapper.py +189 -0
  355. utils/compose_utils.py +135 -0
  356. utils/config.py +77 -0
  357. utils/env_helpers.py +104 -0
  358. utils/eval_stats.py +88 -0
  359. utils/injection_helpers.py +429 -0
  360. utils/injection_mcp_helpers.py +152 -0
  361. utils/judge_helpers.py +181 -0
  362. utils/judge_utils.py +472 -0
  363. utils/llm.py +196 -0
  364. utils/logging.py +45 -0
  365. utils/mcp_helpers.py +232 -0
  366. utils/mcp_manager.py +235 -0
  367. utils/memory_guard.py +18 -0
  368. utils/red_teaming_sandbox.py +476 -0
  369. utils/reset_helpers.py +318 -0
  370. utils/resource_manager.py +370 -0
  371. utils/skill_helpers.py +447 -0
  372. utils/task_executor.py +904 -0
  373. utils/task_helpers.py +270 -0
  374. utils/template_helpers.py +179 -0
@@ -0,0 +1,338 @@
1
+ """
2
+ Telegram MCP Server - thin client that proxies to the Telegram ENV REST API.
3
+ Independent environment with its own PostgreSQL + FastAPI backend.
4
+ """
5
+ import os
6
+ import sys
7
+ import asyncio
8
+ from typing import Any, Dict, Optional
9
+ from urllib.parse import quote
10
+
11
+ from fastmcp import FastMCP
12
+ import httpx
13
+
14
+ API_URL = os.getenv("TELEGRAM_API_URL", "http://127.0.0.1:8038")
15
+ USER_ACCESS_TOKEN = os.getenv("TELEGRAM_USER_ACCESS_TOKEN", "telegram_token_tom_dev_123")
16
+
17
+
18
+ def _headers() -> Dict[str, str]:
19
+ return {"Authorization": f"Bearer {USER_ACCESS_TOKEN}"}
20
+
21
+
22
+ async def _get(path: str, params: Optional[Dict[str, Any]] = None) -> Any:
23
+ """GET request to the ENV API with retry."""
24
+ last_exc: Optional[Exception] = None
25
+ # strip None values from params
26
+ if params:
27
+ params = {k: v for k, v in params.items() if v is not None}
28
+ for attempt in range(1, 11):
29
+ try:
30
+ async with httpx.AsyncClient(timeout=20) as client:
31
+ resp = await client.get(f"{API_URL}{path}", params=params, headers=_headers())
32
+ resp.raise_for_status()
33
+ return resp.json()
34
+ except httpx.HTTPStatusError as e:
35
+ if e.response.status_code < 500:
36
+ raise RuntimeError(f"GET {path} failed: {e}")
37
+ last_exc = e
38
+ await asyncio.sleep(min(0.5 * attempt, 3.0))
39
+ except Exception as e:
40
+ last_exc = e
41
+ await asyncio.sleep(min(0.5 * attempt, 3.0))
42
+ raise RuntimeError(f"GET {path} failed after retries: {last_exc}")
43
+
44
+
45
+ async def _post(path: str, json: Optional[Dict[str, Any]] = None) -> Any:
46
+ """POST request to the ENV API with retry."""
47
+ last_exc: Optional[Exception] = None
48
+ for attempt in range(1, 11):
49
+ try:
50
+ async with httpx.AsyncClient(timeout=20) as client:
51
+ resp = await client.post(f"{API_URL}{path}", json=json or {}, headers=_headers())
52
+ resp.raise_for_status()
53
+ return resp.json()
54
+ except httpx.HTTPStatusError as e:
55
+ if e.response.status_code < 500:
56
+ raise RuntimeError(f"POST {path} failed: {e}")
57
+ last_exc = e
58
+ await asyncio.sleep(min(0.5 * attempt, 3.0))
59
+ except Exception as e:
60
+ last_exc = e
61
+ await asyncio.sleep(min(0.5 * attempt, 3.0))
62
+ raise RuntimeError(f"POST {path} failed after retries: {last_exc}")
63
+
64
+
65
+ async def _delete(path: str) -> Any:
66
+ """DELETE request to the ENV API with retry."""
67
+ last_exc: Optional[Exception] = None
68
+ for attempt in range(1, 11):
69
+ try:
70
+ async with httpx.AsyncClient(timeout=20) as client:
71
+ resp = await client.delete(f"{API_URL}{path}", headers=_headers())
72
+ resp.raise_for_status()
73
+ return resp.json()
74
+ except httpx.HTTPStatusError as e:
75
+ if e.response.status_code < 500:
76
+ raise RuntimeError(f"DELETE {path} failed: {e}")
77
+ last_exc = e
78
+ await asyncio.sleep(min(0.5 * attempt, 3.0))
79
+ except Exception as e:
80
+ last_exc = e
81
+ await asyncio.sleep(min(0.5 * attempt, 3.0))
82
+ raise RuntimeError(f"DELETE {path} failed after retries: {last_exc}")
83
+
84
+
85
+ mcp = FastMCP("Telegram MCP Server")
86
+
87
+
88
+ # ═════════════════════════════════════════════════════════════════════════════
89
+ # Contacts
90
+ # ═════════════════════════════════════════════════════════════════════════════
91
+
92
+ @mcp.tool(name="telegram_list_contacts")
93
+ async def list_contacts(search: Optional[str] = None, limit: int = 100, offset: int = 0) -> Dict[str, Any]:
94
+ """List contacts from your Telegram address book.
95
+
96
+ Args:
97
+ search: Search by name, email, phone, or company
98
+ limit: Max results (default 100)
99
+ offset: Pagination offset
100
+ """
101
+ return await _get("/contacts", {"search": search, "limit": limit, "offset": offset})
102
+
103
+
104
+ @mcp.tool(name="telegram_get_contact")
105
+ async def get_contact(contact_id: str) -> Dict[str, Any]:
106
+ """Get full details for a Telegram contact.
107
+
108
+ Args:
109
+ contact_id: The contact identifier (REQUIRED)
110
+ """
111
+ return await _get(f"/contacts/{contact_id}")
112
+
113
+
114
+ @mcp.tool(name="telegram_create_contact")
115
+ async def create_contact(first_name: str, last_name: Optional[str] = None,
116
+ email: Optional[str] = None, phone: Optional[str] = None,
117
+ company: Optional[str] = None, job_title: Optional[str] = None,
118
+ address: Optional[str] = None, notes: Optional[str] = None,
119
+ favorite: bool = False) -> Dict[str, Any]:
120
+ """Create a new contact.
121
+
122
+ Args:
123
+ first_name: First name (REQUIRED)
124
+ last_name: Last name
125
+ email: Email address
126
+ phone: Phone number
127
+ company: Company name
128
+ job_title: Job title
129
+ address: Mailing address
130
+ notes: Free-form notes
131
+ favorite: Mark as favorite
132
+ """
133
+ return await _post("/contacts", {
134
+ "first_name": first_name, "last_name": last_name,
135
+ "email": email, "phone": phone,
136
+ "company": company, "job_title": job_title,
137
+ "address": address, "notes": notes, "favorite": favorite,
138
+ })
139
+
140
+
141
+ @mcp.tool(name="telegram_search_contacts")
142
+ async def search_contacts(query: str, limit: int = 20) -> Dict[str, Any]:
143
+ """Search Telegram contacts by name, email, phone, or company.
144
+
145
+ Args:
146
+ query: Search term (REQUIRED)
147
+ limit: Max results (default 20)
148
+ """
149
+ return await _get("/contacts/search", {"query": query, "limit": limit})
150
+
151
+
152
+ # ═════════════════════════════════════════════════════════════════════════════
153
+ # Messaging
154
+ # ═════════════════════════════════════════════════════════════════════════════
155
+
156
+ @mcp.tool()
157
+ async def send_message(phone_number: str, body: str, contact_id: Optional[str] = None,
158
+ media_url: Optional[str] = None) -> Dict[str, Any]:
159
+ """Send a message to a phone number.
160
+
161
+ Args:
162
+ phone_number: Recipient phone number (REQUIRED)
163
+ body: Message text (REQUIRED)
164
+ contact_id: Associated contact ID
165
+ media_url: URL of photo/video/file attachment
166
+ """
167
+ return await _post("/messages/send", {
168
+ "phone_number": phone_number, "body": body,
169
+ "contact_id": contact_id, "media_url": media_url,
170
+ "message_type": "media" if media_url else "text",
171
+ })
172
+
173
+
174
+ @mcp.tool()
175
+ async def list_chats(limit: int = 50, offset: int = 0,
176
+ unread_only: bool = False) -> Dict[str, Any]:
177
+ """List conversation threads (chats).
178
+
179
+ Args:
180
+ limit: Max results
181
+ offset: Pagination offset
182
+ unread_only: Only return chats with unread messages
183
+ """
184
+ return await _get("/messages/chats", {"limit": limit, "offset": offset, "unread_only": unread_only})
185
+
186
+
187
+ @mcp.tool()
188
+ async def get_chat(phone_number: str, limit: int = 100, offset: int = 0) -> Dict[str, Any]:
189
+ """Get the full conversation with a phone number. Marks incoming as read.
190
+
191
+ Args:
192
+ phone_number: Phone number for the conversation (REQUIRED)
193
+ limit: Max messages
194
+ offset: Pagination offset
195
+ """
196
+ return await _get(f"/messages/chat/{quote(phone_number, safe='')}", {"limit": limit, "offset": offset})
197
+
198
+
199
+ @mcp.tool()
200
+ async def get_message(message_id: str) -> Dict[str, Any]:
201
+ """Get a single message by ID.
202
+
203
+ Args:
204
+ message_id: Message identifier (REQUIRED)
205
+ """
206
+ return await _get(f"/messages/{message_id}")
207
+
208
+
209
+ @mcp.tool()
210
+ async def reply_to_message(message_id: str, body: str,
211
+ media_url: Optional[str] = None) -> Dict[str, Any]:
212
+ """Reply to a message.
213
+
214
+ Args:
215
+ message_id: Original message ID to reply to (REQUIRED)
216
+ body: Reply text (REQUIRED)
217
+ media_url: Optional media attachment
218
+ """
219
+ return await _post(f"/messages/{message_id}/reply", {"body": body, "media_url": media_url})
220
+
221
+
222
+ @mcp.tool()
223
+ async def forward_message(message_id: str, to_phone_number: str,
224
+ additional_text: Optional[str] = None) -> Dict[str, Any]:
225
+ """Forward a message to another phone number.
226
+
227
+ Args:
228
+ message_id: Message to forward (REQUIRED)
229
+ to_phone_number: Recipient phone number (REQUIRED)
230
+ additional_text: Optional text to add before forwarded content
231
+ """
232
+ return await _post(f"/messages/{message_id}/forward", {
233
+ "to_phone_number": to_phone_number, "additional_text": additional_text,
234
+ })
235
+
236
+
237
+ @mcp.tool()
238
+ async def delete_message(message_id: str) -> Dict[str, Any]:
239
+ """Delete a message by ID.
240
+
241
+ Args:
242
+ message_id: Message identifier (REQUIRED)
243
+ """
244
+ return await _delete(f"/messages/{message_id}")
245
+
246
+
247
+ @mcp.tool()
248
+ async def delete_chat(phone_number: str) -> Dict[str, Any]:
249
+ """Delete an entire conversation with a phone number.
250
+
251
+ Args:
252
+ phone_number: Phone number of the conversation to delete (REQUIRED)
253
+ """
254
+ return await _delete(f"/messages/chat/{quote(phone_number, safe='')}")
255
+
256
+
257
+ @mcp.tool()
258
+ async def search_messages(query: str, phone_number: Optional[str] = None,
259
+ direction: Optional[str] = None, limit: int = 50) -> Dict[str, Any]:
260
+ """Search messages by text content.
261
+
262
+ Args:
263
+ query: Search term (REQUIRED)
264
+ phone_number: Limit search to a specific conversation
265
+ direction: Filter by 'incoming' or 'outgoing'
266
+ limit: Max results
267
+ """
268
+ return await _get("/messages/search", {
269
+ "query": query, "phone_number": phone_number, "direction": direction, "limit": limit,
270
+ })
271
+
272
+
273
+ @mcp.tool()
274
+ async def get_unread_messages(limit: int = 50) -> Dict[str, Any]:
275
+ """Get all unread incoming messages.
276
+
277
+ Args:
278
+ limit: Max results
279
+ """
280
+ return await _get("/messages/unread", {"limit": limit})
281
+
282
+
283
+ @mcp.tool()
284
+ async def mark_message_read(message_id: str) -> Dict[str, Any]:
285
+ """Mark a message as read.
286
+
287
+ Args:
288
+ message_id: Message identifier (REQUIRED)
289
+ """
290
+ return await _post(f"/messages/{message_id}/read")
291
+
292
+
293
+ # ═════════════════════════════════════════════════════════════════════════════
294
+ # Calls
295
+ # ═════════════════════════════════════════════════════════════════════════════
296
+
297
+ @mcp.tool()
298
+ async def make_call(phone_number: str, contact_id: Optional[str] = None,
299
+ duration: int = 0) -> Dict[str, Any]:
300
+ """Make a phone call to a phone number.
301
+
302
+ Args:
303
+ phone_number: Phone number to call (REQUIRED)
304
+ contact_id: Associated contact ID
305
+ duration: Call duration in seconds (default 0)
306
+ """
307
+ return await _post("/calls", {
308
+ "phone_number": phone_number, "contact_id": contact_id, "duration": duration,
309
+ })
310
+
311
+
312
+ @mcp.tool()
313
+ async def list_calls(phone_number: Optional[str] = None, limit: int = 50,
314
+ offset: int = 0) -> Dict[str, Any]:
315
+ """List call history.
316
+
317
+ Args:
318
+ phone_number: Filter by phone number
319
+ limit: Max results
320
+ offset: Pagination offset
321
+ """
322
+ return await _get("/calls", {"phone_number": phone_number, "limit": limit, "offset": offset})
323
+
324
+
325
+ # ═════════════════════════════════════════════════════════════════════════════
326
+ # Entrypoint
327
+ # ═════════════════════════════════════════════════════════════════════════════
328
+
329
+ def main() -> None:
330
+ print("Starting Telegram MCP Server (proxying to ENV API)...", file=sys.stderr)
331
+ host = os.getenv("TELEGRAM_MCP_HOST", "localhost")
332
+ port_str = os.getenv("PORT", "").strip() or os.getenv("TELEGRAM_MCP_PORT", "8864")
333
+ port = int(port_str)
334
+ mcp.run(transport="http", host=host, port=port)
335
+
336
+
337
+ if __name__ == "__main__":
338
+ main()
@@ -0,0 +1,163 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Terminal MCP server for executing commands in Docker environment.
4
+ Provides a single tool: execute_command(command: str) -> str
5
+ """
6
+ import os
7
+ import asyncio
8
+ import json
9
+ import subprocess
10
+ import sys
11
+ from typing import Any, Dict, Optional
12
+
13
+ from mcp.server.fastmcp import FastMCP
14
+ from dt_arena.utils.terminal.helpers import get_terminal_container_name
15
+
16
+ TERMINAL_CONTAINER_NAME = get_terminal_container_name()
17
+ DOCKER_HOST = os.getenv("DOCKER_HOST", "unix:///var/run/docker.sock")
18
+ MAX_OUTPUT_CHARS = 200_000
19
+
20
+ # Debug: Print config on startup
21
+ print(f"[Terminal MCP Server] ===== STARTING =====", file=sys.stderr)
22
+ print(f"[Terminal MCP Server] TERMINAL_CONTAINER_NAME: {TERMINAL_CONTAINER_NAME}", file=sys.stderr)
23
+ print(f"[Terminal MCP Server] DOCKER_HOST: {DOCKER_HOST}", file=sys.stderr)
24
+ print(f"[Terminal MCP Server] ==================", file=sys.stderr)
25
+ sys.stderr.flush()
26
+
27
+ # Create a FastMCP server (host/port from env, set by evaluation system before launch)
28
+ mcp = FastMCP(
29
+ "Terminal Client",
30
+ host=os.getenv("HOST", "0.0.0.0"),
31
+ port=int(os.getenv("PORT", "8845")),
32
+ )
33
+
34
+
35
+ async def _execute_command_in_container(command: str, timeout: int = 180) -> Dict[str, Any]:
36
+ """Execute a command in the terminal Docker container.
37
+
38
+ Args:
39
+ command: The command to execute
40
+ timeout: Timeout in seconds (default: 180)
41
+
42
+ Returns:
43
+ Dictionary with stdout, stderr, return_code, and success status
44
+ """
45
+ try:
46
+ # Use docker exec to run command in the container
47
+ # Run as root user for full system access
48
+ docker_cmd = [
49
+ "docker", "exec",
50
+ "-u", "root",
51
+ TERMINAL_CONTAINER_NAME,
52
+ "bash", "-c", command
53
+ ]
54
+
55
+ print(f"[Terminal MCP Server] Executing: {' '.join(docker_cmd)}", file=sys.stderr)
56
+
57
+ # Execute the command with timeout
58
+ process = await asyncio.create_subprocess_exec(
59
+ *docker_cmd,
60
+ stdout=asyncio.subprocess.PIPE,
61
+ stderr=asyncio.subprocess.PIPE
62
+ )
63
+
64
+ try:
65
+ stdout, stderr = await asyncio.wait_for(
66
+ process.communicate(),
67
+ timeout=timeout
68
+ )
69
+
70
+ stdout_str = stdout.decode('utf-8', errors='replace')
71
+ stderr_str = stderr.decode('utf-8', errors='replace')
72
+ return_code = process.returncode
73
+
74
+ truncated = False
75
+ if len(stdout_str) > MAX_OUTPUT_CHARS:
76
+ stdout_str = stdout_str[:MAX_OUTPUT_CHARS] + "\n... [truncated]"
77
+ truncated = True
78
+ if len(stderr_str) > MAX_OUTPUT_CHARS:
79
+ stderr_str = stderr_str[:MAX_OUTPUT_CHARS] + "\n... [truncated]"
80
+ truncated = True
81
+
82
+ result = {
83
+ "stdout": stdout_str,
84
+ "stderr": stderr_str,
85
+ "return_code": return_code,
86
+ "success": return_code == 0,
87
+ "command": command,
88
+ }
89
+ if truncated:
90
+ result["truncated"] = True
91
+ return result
92
+
93
+ except asyncio.TimeoutError:
94
+ # Kill the process if it times out
95
+ process.kill()
96
+ await process.wait()
97
+ return {
98
+ "stdout": "",
99
+ "stderr": f"Command timed out after {timeout} seconds",
100
+ "return_code": -1,
101
+ "success": False,
102
+ "command": command,
103
+ "error": "timeout"
104
+ }
105
+
106
+ except Exception as e:
107
+ return {
108
+ "stdout": "",
109
+ "stderr": f"Failed to execute command: {str(e)}",
110
+ "return_code": -1,
111
+ "success": False,
112
+ "command": command,
113
+ "error": str(e)
114
+ }
115
+
116
+
117
+ @mcp.tool()
118
+ async def execute_command(command: str, timeout: int = 180) -> str:
119
+ """Execute a command in the terminal Docker environment.
120
+
121
+ Args:
122
+ command: The command to execute (e.g., "ls", "pwd", "cat file.txt")
123
+ timeout: Timeout in seconds (default: 180, max: 300)
124
+
125
+ Returns:
126
+ JSON string containing the command output, stderr, return code, and success status
127
+
128
+ Example:
129
+ execute_command("ls -la") -> Returns directory listing
130
+ execute_command("pwd") -> Returns current working directory
131
+ execute_command("echo 'Hello World'") -> Returns "Hello World"
132
+ """
133
+ if not command or not command.strip():
134
+ return json.dumps({
135
+ "error": "Command cannot be empty",
136
+ "success": False
137
+ })
138
+
139
+ # Limit timeout to prevent abuse
140
+ timeout = min(max(timeout, 1), 300)
141
+
142
+ try:
143
+ result = await _execute_command_in_container(command.strip(), timeout)
144
+ return json.dumps(result, ensure_ascii=False, indent=2)
145
+ except Exception as e:
146
+ return json.dumps({
147
+ "error": f"Failed to execute command: {str(e)}",
148
+ "success": False,
149
+ "command": command
150
+ })
151
+
152
+
153
+
154
+
155
+ def main():
156
+ print(f"Starting Terminal MCP server on {mcp.settings.host}:{mcp.settings.port}...", file=sys.stderr)
157
+ sys.stderr.flush()
158
+
159
+ mcp.run(transport="streamable-http")
160
+
161
+
162
+ if __name__ == "__main__":
163
+ main()
@@ -0,0 +1,16 @@
1
+ import os
2
+ from fastmcp import Client
3
+ import asyncio
4
+
5
+ TRAVEL_MCP_HOST = os.getenv("TRAVEL_MCP_HOST", "localhost")
6
+ TRAVEL_MCP_PORT = os.getenv("TRAVEL_MCP_PORT", 10301)
7
+
8
+ async def main():
9
+ async with Client(f"http://{TRAVEL_MCP_HOST}:{TRAVEL_MCP_PORT}/mcp") as client:
10
+ tools = await client.list_tools()
11
+ print(f"Available tools: {tools}")
12
+ result = await client.call_tool("query_accommodation", {"city": "Chicago"})
13
+ print(f"Result: {result.content[0].text}")
14
+
15
+ if __name__ == "__main__":
16
+ asyncio.run(main())