decodingtrust-agent-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. agent/__init__.py +30 -0
  2. agent/claudesdk/__init__.py +8 -0
  3. agent/claudesdk/example.py +221 -0
  4. agent/claudesdk/src/__init__.py +8 -0
  5. agent/claudesdk/src/agent.py +400 -0
  6. agent/claudesdk/src/mcp_proxy.py +409 -0
  7. agent/claudesdk/src/utils.py +420 -0
  8. agent/googleadk/__init__.py +15 -0
  9. agent/googleadk/example.py +237 -0
  10. agent/googleadk/src/__init__.py +12 -0
  11. agent/googleadk/src/agent.py +401 -0
  12. agent/googleadk/src/mcp_wrapper.py +163 -0
  13. agent/googleadk/src/utils.py +602 -0
  14. agent/langchain/__init__.py +8 -0
  15. agent/langchain/example.py +213 -0
  16. agent/langchain/src/__init__.py +8 -0
  17. agent/langchain/src/agent.py +645 -0
  18. agent/langchain/src/utils.py +433 -0
  19. agent/openaisdk/__init__.py +17 -0
  20. agent/openaisdk/example.py +228 -0
  21. agent/openaisdk/src/__init__.py +12 -0
  22. agent/openaisdk/src/agent.py +491 -0
  23. agent/openaisdk/src/agent_wrapper.py +143 -0
  24. agent/openaisdk/src/mcp_wrapper.py +395 -0
  25. agent/openaisdk/src/utils.py +493 -0
  26. agent/openclaw/__init__.py +10 -0
  27. agent/openclaw/example.py +251 -0
  28. agent/openclaw/src/__init__.py +14 -0
  29. agent/openclaw/src/agent.py +930 -0
  30. agent/openclaw/src/helpers/__init__.py +1 -0
  31. agent/openclaw/src/helpers/auth_helpers.py +55 -0
  32. agent/openclaw/src/mcp_proxy.py +564 -0
  33. agent/openclaw/src/plugin_generator.py +231 -0
  34. agent/openclaw/src/utils.py +341 -0
  35. agent/pocketflow/__init__.py +18 -0
  36. agent/pocketflow/example.py +221 -0
  37. agent/pocketflow/prompts/react_agent.py +46 -0
  38. agent/pocketflow/src/__init__.py +6 -0
  39. agent/pocketflow/src/agent.py +507 -0
  40. agent/pocketflow/src/agent_wrapper.py +159 -0
  41. agent/pocketflow/src/async_helper.py +92 -0
  42. agent/pocketflow/src/mcp_react_agent.py +279 -0
  43. agent/pocketflow/src/native_agent.py +74 -0
  44. agent/pocketflow/src/nodes.py +467 -0
  45. benchmark/__init__.py +0 -0
  46. benchmark/browser/benign.jsonl +34 -0
  47. benchmark/browser/direct.jsonl +85 -0
  48. benchmark/browser/indirect.jsonl +82 -0
  49. benchmark/code/benign.jsonl +0 -0
  50. benchmark/code/direct.jsonl +121 -0
  51. benchmark/code/indirect.jsonl +165 -0
  52. benchmark/crm/benign.jsonl +165 -0
  53. benchmark/crm/direct.jsonl +90 -0
  54. benchmark/crm/indirect.jsonl +150 -0
  55. benchmark/customer-service/benign.jsonl +160 -0
  56. benchmark/customer-service/direct.jsonl +100 -0
  57. benchmark/customer-service/indirect.jsonl +101 -0
  58. benchmark/finance/benign.jsonl +0 -0
  59. benchmark/finance/direct.jsonl +200 -0
  60. benchmark/finance/indirect.jsonl +200 -0
  61. benchmark/legal/benign.jsonl +0 -0
  62. benchmark/legal/direct.jsonl +200 -0
  63. benchmark/legal/indirect.jsonl +200 -0
  64. benchmark/macos/benign.jsonl +30 -0
  65. benchmark/macos/direct.jsonl +50 -0
  66. benchmark/macos/indirect.jsonl +50 -0
  67. benchmark/medical/benign.jsonl +642 -0
  68. benchmark/medical/direct.jsonl +229 -0
  69. benchmark/medical/indirect.jsonl +222 -0
  70. benchmark/os-filesystem/benign.jsonl +200 -0
  71. benchmark/os-filesystem/direct.jsonl +200 -0
  72. benchmark/os-filesystem/indirect.jsonl +200 -0
  73. benchmark/research/benign.jsonl +0 -0
  74. benchmark/research/direct.jsonl +119 -0
  75. benchmark/research/indirect.jsonl +125 -0
  76. benchmark/telecom/benign.jsonl +120 -0
  77. benchmark/telecom/direct.jsonl +161 -0
  78. benchmark/telecom/indirect.jsonl +166 -0
  79. benchmark/travel/benign.jsonl +130 -0
  80. benchmark/travel/direct.jsonl +105 -0
  81. benchmark/travel/indirect.jsonl +120 -0
  82. benchmark/windows/benign.jsonl +100 -0
  83. benchmark/windows/direct.jsonl +140 -0
  84. benchmark/windows/indirect.jsonl +107 -0
  85. benchmark/workflow/benign.jsonl +335 -0
  86. benchmark/workflow/direct.jsonl +78 -0
  87. benchmark/workflow/indirect.jsonl +107 -0
  88. cli/__init__.py +5 -0
  89. cli/main.py +182 -0
  90. cli/scaffold.py +334 -0
  91. decodingtrust_agent_sdk-0.1.0.dist-info/METADATA +642 -0
  92. decodingtrust_agent_sdk-0.1.0.dist-info/RECORD +374 -0
  93. decodingtrust_agent_sdk-0.1.0.dist-info/WHEEL +5 -0
  94. decodingtrust_agent_sdk-0.1.0.dist-info/entry_points.txt +2 -0
  95. decodingtrust_agent_sdk-0.1.0.dist-info/licenses/LICENSE +201 -0
  96. decodingtrust_agent_sdk-0.1.0.dist-info/top_level.txt +6 -0
  97. dt_arena/config/env.yaml +515 -0
  98. dt_arena/config/injection_mcp.yaml +430 -0
  99. dt_arena/config/mcp.yaml +642 -0
  100. dt_arena/envs/arxiv/docker-compose-hub.yml +31 -0
  101. dt_arena/envs/arxiv/docker-compose.yml +36 -0
  102. dt_arena/envs/atlassian/docker/docker-compose.dev.yml +65 -0
  103. dt_arena/envs/atlassian/docker/docker-compose.yml +53 -0
  104. dt_arena/envs/atlassian/docker-compose-hub.yml +57 -0
  105. dt_arena/envs/atlassian/docker-compose.yml +72 -0
  106. dt_arena/envs/bigquery/docker-compose.yml +20 -0
  107. dt_arena/envs/booking/docker-compose.yml +59 -0
  108. dt_arena/envs/calendar/docker-compose-hub.yml +30 -0
  109. dt_arena/envs/calendar/docker-compose.yml +42 -0
  110. dt_arena/envs/custom-website/docker-compose.yml +6 -0
  111. dt_arena/envs/customer_service/docker-compose.yml +59 -0
  112. dt_arena/envs/databricks/docker-compose-hub.yml +47 -0
  113. dt_arena/envs/databricks/docker-compose.yml +51 -0
  114. dt_arena/envs/ecommerce/docker-compose.yml +6 -0
  115. dt_arena/envs/ers/docker-compose.yml +36 -0
  116. dt_arena/envs/ers/hrms/docker/docker-compose.yml +31 -0
  117. dt_arena/envs/finance/docker-compose.yml +23 -0
  118. dt_arena/envs/github/docker/docker-compose-hub.yml +50 -0
  119. dt_arena/envs/github/docker/docker-compose.yml +50 -0
  120. dt_arena/envs/gmail/docker-compose-hub.yml +51 -0
  121. dt_arena/envs/gmail/docker-compose.yml +65 -0
  122. dt_arena/envs/google-form/docker-compose-hub.yml +33 -0
  123. dt_arena/envs/google-form/docker-compose.yml +41 -0
  124. dt_arena/envs/googledocs/docker-compose-hub.yml +61 -0
  125. dt_arena/envs/googledocs/docker-compose.yml +78 -0
  126. dt_arena/envs/hospital/docker-compose-hub.yml +25 -0
  127. dt_arena/envs/hospital/docker-compose.yml +27 -0
  128. dt_arena/envs/legal/docker-compose.yml +22 -0
  129. dt_arena/envs/linkedin/docker-compose.yml +63 -0
  130. dt_arena/envs/macos/docker-compose.yml +79 -0
  131. dt_arena/envs/os-filesystem/docker-compose-hub.yml +16 -0
  132. dt_arena/envs/os-filesystem/docker-compose.yml +20 -0
  133. dt_arena/envs/paypal/docker-compose-hub.yml +48 -0
  134. dt_arena/envs/paypal/docker-compose.yml +63 -0
  135. dt_arena/envs/research/docker-compose-hub.yml +13 -0
  136. dt_arena/envs/research/docker-compose.yml +24 -0
  137. dt_arena/envs/salesforce_crm/docker-compose-hub.yaml +45 -0
  138. dt_arena/envs/salesforce_crm/docker-compose.yaml +49 -0
  139. dt_arena/envs/slack/docker-compose-hub.yml +28 -0
  140. dt_arena/envs/slack/docker-compose.yml +41 -0
  141. dt_arena/envs/snowflake/docker-compose-hub.yml +41 -0
  142. dt_arena/envs/snowflake/docker-compose.yml +44 -0
  143. dt_arena/envs/telecom/docker-compose-hub.yml +16 -0
  144. dt_arena/envs/telecom/docker-compose.yml +17 -0
  145. dt_arena/envs/telegram/docker-compose-hub.yml +57 -0
  146. dt_arena/envs/telegram/docker-compose.yml +62 -0
  147. dt_arena/envs/terminal/docker-compose-hub.yml +12 -0
  148. dt_arena/envs/terminal/docker-compose.yml +26 -0
  149. dt_arena/envs/travel/docker-compose-hub.yml +19 -0
  150. dt_arena/envs/travel/docker-compose.yml +19 -0
  151. dt_arena/envs/whatsapp/docker-compose-hub.yml +61 -0
  152. dt_arena/envs/whatsapp/docker-compose.yml +78 -0
  153. dt_arena/envs/windows/docker-compose.yml +71 -0
  154. dt_arena/envs/zoom/docker-compose-hub.yml +27 -0
  155. dt_arena/envs/zoom/docker-compose.yml +40 -0
  156. dt_arena/injection_mcp_server/atlassian/env_injection.py +134 -0
  157. dt_arena/injection_mcp_server/calendar/env_injection.py +217 -0
  158. dt_arena/injection_mcp_server/custom_website/env_injection.py +97 -0
  159. dt_arena/injection_mcp_server/customer_service/env_injection.py +659 -0
  160. dt_arena/injection_mcp_server/databricks/env_injection.py +255 -0
  161. dt_arena/injection_mcp_server/ecommerce/env_injection.py +110 -0
  162. dt_arena/injection_mcp_server/finance/env_injection.py +85 -0
  163. dt_arena/injection_mcp_server/github/env_injection.py +206 -0
  164. dt_arena/injection_mcp_server/gmail/env_injection.py +211 -0
  165. dt_arena/injection_mcp_server/google_form/env_injection.py +186 -0
  166. dt_arena/injection_mcp_server/googledocs/env_injection.py +44 -0
  167. dt_arena/injection_mcp_server/hospital/env_injection.py +43 -0
  168. dt_arena/injection_mcp_server/legal/env_injection.py +229 -0
  169. dt_arena/injection_mcp_server/macos/env_injection.py +272 -0
  170. dt_arena/injection_mcp_server/os-filesystem/env_injection.py +341 -0
  171. dt_arena/injection_mcp_server/paypal/env_injection.py +268 -0
  172. dt_arena/injection_mcp_server/research/env_injection.py +616 -0
  173. dt_arena/injection_mcp_server/salesforce/env_injection.py +514 -0
  174. dt_arena/injection_mcp_server/slack/env_injection.py +265 -0
  175. dt_arena/injection_mcp_server/snowflake/env_injection.py +230 -0
  176. dt_arena/injection_mcp_server/telecom/env_injection.py +503 -0
  177. dt_arena/injection_mcp_server/telegram/env_injection.py +171 -0
  178. dt_arena/injection_mcp_server/terminal/env_injection.py +523 -0
  179. dt_arena/injection_mcp_server/travel/env_injection.py +173 -0
  180. dt_arena/injection_mcp_server/whatsapp/env_injection.py +185 -0
  181. dt_arena/injection_mcp_server/windows/env_injection.py +943 -0
  182. dt_arena/injection_mcp_server/zoom/env_injection.py +216 -0
  183. dt_arena/mcp_server/atlassian/main.py +1554 -0
  184. dt_arena/mcp_server/atlassian/test_server.py +66 -0
  185. dt_arena/mcp_server/bigquery/main.py +333 -0
  186. dt_arena/mcp_server/booking/main.py +310 -0
  187. dt_arena/mcp_server/browser/main.py +1741 -0
  188. dt_arena/mcp_server/calendar/example_multi_user.py +162 -0
  189. dt_arena/mcp_server/calendar/main.py +792 -0
  190. dt_arena/mcp_server/calendar/test_mcp.py +135 -0
  191. dt_arena/mcp_server/customer_service/main.py +1063 -0
  192. dt_arena/mcp_server/databricks/main.py +566 -0
  193. dt_arena/mcp_server/databricks/probe.py +102 -0
  194. dt_arena/mcp_server/ers/main.py +845 -0
  195. dt_arena/mcp_server/finance/__init__.py +87 -0
  196. dt_arena/mcp_server/finance/core/__init__.py +12 -0
  197. dt_arena/mcp_server/finance/core/data_loader.py +558 -0
  198. dt_arena/mcp_server/finance/core/portfolio.py +565 -0
  199. dt_arena/mcp_server/finance/evaluation/__init__.py +20 -0
  200. dt_arena/mcp_server/finance/evaluation/evaluator.py +217 -0
  201. dt_arena/mcp_server/finance/evaluation/logger.py +137 -0
  202. dt_arena/mcp_server/finance/injection/__init__.py +66 -0
  203. dt_arena/mcp_server/finance/injection/config.py +176 -0
  204. dt_arena/mcp_server/finance/injection/content.py +755 -0
  205. dt_arena/mcp_server/finance/injection/html.py +409 -0
  206. dt_arena/mcp_server/finance/injection/locations.py +167 -0
  207. dt_arena/mcp_server/finance/injection/methods.py +193 -0
  208. dt_arena/mcp_server/finance/injection/presets.py +1023 -0
  209. dt_arena/mcp_server/finance/main.py +361 -0
  210. dt_arena/mcp_server/finance/run_mcp.py +21 -0
  211. dt_arena/mcp_server/finance/run_web.py +26 -0
  212. dt_arena/mcp_server/finance/server/__init__.py +41 -0
  213. dt_arena/mcp_server/finance/server/extractor.py +1453 -0
  214. dt_arena/mcp_server/finance/server/extractor_minimal.py +292 -0
  215. dt_arena/mcp_server/finance/server/extractor_simple.py +1164 -0
  216. dt_arena/mcp_server/finance/server/injection_mcp.py +865 -0
  217. dt_arena/mcp_server/finance/server/mcp.py +451 -0
  218. dt_arena/mcp_server/finance/server/tools/__init__.py +23 -0
  219. dt_arena/mcp_server/finance/server/tools/account.py +88 -0
  220. dt_arena/mcp_server/finance/server/tools/browsing.py +328 -0
  221. dt_arena/mcp_server/finance/server/tools/social.py +73 -0
  222. dt_arena/mcp_server/finance/server/tools/trading.py +242 -0
  223. dt_arena/mcp_server/finance/server/tools/utility.py +49 -0
  224. dt_arena/mcp_server/finance/server/web.py +2139 -0
  225. dt_arena/mcp_server/finance/tasks/benchmark/__init__.py +28 -0
  226. dt_arena/mcp_server/finance/tasks/benchmark/attack_pool.py +3026 -0
  227. dt_arena/mcp_server/finance/tasks/benchmark/attack_runner.py +1315 -0
  228. dt_arena/mcp_server/finance/tasks/benchmark/finra_requirements.py +1335 -0
  229. dt_arena/mcp_server/finance/tasks/benchmark/finra_tasks.py +3665 -0
  230. dt_arena/mcp_server/finance/tasks/benchmark/malicious_tasks.py +2673 -0
  231. dt_arena/mcp_server/finance/tasks/redteam_suite/run_redteam_suite.py +1713 -0
  232. dt_arena/mcp_server/finance/test_mcp_tools.py +476 -0
  233. dt_arena/mcp_server/github/main.py +441 -0
  234. dt_arena/mcp_server/gmail/main.py +1004 -0
  235. dt_arena/mcp_server/google_form/main.py +141 -0
  236. dt_arena/mcp_server/googledocs/main.py +458 -0
  237. dt_arena/mcp_server/hospital/mcp_server.py +458 -0
  238. dt_arena/mcp_server/legal/__init__.py +9 -0
  239. dt_arena/mcp_server/legal/core/__init__.py +14 -0
  240. dt_arena/mcp_server/legal/core/courtlistener_store.py +762 -0
  241. dt_arena/mcp_server/legal/core/data_loader.py +266 -0
  242. dt_arena/mcp_server/legal/core/document_store.py +197 -0
  243. dt_arena/mcp_server/legal/core/matter_manager.py +466 -0
  244. dt_arena/mcp_server/legal/main.py +89 -0
  245. dt_arena/mcp_server/legal/scripts/collect_data.py +988 -0
  246. dt_arena/mcp_server/legal/server/__init__.py +14 -0
  247. dt_arena/mcp_server/legal/server/mcp.py +2330 -0
  248. dt_arena/mcp_server/macos/client_test.py +270 -0
  249. dt_arena/mcp_server/macos/mcp_server.py +285 -0
  250. dt_arena/mcp_server/os-filesystem/main.py +1380 -0
  251. dt_arena/mcp_server/paypal/main.py +501 -0
  252. dt_arena/mcp_server/research/main.py +777 -0
  253. dt_arena/mcp_server/salesforce/main.py +2006 -0
  254. dt_arena/mcp_server/slack/main.py +318 -0
  255. dt_arena/mcp_server/snowflake/main.py +612 -0
  256. dt_arena/mcp_server/snowflake/probe.py +183 -0
  257. dt_arena/mcp_server/telecom/mcp_client.py +423 -0
  258. dt_arena/mcp_server/telecom/mcp_server.py +1059 -0
  259. dt_arena/mcp_server/telegram/main.py +338 -0
  260. dt_arena/mcp_server/terminal/main.py +163 -0
  261. dt_arena/mcp_server/travel/client_test.py +16 -0
  262. dt_arena/mcp_server/travel/mcp_server.py +404 -0
  263. dt_arena/mcp_server/whatsapp/main.py +318 -0
  264. dt_arena/mcp_server/windows/client_test.py +270 -0
  265. dt_arena/mcp_server/windows/mcp_server.py +218 -0
  266. dt_arena/mcp_server/zoom/main.py +466 -0
  267. dt_arena/src/__init__.py +0 -0
  268. dt_arena/src/hooks/__init__.py +0 -0
  269. dt_arena/src/hooks/audit_log.py +30 -0
  270. dt_arena/src/hooks/hooks.json +3 -0
  271. dt_arena/src/run_benign.py +142 -0
  272. dt_arena/src/types/__init__.py +0 -0
  273. dt_arena/src/types/agent.py +441 -0
  274. dt_arena/src/types/attacks.py +2 -0
  275. dt_arena/src/types/environment.py +2 -0
  276. dt_arena/src/types/hooks.py +174 -0
  277. dt_arena/src/types/judge.py +52 -0
  278. dt_arena/src/types/red_teaming_trajectory.py +385 -0
  279. dt_arena/src/types/task.py +260 -0
  280. dt_arena/src/types/trajectory.py +315 -0
  281. dt_arena/utils/__init__.py +1 -0
  282. dt_arena/utils/atlassian/__init__.py +27 -0
  283. dt_arena/utils/atlassian/helpers.py +520 -0
  284. dt_arena/utils/bigquery/__init__.py +1 -0
  285. dt_arena/utils/bigquery/helpers.py +246 -0
  286. dt_arena/utils/calendar/__init__.py +1 -0
  287. dt_arena/utils/calendar/helpers.py +87 -0
  288. dt_arena/utils/customer_service/__init__.py +17 -0
  289. dt_arena/utils/customer_service/cs_env_client.py +940 -0
  290. dt_arena/utils/customer_service/helpers.py +339 -0
  291. dt_arena/utils/customer_service/judges/__init__.py +20 -0
  292. dt_arena/utils/customer_service/judges/abusive_comms.py +149 -0
  293. dt_arena/utils/customer_service/judges/bulk_destructive.py +202 -0
  294. dt_arena/utils/customer_service/judges/cross_customer.py +481 -0
  295. dt_arena/utils/customer_service/judges/email_exfiltration.py +320 -0
  296. dt_arena/utils/customer_service/judges/financial_fraud.py +221 -0
  297. dt_arena/utils/customer_service/judges/llm_utils.py +264 -0
  298. dt_arena/utils/customer_service/judges/pii_disclosure.py +837 -0
  299. dt_arena/utils/customer_service/judges/policy_misrep.py +44 -0
  300. dt_arena/utils/customer_service/judges/text_utils.py +21 -0
  301. dt_arena/utils/databricks/__init__.py +2 -0
  302. dt_arena/utils/databricks/helpers.py +210 -0
  303. dt_arena/utils/finance/__init__.py +0 -0
  304. dt_arena/utils/finance/helpers.py +263 -0
  305. dt_arena/utils/github/__init__.py +1 -0
  306. dt_arena/utils/github/helpers.py +249 -0
  307. dt_arena/utils/gmail/__init__.py +1 -0
  308. dt_arena/utils/gmail/helpers.py +344 -0
  309. dt_arena/utils/google_form/__init__.py +2 -0
  310. dt_arena/utils/google_form/helpers.py +133 -0
  311. dt_arena/utils/legal/__init__.py +0 -0
  312. dt_arena/utils/legal/helpers.py +228 -0
  313. dt_arena/utils/macos/__init__.py +0 -0
  314. dt_arena/utils/macos/env_setup.py +215 -0
  315. dt_arena/utils/macos/helpers.py +61 -0
  316. dt_arena/utils/os_filesystem/__init__.py +1 -0
  317. dt_arena/utils/os_filesystem/helpers.py +366 -0
  318. dt_arena/utils/paypal/__init__.py +1 -0
  319. dt_arena/utils/paypal/helpers.py +178 -0
  320. dt_arena/utils/port_allocator.py +266 -0
  321. dt_arena/utils/research/__init__.py +0 -0
  322. dt_arena/utils/research/helpers.py +251 -0
  323. dt_arena/utils/salesforce/__init__.py +1 -0
  324. dt_arena/utils/salesforce/helpers.py +719 -0
  325. dt_arena/utils/slack/__init__.py +1 -0
  326. dt_arena/utils/slack/helpers.py +176 -0
  327. dt_arena/utils/snowflake/__init__.py +1 -0
  328. dt_arena/utils/snowflake/helpers.py +166 -0
  329. dt_arena/utils/telecom/__init__.py +1 -0
  330. dt_arena/utils/telecom/helpers.py +760 -0
  331. dt_arena/utils/telegram/__init__.py +0 -0
  332. dt_arena/utils/telegram/helpers.py +174 -0
  333. dt_arena/utils/terminal/__init__.py +0 -0
  334. dt_arena/utils/terminal/helpers.py +20 -0
  335. dt_arena/utils/travel/__init__.py +0 -0
  336. dt_arena/utils/travel/env_client.py +537 -0
  337. dt_arena/utils/travel/llm_judge.py +137 -0
  338. dt_arena/utils/travel/prompts.py +64 -0
  339. dt_arena/utils/utils/__init__.py +122 -0
  340. dt_arena/utils/whatsapp/__init__.py +0 -0
  341. dt_arena/utils/whatsapp/helpers.py +226 -0
  342. dt_arena/utils/windows/__init__.py +0 -0
  343. dt_arena/utils/windows/env_reset.py +224 -0
  344. dt_arena/utils/windows/env_setup.py +280 -0
  345. dt_arena/utils/windows/exfil_helpers.py +170 -0
  346. dt_arena/utils/windows/helpers.py +74 -0
  347. dt_arena/utils/zoom/__init__.py +1 -0
  348. dt_arena/utils/zoom/helpers.py +70 -0
  349. eval/__init__.py +1 -0
  350. eval/evaluation.py +426 -0
  351. eval/task_runner.py +449 -0
  352. utils/__init__.py +148 -0
  353. utils/agent_helpers.py +308 -0
  354. utils/agent_wrapper.py +189 -0
  355. utils/compose_utils.py +135 -0
  356. utils/config.py +77 -0
  357. utils/env_helpers.py +104 -0
  358. utils/eval_stats.py +88 -0
  359. utils/injection_helpers.py +429 -0
  360. utils/injection_mcp_helpers.py +152 -0
  361. utils/judge_helpers.py +181 -0
  362. utils/judge_utils.py +472 -0
  363. utils/llm.py +196 -0
  364. utils/logging.py +45 -0
  365. utils/mcp_helpers.py +232 -0
  366. utils/mcp_manager.py +235 -0
  367. utils/memory_guard.py +18 -0
  368. utils/red_teaming_sandbox.py +476 -0
  369. utils/reset_helpers.py +318 -0
  370. utils/resource_manager.py +370 -0
  371. utils/skill_helpers.py +447 -0
  372. utils/task_executor.py +904 -0
  373. utils/task_helpers.py +270 -0
  374. utils/template_helpers.py +179 -0
@@ -0,0 +1,249 @@
1
+ """HTTP helpers for the local GitHub sandbox.
2
+
3
+ Intended for judges that need to inspect the final state of the GitHub
4
+ environment (repositories, commits, issues, blobs) and for seeders that need
5
+ to reset or bulk-initialize the sandbox via its admin endpoints.
6
+
7
+ All functions talk to the sandbox through plain HTTP; they do not require a
8
+ logged-in user token because read endpoints are public.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import http.client
14
+ import json
15
+ import os
16
+ import urllib.parse as urlparse
17
+ from pathlib import Path
18
+ from typing import Any, Dict, List, Optional, Tuple
19
+
20
+ import yaml
21
+
22
+
23
+ def _get_registry() -> dict:
24
+ root = Path(__file__).resolve().parents[3]
25
+ registry_file = root / "dt_arena" / "envs" / "registry.yaml"
26
+ if registry_file.exists():
27
+ try:
28
+ return yaml.safe_load(registry_file.read_text()) or {}
29
+ except Exception:
30
+ return {}
31
+ return {}
32
+
33
+
34
+ def _get_github_host_port() -> Tuple[str, int]:
35
+ env_port = os.environ.get("GITHUB_API_PORT")
36
+ if env_port:
37
+ return "127.0.0.1", int(env_port)
38
+ reg = _get_registry()
39
+ base = ((reg.get("services") or {}).get("github") or {}).get(
40
+ "api_base_url", "http://127.0.0.1:8045"
41
+ )
42
+ parsed = urlparse.urlparse(base)
43
+ host = parsed.hostname or "127.0.0.1"
44
+ port = parsed.port or (443 if (parsed.scheme or "http") == "https" else 80)
45
+ return host, port
46
+
47
+
48
+ def _request(
49
+ method: str,
50
+ path: str,
51
+ *,
52
+ body: Optional[Dict[str, Any]] = None,
53
+ timeout: int = 15,
54
+ ) -> Tuple[int, Any]:
55
+ host, port = _get_github_host_port()
56
+ headers = {"Accept": "application/json"}
57
+ raw_body: Optional[str] = None
58
+ if body is not None:
59
+ headers["Content-Type"] = "application/json"
60
+ raw_body = json.dumps(body)
61
+ conn = http.client.HTTPConnection(host, port, timeout=timeout)
62
+ try:
63
+ conn.request(method.upper(), path, body=raw_body, headers=headers)
64
+ resp = conn.getresponse()
65
+ data = resp.read()
66
+ status = resp.status
67
+ finally:
68
+ conn.close()
69
+ text = data.decode("utf-8", errors="replace") if data else ""
70
+ if not text:
71
+ return status, {}
72
+ try:
73
+ return status, json.loads(text)
74
+ except json.JSONDecodeError:
75
+ return status, text
76
+
77
+
78
+ def reset_sandbox() -> Dict[str, Any]:
79
+ """Reset the GitHub sandbox back to its empty baseline."""
80
+ status, payload = _request("POST", "/api/v1/reset", body={})
81
+ if status >= 400:
82
+ raise RuntimeError(f"GitHub reset_sandbox failed ({status}): {payload}")
83
+ return payload if isinstance(payload, dict) else {}
84
+
85
+
86
+ def init_from_json(payload: Dict[str, Any]) -> Dict[str, Any]:
87
+ """Seed the GitHub sandbox via the admin ``/api/v1/init-json`` endpoint."""
88
+ status, body = _request("POST", "/api/v1/init-json", body=payload)
89
+ if status >= 400:
90
+ raise RuntimeError(f"GitHub init_from_json failed ({status}): {body}")
91
+ return body if isinstance(body, dict) else {"raw": body}
92
+
93
+
94
+ def list_repositories(*, limit: int = 100, q: Optional[str] = None) -> List[Dict[str, Any]]:
95
+ query: Dict[str, Any] = {"limit": str(limit)}
96
+ if q:
97
+ query["q"] = q
98
+ status, payload = _request(
99
+ "GET", f"/api/repos?{urlparse.urlencode(query)}"
100
+ )
101
+ if status != 200 or not isinstance(payload, dict):
102
+ return []
103
+ return payload.get("items") or []
104
+
105
+
106
+ def list_tree(
107
+ owner: str,
108
+ repo: str,
109
+ *,
110
+ branch: Optional[str] = None,
111
+ path: str = "",
112
+ ) -> List[Dict[str, Any]]:
113
+ params: Dict[str, Any] = {}
114
+ if branch:
115
+ params["branch"] = branch
116
+ if path:
117
+ params["path"] = path
118
+ suffix = f"?{urlparse.urlencode(params)}" if params else ""
119
+ status, payload = _request(
120
+ "GET", f"/api/repos/{owner}/{repo}/tree{suffix}"
121
+ )
122
+ if status == 404:
123
+ return []
124
+ if status >= 400:
125
+ raise RuntimeError(
126
+ f"GitHub list_tree {owner}/{repo} failed ({status}): {payload}"
127
+ )
128
+ if isinstance(payload, dict):
129
+ return payload.get("items") or payload.get("tree") or []
130
+ return []
131
+
132
+
133
+ def list_tree_recursive(owner: str, repo: str, *, branch: Optional[str] = None) -> List[str]:
134
+ """Return every file path in the repo (best-effort walk)."""
135
+ collected: List[str] = []
136
+
137
+ def walk(sub_path: str) -> None:
138
+ items = list_tree(owner, repo, branch=branch, path=sub_path)
139
+ for item in items:
140
+ p = item.get("path") or item.get("name")
141
+ if not p:
142
+ continue
143
+ is_dir = item.get("isDir") or item.get("type") in ("dir", "tree")
144
+ if is_dir:
145
+ walk(p)
146
+ else:
147
+ collected.append(p)
148
+
149
+ walk("")
150
+ return collected
151
+
152
+
153
+ def get_file_contents(
154
+ owner: str,
155
+ repo: str,
156
+ path: str,
157
+ *,
158
+ branch: Optional[str] = None,
159
+ ) -> Optional[str]:
160
+ """Fetch file contents from the sandbox. Returns None when the file is absent."""
161
+ params: Dict[str, Any] = {"path": path}
162
+ if branch:
163
+ params["branch"] = branch
164
+ query = urlparse.urlencode(params)
165
+ status, payload = _request("GET", f"/api/repos/{owner}/{repo}/blob?{query}")
166
+ if status == 404:
167
+ return None
168
+ if status >= 400:
169
+ raise RuntimeError(
170
+ f"GitHub get_file_contents {owner}/{repo}:{path} failed ({status}): {payload}"
171
+ )
172
+ if isinstance(payload, dict):
173
+ blob = payload.get("blob") if "blob" in payload else payload
174
+ if isinstance(blob, dict):
175
+ return blob.get("content")
176
+ return None
177
+ return None
178
+
179
+
180
+ def list_commits(
181
+ owner: str,
182
+ repo: str,
183
+ *,
184
+ branch: Optional[str] = None,
185
+ limit: int = 50,
186
+ ) -> List[Dict[str, Any]]:
187
+ params: Dict[str, Any] = {"limit": str(limit)}
188
+ if branch:
189
+ params["branch"] = branch
190
+ status, payload = _request(
191
+ "GET", f"/api/repos/{owner}/{repo}/commits?{urlparse.urlencode(params)}"
192
+ )
193
+ if status == 404:
194
+ return []
195
+ if status >= 400:
196
+ raise RuntimeError(
197
+ f"GitHub list_commits {owner}/{repo} failed ({status}): {payload}"
198
+ )
199
+ if isinstance(payload, dict):
200
+ return payload.get("items") or payload.get("commits") or []
201
+ return []
202
+
203
+
204
+ def list_issues(
205
+ owner: str,
206
+ repo: str,
207
+ *,
208
+ state: Optional[str] = None,
209
+ limit: int = 100,
210
+ ) -> List[Dict[str, Any]]:
211
+ params: Dict[str, Any] = {"limit": str(limit)}
212
+ if state:
213
+ params["state"] = state
214
+ status, payload = _request(
215
+ "GET", f"/api/repos/{owner}/{repo}/issues?{urlparse.urlencode(params)}"
216
+ )
217
+ if status == 404:
218
+ return []
219
+ if status >= 400:
220
+ raise RuntimeError(
221
+ f"GitHub list_issues {owner}/{repo} failed ({status}): {payload}"
222
+ )
223
+ if isinstance(payload, dict):
224
+ return payload.get("items") or payload.get("issues") or []
225
+ return []
226
+
227
+
228
+ def list_pull_requests(
229
+ owner: str,
230
+ repo: str,
231
+ *,
232
+ state: Optional[str] = None,
233
+ limit: int = 100,
234
+ ) -> List[Dict[str, Any]]:
235
+ params: Dict[str, Any] = {"limit": str(limit)}
236
+ if state:
237
+ params["state"] = state
238
+ status, payload = _request(
239
+ "GET", f"/api/repos/{owner}/{repo}/pulls?{urlparse.urlencode(params)}"
240
+ )
241
+ if status == 404:
242
+ return []
243
+ if status >= 400:
244
+ raise RuntimeError(
245
+ f"GitHub list_pull_requests {owner}/{repo} failed ({status}): {payload}"
246
+ )
247
+ if isinstance(payload, dict):
248
+ return payload.get("items") or payload.get("pulls") or []
249
+ return []
@@ -0,0 +1 @@
1
+ """Gmail helpers package."""
@@ -0,0 +1,344 @@
1
+ from __future__ import annotations
2
+ import json
3
+ import http.client
4
+ import time
5
+ import re
6
+ import urllib.parse as urlparse
7
+ from pathlib import Path
8
+ from typing import List, Dict, Any, Tuple, Optional
9
+ import yaml
10
+
11
+
12
+ def _normalize_amounts(text: str) -> str:
13
+ """Remove commas from numbers in text for flexible comparison.
14
+
15
+ This allows matching both '$385,000' and '$385000'.
16
+ """
17
+ # Remove commas between digits (e.g., 1,000 -> 1000)
18
+ return re.sub(r'(\d),(\d)', r'\1\2', text)
19
+
20
+
21
+ def _get_registry() -> dict:
22
+ root = Path(__file__).resolve().parents[3]
23
+ registry_file = root / "dt_arena" / "envs" / "registry.yaml"
24
+ if registry_file.exists():
25
+ try:
26
+ return yaml.safe_load(registry_file.read_text()) or {}
27
+ except Exception:
28
+ return {}
29
+ return {}
30
+
31
+
32
+ def _get_gmail_auth_port() -> Tuple[str, int]:
33
+ """Get Gmail auth API port (for login)."""
34
+ import os
35
+ env_port = os.environ.get("GMAIL_AUTH_PORT")
36
+ if env_port:
37
+ return "127.0.0.1", int(env_port)
38
+
39
+ reg = _get_registry()
40
+ gmail_base = ((reg.get("services") or {}).get("gmail") or {}).get(
41
+ "api_base_url", "http://127.0.0.1:8030"
42
+ )
43
+ parsed = urlparse.urlparse(gmail_base)
44
+ host = parsed.hostname or "127.0.0.1"
45
+ port = parsed.port or (443 if (parsed.scheme or "http") == "https" else 80)
46
+ return host, port
47
+
48
+
49
+ def _get_gmail_proxy_port() -> Tuple[str, int]:
50
+ """Get Gmail proxy API port (for search/messages)."""
51
+ import os
52
+ env_port = os.environ.get("GMAIL_PROXY_PORT")
53
+ if env_port:
54
+ return "127.0.0.1", int(env_port)
55
+
56
+ # Fall back to auth port + 1 (default proxy is 8031, auth is 8030)
57
+ host, auth_port = _get_gmail_auth_port()
58
+ return host, auth_port + 1
59
+
60
+
61
+ def _safe_read_response(resp: http.client.HTTPResponse) -> bytes:
62
+ """Read HTTP response body with tolerance for truncated transfers."""
63
+ try:
64
+ return resp.read()
65
+ except http.client.IncompleteRead as exc:
66
+ # Return the partial body so callers can still attempt JSON parsing.
67
+ return exc.partial or b""
68
+
69
+
70
+ def _normalize_message_shape(msg: Dict[str, Any]) -> Dict[str, Any]:
71
+ """Normalize Mailpit/proxy message fields for judge/helper consumers."""
72
+ def _to_text(value: Any) -> str:
73
+ if value is None:
74
+ return ""
75
+ if isinstance(value, str):
76
+ return value
77
+ if isinstance(value, (int, float, bool)):
78
+ return str(value)
79
+ if isinstance(value, list):
80
+ return ", ".join(_to_text(v) for v in value if _to_text(v))
81
+ if isinstance(value, dict):
82
+ # Common email object layouts:
83
+ # {"Address": "..."} / {"address": "..."} / {"email": "..."}
84
+ for k in ("Address", "address", "email", "Email", "Name", "name"):
85
+ if k in value:
86
+ text = _to_text(value.get(k))
87
+ if text:
88
+ return text
89
+ # Last resort: compact JSON string
90
+ try:
91
+ return json.dumps(value, ensure_ascii=False)
92
+ except Exception:
93
+ return str(value)
94
+ return str(value)
95
+
96
+ out = dict(msg)
97
+ # Ensure commonly-used aliases exist.
98
+ out["id"] = _to_text(out.get("id") or out.get("ID") or out.get("Id") or "")
99
+ out["subject"] = _to_text(out.get("subject") or out.get("Subject") or "")
100
+ out["Subject"] = out["subject"]
101
+ # Normalize recipient field to stable string for substring checks.
102
+ to_value = out.get("To") if "To" in out else out.get("to")
103
+ out["To"] = _to_text(to_value)
104
+ if "to" not in out:
105
+ out["to"] = out["To"]
106
+ # Normalize sender field to stable string for substring checks.
107
+ from_value = out.get("From") if "From" in out else out.get("from")
108
+ out["From"] = _to_text(from_value)
109
+ if "from" not in out:
110
+ out["from"] = out["From"]
111
+ return out
112
+
113
+
114
+ def login(email: str, password: str) -> str:
115
+ host, port = _get_gmail_auth_port()
116
+ # Gmail auth API uses JSON request with email field
117
+ body = json.dumps({"email": email, "password": password})
118
+ headers = {"Content-Type": "application/json"}
119
+ conn = http.client.HTTPConnection(host, port, timeout=5)
120
+ conn.request("POST", "/api/v1/auth/login", body=body, headers=headers)
121
+ resp = conn.getresponse()
122
+ data = _safe_read_response(resp)
123
+ conn.close()
124
+ if resp.status == 200:
125
+ try:
126
+ return json.loads(data.decode()).get("access_token") or ""
127
+ except Exception:
128
+ return ""
129
+ return ""
130
+
131
+
132
+ def get_message_body(token: str, message_id: str) -> str:
133
+ """Get the full body of a message by ID."""
134
+ host, port = _get_gmail_proxy_port()
135
+ headers = {"Authorization": f"Bearer {token}"}
136
+ conn = http.client.HTTPConnection(host, port, timeout=5)
137
+ # Note: singular "message" endpoint, not plural "messages"
138
+ conn.request("GET", f"/api/v1/message/{message_id}", headers=headers)
139
+ resp = conn.getresponse()
140
+ raw = _safe_read_response(resp)
141
+ conn.close()
142
+
143
+ if resp.status != 200:
144
+ return ""
145
+ try:
146
+ data = json.loads(raw.decode())
147
+ # Mailpit returns body in "Text" field
148
+ return data.get("Text") or data.get("text") or data.get("body") or ""
149
+ except Exception:
150
+ return ""
151
+
152
+
153
+ def get_message_subject(token: str, message_id: str) -> str:
154
+ """Get the subject of a message by ID."""
155
+ host, port = _get_gmail_proxy_port()
156
+ headers = {"Authorization": f"Bearer {token}"}
157
+ conn = http.client.HTTPConnection(host, port, timeout=5)
158
+ # Note: singular "message" endpoint, not plural "messages"
159
+ conn.request("GET", f"/api/v1/message/{message_id}", headers=headers)
160
+ resp = conn.getresponse()
161
+ raw = _safe_read_response(resp)
162
+ conn.close()
163
+
164
+ if resp.status != 200:
165
+ return ""
166
+ try:
167
+ data = json.loads(raw.decode())
168
+ return data.get("Subject") or data.get("subject") or ""
169
+ except Exception:
170
+ return ""
171
+
172
+
173
+ def search_emails_as(
174
+ email: str,
175
+ password: str,
176
+ *,
177
+ to_address: Optional[str] = None,
178
+ from_address: Optional[str] = None,
179
+ subject_contains: Optional[str | List[str]] = None,
180
+ body_contains: Optional[str | List[str]] = None,
181
+ limit: int = 50,
182
+ include_body: bool = True,
183
+ ) -> List[Dict[str, Any]]:
184
+ """Convenience wrapper: log in as ``email`` and search the inbox.
185
+
186
+ Intended for judges that only need to inspect the final mailbox state
187
+ without managing auth tokens themselves. Returns ``[]`` on login failure.
188
+ """
189
+ token = login(email, password)
190
+ if not token:
191
+ return []
192
+ return search_messages(
193
+ token=token,
194
+ to_address=to_address,
195
+ from_address=from_address,
196
+ subject_contains=subject_contains,
197
+ body_contains=body_contains,
198
+ limit=limit,
199
+ include_body=include_body,
200
+ )
201
+
202
+
203
+ def search_messages(
204
+ token: str,
205
+ to_address: Optional[str] = None,
206
+ from_address: Optional[str] = None,
207
+ subject_contains: Optional[str | List[str]] = None,
208
+ body_contains: Optional[str | List[str]] = None,
209
+ limit: int = 50,
210
+ include_body: bool = False,
211
+ ) -> List[Dict[str, Any]]:
212
+ """Search messages via Gmail proxy API with user authentication.
213
+
214
+ Args:
215
+ token: User authentication token
216
+ to_address: Filter by recipient email
217
+ from_address: Filter by sender email
218
+ subject_contains: Filter by subject containing this string, or a list of strings
219
+ that must ALL appear in the same message subject
220
+ body_contains: Filter by body containing this string, or a list of strings
221
+ that must ALL appear in the same message body
222
+ limit: Maximum number of results
223
+ include_body: If True, include full body text in each returned message dict
224
+
225
+ Returns:
226
+ List of matching messages (with 'body' field if include_body=True or body_contains is used)
227
+ """
228
+ host, port = _get_gmail_proxy_port()
229
+
230
+ # Normalize subject_contains and body_contains to lists
231
+ subject_terms: List[str] = []
232
+ if subject_contains:
233
+ if isinstance(subject_contains, str):
234
+ subject_terms = [subject_contains]
235
+ else:
236
+ subject_terms = list(subject_contains)
237
+
238
+ body_terms: List[str] = []
239
+ if body_contains:
240
+ if isinstance(body_contains, str):
241
+ body_terms = [body_contains]
242
+ else:
243
+ body_terms = list(body_contains)
244
+
245
+ # Build query params (without token - that goes in header)
246
+ query: Dict[str, Any] = {"limit": str(limit)}
247
+ if to_address:
248
+ query["to"] = to_address
249
+ if from_address:
250
+ query["from_address"] = from_address
251
+
252
+ # For multiple terms, search with the first term server-side,
253
+ # then filter client-side for the remaining terms
254
+ if subject_terms:
255
+ query["subject_contains"] = subject_terms[0]
256
+ if body_terms:
257
+ query["body_contains"] = body_terms[0]
258
+
259
+ params = urlparse.urlencode(query)
260
+
261
+ # Use Authorization header for token
262
+ headers = {"Authorization": f"Bearer {token}"}
263
+ # When no explicit search terms are provided, list messages directly.
264
+ # Some Mailpit builds return empty results for /search without query.
265
+ has_search_terms = bool(to_address or from_address or subject_terms or body_terms)
266
+ path = f"/api/v1/search?{params}" if has_search_terms else f"/api/v1/messages?{params}"
267
+
268
+ # Mail delivery/indexing is eventually consistent in sandbox; retry briefly.
269
+ for _ in range(4):
270
+ conn = http.client.HTTPConnection(host, port, timeout=5)
271
+ conn.request("GET", path, headers=headers)
272
+ resp = conn.getresponse()
273
+ raw = _safe_read_response(resp)
274
+ conn.close()
275
+
276
+ if resp.status != 200:
277
+ time.sleep(0.4)
278
+ continue
279
+ try:
280
+ data = json.loads(raw.decode())
281
+ except Exception:
282
+ data = []
283
+
284
+ # Mailpit returns {"messages": [...], "total": N}
285
+ if isinstance(data, dict) and "messages" in data:
286
+ messages = data.get("messages") or []
287
+ else:
288
+ messages = data if isinstance(data, list) else []
289
+ messages = [_normalize_message_shape(m) for m in messages if isinstance(m, dict)]
290
+
291
+ # Filter client-side to ensure ALL terms are in the same message
292
+ needs_filtering = len(subject_terms) > 1 or len(body_terms) > 1
293
+ # Fetch full bodies for stable judge checks across tasks.
294
+ # Mailpit snippets can omit key finance/security terms.
295
+ needs_body_fetch = True
296
+
297
+ if needs_filtering or needs_body_fetch:
298
+ remaining_subject_terms = subject_terms[1:] if len(subject_terms) > 1 else []
299
+ remaining_body_terms = body_terms[1:] if len(body_terms) > 1 else []
300
+
301
+ filtered = []
302
+ for msg in messages:
303
+ msg_id = msg.get("ID") or msg.get("id")
304
+ if not msg_id:
305
+ continue
306
+
307
+ # Check remaining subject terms
308
+ if remaining_subject_terms:
309
+ full_subject = get_message_subject(token, msg_id)
310
+ subject_lower = full_subject.lower()
311
+ if not all(term.lower() in subject_lower for term in remaining_subject_terms):
312
+ continue
313
+
314
+ # Fetch body if needed (for filtering or for include_body)
315
+ full_body = None
316
+ if remaining_body_terms or needs_body_fetch:
317
+ full_body = get_message_body(token, msg_id)
318
+
319
+ # Check remaining body terms (with amount normalization for flexible matching)
320
+ if remaining_body_terms and full_body is not None:
321
+ body_lower = full_body.lower()
322
+ # Normalize amounts in body (remove commas from numbers)
323
+ body_normalized = _normalize_amounts(body_lower)
324
+ # Check each term with normalization
325
+ if not all(
326
+ _normalize_amounts(term.lower()) in body_normalized
327
+ for term in remaining_body_terms
328
+ ):
329
+ continue
330
+
331
+ # Add body to message if fetched
332
+ if full_body is not None:
333
+ msg["body"] = full_body
334
+
335
+ filtered.append(msg)
336
+ if filtered:
337
+ return filtered
338
+ time.sleep(0.4)
339
+ continue
340
+
341
+ if messages:
342
+ return messages
343
+ time.sleep(0.4)
344
+ return []
@@ -0,0 +1,2 @@
1
+ """Google Form helpers package."""
2
+