decodingtrust-agent-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. agent/__init__.py +30 -0
  2. agent/claudesdk/__init__.py +8 -0
  3. agent/claudesdk/example.py +221 -0
  4. agent/claudesdk/src/__init__.py +8 -0
  5. agent/claudesdk/src/agent.py +400 -0
  6. agent/claudesdk/src/mcp_proxy.py +409 -0
  7. agent/claudesdk/src/utils.py +420 -0
  8. agent/googleadk/__init__.py +15 -0
  9. agent/googleadk/example.py +237 -0
  10. agent/googleadk/src/__init__.py +12 -0
  11. agent/googleadk/src/agent.py +401 -0
  12. agent/googleadk/src/mcp_wrapper.py +163 -0
  13. agent/googleadk/src/utils.py +602 -0
  14. agent/langchain/__init__.py +8 -0
  15. agent/langchain/example.py +213 -0
  16. agent/langchain/src/__init__.py +8 -0
  17. agent/langchain/src/agent.py +645 -0
  18. agent/langchain/src/utils.py +433 -0
  19. agent/openaisdk/__init__.py +17 -0
  20. agent/openaisdk/example.py +228 -0
  21. agent/openaisdk/src/__init__.py +12 -0
  22. agent/openaisdk/src/agent.py +491 -0
  23. agent/openaisdk/src/agent_wrapper.py +143 -0
  24. agent/openaisdk/src/mcp_wrapper.py +395 -0
  25. agent/openaisdk/src/utils.py +493 -0
  26. agent/openclaw/__init__.py +10 -0
  27. agent/openclaw/example.py +251 -0
  28. agent/openclaw/src/__init__.py +14 -0
  29. agent/openclaw/src/agent.py +930 -0
  30. agent/openclaw/src/helpers/__init__.py +1 -0
  31. agent/openclaw/src/helpers/auth_helpers.py +55 -0
  32. agent/openclaw/src/mcp_proxy.py +564 -0
  33. agent/openclaw/src/plugin_generator.py +231 -0
  34. agent/openclaw/src/utils.py +341 -0
  35. agent/pocketflow/__init__.py +18 -0
  36. agent/pocketflow/example.py +221 -0
  37. agent/pocketflow/prompts/react_agent.py +46 -0
  38. agent/pocketflow/src/__init__.py +6 -0
  39. agent/pocketflow/src/agent.py +507 -0
  40. agent/pocketflow/src/agent_wrapper.py +159 -0
  41. agent/pocketflow/src/async_helper.py +92 -0
  42. agent/pocketflow/src/mcp_react_agent.py +279 -0
  43. agent/pocketflow/src/native_agent.py +74 -0
  44. agent/pocketflow/src/nodes.py +467 -0
  45. benchmark/__init__.py +0 -0
  46. benchmark/browser/benign.jsonl +34 -0
  47. benchmark/browser/direct.jsonl +85 -0
  48. benchmark/browser/indirect.jsonl +82 -0
  49. benchmark/code/benign.jsonl +0 -0
  50. benchmark/code/direct.jsonl +121 -0
  51. benchmark/code/indirect.jsonl +165 -0
  52. benchmark/crm/benign.jsonl +165 -0
  53. benchmark/crm/direct.jsonl +90 -0
  54. benchmark/crm/indirect.jsonl +150 -0
  55. benchmark/customer-service/benign.jsonl +160 -0
  56. benchmark/customer-service/direct.jsonl +100 -0
  57. benchmark/customer-service/indirect.jsonl +101 -0
  58. benchmark/finance/benign.jsonl +0 -0
  59. benchmark/finance/direct.jsonl +200 -0
  60. benchmark/finance/indirect.jsonl +200 -0
  61. benchmark/legal/benign.jsonl +0 -0
  62. benchmark/legal/direct.jsonl +200 -0
  63. benchmark/legal/indirect.jsonl +200 -0
  64. benchmark/macos/benign.jsonl +30 -0
  65. benchmark/macos/direct.jsonl +50 -0
  66. benchmark/macos/indirect.jsonl +50 -0
  67. benchmark/medical/benign.jsonl +642 -0
  68. benchmark/medical/direct.jsonl +229 -0
  69. benchmark/medical/indirect.jsonl +222 -0
  70. benchmark/os-filesystem/benign.jsonl +200 -0
  71. benchmark/os-filesystem/direct.jsonl +200 -0
  72. benchmark/os-filesystem/indirect.jsonl +200 -0
  73. benchmark/research/benign.jsonl +0 -0
  74. benchmark/research/direct.jsonl +119 -0
  75. benchmark/research/indirect.jsonl +125 -0
  76. benchmark/telecom/benign.jsonl +120 -0
  77. benchmark/telecom/direct.jsonl +161 -0
  78. benchmark/telecom/indirect.jsonl +166 -0
  79. benchmark/travel/benign.jsonl +130 -0
  80. benchmark/travel/direct.jsonl +105 -0
  81. benchmark/travel/indirect.jsonl +120 -0
  82. benchmark/windows/benign.jsonl +100 -0
  83. benchmark/windows/direct.jsonl +140 -0
  84. benchmark/windows/indirect.jsonl +107 -0
  85. benchmark/workflow/benign.jsonl +335 -0
  86. benchmark/workflow/direct.jsonl +78 -0
  87. benchmark/workflow/indirect.jsonl +107 -0
  88. cli/__init__.py +5 -0
  89. cli/main.py +182 -0
  90. cli/scaffold.py +334 -0
  91. decodingtrust_agent_sdk-0.1.0.dist-info/METADATA +642 -0
  92. decodingtrust_agent_sdk-0.1.0.dist-info/RECORD +374 -0
  93. decodingtrust_agent_sdk-0.1.0.dist-info/WHEEL +5 -0
  94. decodingtrust_agent_sdk-0.1.0.dist-info/entry_points.txt +2 -0
  95. decodingtrust_agent_sdk-0.1.0.dist-info/licenses/LICENSE +201 -0
  96. decodingtrust_agent_sdk-0.1.0.dist-info/top_level.txt +6 -0
  97. dt_arena/config/env.yaml +515 -0
  98. dt_arena/config/injection_mcp.yaml +430 -0
  99. dt_arena/config/mcp.yaml +642 -0
  100. dt_arena/envs/arxiv/docker-compose-hub.yml +31 -0
  101. dt_arena/envs/arxiv/docker-compose.yml +36 -0
  102. dt_arena/envs/atlassian/docker/docker-compose.dev.yml +65 -0
  103. dt_arena/envs/atlassian/docker/docker-compose.yml +53 -0
  104. dt_arena/envs/atlassian/docker-compose-hub.yml +57 -0
  105. dt_arena/envs/atlassian/docker-compose.yml +72 -0
  106. dt_arena/envs/bigquery/docker-compose.yml +20 -0
  107. dt_arena/envs/booking/docker-compose.yml +59 -0
  108. dt_arena/envs/calendar/docker-compose-hub.yml +30 -0
  109. dt_arena/envs/calendar/docker-compose.yml +42 -0
  110. dt_arena/envs/custom-website/docker-compose.yml +6 -0
  111. dt_arena/envs/customer_service/docker-compose.yml +59 -0
  112. dt_arena/envs/databricks/docker-compose-hub.yml +47 -0
  113. dt_arena/envs/databricks/docker-compose.yml +51 -0
  114. dt_arena/envs/ecommerce/docker-compose.yml +6 -0
  115. dt_arena/envs/ers/docker-compose.yml +36 -0
  116. dt_arena/envs/ers/hrms/docker/docker-compose.yml +31 -0
  117. dt_arena/envs/finance/docker-compose.yml +23 -0
  118. dt_arena/envs/github/docker/docker-compose-hub.yml +50 -0
  119. dt_arena/envs/github/docker/docker-compose.yml +50 -0
  120. dt_arena/envs/gmail/docker-compose-hub.yml +51 -0
  121. dt_arena/envs/gmail/docker-compose.yml +65 -0
  122. dt_arena/envs/google-form/docker-compose-hub.yml +33 -0
  123. dt_arena/envs/google-form/docker-compose.yml +41 -0
  124. dt_arena/envs/googledocs/docker-compose-hub.yml +61 -0
  125. dt_arena/envs/googledocs/docker-compose.yml +78 -0
  126. dt_arena/envs/hospital/docker-compose-hub.yml +25 -0
  127. dt_arena/envs/hospital/docker-compose.yml +27 -0
  128. dt_arena/envs/legal/docker-compose.yml +22 -0
  129. dt_arena/envs/linkedin/docker-compose.yml +63 -0
  130. dt_arena/envs/macos/docker-compose.yml +79 -0
  131. dt_arena/envs/os-filesystem/docker-compose-hub.yml +16 -0
  132. dt_arena/envs/os-filesystem/docker-compose.yml +20 -0
  133. dt_arena/envs/paypal/docker-compose-hub.yml +48 -0
  134. dt_arena/envs/paypal/docker-compose.yml +63 -0
  135. dt_arena/envs/research/docker-compose-hub.yml +13 -0
  136. dt_arena/envs/research/docker-compose.yml +24 -0
  137. dt_arena/envs/salesforce_crm/docker-compose-hub.yaml +45 -0
  138. dt_arena/envs/salesforce_crm/docker-compose.yaml +49 -0
  139. dt_arena/envs/slack/docker-compose-hub.yml +28 -0
  140. dt_arena/envs/slack/docker-compose.yml +41 -0
  141. dt_arena/envs/snowflake/docker-compose-hub.yml +41 -0
  142. dt_arena/envs/snowflake/docker-compose.yml +44 -0
  143. dt_arena/envs/telecom/docker-compose-hub.yml +16 -0
  144. dt_arena/envs/telecom/docker-compose.yml +17 -0
  145. dt_arena/envs/telegram/docker-compose-hub.yml +57 -0
  146. dt_arena/envs/telegram/docker-compose.yml +62 -0
  147. dt_arena/envs/terminal/docker-compose-hub.yml +12 -0
  148. dt_arena/envs/terminal/docker-compose.yml +26 -0
  149. dt_arena/envs/travel/docker-compose-hub.yml +19 -0
  150. dt_arena/envs/travel/docker-compose.yml +19 -0
  151. dt_arena/envs/whatsapp/docker-compose-hub.yml +61 -0
  152. dt_arena/envs/whatsapp/docker-compose.yml +78 -0
  153. dt_arena/envs/windows/docker-compose.yml +71 -0
  154. dt_arena/envs/zoom/docker-compose-hub.yml +27 -0
  155. dt_arena/envs/zoom/docker-compose.yml +40 -0
  156. dt_arena/injection_mcp_server/atlassian/env_injection.py +134 -0
  157. dt_arena/injection_mcp_server/calendar/env_injection.py +217 -0
  158. dt_arena/injection_mcp_server/custom_website/env_injection.py +97 -0
  159. dt_arena/injection_mcp_server/customer_service/env_injection.py +659 -0
  160. dt_arena/injection_mcp_server/databricks/env_injection.py +255 -0
  161. dt_arena/injection_mcp_server/ecommerce/env_injection.py +110 -0
  162. dt_arena/injection_mcp_server/finance/env_injection.py +85 -0
  163. dt_arena/injection_mcp_server/github/env_injection.py +206 -0
  164. dt_arena/injection_mcp_server/gmail/env_injection.py +211 -0
  165. dt_arena/injection_mcp_server/google_form/env_injection.py +186 -0
  166. dt_arena/injection_mcp_server/googledocs/env_injection.py +44 -0
  167. dt_arena/injection_mcp_server/hospital/env_injection.py +43 -0
  168. dt_arena/injection_mcp_server/legal/env_injection.py +229 -0
  169. dt_arena/injection_mcp_server/macos/env_injection.py +272 -0
  170. dt_arena/injection_mcp_server/os-filesystem/env_injection.py +341 -0
  171. dt_arena/injection_mcp_server/paypal/env_injection.py +268 -0
  172. dt_arena/injection_mcp_server/research/env_injection.py +616 -0
  173. dt_arena/injection_mcp_server/salesforce/env_injection.py +514 -0
  174. dt_arena/injection_mcp_server/slack/env_injection.py +265 -0
  175. dt_arena/injection_mcp_server/snowflake/env_injection.py +230 -0
  176. dt_arena/injection_mcp_server/telecom/env_injection.py +503 -0
  177. dt_arena/injection_mcp_server/telegram/env_injection.py +171 -0
  178. dt_arena/injection_mcp_server/terminal/env_injection.py +523 -0
  179. dt_arena/injection_mcp_server/travel/env_injection.py +173 -0
  180. dt_arena/injection_mcp_server/whatsapp/env_injection.py +185 -0
  181. dt_arena/injection_mcp_server/windows/env_injection.py +943 -0
  182. dt_arena/injection_mcp_server/zoom/env_injection.py +216 -0
  183. dt_arena/mcp_server/atlassian/main.py +1554 -0
  184. dt_arena/mcp_server/atlassian/test_server.py +66 -0
  185. dt_arena/mcp_server/bigquery/main.py +333 -0
  186. dt_arena/mcp_server/booking/main.py +310 -0
  187. dt_arena/mcp_server/browser/main.py +1741 -0
  188. dt_arena/mcp_server/calendar/example_multi_user.py +162 -0
  189. dt_arena/mcp_server/calendar/main.py +792 -0
  190. dt_arena/mcp_server/calendar/test_mcp.py +135 -0
  191. dt_arena/mcp_server/customer_service/main.py +1063 -0
  192. dt_arena/mcp_server/databricks/main.py +566 -0
  193. dt_arena/mcp_server/databricks/probe.py +102 -0
  194. dt_arena/mcp_server/ers/main.py +845 -0
  195. dt_arena/mcp_server/finance/__init__.py +87 -0
  196. dt_arena/mcp_server/finance/core/__init__.py +12 -0
  197. dt_arena/mcp_server/finance/core/data_loader.py +558 -0
  198. dt_arena/mcp_server/finance/core/portfolio.py +565 -0
  199. dt_arena/mcp_server/finance/evaluation/__init__.py +20 -0
  200. dt_arena/mcp_server/finance/evaluation/evaluator.py +217 -0
  201. dt_arena/mcp_server/finance/evaluation/logger.py +137 -0
  202. dt_arena/mcp_server/finance/injection/__init__.py +66 -0
  203. dt_arena/mcp_server/finance/injection/config.py +176 -0
  204. dt_arena/mcp_server/finance/injection/content.py +755 -0
  205. dt_arena/mcp_server/finance/injection/html.py +409 -0
  206. dt_arena/mcp_server/finance/injection/locations.py +167 -0
  207. dt_arena/mcp_server/finance/injection/methods.py +193 -0
  208. dt_arena/mcp_server/finance/injection/presets.py +1023 -0
  209. dt_arena/mcp_server/finance/main.py +361 -0
  210. dt_arena/mcp_server/finance/run_mcp.py +21 -0
  211. dt_arena/mcp_server/finance/run_web.py +26 -0
  212. dt_arena/mcp_server/finance/server/__init__.py +41 -0
  213. dt_arena/mcp_server/finance/server/extractor.py +1453 -0
  214. dt_arena/mcp_server/finance/server/extractor_minimal.py +292 -0
  215. dt_arena/mcp_server/finance/server/extractor_simple.py +1164 -0
  216. dt_arena/mcp_server/finance/server/injection_mcp.py +865 -0
  217. dt_arena/mcp_server/finance/server/mcp.py +451 -0
  218. dt_arena/mcp_server/finance/server/tools/__init__.py +23 -0
  219. dt_arena/mcp_server/finance/server/tools/account.py +88 -0
  220. dt_arena/mcp_server/finance/server/tools/browsing.py +328 -0
  221. dt_arena/mcp_server/finance/server/tools/social.py +73 -0
  222. dt_arena/mcp_server/finance/server/tools/trading.py +242 -0
  223. dt_arena/mcp_server/finance/server/tools/utility.py +49 -0
  224. dt_arena/mcp_server/finance/server/web.py +2139 -0
  225. dt_arena/mcp_server/finance/tasks/benchmark/__init__.py +28 -0
  226. dt_arena/mcp_server/finance/tasks/benchmark/attack_pool.py +3026 -0
  227. dt_arena/mcp_server/finance/tasks/benchmark/attack_runner.py +1315 -0
  228. dt_arena/mcp_server/finance/tasks/benchmark/finra_requirements.py +1335 -0
  229. dt_arena/mcp_server/finance/tasks/benchmark/finra_tasks.py +3665 -0
  230. dt_arena/mcp_server/finance/tasks/benchmark/malicious_tasks.py +2673 -0
  231. dt_arena/mcp_server/finance/tasks/redteam_suite/run_redteam_suite.py +1713 -0
  232. dt_arena/mcp_server/finance/test_mcp_tools.py +476 -0
  233. dt_arena/mcp_server/github/main.py +441 -0
  234. dt_arena/mcp_server/gmail/main.py +1004 -0
  235. dt_arena/mcp_server/google_form/main.py +141 -0
  236. dt_arena/mcp_server/googledocs/main.py +458 -0
  237. dt_arena/mcp_server/hospital/mcp_server.py +458 -0
  238. dt_arena/mcp_server/legal/__init__.py +9 -0
  239. dt_arena/mcp_server/legal/core/__init__.py +14 -0
  240. dt_arena/mcp_server/legal/core/courtlistener_store.py +762 -0
  241. dt_arena/mcp_server/legal/core/data_loader.py +266 -0
  242. dt_arena/mcp_server/legal/core/document_store.py +197 -0
  243. dt_arena/mcp_server/legal/core/matter_manager.py +466 -0
  244. dt_arena/mcp_server/legal/main.py +89 -0
  245. dt_arena/mcp_server/legal/scripts/collect_data.py +988 -0
  246. dt_arena/mcp_server/legal/server/__init__.py +14 -0
  247. dt_arena/mcp_server/legal/server/mcp.py +2330 -0
  248. dt_arena/mcp_server/macos/client_test.py +270 -0
  249. dt_arena/mcp_server/macos/mcp_server.py +285 -0
  250. dt_arena/mcp_server/os-filesystem/main.py +1380 -0
  251. dt_arena/mcp_server/paypal/main.py +501 -0
  252. dt_arena/mcp_server/research/main.py +777 -0
  253. dt_arena/mcp_server/salesforce/main.py +2006 -0
  254. dt_arena/mcp_server/slack/main.py +318 -0
  255. dt_arena/mcp_server/snowflake/main.py +612 -0
  256. dt_arena/mcp_server/snowflake/probe.py +183 -0
  257. dt_arena/mcp_server/telecom/mcp_client.py +423 -0
  258. dt_arena/mcp_server/telecom/mcp_server.py +1059 -0
  259. dt_arena/mcp_server/telegram/main.py +338 -0
  260. dt_arena/mcp_server/terminal/main.py +163 -0
  261. dt_arena/mcp_server/travel/client_test.py +16 -0
  262. dt_arena/mcp_server/travel/mcp_server.py +404 -0
  263. dt_arena/mcp_server/whatsapp/main.py +318 -0
  264. dt_arena/mcp_server/windows/client_test.py +270 -0
  265. dt_arena/mcp_server/windows/mcp_server.py +218 -0
  266. dt_arena/mcp_server/zoom/main.py +466 -0
  267. dt_arena/src/__init__.py +0 -0
  268. dt_arena/src/hooks/__init__.py +0 -0
  269. dt_arena/src/hooks/audit_log.py +30 -0
  270. dt_arena/src/hooks/hooks.json +3 -0
  271. dt_arena/src/run_benign.py +142 -0
  272. dt_arena/src/types/__init__.py +0 -0
  273. dt_arena/src/types/agent.py +441 -0
  274. dt_arena/src/types/attacks.py +2 -0
  275. dt_arena/src/types/environment.py +2 -0
  276. dt_arena/src/types/hooks.py +174 -0
  277. dt_arena/src/types/judge.py +52 -0
  278. dt_arena/src/types/red_teaming_trajectory.py +385 -0
  279. dt_arena/src/types/task.py +260 -0
  280. dt_arena/src/types/trajectory.py +315 -0
  281. dt_arena/utils/__init__.py +1 -0
  282. dt_arena/utils/atlassian/__init__.py +27 -0
  283. dt_arena/utils/atlassian/helpers.py +520 -0
  284. dt_arena/utils/bigquery/__init__.py +1 -0
  285. dt_arena/utils/bigquery/helpers.py +246 -0
  286. dt_arena/utils/calendar/__init__.py +1 -0
  287. dt_arena/utils/calendar/helpers.py +87 -0
  288. dt_arena/utils/customer_service/__init__.py +17 -0
  289. dt_arena/utils/customer_service/cs_env_client.py +940 -0
  290. dt_arena/utils/customer_service/helpers.py +339 -0
  291. dt_arena/utils/customer_service/judges/__init__.py +20 -0
  292. dt_arena/utils/customer_service/judges/abusive_comms.py +149 -0
  293. dt_arena/utils/customer_service/judges/bulk_destructive.py +202 -0
  294. dt_arena/utils/customer_service/judges/cross_customer.py +481 -0
  295. dt_arena/utils/customer_service/judges/email_exfiltration.py +320 -0
  296. dt_arena/utils/customer_service/judges/financial_fraud.py +221 -0
  297. dt_arena/utils/customer_service/judges/llm_utils.py +264 -0
  298. dt_arena/utils/customer_service/judges/pii_disclosure.py +837 -0
  299. dt_arena/utils/customer_service/judges/policy_misrep.py +44 -0
  300. dt_arena/utils/customer_service/judges/text_utils.py +21 -0
  301. dt_arena/utils/databricks/__init__.py +2 -0
  302. dt_arena/utils/databricks/helpers.py +210 -0
  303. dt_arena/utils/finance/__init__.py +0 -0
  304. dt_arena/utils/finance/helpers.py +263 -0
  305. dt_arena/utils/github/__init__.py +1 -0
  306. dt_arena/utils/github/helpers.py +249 -0
  307. dt_arena/utils/gmail/__init__.py +1 -0
  308. dt_arena/utils/gmail/helpers.py +344 -0
  309. dt_arena/utils/google_form/__init__.py +2 -0
  310. dt_arena/utils/google_form/helpers.py +133 -0
  311. dt_arena/utils/legal/__init__.py +0 -0
  312. dt_arena/utils/legal/helpers.py +228 -0
  313. dt_arena/utils/macos/__init__.py +0 -0
  314. dt_arena/utils/macos/env_setup.py +215 -0
  315. dt_arena/utils/macos/helpers.py +61 -0
  316. dt_arena/utils/os_filesystem/__init__.py +1 -0
  317. dt_arena/utils/os_filesystem/helpers.py +366 -0
  318. dt_arena/utils/paypal/__init__.py +1 -0
  319. dt_arena/utils/paypal/helpers.py +178 -0
  320. dt_arena/utils/port_allocator.py +266 -0
  321. dt_arena/utils/research/__init__.py +0 -0
  322. dt_arena/utils/research/helpers.py +251 -0
  323. dt_arena/utils/salesforce/__init__.py +1 -0
  324. dt_arena/utils/salesforce/helpers.py +719 -0
  325. dt_arena/utils/slack/__init__.py +1 -0
  326. dt_arena/utils/slack/helpers.py +176 -0
  327. dt_arena/utils/snowflake/__init__.py +1 -0
  328. dt_arena/utils/snowflake/helpers.py +166 -0
  329. dt_arena/utils/telecom/__init__.py +1 -0
  330. dt_arena/utils/telecom/helpers.py +760 -0
  331. dt_arena/utils/telegram/__init__.py +0 -0
  332. dt_arena/utils/telegram/helpers.py +174 -0
  333. dt_arena/utils/terminal/__init__.py +0 -0
  334. dt_arena/utils/terminal/helpers.py +20 -0
  335. dt_arena/utils/travel/__init__.py +0 -0
  336. dt_arena/utils/travel/env_client.py +537 -0
  337. dt_arena/utils/travel/llm_judge.py +137 -0
  338. dt_arena/utils/travel/prompts.py +64 -0
  339. dt_arena/utils/utils/__init__.py +122 -0
  340. dt_arena/utils/whatsapp/__init__.py +0 -0
  341. dt_arena/utils/whatsapp/helpers.py +226 -0
  342. dt_arena/utils/windows/__init__.py +0 -0
  343. dt_arena/utils/windows/env_reset.py +224 -0
  344. dt_arena/utils/windows/env_setup.py +280 -0
  345. dt_arena/utils/windows/exfil_helpers.py +170 -0
  346. dt_arena/utils/windows/helpers.py +74 -0
  347. dt_arena/utils/zoom/__init__.py +1 -0
  348. dt_arena/utils/zoom/helpers.py +70 -0
  349. eval/__init__.py +1 -0
  350. eval/evaluation.py +426 -0
  351. eval/task_runner.py +449 -0
  352. utils/__init__.py +148 -0
  353. utils/agent_helpers.py +308 -0
  354. utils/agent_wrapper.py +189 -0
  355. utils/compose_utils.py +135 -0
  356. utils/config.py +77 -0
  357. utils/env_helpers.py +104 -0
  358. utils/eval_stats.py +88 -0
  359. utils/injection_helpers.py +429 -0
  360. utils/injection_mcp_helpers.py +152 -0
  361. utils/judge_helpers.py +181 -0
  362. utils/judge_utils.py +472 -0
  363. utils/llm.py +196 -0
  364. utils/logging.py +45 -0
  365. utils/mcp_helpers.py +232 -0
  366. utils/mcp_manager.py +235 -0
  367. utils/memory_guard.py +18 -0
  368. utils/red_teaming_sandbox.py +476 -0
  369. utils/reset_helpers.py +318 -0
  370. utils/resource_manager.py +370 -0
  371. utils/skill_helpers.py +447 -0
  372. utils/task_executor.py +904 -0
  373. utils/task_helpers.py +270 -0
  374. utils/template_helpers.py +179 -0
@@ -0,0 +1,719 @@
1
+ """
2
+ Salesforce CRM helpers for end-to-end verification in judges.
3
+
4
+ These helpers directly query the Salesforce API to verify that the agent's
5
+ actions have had the expected effect on the environment state.
6
+
7
+ Usage in judges:
8
+ from dt_arena.utils.salesforce.helpers import configure_salesforce, search_opportunities
9
+ from dt_arena.utils.utils import get_mcp_env_var
10
+
11
+ class Judge:
12
+ def __init__(self):
13
+ config = load_task_config(_task_dir)
14
+ configure_salesforce(
15
+ base_url=get_mcp_env_var(config, "salesforce", "SALESFORCE_BASE_URL"),
16
+ grant_type=get_mcp_env_var(config, "salesforce", "SALESFORCE_GRANT_TYPE"),
17
+ client_id=get_mcp_env_var(config, "salesforce", "SALESFORCE_CLIENT_ID"),
18
+ client_secret=get_mcp_env_var(config, "salesforce", "SALESFORCE_CLIENT_SECRET"),
19
+ )
20
+ """
21
+ from __future__ import annotations
22
+
23
+ import http.client
24
+ import json
25
+ import urllib.parse as urlparse
26
+ from typing import List, Dict, Any, Tuple, Optional
27
+
28
+
29
+ # Module-level configuration - set by configure_salesforce()
30
+ _salesforce_config: Dict[str, str] = {}
31
+
32
+
33
+ def configure_salesforce(
34
+ base_url: Optional[str] = None,
35
+ grant_type: Optional[str] = None,
36
+ client_id: Optional[str] = None,
37
+ client_secret: Optional[str] = None,
38
+ username: Optional[str] = None,
39
+ password: Optional[str] = None,
40
+ access_token: Optional[str] = None,
41
+ ) -> None:
42
+ """Configure Salesforce connection parameters.
43
+
44
+ Call this once in your judge's __init__ before using any helper functions.
45
+ All parameters should be obtained via get_mcp_env_var() which resolves
46
+ template variables like ${SALESFORCE_API_PORT}.
47
+
48
+ Args:
49
+ base_url: Salesforce API base URL (e.g., "http://127.0.0.1:8080")
50
+ grant_type: OAuth grant type ("client_credentials" or "password")
51
+ client_id: OAuth client ID
52
+ client_secret: OAuth client secret
53
+ username: Username for password grant
54
+ password: Password for password grant
55
+ access_token: Pre-existing access token (optional)
56
+ """
57
+ global _salesforce_config
58
+ _salesforce_config = {
59
+ "base_url": (base_url or "").rstrip("/"),
60
+ "grant_type": grant_type or "client_credentials",
61
+ "client_id": client_id or "",
62
+ "client_secret": client_secret or "",
63
+ "username": username or "",
64
+ "password": password or "",
65
+ "access_token": access_token or "",
66
+ }
67
+
68
+
69
+ def _get_config() -> Dict[str, str]:
70
+ """Get Salesforce configuration.
71
+
72
+ Returns the configuration set by configure_salesforce().
73
+ Raises an error if not configured.
74
+ """
75
+ if not _salesforce_config or not _salesforce_config.get("base_url"):
76
+ raise RuntimeError(
77
+ "Salesforce not configured. Call configure_salesforce() first in your judge's __init__. "
78
+ "See module docstring for usage example."
79
+ )
80
+ return _salesforce_config
81
+
82
+
83
+ def _get_host_port(base_url: str) -> Tuple[str, int]:
84
+ """Parse host and port from base URL."""
85
+ parsed = urlparse.urlparse(base_url)
86
+ host = parsed.hostname or "127.0.0.1"
87
+ port = parsed.port or (443 if parsed.scheme == "https" else 80)
88
+ return host, port
89
+
90
+
91
+ def _get_access_token() -> str:
92
+ """Get an access token for Salesforce API."""
93
+ global _salesforce_config
94
+ config = _get_config()
95
+
96
+ # Use cached token if available
97
+ if config["access_token"]:
98
+ return config["access_token"]
99
+
100
+ # Otherwise obtain a new token
101
+ host, port = _get_host_port(config["base_url"])
102
+
103
+ if config["grant_type"] == "password":
104
+ body = urlparse.urlencode({
105
+ "grant_type": "password",
106
+ "client_id": config["client_id"],
107
+ "client_secret": config["client_secret"],
108
+ "username": config["username"],
109
+ "password": config["password"],
110
+ })
111
+ else: # client_credentials
112
+ body = urlparse.urlencode({
113
+ "grant_type": "client_credentials",
114
+ "client_id": config["client_id"],
115
+ "client_secret": config["client_secret"],
116
+ })
117
+
118
+ headers = {
119
+ "Content-Type": "application/x-www-form-urlencoded",
120
+ "Accept": "application/json",
121
+ }
122
+
123
+ conn = http.client.HTTPConnection(host, port, timeout=30)
124
+ conn.request("POST", "/Api/access_token", body=body, headers=headers)
125
+ resp = conn.getresponse()
126
+ data = resp.read()
127
+ conn.close()
128
+
129
+ if resp.status == 200:
130
+ try:
131
+ result = json.loads(data.decode())
132
+ token = result.get("access_token") or result.get("token")
133
+ if token:
134
+ # Cache in module config
135
+ _salesforce_config["access_token"] = token
136
+ return token
137
+ except Exception:
138
+ pass
139
+
140
+ raise RuntimeError(f"Failed to obtain Salesforce access token: {resp.status} {data.decode()[:200]}")
141
+
142
+
143
+ def _api_request(
144
+ method: str,
145
+ path: str,
146
+ token: Optional[str] = None,
147
+ body: Optional[Dict[str, Any]] = None,
148
+ params: Optional[Dict[str, Any]] = None,
149
+ _retried: bool = False,
150
+ ) -> Dict[str, Any]:
151
+ """Make an API request to Salesforce."""
152
+ config = _get_config()
153
+ host, port = _get_host_port(config["base_url"])
154
+
155
+ if token is None:
156
+ token = _get_access_token()
157
+
158
+ headers = {
159
+ "Authorization": f"Bearer {token}",
160
+ "Accept": "application/vnd.api+json",
161
+ "Content-Type": "application/vnd.api+json",
162
+ }
163
+
164
+ url = path
165
+ if params:
166
+ url = f"{path}?{urlparse.urlencode(params)}"
167
+
168
+ conn = http.client.HTTPConnection(host, port, timeout=30)
169
+
170
+ if body:
171
+ conn.request(method, url, body=json.dumps(body), headers=headers)
172
+ else:
173
+ conn.request(method, url, headers=headers)
174
+
175
+ resp = conn.getresponse()
176
+ data = resp.read()
177
+ conn.close()
178
+
179
+ # On 401, clear cached token and retry once with a fresh token.
180
+ # reset-database drops oauth2tokens, invalidating any cached token.
181
+ if resp.status == 401 and not _retried:
182
+ global _salesforce_config
183
+ _salesforce_config["access_token"] = ""
184
+ return _api_request(method, path, token=None, body=body, params=params, _retried=True)
185
+
186
+ if resp.status >= 200 and resp.status < 300:
187
+ if data:
188
+ try:
189
+ return json.loads(data.decode())
190
+ except Exception:
191
+ return {"ok": True, "text": data.decode()}
192
+ return {"ok": True}
193
+
194
+ return {
195
+ "error": f"HTTP {resp.status}",
196
+ "status_code": resp.status,
197
+ "text": data.decode()[:500] if data else "",
198
+ }
199
+
200
+
201
+ # ============ Lead Operations ============
202
+
203
+ def list_leads(page: int = 1, page_size: int = 50) -> List[Dict[str, Any]]:
204
+ """List all leads from the Salesforce environment.
205
+
206
+ Returns:
207
+ List of lead records with id, type, and attributes.
208
+ """
209
+ params = {
210
+ "page[number]": page,
211
+ "page[size]": page_size,
212
+ }
213
+ result = _api_request("GET", "/Api/V8/module/Leads", params=params)
214
+ if "error" in result:
215
+ return []
216
+ return result.get("data", [])
217
+
218
+
219
+ def get_lead_by_id(lead_id: str) -> Optional[Dict[str, Any]]:
220
+ """Get a specific lead by its ID.
221
+
222
+ Args:
223
+ lead_id: The GUID of the lead.
224
+
225
+ Returns:
226
+ Lead record or None if not found.
227
+ """
228
+ result = _api_request("GET", f"/Api/V8/module/Leads/{lead_id}")
229
+ if "error" in result:
230
+ return None
231
+ return result.get("data")
232
+
233
+
234
+ def search_leads_by_name(
235
+ first_name: Optional[str] = None,
236
+ last_name: Optional[str] = None,
237
+ account_name: Optional[str] = None,
238
+ ) -> List[Dict[str, Any]]:
239
+ """Search leads by name fields.
240
+
241
+ Args:
242
+ first_name: First name to search for.
243
+ last_name: Last name to search for.
244
+ account_name: Company/account name to search for.
245
+
246
+ Returns:
247
+ List of matching lead records.
248
+ """
249
+ params: Dict[str, Any] = {
250
+ "page[number]": 1,
251
+ "page[size]": 50,
252
+ }
253
+
254
+ # Build filter params
255
+ if first_name:
256
+ params["filter[first_name][eq]"] = first_name
257
+ if last_name:
258
+ params["filter[last_name][eq]"] = last_name
259
+ if account_name:
260
+ params["filter[account_name][eq]"] = account_name
261
+
262
+ result = _api_request("GET", "/Api/V8/module/Leads", params=params)
263
+ if "error" in result:
264
+ return []
265
+ return result.get("data", [])
266
+
267
+
268
+ def find_lead_by_email(email: str) -> Optional[Dict[str, Any]]:
269
+ """Find a lead by email address.
270
+
271
+ Args:
272
+ email: Email address to search for.
273
+
274
+ Returns:
275
+ Lead record or None if not found.
276
+ """
277
+ params = {
278
+ "page[number]": 1,
279
+ "page[size]": 10,
280
+ "filter[email1][eq]": email,
281
+ }
282
+ result = _api_request("GET", "/Api/V8/module/Leads", params=params)
283
+ if "error" in result:
284
+ return None
285
+ data = result.get("data", [])
286
+ return data[0] if data else None
287
+
288
+
289
+ def check_lead_exists(
290
+ first_name: str,
291
+ last_name: str,
292
+ email: Optional[str] = None,
293
+ ) -> bool:
294
+ """Check if a lead with the given name exists.
295
+
296
+ Args:
297
+ first_name: Lead's first name.
298
+ last_name: Lead's last name.
299
+ email: Optional email to also check.
300
+
301
+ Returns:
302
+ True if a matching lead exists, False otherwise.
303
+ """
304
+ leads = search_leads_by_name(first_name=first_name, last_name=last_name)
305
+ if not leads:
306
+ return False
307
+
308
+ if email:
309
+ for lead in leads:
310
+ attrs = lead.get("attributes", {})
311
+ if attrs.get("email1") == email:
312
+ return True
313
+ return False
314
+
315
+ return len(leads) > 0
316
+
317
+
318
+ def check_lead_status(lead_id: str, expected_status: str) -> bool:
319
+ """Check if a lead has the expected status.
320
+
321
+ Args:
322
+ lead_id: The GUID of the lead.
323
+ expected_status: Expected status value (e.g., "New", "In Process", "Converted").
324
+
325
+ Returns:
326
+ True if the lead has the expected status, False otherwise.
327
+ """
328
+ lead = get_lead_by_id(lead_id)
329
+ if not lead:
330
+ return False
331
+ attrs = lead.get("attributes", {})
332
+ return attrs.get("status") == expected_status
333
+
334
+
335
+ def check_lead_description_contains(lead_id: str, text: str) -> bool:
336
+ """Check if a lead's description contains specific text.
337
+
338
+ Args:
339
+ lead_id: The GUID of the lead.
340
+ text: Text to search for in the description.
341
+
342
+ Returns:
343
+ True if the description contains the text, False otherwise.
344
+ """
345
+ lead = get_lead_by_id(lead_id)
346
+ if not lead:
347
+ return False
348
+ attrs = lead.get("attributes", {})
349
+ description = attrs.get("description", "") or ""
350
+ return text.lower() in description.lower()
351
+
352
+
353
+ # ============ Contact Operations ============
354
+
355
+ def list_contacts(page: int = 1, page_size: int = 50) -> List[Dict[str, Any]]:
356
+ """List all contacts from the Salesforce environment.
357
+
358
+ Returns:
359
+ List of contact records.
360
+ """
361
+ params = {
362
+ "page[number]": page,
363
+ "page[size]": page_size,
364
+ }
365
+ result = _api_request("GET", "/Api/V8/module/Contacts", params=params)
366
+ if "error" in result:
367
+ return []
368
+ return result.get("data", [])
369
+
370
+
371
+ def get_contact_by_id(contact_id: str) -> Optional[Dict[str, Any]]:
372
+ """Get a specific contact by its ID.
373
+
374
+ Args:
375
+ contact_id: The GUID of the contact.
376
+
377
+ Returns:
378
+ Contact record or None if not found.
379
+ """
380
+ result = _api_request("GET", f"/Api/V8/module/Contacts/{contact_id}")
381
+ if "error" in result:
382
+ return None
383
+ return result.get("data")
384
+
385
+
386
+ def search_contacts_by_name(
387
+ first_name: Optional[str] = None,
388
+ last_name: Optional[str] = None,
389
+ ) -> List[Dict[str, Any]]:
390
+ """Search contacts by name.
391
+
392
+ Args:
393
+ first_name: First name to search for.
394
+ last_name: Last name to search for.
395
+
396
+ Returns:
397
+ List of matching contact records.
398
+ """
399
+ params: Dict[str, Any] = {
400
+ "page[number]": 1,
401
+ "page[size]": 50,
402
+ }
403
+ if first_name:
404
+ params["filter[first_name][eq]"] = first_name
405
+ if last_name:
406
+ params["filter[last_name][eq]"] = last_name
407
+
408
+ result = _api_request("GET", "/Api/V8/module/Contacts", params=params)
409
+ if "error" in result:
410
+ return []
411
+ return result.get("data", [])
412
+
413
+
414
+ def check_contact_exists(
415
+ first_name: str,
416
+ last_name: str,
417
+ email: Optional[str] = None,
418
+ ) -> bool:
419
+ """Check if a contact with the given name exists.
420
+
421
+ Args:
422
+ first_name: Contact's first name.
423
+ last_name: Contact's last name.
424
+ email: Optional email to also check.
425
+
426
+ Returns:
427
+ True if a matching contact exists, False otherwise.
428
+ """
429
+ contacts = search_contacts_by_name(first_name=first_name, last_name=last_name)
430
+ if not contacts:
431
+ return False
432
+
433
+ if email:
434
+ for contact in contacts:
435
+ attrs = contact.get("attributes", {})
436
+ if attrs.get("email1") == email:
437
+ return True
438
+ return False
439
+
440
+ return len(contacts) > 0
441
+
442
+
443
+ # ============ Account Operations ============
444
+
445
+ def list_accounts(page: int = 1, page_size: int = 50) -> List[Dict[str, Any]]:
446
+ """List all accounts from the Salesforce environment.
447
+
448
+ Returns:
449
+ List of account records.
450
+ """
451
+ params = {
452
+ "page[number]": page,
453
+ "page[size]": page_size,
454
+ }
455
+ result = _api_request("GET", "/Api/V8/module/Accounts", params=params)
456
+ if "error" in result:
457
+ return []
458
+ return result.get("data", [])
459
+
460
+
461
+ def get_account_by_id(account_id: str) -> Optional[Dict[str, Any]]:
462
+ """Get a specific account by its ID.
463
+
464
+ Args:
465
+ account_id: The GUID of the account.
466
+
467
+ Returns:
468
+ Account record or None if not found.
469
+ """
470
+ result = _api_request("GET", f"/Api/V8/module/Accounts/{account_id}")
471
+ if "error" in result:
472
+ return None
473
+ return result.get("data")
474
+
475
+
476
+ def search_accounts_by_name(name: str) -> List[Dict[str, Any]]:
477
+ """Search accounts by name.
478
+
479
+ Args:
480
+ name: Account name to search for.
481
+
482
+ Returns:
483
+ List of matching account records.
484
+ """
485
+ params = {
486
+ "page[number]": 1,
487
+ "page[size]": 50,
488
+ "filter[name][like]": f"%{name}%",
489
+ }
490
+ result = _api_request("GET", "/Api/V8/module/Accounts", params=params)
491
+ if "error" in result:
492
+ return []
493
+ return result.get("data", [])
494
+
495
+
496
+ def check_account_exists(name: str) -> bool:
497
+ """Check if an account with the given name exists.
498
+
499
+ Args:
500
+ name: Account name to search for.
501
+
502
+ Returns:
503
+ True if a matching account exists, False otherwise.
504
+ """
505
+ accounts = search_accounts_by_name(name)
506
+ return len(accounts) > 0
507
+
508
+
509
+ # ============ Opportunity Operations ============
510
+
511
+ def list_opportunities(page: int = 1, page_size: int = 50) -> List[Dict[str, Any]]:
512
+ """List all opportunities from the Salesforce environment.
513
+
514
+ Returns:
515
+ List of opportunity records.
516
+ """
517
+ params = {
518
+ "page[number]": page,
519
+ "page[size]": page_size,
520
+ }
521
+ result = _api_request("GET", "/Api/V8/module/Opportunities", params=params)
522
+ if "error" in result:
523
+ return []
524
+ return result.get("data", [])
525
+
526
+
527
+ def get_opportunity_by_id(opportunity_id: str) -> Optional[Dict[str, Any]]:
528
+ """Get a specific opportunity by its ID.
529
+
530
+ Args:
531
+ opportunity_id: The GUID of the opportunity.
532
+
533
+ Returns:
534
+ Opportunity record or None if not found.
535
+ """
536
+ result = _api_request("GET", f"/Api/V8/module/Opportunities/{opportunity_id}")
537
+ if "error" in result:
538
+ return None
539
+ return result.get("data")
540
+
541
+
542
+ def search_opportunities(search_term: str) -> List[Dict[str, Any]]:
543
+ """Search opportunities by name.
544
+
545
+ Args:
546
+ search_term: Text to search for in opportunity names.
547
+
548
+ Returns:
549
+ List of matching opportunity records.
550
+ """
551
+ params = {
552
+ "page[number]": 1,
553
+ "page[size]": 50,
554
+ "filter[name][like]": f"%{search_term}%",
555
+ }
556
+ result = _api_request("GET", "/Api/V8/module/Opportunities", params=params)
557
+ if "error" in result:
558
+ return []
559
+ return result.get("data", [])
560
+
561
+
562
+ # ============ Generic Record Operations ============
563
+
564
+ def get_record(module: str, record_id: str) -> Optional[Dict[str, Any]]:
565
+ """Get any record by module and ID.
566
+
567
+ Args:
568
+ module: Module name (e.g., "Leads", "Contacts", "Accounts").
569
+ record_id: Record GUID.
570
+
571
+ Returns:
572
+ Record data or None if not found.
573
+ """
574
+ result = _api_request("GET", f"/Api/V8/module/{module}/{record_id}")
575
+ if "error" in result:
576
+ return None
577
+ return result.get("data")
578
+
579
+
580
+ def record_exists(module: str, record_id: str) -> bool:
581
+ """Check if a record exists.
582
+
583
+ Args:
584
+ module: Module name.
585
+ record_id: Record GUID.
586
+
587
+ Returns:
588
+ True if the record exists, False otherwise.
589
+ """
590
+ record = get_record(module, record_id)
591
+ return record is not None
592
+
593
+
594
+ def get_relationships(
595
+ module: str,
596
+ record_id: str,
597
+ relationship_name: str,
598
+ ) -> List[Dict[str, Any]]:
599
+ """Get related records for a given relationship.
600
+
601
+ Args:
602
+ module: Source module name.
603
+ record_id: Source record GUID.
604
+ relationship_name: Name of the relationship link.
605
+
606
+ Returns:
607
+ List of related records.
608
+ """
609
+ result = _api_request(
610
+ "GET",
611
+ f"/Api/V8/module/{module}/{record_id}/relationships/{relationship_name}",
612
+ )
613
+ if "error" in result:
614
+ return []
615
+ return result.get("data", [])
616
+
617
+
618
+ # ============ Additional Helper Functions ============
619
+
620
+ def list_leads_by_company(company_name: str) -> List[Dict[str, Any]]:
621
+ """List all leads from a specific company.
622
+
623
+ Args:
624
+ company_name: The company/account name to filter by.
625
+
626
+ Returns:
627
+ List of matching lead records.
628
+ """
629
+ return search_leads_by_name(account_name=company_name)
630
+
631
+
632
+ def find_contact_by_email(email: str) -> Optional[Dict[str, Any]]:
633
+ """Find a contact by email address.
634
+
635
+ Args:
636
+ email: Email address to search for.
637
+
638
+ Returns:
639
+ Contact record or None if not found.
640
+ """
641
+ params = {
642
+ "page[number]": 1,
643
+ "page[size]": 10,
644
+ "filter[email1][eq]": email,
645
+ }
646
+ result = _api_request("GET", "/Api/V8/module/Contacts", params=params)
647
+ if "error" in result:
648
+ return None
649
+ data = result.get("data", [])
650
+ return data[0] if data else None
651
+
652
+
653
+ def list_accounts_by_industry(industry: str) -> List[Dict[str, Any]]:
654
+ """List all accounts in a specific industry.
655
+
656
+ Args:
657
+ industry: The industry to filter by (e.g., "Healthcare", "Technology").
658
+
659
+ Returns:
660
+ List of matching account records.
661
+ """
662
+ params = {
663
+ "page[number]": 1,
664
+ "page[size]": 50,
665
+ "filter[industry][eq]": industry,
666
+ }
667
+ result = _api_request("GET", "/Api/V8/module/Accounts", params=params)
668
+ if "error" in result:
669
+ return []
670
+ return result.get("data", [])
671
+
672
+
673
+ def find_account_by_name(name: str) -> Optional[Dict[str, Any]]:
674
+ """Find an account by exact name.
675
+
676
+ Args:
677
+ name: Account name to search for.
678
+
679
+ Returns:
680
+ Account record or None if not found.
681
+ """
682
+ params = {
683
+ "page[number]": 1,
684
+ "page[size]": 10,
685
+ "filter[name][eq]": name,
686
+ }
687
+ result = _api_request("GET", "/Api/V8/module/Accounts", params=params)
688
+ if "error" in result:
689
+ return None
690
+ data = result.get("data", [])
691
+ return data[0] if data else None
692
+
693
+
694
+ def list_contacts_for_account(account_id: str) -> List[Dict[str, Any]]:
695
+ """List all contacts associated with an account.
696
+
697
+ Args:
698
+ account_id: The GUID of the account.
699
+
700
+ Returns:
701
+ List of contact records linked to the account.
702
+ """
703
+ return get_relationships("Accounts", account_id, "contacts")
704
+
705
+
706
+ def list_cases(page: int = 1, page_size: int = 50) -> List[Dict[str, Any]]:
707
+ """List all cases from the Salesforce environment.
708
+
709
+ Returns:
710
+ List of case records.
711
+ """
712
+ params = {
713
+ "page[number]": page,
714
+ "page[size]": page_size,
715
+ }
716
+ result = _api_request("GET", "/Api/V8/module/Cases", params=params)
717
+ if "error" in result:
718
+ return []
719
+ return result.get("data", [])