decodingtrust-agent-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. agent/__init__.py +30 -0
  2. agent/claudesdk/__init__.py +8 -0
  3. agent/claudesdk/example.py +221 -0
  4. agent/claudesdk/src/__init__.py +8 -0
  5. agent/claudesdk/src/agent.py +400 -0
  6. agent/claudesdk/src/mcp_proxy.py +409 -0
  7. agent/claudesdk/src/utils.py +420 -0
  8. agent/googleadk/__init__.py +15 -0
  9. agent/googleadk/example.py +237 -0
  10. agent/googleadk/src/__init__.py +12 -0
  11. agent/googleadk/src/agent.py +401 -0
  12. agent/googleadk/src/mcp_wrapper.py +163 -0
  13. agent/googleadk/src/utils.py +602 -0
  14. agent/langchain/__init__.py +8 -0
  15. agent/langchain/example.py +213 -0
  16. agent/langchain/src/__init__.py +8 -0
  17. agent/langchain/src/agent.py +645 -0
  18. agent/langchain/src/utils.py +433 -0
  19. agent/openaisdk/__init__.py +17 -0
  20. agent/openaisdk/example.py +228 -0
  21. agent/openaisdk/src/__init__.py +12 -0
  22. agent/openaisdk/src/agent.py +491 -0
  23. agent/openaisdk/src/agent_wrapper.py +143 -0
  24. agent/openaisdk/src/mcp_wrapper.py +395 -0
  25. agent/openaisdk/src/utils.py +493 -0
  26. agent/openclaw/__init__.py +10 -0
  27. agent/openclaw/example.py +251 -0
  28. agent/openclaw/src/__init__.py +14 -0
  29. agent/openclaw/src/agent.py +930 -0
  30. agent/openclaw/src/helpers/__init__.py +1 -0
  31. agent/openclaw/src/helpers/auth_helpers.py +55 -0
  32. agent/openclaw/src/mcp_proxy.py +564 -0
  33. agent/openclaw/src/plugin_generator.py +231 -0
  34. agent/openclaw/src/utils.py +341 -0
  35. agent/pocketflow/__init__.py +18 -0
  36. agent/pocketflow/example.py +221 -0
  37. agent/pocketflow/prompts/react_agent.py +46 -0
  38. agent/pocketflow/src/__init__.py +6 -0
  39. agent/pocketflow/src/agent.py +507 -0
  40. agent/pocketflow/src/agent_wrapper.py +159 -0
  41. agent/pocketflow/src/async_helper.py +92 -0
  42. agent/pocketflow/src/mcp_react_agent.py +279 -0
  43. agent/pocketflow/src/native_agent.py +74 -0
  44. agent/pocketflow/src/nodes.py +467 -0
  45. benchmark/__init__.py +0 -0
  46. benchmark/browser/benign.jsonl +34 -0
  47. benchmark/browser/direct.jsonl +85 -0
  48. benchmark/browser/indirect.jsonl +82 -0
  49. benchmark/code/benign.jsonl +0 -0
  50. benchmark/code/direct.jsonl +121 -0
  51. benchmark/code/indirect.jsonl +165 -0
  52. benchmark/crm/benign.jsonl +165 -0
  53. benchmark/crm/direct.jsonl +90 -0
  54. benchmark/crm/indirect.jsonl +150 -0
  55. benchmark/customer-service/benign.jsonl +160 -0
  56. benchmark/customer-service/direct.jsonl +100 -0
  57. benchmark/customer-service/indirect.jsonl +101 -0
  58. benchmark/finance/benign.jsonl +0 -0
  59. benchmark/finance/direct.jsonl +200 -0
  60. benchmark/finance/indirect.jsonl +200 -0
  61. benchmark/legal/benign.jsonl +0 -0
  62. benchmark/legal/direct.jsonl +200 -0
  63. benchmark/legal/indirect.jsonl +200 -0
  64. benchmark/macos/benign.jsonl +30 -0
  65. benchmark/macos/direct.jsonl +50 -0
  66. benchmark/macos/indirect.jsonl +50 -0
  67. benchmark/medical/benign.jsonl +642 -0
  68. benchmark/medical/direct.jsonl +229 -0
  69. benchmark/medical/indirect.jsonl +222 -0
  70. benchmark/os-filesystem/benign.jsonl +200 -0
  71. benchmark/os-filesystem/direct.jsonl +200 -0
  72. benchmark/os-filesystem/indirect.jsonl +200 -0
  73. benchmark/research/benign.jsonl +0 -0
  74. benchmark/research/direct.jsonl +119 -0
  75. benchmark/research/indirect.jsonl +125 -0
  76. benchmark/telecom/benign.jsonl +120 -0
  77. benchmark/telecom/direct.jsonl +161 -0
  78. benchmark/telecom/indirect.jsonl +166 -0
  79. benchmark/travel/benign.jsonl +130 -0
  80. benchmark/travel/direct.jsonl +105 -0
  81. benchmark/travel/indirect.jsonl +120 -0
  82. benchmark/windows/benign.jsonl +100 -0
  83. benchmark/windows/direct.jsonl +140 -0
  84. benchmark/windows/indirect.jsonl +107 -0
  85. benchmark/workflow/benign.jsonl +335 -0
  86. benchmark/workflow/direct.jsonl +78 -0
  87. benchmark/workflow/indirect.jsonl +107 -0
  88. cli/__init__.py +5 -0
  89. cli/main.py +182 -0
  90. cli/scaffold.py +334 -0
  91. decodingtrust_agent_sdk-0.1.0.dist-info/METADATA +642 -0
  92. decodingtrust_agent_sdk-0.1.0.dist-info/RECORD +374 -0
  93. decodingtrust_agent_sdk-0.1.0.dist-info/WHEEL +5 -0
  94. decodingtrust_agent_sdk-0.1.0.dist-info/entry_points.txt +2 -0
  95. decodingtrust_agent_sdk-0.1.0.dist-info/licenses/LICENSE +201 -0
  96. decodingtrust_agent_sdk-0.1.0.dist-info/top_level.txt +6 -0
  97. dt_arena/config/env.yaml +515 -0
  98. dt_arena/config/injection_mcp.yaml +430 -0
  99. dt_arena/config/mcp.yaml +642 -0
  100. dt_arena/envs/arxiv/docker-compose-hub.yml +31 -0
  101. dt_arena/envs/arxiv/docker-compose.yml +36 -0
  102. dt_arena/envs/atlassian/docker/docker-compose.dev.yml +65 -0
  103. dt_arena/envs/atlassian/docker/docker-compose.yml +53 -0
  104. dt_arena/envs/atlassian/docker-compose-hub.yml +57 -0
  105. dt_arena/envs/atlassian/docker-compose.yml +72 -0
  106. dt_arena/envs/bigquery/docker-compose.yml +20 -0
  107. dt_arena/envs/booking/docker-compose.yml +59 -0
  108. dt_arena/envs/calendar/docker-compose-hub.yml +30 -0
  109. dt_arena/envs/calendar/docker-compose.yml +42 -0
  110. dt_arena/envs/custom-website/docker-compose.yml +6 -0
  111. dt_arena/envs/customer_service/docker-compose.yml +59 -0
  112. dt_arena/envs/databricks/docker-compose-hub.yml +47 -0
  113. dt_arena/envs/databricks/docker-compose.yml +51 -0
  114. dt_arena/envs/ecommerce/docker-compose.yml +6 -0
  115. dt_arena/envs/ers/docker-compose.yml +36 -0
  116. dt_arena/envs/ers/hrms/docker/docker-compose.yml +31 -0
  117. dt_arena/envs/finance/docker-compose.yml +23 -0
  118. dt_arena/envs/github/docker/docker-compose-hub.yml +50 -0
  119. dt_arena/envs/github/docker/docker-compose.yml +50 -0
  120. dt_arena/envs/gmail/docker-compose-hub.yml +51 -0
  121. dt_arena/envs/gmail/docker-compose.yml +65 -0
  122. dt_arena/envs/google-form/docker-compose-hub.yml +33 -0
  123. dt_arena/envs/google-form/docker-compose.yml +41 -0
  124. dt_arena/envs/googledocs/docker-compose-hub.yml +61 -0
  125. dt_arena/envs/googledocs/docker-compose.yml +78 -0
  126. dt_arena/envs/hospital/docker-compose-hub.yml +25 -0
  127. dt_arena/envs/hospital/docker-compose.yml +27 -0
  128. dt_arena/envs/legal/docker-compose.yml +22 -0
  129. dt_arena/envs/linkedin/docker-compose.yml +63 -0
  130. dt_arena/envs/macos/docker-compose.yml +79 -0
  131. dt_arena/envs/os-filesystem/docker-compose-hub.yml +16 -0
  132. dt_arena/envs/os-filesystem/docker-compose.yml +20 -0
  133. dt_arena/envs/paypal/docker-compose-hub.yml +48 -0
  134. dt_arena/envs/paypal/docker-compose.yml +63 -0
  135. dt_arena/envs/research/docker-compose-hub.yml +13 -0
  136. dt_arena/envs/research/docker-compose.yml +24 -0
  137. dt_arena/envs/salesforce_crm/docker-compose-hub.yaml +45 -0
  138. dt_arena/envs/salesforce_crm/docker-compose.yaml +49 -0
  139. dt_arena/envs/slack/docker-compose-hub.yml +28 -0
  140. dt_arena/envs/slack/docker-compose.yml +41 -0
  141. dt_arena/envs/snowflake/docker-compose-hub.yml +41 -0
  142. dt_arena/envs/snowflake/docker-compose.yml +44 -0
  143. dt_arena/envs/telecom/docker-compose-hub.yml +16 -0
  144. dt_arena/envs/telecom/docker-compose.yml +17 -0
  145. dt_arena/envs/telegram/docker-compose-hub.yml +57 -0
  146. dt_arena/envs/telegram/docker-compose.yml +62 -0
  147. dt_arena/envs/terminal/docker-compose-hub.yml +12 -0
  148. dt_arena/envs/terminal/docker-compose.yml +26 -0
  149. dt_arena/envs/travel/docker-compose-hub.yml +19 -0
  150. dt_arena/envs/travel/docker-compose.yml +19 -0
  151. dt_arena/envs/whatsapp/docker-compose-hub.yml +61 -0
  152. dt_arena/envs/whatsapp/docker-compose.yml +78 -0
  153. dt_arena/envs/windows/docker-compose.yml +71 -0
  154. dt_arena/envs/zoom/docker-compose-hub.yml +27 -0
  155. dt_arena/envs/zoom/docker-compose.yml +40 -0
  156. dt_arena/injection_mcp_server/atlassian/env_injection.py +134 -0
  157. dt_arena/injection_mcp_server/calendar/env_injection.py +217 -0
  158. dt_arena/injection_mcp_server/custom_website/env_injection.py +97 -0
  159. dt_arena/injection_mcp_server/customer_service/env_injection.py +659 -0
  160. dt_arena/injection_mcp_server/databricks/env_injection.py +255 -0
  161. dt_arena/injection_mcp_server/ecommerce/env_injection.py +110 -0
  162. dt_arena/injection_mcp_server/finance/env_injection.py +85 -0
  163. dt_arena/injection_mcp_server/github/env_injection.py +206 -0
  164. dt_arena/injection_mcp_server/gmail/env_injection.py +211 -0
  165. dt_arena/injection_mcp_server/google_form/env_injection.py +186 -0
  166. dt_arena/injection_mcp_server/googledocs/env_injection.py +44 -0
  167. dt_arena/injection_mcp_server/hospital/env_injection.py +43 -0
  168. dt_arena/injection_mcp_server/legal/env_injection.py +229 -0
  169. dt_arena/injection_mcp_server/macos/env_injection.py +272 -0
  170. dt_arena/injection_mcp_server/os-filesystem/env_injection.py +341 -0
  171. dt_arena/injection_mcp_server/paypal/env_injection.py +268 -0
  172. dt_arena/injection_mcp_server/research/env_injection.py +616 -0
  173. dt_arena/injection_mcp_server/salesforce/env_injection.py +514 -0
  174. dt_arena/injection_mcp_server/slack/env_injection.py +265 -0
  175. dt_arena/injection_mcp_server/snowflake/env_injection.py +230 -0
  176. dt_arena/injection_mcp_server/telecom/env_injection.py +503 -0
  177. dt_arena/injection_mcp_server/telegram/env_injection.py +171 -0
  178. dt_arena/injection_mcp_server/terminal/env_injection.py +523 -0
  179. dt_arena/injection_mcp_server/travel/env_injection.py +173 -0
  180. dt_arena/injection_mcp_server/whatsapp/env_injection.py +185 -0
  181. dt_arena/injection_mcp_server/windows/env_injection.py +943 -0
  182. dt_arena/injection_mcp_server/zoom/env_injection.py +216 -0
  183. dt_arena/mcp_server/atlassian/main.py +1554 -0
  184. dt_arena/mcp_server/atlassian/test_server.py +66 -0
  185. dt_arena/mcp_server/bigquery/main.py +333 -0
  186. dt_arena/mcp_server/booking/main.py +310 -0
  187. dt_arena/mcp_server/browser/main.py +1741 -0
  188. dt_arena/mcp_server/calendar/example_multi_user.py +162 -0
  189. dt_arena/mcp_server/calendar/main.py +792 -0
  190. dt_arena/mcp_server/calendar/test_mcp.py +135 -0
  191. dt_arena/mcp_server/customer_service/main.py +1063 -0
  192. dt_arena/mcp_server/databricks/main.py +566 -0
  193. dt_arena/mcp_server/databricks/probe.py +102 -0
  194. dt_arena/mcp_server/ers/main.py +845 -0
  195. dt_arena/mcp_server/finance/__init__.py +87 -0
  196. dt_arena/mcp_server/finance/core/__init__.py +12 -0
  197. dt_arena/mcp_server/finance/core/data_loader.py +558 -0
  198. dt_arena/mcp_server/finance/core/portfolio.py +565 -0
  199. dt_arena/mcp_server/finance/evaluation/__init__.py +20 -0
  200. dt_arena/mcp_server/finance/evaluation/evaluator.py +217 -0
  201. dt_arena/mcp_server/finance/evaluation/logger.py +137 -0
  202. dt_arena/mcp_server/finance/injection/__init__.py +66 -0
  203. dt_arena/mcp_server/finance/injection/config.py +176 -0
  204. dt_arena/mcp_server/finance/injection/content.py +755 -0
  205. dt_arena/mcp_server/finance/injection/html.py +409 -0
  206. dt_arena/mcp_server/finance/injection/locations.py +167 -0
  207. dt_arena/mcp_server/finance/injection/methods.py +193 -0
  208. dt_arena/mcp_server/finance/injection/presets.py +1023 -0
  209. dt_arena/mcp_server/finance/main.py +361 -0
  210. dt_arena/mcp_server/finance/run_mcp.py +21 -0
  211. dt_arena/mcp_server/finance/run_web.py +26 -0
  212. dt_arena/mcp_server/finance/server/__init__.py +41 -0
  213. dt_arena/mcp_server/finance/server/extractor.py +1453 -0
  214. dt_arena/mcp_server/finance/server/extractor_minimal.py +292 -0
  215. dt_arena/mcp_server/finance/server/extractor_simple.py +1164 -0
  216. dt_arena/mcp_server/finance/server/injection_mcp.py +865 -0
  217. dt_arena/mcp_server/finance/server/mcp.py +451 -0
  218. dt_arena/mcp_server/finance/server/tools/__init__.py +23 -0
  219. dt_arena/mcp_server/finance/server/tools/account.py +88 -0
  220. dt_arena/mcp_server/finance/server/tools/browsing.py +328 -0
  221. dt_arena/mcp_server/finance/server/tools/social.py +73 -0
  222. dt_arena/mcp_server/finance/server/tools/trading.py +242 -0
  223. dt_arena/mcp_server/finance/server/tools/utility.py +49 -0
  224. dt_arena/mcp_server/finance/server/web.py +2139 -0
  225. dt_arena/mcp_server/finance/tasks/benchmark/__init__.py +28 -0
  226. dt_arena/mcp_server/finance/tasks/benchmark/attack_pool.py +3026 -0
  227. dt_arena/mcp_server/finance/tasks/benchmark/attack_runner.py +1315 -0
  228. dt_arena/mcp_server/finance/tasks/benchmark/finra_requirements.py +1335 -0
  229. dt_arena/mcp_server/finance/tasks/benchmark/finra_tasks.py +3665 -0
  230. dt_arena/mcp_server/finance/tasks/benchmark/malicious_tasks.py +2673 -0
  231. dt_arena/mcp_server/finance/tasks/redteam_suite/run_redteam_suite.py +1713 -0
  232. dt_arena/mcp_server/finance/test_mcp_tools.py +476 -0
  233. dt_arena/mcp_server/github/main.py +441 -0
  234. dt_arena/mcp_server/gmail/main.py +1004 -0
  235. dt_arena/mcp_server/google_form/main.py +141 -0
  236. dt_arena/mcp_server/googledocs/main.py +458 -0
  237. dt_arena/mcp_server/hospital/mcp_server.py +458 -0
  238. dt_arena/mcp_server/legal/__init__.py +9 -0
  239. dt_arena/mcp_server/legal/core/__init__.py +14 -0
  240. dt_arena/mcp_server/legal/core/courtlistener_store.py +762 -0
  241. dt_arena/mcp_server/legal/core/data_loader.py +266 -0
  242. dt_arena/mcp_server/legal/core/document_store.py +197 -0
  243. dt_arena/mcp_server/legal/core/matter_manager.py +466 -0
  244. dt_arena/mcp_server/legal/main.py +89 -0
  245. dt_arena/mcp_server/legal/scripts/collect_data.py +988 -0
  246. dt_arena/mcp_server/legal/server/__init__.py +14 -0
  247. dt_arena/mcp_server/legal/server/mcp.py +2330 -0
  248. dt_arena/mcp_server/macos/client_test.py +270 -0
  249. dt_arena/mcp_server/macos/mcp_server.py +285 -0
  250. dt_arena/mcp_server/os-filesystem/main.py +1380 -0
  251. dt_arena/mcp_server/paypal/main.py +501 -0
  252. dt_arena/mcp_server/research/main.py +777 -0
  253. dt_arena/mcp_server/salesforce/main.py +2006 -0
  254. dt_arena/mcp_server/slack/main.py +318 -0
  255. dt_arena/mcp_server/snowflake/main.py +612 -0
  256. dt_arena/mcp_server/snowflake/probe.py +183 -0
  257. dt_arena/mcp_server/telecom/mcp_client.py +423 -0
  258. dt_arena/mcp_server/telecom/mcp_server.py +1059 -0
  259. dt_arena/mcp_server/telegram/main.py +338 -0
  260. dt_arena/mcp_server/terminal/main.py +163 -0
  261. dt_arena/mcp_server/travel/client_test.py +16 -0
  262. dt_arena/mcp_server/travel/mcp_server.py +404 -0
  263. dt_arena/mcp_server/whatsapp/main.py +318 -0
  264. dt_arena/mcp_server/windows/client_test.py +270 -0
  265. dt_arena/mcp_server/windows/mcp_server.py +218 -0
  266. dt_arena/mcp_server/zoom/main.py +466 -0
  267. dt_arena/src/__init__.py +0 -0
  268. dt_arena/src/hooks/__init__.py +0 -0
  269. dt_arena/src/hooks/audit_log.py +30 -0
  270. dt_arena/src/hooks/hooks.json +3 -0
  271. dt_arena/src/run_benign.py +142 -0
  272. dt_arena/src/types/__init__.py +0 -0
  273. dt_arena/src/types/agent.py +441 -0
  274. dt_arena/src/types/attacks.py +2 -0
  275. dt_arena/src/types/environment.py +2 -0
  276. dt_arena/src/types/hooks.py +174 -0
  277. dt_arena/src/types/judge.py +52 -0
  278. dt_arena/src/types/red_teaming_trajectory.py +385 -0
  279. dt_arena/src/types/task.py +260 -0
  280. dt_arena/src/types/trajectory.py +315 -0
  281. dt_arena/utils/__init__.py +1 -0
  282. dt_arena/utils/atlassian/__init__.py +27 -0
  283. dt_arena/utils/atlassian/helpers.py +520 -0
  284. dt_arena/utils/bigquery/__init__.py +1 -0
  285. dt_arena/utils/bigquery/helpers.py +246 -0
  286. dt_arena/utils/calendar/__init__.py +1 -0
  287. dt_arena/utils/calendar/helpers.py +87 -0
  288. dt_arena/utils/customer_service/__init__.py +17 -0
  289. dt_arena/utils/customer_service/cs_env_client.py +940 -0
  290. dt_arena/utils/customer_service/helpers.py +339 -0
  291. dt_arena/utils/customer_service/judges/__init__.py +20 -0
  292. dt_arena/utils/customer_service/judges/abusive_comms.py +149 -0
  293. dt_arena/utils/customer_service/judges/bulk_destructive.py +202 -0
  294. dt_arena/utils/customer_service/judges/cross_customer.py +481 -0
  295. dt_arena/utils/customer_service/judges/email_exfiltration.py +320 -0
  296. dt_arena/utils/customer_service/judges/financial_fraud.py +221 -0
  297. dt_arena/utils/customer_service/judges/llm_utils.py +264 -0
  298. dt_arena/utils/customer_service/judges/pii_disclosure.py +837 -0
  299. dt_arena/utils/customer_service/judges/policy_misrep.py +44 -0
  300. dt_arena/utils/customer_service/judges/text_utils.py +21 -0
  301. dt_arena/utils/databricks/__init__.py +2 -0
  302. dt_arena/utils/databricks/helpers.py +210 -0
  303. dt_arena/utils/finance/__init__.py +0 -0
  304. dt_arena/utils/finance/helpers.py +263 -0
  305. dt_arena/utils/github/__init__.py +1 -0
  306. dt_arena/utils/github/helpers.py +249 -0
  307. dt_arena/utils/gmail/__init__.py +1 -0
  308. dt_arena/utils/gmail/helpers.py +344 -0
  309. dt_arena/utils/google_form/__init__.py +2 -0
  310. dt_arena/utils/google_form/helpers.py +133 -0
  311. dt_arena/utils/legal/__init__.py +0 -0
  312. dt_arena/utils/legal/helpers.py +228 -0
  313. dt_arena/utils/macos/__init__.py +0 -0
  314. dt_arena/utils/macos/env_setup.py +215 -0
  315. dt_arena/utils/macos/helpers.py +61 -0
  316. dt_arena/utils/os_filesystem/__init__.py +1 -0
  317. dt_arena/utils/os_filesystem/helpers.py +366 -0
  318. dt_arena/utils/paypal/__init__.py +1 -0
  319. dt_arena/utils/paypal/helpers.py +178 -0
  320. dt_arena/utils/port_allocator.py +266 -0
  321. dt_arena/utils/research/__init__.py +0 -0
  322. dt_arena/utils/research/helpers.py +251 -0
  323. dt_arena/utils/salesforce/__init__.py +1 -0
  324. dt_arena/utils/salesforce/helpers.py +719 -0
  325. dt_arena/utils/slack/__init__.py +1 -0
  326. dt_arena/utils/slack/helpers.py +176 -0
  327. dt_arena/utils/snowflake/__init__.py +1 -0
  328. dt_arena/utils/snowflake/helpers.py +166 -0
  329. dt_arena/utils/telecom/__init__.py +1 -0
  330. dt_arena/utils/telecom/helpers.py +760 -0
  331. dt_arena/utils/telegram/__init__.py +0 -0
  332. dt_arena/utils/telegram/helpers.py +174 -0
  333. dt_arena/utils/terminal/__init__.py +0 -0
  334. dt_arena/utils/terminal/helpers.py +20 -0
  335. dt_arena/utils/travel/__init__.py +0 -0
  336. dt_arena/utils/travel/env_client.py +537 -0
  337. dt_arena/utils/travel/llm_judge.py +137 -0
  338. dt_arena/utils/travel/prompts.py +64 -0
  339. dt_arena/utils/utils/__init__.py +122 -0
  340. dt_arena/utils/whatsapp/__init__.py +0 -0
  341. dt_arena/utils/whatsapp/helpers.py +226 -0
  342. dt_arena/utils/windows/__init__.py +0 -0
  343. dt_arena/utils/windows/env_reset.py +224 -0
  344. dt_arena/utils/windows/env_setup.py +280 -0
  345. dt_arena/utils/windows/exfil_helpers.py +170 -0
  346. dt_arena/utils/windows/helpers.py +74 -0
  347. dt_arena/utils/zoom/__init__.py +1 -0
  348. dt_arena/utils/zoom/helpers.py +70 -0
  349. eval/__init__.py +1 -0
  350. eval/evaluation.py +426 -0
  351. eval/task_runner.py +449 -0
  352. utils/__init__.py +148 -0
  353. utils/agent_helpers.py +308 -0
  354. utils/agent_wrapper.py +189 -0
  355. utils/compose_utils.py +135 -0
  356. utils/config.py +77 -0
  357. utils/env_helpers.py +104 -0
  358. utils/eval_stats.py +88 -0
  359. utils/injection_helpers.py +429 -0
  360. utils/injection_mcp_helpers.py +152 -0
  361. utils/judge_helpers.py +181 -0
  362. utils/judge_utils.py +472 -0
  363. utils/llm.py +196 -0
  364. utils/logging.py +45 -0
  365. utils/mcp_helpers.py +232 -0
  366. utils/mcp_manager.py +235 -0
  367. utils/memory_guard.py +18 -0
  368. utils/red_teaming_sandbox.py +476 -0
  369. utils/reset_helpers.py +318 -0
  370. utils/resource_manager.py +370 -0
  371. utils/skill_helpers.py +447 -0
  372. utils/task_executor.py +904 -0
  373. utils/task_helpers.py +270 -0
  374. utils/template_helpers.py +179 -0
utils/agent_helpers.py ADDED
@@ -0,0 +1,308 @@
1
+ from typing import Any, Dict, List, Optional, Type
2
+
3
+
4
+ OS_FILESYSTEM_CLAUDE_SDK_DISALLOWED_TOOLS = [
5
+ "Bash",
6
+ "Read",
7
+ "Write",
8
+ "Edit",
9
+ "MultiEdit",
10
+ "Glob",
11
+ "Grep",
12
+ "NotebookEdit",
13
+ "AskUserQuestion",
14
+ ]
15
+
16
+
17
+ OS_FILESYSTEM_OPENCLAW_DISALLOWED_TOOLS = [
18
+ "group:fs",
19
+ "group:runtime",
20
+ "group:web",
21
+ "group:memory",
22
+ "group:ui",
23
+ "group:sessions",
24
+ "group:automation",
25
+ "group:messaging",
26
+ "agents_list",
27
+ "image",
28
+ "nodes",
29
+ "tts",
30
+ ]
31
+
32
+
33
+ def get_default_disallowed_tools(agent_type: Optional[str], domain: Optional[str]) -> Optional[List[str]]:
34
+ """Return domain-specific native tools to disable when none are provided explicitly."""
35
+ if (domain or "").lower() != "os-filesystem":
36
+ return None
37
+
38
+ agent = (agent_type or "").lower()
39
+ if agent == "claudesdk":
40
+ return list(OS_FILESYSTEM_CLAUDE_SDK_DISALLOWED_TOOLS)
41
+ if agent == "openclaw":
42
+ return list(OS_FILESYSTEM_OPENCLAW_DISALLOWED_TOOLS)
43
+ return None
44
+
45
+ _AgentConfig = None
46
+ _RuntimeConfig = None
47
+ _TaskConfig = None
48
+ _AttackConfig = None
49
+ _OpenAISDKAgent = None
50
+ _LangChainAgent = None
51
+ _ClaudeSDKAgent = None
52
+ _GoogleADKAgent = None
53
+ _MCPReactAgent = None
54
+ _OpenClawAgent = None
55
+
56
+ # Native agent wrappers (lazy imports)
57
+ _OpenAISDKNativeWrapper = None
58
+ _MCPReactAgentNativeWrapper = None
59
+
60
+
61
+ def _ensure_imports():
62
+ """Lazy import of agent-related types."""
63
+ global _AgentConfig, _RuntimeConfig, _TaskConfig, _AttackConfig
64
+ global _OpenAISDKAgent, _LangChainAgent, _ClaudeSDKAgent, _GoogleADKAgent, _MCPReactAgent, _OpenClawAgent
65
+ global _OpenAISDKNativeWrapper, _MCPReactAgentNativeWrapper
66
+
67
+ if _AgentConfig is None:
68
+ from dt_arena.src.types.agent import AgentConfig, RuntimeConfig
69
+ from dt_arena.src.types.task import AttackConfig, TaskConfig
70
+ _AgentConfig = AgentConfig
71
+ _RuntimeConfig = RuntimeConfig
72
+ _TaskConfig = TaskConfig
73
+ _AttackConfig = AttackConfig
74
+
75
+ # Import OpenAI SDK agent
76
+ if _OpenAISDKAgent is None:
77
+ try:
78
+ from agent.openaisdk import OpenAISDKAgent
79
+ _OpenAISDKAgent = OpenAISDKAgent
80
+ except ImportError as e:
81
+ print(f"[WARNING] OpenAI SDK agent not available: {e}")
82
+
83
+ # Import LangChain agent
84
+ if _LangChainAgent is None:
85
+ try:
86
+ from agent.langchain import LangChainAgent
87
+ _LangChainAgent = LangChainAgent
88
+ except ImportError as e:
89
+ print(f"[WARNING] LangChain agent not available: {e}")
90
+
91
+ # Import Claude SDK agent
92
+ if _ClaudeSDKAgent is None:
93
+ try:
94
+ from agent.claudesdk import ClaudeSDKAgent
95
+ _ClaudeSDKAgent = ClaudeSDKAgent
96
+ except ImportError as e:
97
+ print(f"[WARNING] Claude SDK agent not available: {e}")
98
+
99
+ # Import Google ADK agent
100
+ if _GoogleADKAgent is None:
101
+ try:
102
+ from agent.googleadk import GoogleADKAgent
103
+ _GoogleADKAgent = GoogleADKAgent
104
+ except ImportError as e:
105
+ print(f"[WARNING] Google ADK agent not available: {e}")
106
+
107
+ # Import PocketFlow agent
108
+ if _MCPReactAgent is None:
109
+ try:
110
+ from agent.pocketflow import MCPReactAgent
111
+ _MCPReactAgent = MCPReactAgent
112
+ except ImportError as e:
113
+ print(f"[WARNING] PocketFlow agent not available: {e}")
114
+
115
+ # Import OpenClaw agent
116
+ if _OpenClawAgent is None:
117
+ try:
118
+ from agent.openclaw import OpenClawAgent
119
+ _OpenClawAgent = OpenClawAgent
120
+ except ImportError as e:
121
+ print(f"[WARNING] OpenClaw agent not available: {e}")
122
+
123
+ # Import native agent wrappers
124
+ if _OpenAISDKNativeWrapper is None:
125
+ try:
126
+ from agent.openaisdk.src.agent_wrapper import OpenAISDKNativeWrapper
127
+ _OpenAISDKNativeWrapper = OpenAISDKNativeWrapper
128
+ except ImportError as e:
129
+ print(f"[WARNING] OpenAI SDK native wrapper not available: {e}")
130
+
131
+ if _MCPReactAgentNativeWrapper is None:
132
+ try:
133
+ from agent.pocketflow.src.agent_wrapper import MCPReactAgentNativeWrapper
134
+ _MCPReactAgentNativeWrapper = MCPReactAgentNativeWrapper
135
+ except ImportError as e:
136
+ print(f"[WARNING] PocketFlow native wrapper not available: {e}")
137
+
138
+
139
+ def _get_agent_registry() -> Dict[str, Type]:
140
+ """Get the agent registry with lazy imports."""
141
+ _ensure_imports()
142
+ registry = {}
143
+
144
+ if _OpenAISDKAgent is not None:
145
+ registry["openaisdk"] = _OpenAISDKAgent
146
+ if _LangChainAgent is not None:
147
+ registry["langchain"] = _LangChainAgent
148
+ if _ClaudeSDKAgent is not None:
149
+ registry["claudesdk"] = _ClaudeSDKAgent
150
+ if _GoogleADKAgent is not None:
151
+ registry["googleadk"] = _GoogleADKAgent
152
+ if _MCPReactAgent is not None:
153
+ registry["pocketflow"] = _MCPReactAgent
154
+ if _OpenClawAgent is not None:
155
+ registry["openclaw"] = _OpenClawAgent
156
+
157
+ return registry
158
+
159
+
160
+ # Agent registry - lazily populated
161
+ AGENT_REGISTRY: Dict[str, Type] = {}
162
+
163
+
164
+ def build_agent(
165
+ agent_type: Optional[str] = None,
166
+ agent_cfg: Optional[Any] = None, # AgentConfig
167
+ runtime_cfg: Optional[Any] = None, # RuntimeConfig
168
+ *,
169
+ native_agent: Optional[Any] = None, # Pre-built native SDK agent
170
+ ) -> Any:
171
+ """Instantiate the requested agent implementation.
172
+
173
+ Args:
174
+ agent_type: Type of agent to build (e.g., "openaisdk", "langchain").
175
+ Required if native_agent is not provided.
176
+ agent_cfg: AgentConfig instance. For native agents, this provides
177
+ additional benchmark MCP servers to add.
178
+ runtime_cfg: RuntimeConfig instance with model, temperature, max_turns,
179
+ output_dir, and mcp_injection settings.
180
+ native_agent: Pre-built native SDK agent to wrap. When provided,
181
+ the framework is auto-detected and the agent is wrapped for
182
+ evaluation.
183
+
184
+ Returns:
185
+ Initialized agent instance
186
+
187
+ Raises:
188
+ ValueError: If neither agent_type nor native_agent is provided,
189
+ or if the native agent framework cannot be detected.
190
+ """
191
+ _ensure_imports()
192
+
193
+ # Use provided runtime_cfg or create default
194
+ if runtime_cfg is None:
195
+ runtime_cfg = _RuntimeConfig()
196
+
197
+ # Mode 2: Wrap a pre-built native agent
198
+ if native_agent is not None:
199
+ return _wrap_native_agent(
200
+ native_agent=native_agent,
201
+ agent_cfg=agent_cfg,
202
+ runtime_cfg=runtime_cfg,
203
+ )
204
+
205
+ # Mode 1: Build from scratch (existing behavior)
206
+ if agent_type is None:
207
+ raise ValueError(
208
+ "Either 'agent_type' or 'native_agent' must be provided. "
209
+ "Use agent_type to build from scratch, or native_agent to wrap "
210
+ "a pre-built agent."
211
+ )
212
+
213
+ registry = _get_agent_registry()
214
+ agent_type = agent_type.lower()
215
+
216
+ if agent_type not in registry:
217
+ raise ValueError(f"Unknown agent type '{agent_type}'. Available: {list(registry.keys())}")
218
+
219
+ AgentCls = registry[agent_type]
220
+
221
+ return AgentCls(
222
+ agent_config=agent_cfg,
223
+ runtime_config=runtime_cfg,
224
+ )
225
+
226
+
227
+ def _wrap_native_agent(
228
+ native_agent: Any,
229
+ agent_cfg: Optional[Any],
230
+ runtime_cfg: Any,
231
+ ) -> Any:
232
+ """
233
+ Wrap a pre-built native SDK agent for evaluation.
234
+
235
+ Auto-detects the agent framework and returns the appropriate wrapper.
236
+
237
+ Args:
238
+ native_agent: Pre-built agent from a native SDK
239
+ agent_cfg: AgentConfig with benchmark MCP servers to add
240
+ runtime_cfg: RuntimeConfig with model settings, output paths, injections
241
+
242
+ Returns:
243
+ Wrapped agent instance
244
+
245
+ Raises:
246
+ ValueError: If framework cannot be detected or no wrapper is available
247
+ """
248
+ from utils.agent_wrapper import detect_native_framework
249
+
250
+ # Detect framework
251
+ framework = detect_native_framework(native_agent)
252
+
253
+ if framework is None:
254
+ raise ValueError(
255
+ f"Could not detect framework for agent of type {type(native_agent).__name__}. "
256
+ f"Supported native agent types: OpenAI SDK Agent, LangChain Agent, "
257
+ f"Claude SDK Client, Google ADK Agent, PocketFlow NativeMCPReactAgent."
258
+ )
259
+
260
+ # Get the appropriate wrapper
261
+ if framework == "openaisdk":
262
+ if _OpenAISDKNativeWrapper is None:
263
+ raise ValueError(
264
+ "OpenAI SDK native wrapper is not available. "
265
+ "Please ensure 'agents' package is installed."
266
+ )
267
+ return _OpenAISDKNativeWrapper(
268
+ native_agent=native_agent,
269
+ agent_config=agent_cfg,
270
+ runtime_config=runtime_cfg,
271
+ )
272
+ elif framework == "langchain":
273
+ # TODO: Implement LangChainNativeWrapper
274
+ raise NotImplementedError(
275
+ f"Native agent wrapper for '{framework}' is not yet implemented. "
276
+ f"Currently supported: openaisdk, pocketflow"
277
+ )
278
+ elif framework == "claudesdk":
279
+ # TODO: Implement ClaudeSDKNativeWrapper
280
+ raise NotImplementedError(
281
+ f"Native agent wrapper for '{framework}' is not yet implemented. "
282
+ f"Currently supported: openaisdk, pocketflow"
283
+ )
284
+ elif framework == "googleadk":
285
+ # TODO: Implement GoogleADKNativeWrapper
286
+ raise NotImplementedError(
287
+ f"Native agent wrapper for '{framework}' is not yet implemented. "
288
+ f"Currently supported: openaisdk, pocketflow"
289
+ )
290
+ elif framework == "pocketflow":
291
+ if _MCPReactAgentNativeWrapper is None:
292
+ raise ValueError(
293
+ "PocketFlow native wrapper is not available. "
294
+ "Please ensure 'pocketflow' package is installed."
295
+ )
296
+ return _MCPReactAgentNativeWrapper(
297
+ native_agent=native_agent,
298
+ agent_config=agent_cfg,
299
+ runtime_config=runtime_cfg,
300
+ )
301
+ else:
302
+ raise ValueError(f"No wrapper available for framework '{framework}'.")
303
+
304
+
305
+ def get_agent_choices() -> list[str]:
306
+ """Get sorted list of available agent types."""
307
+ _ensure_imports()
308
+ return sorted(_get_agent_registry().keys())
utils/agent_wrapper.py ADDED
@@ -0,0 +1,189 @@
1
+ """
2
+ Native Agent Detection Utilities
3
+
4
+ This module provides helper functions for detecting the framework of
5
+ pre-built native agents. The actual wrapper classes are located in
6
+ their respective agent folders (e.g., agent/openaisdk/src/native_wrapper.py).
7
+
8
+ Usage:
9
+ from utils.agent_wrapper import detect_native_framework
10
+
11
+ native_agent = ... # User's pre-built agent
12
+ framework = detect_native_framework(native_agent)
13
+ # Returns: "openaisdk", "langchain", "claudesdk", "googleadk", or None
14
+ """
15
+
16
+ from typing import Any, Optional
17
+
18
+
19
+ def is_openai_sdk_native_agent(agent: Any) -> bool:
20
+ """
21
+ Detect if agent is a native OpenAI SDK Agent.
22
+
23
+ Args:
24
+ agent: Object to check
25
+
26
+ Returns:
27
+ True if this is an OpenAI Agents SDK agent
28
+ """
29
+ # Check for OpenAI Agents SDK (from 'agents' package)
30
+ try:
31
+ from agents import Agent as OpenAIAgent
32
+ if isinstance(agent, OpenAIAgent):
33
+ return True
34
+ except ImportError:
35
+ pass
36
+
37
+ # Duck-type check: OpenAI SDK agents have these specific attributes
38
+ return (
39
+ hasattr(agent, 'name') and
40
+ hasattr(agent, 'instructions') and
41
+ hasattr(agent, 'model') and
42
+ hasattr(agent, 'mcp_servers') and
43
+ hasattr(agent, 'model_settings') and
44
+ hasattr(agent, 'clone')
45
+ )
46
+
47
+
48
+ def is_langchain_native_agent(agent: Any) -> bool:
49
+ """
50
+ Detect if agent is a native LangChain agent/graph.
51
+
52
+ Args:
53
+ agent: Object to check
54
+
55
+ Returns:
56
+ True if this is a LangChain agent
57
+ """
58
+ # Check for LangChain CompiledGraph (from create_agent)
59
+ try:
60
+ from langgraph.graph.state import CompiledGraph, CompiledStateGraph
61
+ if isinstance(agent, (CompiledGraph, CompiledStateGraph)):
62
+ return True
63
+ except ImportError:
64
+ pass
65
+
66
+ # Check for legacy AgentExecutor
67
+ try:
68
+ from langchain.agents import AgentExecutor
69
+ if isinstance(agent, AgentExecutor):
70
+ return True
71
+ except ImportError:
72
+ pass
73
+
74
+ # Duck-type check
75
+ return (
76
+ hasattr(agent, 'invoke') and
77
+ hasattr(agent, 'ainvoke') and
78
+ (hasattr(agent, 'get_graph') or hasattr(agent, 'nodes'))
79
+ )
80
+
81
+
82
+ def is_claude_sdk_native_agent(agent: Any) -> bool:
83
+ """
84
+ Detect if agent is a native Claude SDK agent.
85
+
86
+ Args:
87
+ agent: Object to check
88
+
89
+ Returns:
90
+ True if this is a Claude SDK agent
91
+ """
92
+ try:
93
+ from claude_code_sdk import Client
94
+ if isinstance(agent, Client):
95
+ return True
96
+ except ImportError:
97
+ pass
98
+
99
+ # Duck-type check
100
+ return (
101
+ hasattr(agent, 'query') and
102
+ hasattr(agent, 'receive_response') and
103
+ hasattr(agent, '_model')
104
+ )
105
+
106
+
107
+ def is_google_adk_native_agent(agent: Any) -> bool:
108
+ """
109
+ Detect if agent is a native Google ADK agent.
110
+
111
+ Args:
112
+ agent: Object to check
113
+
114
+ Returns:
115
+ True if this is a Google ADK agent
116
+ """
117
+ try:
118
+ from google.adk.agents import Agent as GoogleAgent, LlmAgent
119
+ if isinstance(agent, (GoogleAgent, LlmAgent)):
120
+ return True
121
+ except ImportError:
122
+ pass
123
+
124
+ # Duck-type check
125
+ return (
126
+ hasattr(agent, 'model') and
127
+ hasattr(agent, 'instruction') and
128
+ hasattr(agent, 'tools') and
129
+ hasattr(type(agent), '__module__') and
130
+ type(agent).__module__.startswith('google.adk')
131
+ )
132
+
133
+
134
+ def is_pocketflow_native_agent(agent: Any) -> bool:
135
+ """
136
+ Detect if agent is a native PocketFlow agent (NativeMCPReactAgent).
137
+
138
+ Args:
139
+ agent: Object to check
140
+
141
+ Returns:
142
+ True if this is a PocketFlow NativeMCPReactAgent
143
+ """
144
+ # Type check for NativeMCPReactAgent
145
+ try:
146
+ from agent.pocketflow import NativeMCPReactAgent
147
+ if isinstance(agent, NativeMCPReactAgent):
148
+ return True
149
+ except ImportError:
150
+ pass
151
+
152
+ # Duck-type check: NativeMCPReactAgent has these specific attributes
153
+ return (
154
+ hasattr(agent, 'mcp_servers') and
155
+ hasattr(agent, 'tool_to_server') and
156
+ hasattr(agent, 'all_tools') and
157
+ isinstance(getattr(agent, 'mcp_servers', None), dict) and
158
+ isinstance(getattr(agent, 'tool_to_server', None), dict) and
159
+ isinstance(getattr(agent, 'all_tools', None), list)
160
+ )
161
+
162
+
163
+ def detect_native_framework(agent: Any) -> Optional[str]:
164
+ """
165
+ Detect which framework a native agent belongs to.
166
+
167
+ Args:
168
+ agent: A pre-built native SDK agent instance
169
+
170
+ Returns:
171
+ Framework name ("openaisdk", "langchain", "claudesdk", "googleadk", "pocketflow")
172
+ or None if not detected
173
+ """
174
+ detectors = [
175
+ ("openaisdk", is_openai_sdk_native_agent),
176
+ ("langchain", is_langchain_native_agent),
177
+ ("claudesdk", is_claude_sdk_native_agent),
178
+ ("googleadk", is_google_adk_native_agent),
179
+ ("pocketflow", is_pocketflow_native_agent),
180
+ ]
181
+
182
+ for framework, detector in detectors:
183
+ try:
184
+ if detector(agent):
185
+ return framework
186
+ except Exception:
187
+ continue
188
+
189
+ return None
utils/compose_utils.py ADDED
@@ -0,0 +1,135 @@
1
+ import os
2
+ import subprocess
3
+ from pathlib import Path
4
+ from typing import List, Optional, Union
5
+
6
+
7
+ def get_project_name(env_name: str) -> Optional[str]:
8
+ """
9
+ Get the Docker compose project name for a given environment.
10
+
11
+ In pooled mode, the project name is passed via environment variable
12
+ like GMAIL_PROJECT_NAME, SLACK_PROJECT_NAME, etc.
13
+
14
+ Args:
15
+ env_name: Environment name (gmail, slack, atlassian, etc.)
16
+
17
+ Returns:
18
+ Project name if found, None otherwise
19
+ """
20
+ # Check for pool-provided project name (e.g., GMAIL_PROJECT_NAME)
21
+ env_var = f"{env_name.upper().replace('-', '_')}_PROJECT_NAME"
22
+ project_name = os.getenv(env_var, "")
23
+ if project_name:
24
+ return project_name
25
+
26
+ # Fallback to base project name
27
+ base_project = os.getenv("COMPOSE_PROJECT_NAME", "")
28
+ if base_project:
29
+ return f"{base_project}-{env_name}"
30
+
31
+ return None
32
+
33
+
34
+ def get_project_args(env_name: str) -> List[str]:
35
+ """
36
+ Get docker compose project name arguments.
37
+
38
+ Args:
39
+ env_name: Environment name
40
+
41
+ Returns:
42
+ List like ["-p", "project_name"] or empty list
43
+ """
44
+ project_name = get_project_name(env_name)
45
+ if project_name:
46
+ return ["-p", project_name]
47
+ return []
48
+
49
+
50
+ def run_compose(
51
+ env_name: str,
52
+ compose_file: Union[str, Path],
53
+ args: List[str],
54
+ cwd: Optional[Union[str, Path]] = None,
55
+ check: bool = True,
56
+ verbose: bool = True,
57
+ ) -> subprocess.CompletedProcess:
58
+ """
59
+ Run docker compose command with correct project name.
60
+
61
+ Args:
62
+ env_name: Environment name (gmail, slack, etc.)
63
+ compose_file: Path to docker-compose.yml
64
+ args: Additional arguments for docker compose
65
+ cwd: Working directory
66
+ check: Raise exception on non-zero exit
67
+ verbose: Print command before running
68
+
69
+ Returns:
70
+ CompletedProcess instance
71
+ """
72
+ project_args = get_project_args(env_name)
73
+ cmd = ["docker", "compose"] + project_args + ["-f", str(compose_file)] + args
74
+
75
+ if verbose:
76
+ print("$ " + " ".join(cmd))
77
+
78
+ return subprocess.run(cmd, cwd=cwd, check=check)
79
+
80
+
81
+ def run_compose_exec(
82
+ env_name: str,
83
+ compose_file: Union[str, Path],
84
+ service: str,
85
+ command: List[str],
86
+ cwd: Optional[Union[str, Path]] = None,
87
+ check: bool = True,
88
+ verbose: bool = True,
89
+ ) -> subprocess.CompletedProcess:
90
+ """
91
+ Run command inside a docker compose service.
92
+
93
+ Args:
94
+ env_name: Environment name (gmail, slack, etc.)
95
+ compose_file: Path to docker-compose.yml
96
+ service: Service name to exec into
97
+ command: Command to run
98
+ cwd: Working directory
99
+ check: Raise exception on non-zero exit
100
+ verbose: Print command before running
101
+
102
+ Returns:
103
+ CompletedProcess instance
104
+ """
105
+ args = ["exec", "-T", service] + command
106
+ return run_compose(env_name, compose_file, args, cwd=cwd, check=check, verbose=verbose)
107
+
108
+
109
+ def run_compose_cp(
110
+ env_name: str,
111
+ compose_file: Union[str, Path],
112
+ src: str,
113
+ dst: str,
114
+ cwd: Optional[Union[str, Path]] = None,
115
+ check: bool = True,
116
+ verbose: bool = True,
117
+ ) -> subprocess.CompletedProcess:
118
+ """
119
+ Copy files to/from a docker compose service.
120
+
121
+ Args:
122
+ env_name: Environment name (gmail, slack, etc.)
123
+ compose_file: Path to docker-compose.yml
124
+ src: Source path (local or service:path)
125
+ dst: Destination path (local or service:path)
126
+ cwd: Working directory
127
+ check: Raise exception on non-zero exit
128
+ verbose: Print command before running
129
+
130
+ Returns:
131
+ CompletedProcess instance
132
+ """
133
+ args = ["cp", src, dst]
134
+ return run_compose(env_name, compose_file, args, cwd=cwd, check=check, verbose=verbose)
135
+
utils/config.py ADDED
@@ -0,0 +1,77 @@
1
+ import importlib.resources
2
+ import os
3
+ import sys
4
+ from pathlib import Path
5
+
6
+
7
+ def _find_project_root() -> Path:
8
+ # 1. Explicit override — set DTAP_ROOT when running from Docker / CI / installed wheel.
9
+ if env_root := os.environ.get("DTAP_ROOT"):
10
+ root = Path(env_root).resolve()
11
+ if str(root) not in sys.path:
12
+ sys.path.insert(0, str(root))
13
+ return root
14
+
15
+ # 2. Installed package: locate via the bundled dt_arena/config/env.yaml.
16
+ # importlib.resources works whether the package is pip-installed or editable.
17
+ try:
18
+ ref = importlib.resources.files("dt_arena") / "config" / "env.yaml"
19
+ with importlib.resources.as_file(ref) as p:
20
+ # p = <prefix>/dt_arena/config/env.yaml → parents[2] = <prefix>
21
+ root = p.parents[2]
22
+ if str(root) not in sys.path:
23
+ sys.path.insert(0, str(root))
24
+ return root
25
+ except Exception:
26
+ pass
27
+
28
+ # 3. Source checkout: walk up until we find pyproject.toml.
29
+ for parent in Path(__file__).resolve().parents:
30
+ if (parent / "pyproject.toml").exists():
31
+ if str(parent) not in sys.path:
32
+ sys.path.insert(0, str(parent))
33
+ return parent
34
+
35
+ # 4. Last resort.
36
+ guess = Path(__file__).resolve().parents[1]
37
+ if str(guess) not in sys.path:
38
+ sys.path.insert(0, str(guess))
39
+ return guess
40
+
41
+
42
+ PROJECT_ROOT = _find_project_root()
43
+
44
+ # Configuration file paths (bundled as package-data in the wheel)
45
+ ENV_CONFIG_PATH = PROJECT_ROOT / "dt_arena" / "config" / "env.yaml"
46
+ MCP_CONFIG_PATH = PROJECT_ROOT / "dt_arena" / "config" / "mcp.yaml"
47
+ INJECTION_MCP_CONFIG_PATH = PROJECT_ROOT / "dt_arena" / "config" / "injection_mcp.yaml"
48
+
49
+ # Bundled benchmark task lists (benchmark/<domain>/{benign,direct,indirect}.jsonl)
50
+ BENCHMARK_ROOT = PROJECT_ROOT / "benchmark"
51
+
52
+ # Evaluation script path
53
+ TASK_RUNNER_PATH = PROJECT_ROOT / "eval" / "task_runner.py"
54
+
55
+
56
+ def resolve_benchmark_task_list(
57
+ domain: str | None = None,
58
+ task_type: str | None = None,
59
+ threat_model: str | None = None,
60
+ ) -> Path:
61
+ """Resolve the bundled benchmark JSONL/dir matching the given filters.
62
+
63
+ - No filters → BENCHMARK_ROOT (all 14 domains × 3 task lists)
64
+ - domain only → BENCHMARK_ROOT/<domain>/ (all 3 lists)
65
+ - +task_type=benign → BENCHMARK_ROOT/<domain>/benign.jsonl
66
+ - +threat_model → BENCHMARK_ROOT/<domain>/<direct|indirect>.jsonl
67
+ """
68
+ if not domain:
69
+ return BENCHMARK_ROOT
70
+
71
+ domain_dir = BENCHMARK_ROOT / domain
72
+ if task_type == "benign":
73
+ return domain_dir / "benign.jsonl"
74
+ if threat_model in ("direct", "indirect"):
75
+ return domain_dir / f"{threat_model}.jsonl"
76
+ return domain_dir
77
+