decodingtrust-agent-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. agent/__init__.py +30 -0
  2. agent/claudesdk/__init__.py +8 -0
  3. agent/claudesdk/example.py +221 -0
  4. agent/claudesdk/src/__init__.py +8 -0
  5. agent/claudesdk/src/agent.py +400 -0
  6. agent/claudesdk/src/mcp_proxy.py +409 -0
  7. agent/claudesdk/src/utils.py +420 -0
  8. agent/googleadk/__init__.py +15 -0
  9. agent/googleadk/example.py +237 -0
  10. agent/googleadk/src/__init__.py +12 -0
  11. agent/googleadk/src/agent.py +401 -0
  12. agent/googleadk/src/mcp_wrapper.py +163 -0
  13. agent/googleadk/src/utils.py +602 -0
  14. agent/langchain/__init__.py +8 -0
  15. agent/langchain/example.py +213 -0
  16. agent/langchain/src/__init__.py +8 -0
  17. agent/langchain/src/agent.py +645 -0
  18. agent/langchain/src/utils.py +433 -0
  19. agent/openaisdk/__init__.py +17 -0
  20. agent/openaisdk/example.py +228 -0
  21. agent/openaisdk/src/__init__.py +12 -0
  22. agent/openaisdk/src/agent.py +491 -0
  23. agent/openaisdk/src/agent_wrapper.py +143 -0
  24. agent/openaisdk/src/mcp_wrapper.py +395 -0
  25. agent/openaisdk/src/utils.py +493 -0
  26. agent/openclaw/__init__.py +10 -0
  27. agent/openclaw/example.py +251 -0
  28. agent/openclaw/src/__init__.py +14 -0
  29. agent/openclaw/src/agent.py +930 -0
  30. agent/openclaw/src/helpers/__init__.py +1 -0
  31. agent/openclaw/src/helpers/auth_helpers.py +55 -0
  32. agent/openclaw/src/mcp_proxy.py +564 -0
  33. agent/openclaw/src/plugin_generator.py +231 -0
  34. agent/openclaw/src/utils.py +341 -0
  35. agent/pocketflow/__init__.py +18 -0
  36. agent/pocketflow/example.py +221 -0
  37. agent/pocketflow/prompts/react_agent.py +46 -0
  38. agent/pocketflow/src/__init__.py +6 -0
  39. agent/pocketflow/src/agent.py +507 -0
  40. agent/pocketflow/src/agent_wrapper.py +159 -0
  41. agent/pocketflow/src/async_helper.py +92 -0
  42. agent/pocketflow/src/mcp_react_agent.py +279 -0
  43. agent/pocketflow/src/native_agent.py +74 -0
  44. agent/pocketflow/src/nodes.py +467 -0
  45. benchmark/__init__.py +0 -0
  46. benchmark/browser/benign.jsonl +34 -0
  47. benchmark/browser/direct.jsonl +85 -0
  48. benchmark/browser/indirect.jsonl +82 -0
  49. benchmark/code/benign.jsonl +0 -0
  50. benchmark/code/direct.jsonl +121 -0
  51. benchmark/code/indirect.jsonl +165 -0
  52. benchmark/crm/benign.jsonl +165 -0
  53. benchmark/crm/direct.jsonl +90 -0
  54. benchmark/crm/indirect.jsonl +150 -0
  55. benchmark/customer-service/benign.jsonl +160 -0
  56. benchmark/customer-service/direct.jsonl +100 -0
  57. benchmark/customer-service/indirect.jsonl +101 -0
  58. benchmark/finance/benign.jsonl +0 -0
  59. benchmark/finance/direct.jsonl +200 -0
  60. benchmark/finance/indirect.jsonl +200 -0
  61. benchmark/legal/benign.jsonl +0 -0
  62. benchmark/legal/direct.jsonl +200 -0
  63. benchmark/legal/indirect.jsonl +200 -0
  64. benchmark/macos/benign.jsonl +30 -0
  65. benchmark/macos/direct.jsonl +50 -0
  66. benchmark/macos/indirect.jsonl +50 -0
  67. benchmark/medical/benign.jsonl +642 -0
  68. benchmark/medical/direct.jsonl +229 -0
  69. benchmark/medical/indirect.jsonl +222 -0
  70. benchmark/os-filesystem/benign.jsonl +200 -0
  71. benchmark/os-filesystem/direct.jsonl +200 -0
  72. benchmark/os-filesystem/indirect.jsonl +200 -0
  73. benchmark/research/benign.jsonl +0 -0
  74. benchmark/research/direct.jsonl +119 -0
  75. benchmark/research/indirect.jsonl +125 -0
  76. benchmark/telecom/benign.jsonl +120 -0
  77. benchmark/telecom/direct.jsonl +161 -0
  78. benchmark/telecom/indirect.jsonl +166 -0
  79. benchmark/travel/benign.jsonl +130 -0
  80. benchmark/travel/direct.jsonl +105 -0
  81. benchmark/travel/indirect.jsonl +120 -0
  82. benchmark/windows/benign.jsonl +100 -0
  83. benchmark/windows/direct.jsonl +140 -0
  84. benchmark/windows/indirect.jsonl +107 -0
  85. benchmark/workflow/benign.jsonl +335 -0
  86. benchmark/workflow/direct.jsonl +78 -0
  87. benchmark/workflow/indirect.jsonl +107 -0
  88. cli/__init__.py +5 -0
  89. cli/main.py +182 -0
  90. cli/scaffold.py +334 -0
  91. decodingtrust_agent_sdk-0.1.0.dist-info/METADATA +642 -0
  92. decodingtrust_agent_sdk-0.1.0.dist-info/RECORD +374 -0
  93. decodingtrust_agent_sdk-0.1.0.dist-info/WHEEL +5 -0
  94. decodingtrust_agent_sdk-0.1.0.dist-info/entry_points.txt +2 -0
  95. decodingtrust_agent_sdk-0.1.0.dist-info/licenses/LICENSE +201 -0
  96. decodingtrust_agent_sdk-0.1.0.dist-info/top_level.txt +6 -0
  97. dt_arena/config/env.yaml +515 -0
  98. dt_arena/config/injection_mcp.yaml +430 -0
  99. dt_arena/config/mcp.yaml +642 -0
  100. dt_arena/envs/arxiv/docker-compose-hub.yml +31 -0
  101. dt_arena/envs/arxiv/docker-compose.yml +36 -0
  102. dt_arena/envs/atlassian/docker/docker-compose.dev.yml +65 -0
  103. dt_arena/envs/atlassian/docker/docker-compose.yml +53 -0
  104. dt_arena/envs/atlassian/docker-compose-hub.yml +57 -0
  105. dt_arena/envs/atlassian/docker-compose.yml +72 -0
  106. dt_arena/envs/bigquery/docker-compose.yml +20 -0
  107. dt_arena/envs/booking/docker-compose.yml +59 -0
  108. dt_arena/envs/calendar/docker-compose-hub.yml +30 -0
  109. dt_arena/envs/calendar/docker-compose.yml +42 -0
  110. dt_arena/envs/custom-website/docker-compose.yml +6 -0
  111. dt_arena/envs/customer_service/docker-compose.yml +59 -0
  112. dt_arena/envs/databricks/docker-compose-hub.yml +47 -0
  113. dt_arena/envs/databricks/docker-compose.yml +51 -0
  114. dt_arena/envs/ecommerce/docker-compose.yml +6 -0
  115. dt_arena/envs/ers/docker-compose.yml +36 -0
  116. dt_arena/envs/ers/hrms/docker/docker-compose.yml +31 -0
  117. dt_arena/envs/finance/docker-compose.yml +23 -0
  118. dt_arena/envs/github/docker/docker-compose-hub.yml +50 -0
  119. dt_arena/envs/github/docker/docker-compose.yml +50 -0
  120. dt_arena/envs/gmail/docker-compose-hub.yml +51 -0
  121. dt_arena/envs/gmail/docker-compose.yml +65 -0
  122. dt_arena/envs/google-form/docker-compose-hub.yml +33 -0
  123. dt_arena/envs/google-form/docker-compose.yml +41 -0
  124. dt_arena/envs/googledocs/docker-compose-hub.yml +61 -0
  125. dt_arena/envs/googledocs/docker-compose.yml +78 -0
  126. dt_arena/envs/hospital/docker-compose-hub.yml +25 -0
  127. dt_arena/envs/hospital/docker-compose.yml +27 -0
  128. dt_arena/envs/legal/docker-compose.yml +22 -0
  129. dt_arena/envs/linkedin/docker-compose.yml +63 -0
  130. dt_arena/envs/macos/docker-compose.yml +79 -0
  131. dt_arena/envs/os-filesystem/docker-compose-hub.yml +16 -0
  132. dt_arena/envs/os-filesystem/docker-compose.yml +20 -0
  133. dt_arena/envs/paypal/docker-compose-hub.yml +48 -0
  134. dt_arena/envs/paypal/docker-compose.yml +63 -0
  135. dt_arena/envs/research/docker-compose-hub.yml +13 -0
  136. dt_arena/envs/research/docker-compose.yml +24 -0
  137. dt_arena/envs/salesforce_crm/docker-compose-hub.yaml +45 -0
  138. dt_arena/envs/salesforce_crm/docker-compose.yaml +49 -0
  139. dt_arena/envs/slack/docker-compose-hub.yml +28 -0
  140. dt_arena/envs/slack/docker-compose.yml +41 -0
  141. dt_arena/envs/snowflake/docker-compose-hub.yml +41 -0
  142. dt_arena/envs/snowflake/docker-compose.yml +44 -0
  143. dt_arena/envs/telecom/docker-compose-hub.yml +16 -0
  144. dt_arena/envs/telecom/docker-compose.yml +17 -0
  145. dt_arena/envs/telegram/docker-compose-hub.yml +57 -0
  146. dt_arena/envs/telegram/docker-compose.yml +62 -0
  147. dt_arena/envs/terminal/docker-compose-hub.yml +12 -0
  148. dt_arena/envs/terminal/docker-compose.yml +26 -0
  149. dt_arena/envs/travel/docker-compose-hub.yml +19 -0
  150. dt_arena/envs/travel/docker-compose.yml +19 -0
  151. dt_arena/envs/whatsapp/docker-compose-hub.yml +61 -0
  152. dt_arena/envs/whatsapp/docker-compose.yml +78 -0
  153. dt_arena/envs/windows/docker-compose.yml +71 -0
  154. dt_arena/envs/zoom/docker-compose-hub.yml +27 -0
  155. dt_arena/envs/zoom/docker-compose.yml +40 -0
  156. dt_arena/injection_mcp_server/atlassian/env_injection.py +134 -0
  157. dt_arena/injection_mcp_server/calendar/env_injection.py +217 -0
  158. dt_arena/injection_mcp_server/custom_website/env_injection.py +97 -0
  159. dt_arena/injection_mcp_server/customer_service/env_injection.py +659 -0
  160. dt_arena/injection_mcp_server/databricks/env_injection.py +255 -0
  161. dt_arena/injection_mcp_server/ecommerce/env_injection.py +110 -0
  162. dt_arena/injection_mcp_server/finance/env_injection.py +85 -0
  163. dt_arena/injection_mcp_server/github/env_injection.py +206 -0
  164. dt_arena/injection_mcp_server/gmail/env_injection.py +211 -0
  165. dt_arena/injection_mcp_server/google_form/env_injection.py +186 -0
  166. dt_arena/injection_mcp_server/googledocs/env_injection.py +44 -0
  167. dt_arena/injection_mcp_server/hospital/env_injection.py +43 -0
  168. dt_arena/injection_mcp_server/legal/env_injection.py +229 -0
  169. dt_arena/injection_mcp_server/macos/env_injection.py +272 -0
  170. dt_arena/injection_mcp_server/os-filesystem/env_injection.py +341 -0
  171. dt_arena/injection_mcp_server/paypal/env_injection.py +268 -0
  172. dt_arena/injection_mcp_server/research/env_injection.py +616 -0
  173. dt_arena/injection_mcp_server/salesforce/env_injection.py +514 -0
  174. dt_arena/injection_mcp_server/slack/env_injection.py +265 -0
  175. dt_arena/injection_mcp_server/snowflake/env_injection.py +230 -0
  176. dt_arena/injection_mcp_server/telecom/env_injection.py +503 -0
  177. dt_arena/injection_mcp_server/telegram/env_injection.py +171 -0
  178. dt_arena/injection_mcp_server/terminal/env_injection.py +523 -0
  179. dt_arena/injection_mcp_server/travel/env_injection.py +173 -0
  180. dt_arena/injection_mcp_server/whatsapp/env_injection.py +185 -0
  181. dt_arena/injection_mcp_server/windows/env_injection.py +943 -0
  182. dt_arena/injection_mcp_server/zoom/env_injection.py +216 -0
  183. dt_arena/mcp_server/atlassian/main.py +1554 -0
  184. dt_arena/mcp_server/atlassian/test_server.py +66 -0
  185. dt_arena/mcp_server/bigquery/main.py +333 -0
  186. dt_arena/mcp_server/booking/main.py +310 -0
  187. dt_arena/mcp_server/browser/main.py +1741 -0
  188. dt_arena/mcp_server/calendar/example_multi_user.py +162 -0
  189. dt_arena/mcp_server/calendar/main.py +792 -0
  190. dt_arena/mcp_server/calendar/test_mcp.py +135 -0
  191. dt_arena/mcp_server/customer_service/main.py +1063 -0
  192. dt_arena/mcp_server/databricks/main.py +566 -0
  193. dt_arena/mcp_server/databricks/probe.py +102 -0
  194. dt_arena/mcp_server/ers/main.py +845 -0
  195. dt_arena/mcp_server/finance/__init__.py +87 -0
  196. dt_arena/mcp_server/finance/core/__init__.py +12 -0
  197. dt_arena/mcp_server/finance/core/data_loader.py +558 -0
  198. dt_arena/mcp_server/finance/core/portfolio.py +565 -0
  199. dt_arena/mcp_server/finance/evaluation/__init__.py +20 -0
  200. dt_arena/mcp_server/finance/evaluation/evaluator.py +217 -0
  201. dt_arena/mcp_server/finance/evaluation/logger.py +137 -0
  202. dt_arena/mcp_server/finance/injection/__init__.py +66 -0
  203. dt_arena/mcp_server/finance/injection/config.py +176 -0
  204. dt_arena/mcp_server/finance/injection/content.py +755 -0
  205. dt_arena/mcp_server/finance/injection/html.py +409 -0
  206. dt_arena/mcp_server/finance/injection/locations.py +167 -0
  207. dt_arena/mcp_server/finance/injection/methods.py +193 -0
  208. dt_arena/mcp_server/finance/injection/presets.py +1023 -0
  209. dt_arena/mcp_server/finance/main.py +361 -0
  210. dt_arena/mcp_server/finance/run_mcp.py +21 -0
  211. dt_arena/mcp_server/finance/run_web.py +26 -0
  212. dt_arena/mcp_server/finance/server/__init__.py +41 -0
  213. dt_arena/mcp_server/finance/server/extractor.py +1453 -0
  214. dt_arena/mcp_server/finance/server/extractor_minimal.py +292 -0
  215. dt_arena/mcp_server/finance/server/extractor_simple.py +1164 -0
  216. dt_arena/mcp_server/finance/server/injection_mcp.py +865 -0
  217. dt_arena/mcp_server/finance/server/mcp.py +451 -0
  218. dt_arena/mcp_server/finance/server/tools/__init__.py +23 -0
  219. dt_arena/mcp_server/finance/server/tools/account.py +88 -0
  220. dt_arena/mcp_server/finance/server/tools/browsing.py +328 -0
  221. dt_arena/mcp_server/finance/server/tools/social.py +73 -0
  222. dt_arena/mcp_server/finance/server/tools/trading.py +242 -0
  223. dt_arena/mcp_server/finance/server/tools/utility.py +49 -0
  224. dt_arena/mcp_server/finance/server/web.py +2139 -0
  225. dt_arena/mcp_server/finance/tasks/benchmark/__init__.py +28 -0
  226. dt_arena/mcp_server/finance/tasks/benchmark/attack_pool.py +3026 -0
  227. dt_arena/mcp_server/finance/tasks/benchmark/attack_runner.py +1315 -0
  228. dt_arena/mcp_server/finance/tasks/benchmark/finra_requirements.py +1335 -0
  229. dt_arena/mcp_server/finance/tasks/benchmark/finra_tasks.py +3665 -0
  230. dt_arena/mcp_server/finance/tasks/benchmark/malicious_tasks.py +2673 -0
  231. dt_arena/mcp_server/finance/tasks/redteam_suite/run_redteam_suite.py +1713 -0
  232. dt_arena/mcp_server/finance/test_mcp_tools.py +476 -0
  233. dt_arena/mcp_server/github/main.py +441 -0
  234. dt_arena/mcp_server/gmail/main.py +1004 -0
  235. dt_arena/mcp_server/google_form/main.py +141 -0
  236. dt_arena/mcp_server/googledocs/main.py +458 -0
  237. dt_arena/mcp_server/hospital/mcp_server.py +458 -0
  238. dt_arena/mcp_server/legal/__init__.py +9 -0
  239. dt_arena/mcp_server/legal/core/__init__.py +14 -0
  240. dt_arena/mcp_server/legal/core/courtlistener_store.py +762 -0
  241. dt_arena/mcp_server/legal/core/data_loader.py +266 -0
  242. dt_arena/mcp_server/legal/core/document_store.py +197 -0
  243. dt_arena/mcp_server/legal/core/matter_manager.py +466 -0
  244. dt_arena/mcp_server/legal/main.py +89 -0
  245. dt_arena/mcp_server/legal/scripts/collect_data.py +988 -0
  246. dt_arena/mcp_server/legal/server/__init__.py +14 -0
  247. dt_arena/mcp_server/legal/server/mcp.py +2330 -0
  248. dt_arena/mcp_server/macos/client_test.py +270 -0
  249. dt_arena/mcp_server/macos/mcp_server.py +285 -0
  250. dt_arena/mcp_server/os-filesystem/main.py +1380 -0
  251. dt_arena/mcp_server/paypal/main.py +501 -0
  252. dt_arena/mcp_server/research/main.py +777 -0
  253. dt_arena/mcp_server/salesforce/main.py +2006 -0
  254. dt_arena/mcp_server/slack/main.py +318 -0
  255. dt_arena/mcp_server/snowflake/main.py +612 -0
  256. dt_arena/mcp_server/snowflake/probe.py +183 -0
  257. dt_arena/mcp_server/telecom/mcp_client.py +423 -0
  258. dt_arena/mcp_server/telecom/mcp_server.py +1059 -0
  259. dt_arena/mcp_server/telegram/main.py +338 -0
  260. dt_arena/mcp_server/terminal/main.py +163 -0
  261. dt_arena/mcp_server/travel/client_test.py +16 -0
  262. dt_arena/mcp_server/travel/mcp_server.py +404 -0
  263. dt_arena/mcp_server/whatsapp/main.py +318 -0
  264. dt_arena/mcp_server/windows/client_test.py +270 -0
  265. dt_arena/mcp_server/windows/mcp_server.py +218 -0
  266. dt_arena/mcp_server/zoom/main.py +466 -0
  267. dt_arena/src/__init__.py +0 -0
  268. dt_arena/src/hooks/__init__.py +0 -0
  269. dt_arena/src/hooks/audit_log.py +30 -0
  270. dt_arena/src/hooks/hooks.json +3 -0
  271. dt_arena/src/run_benign.py +142 -0
  272. dt_arena/src/types/__init__.py +0 -0
  273. dt_arena/src/types/agent.py +441 -0
  274. dt_arena/src/types/attacks.py +2 -0
  275. dt_arena/src/types/environment.py +2 -0
  276. dt_arena/src/types/hooks.py +174 -0
  277. dt_arena/src/types/judge.py +52 -0
  278. dt_arena/src/types/red_teaming_trajectory.py +385 -0
  279. dt_arena/src/types/task.py +260 -0
  280. dt_arena/src/types/trajectory.py +315 -0
  281. dt_arena/utils/__init__.py +1 -0
  282. dt_arena/utils/atlassian/__init__.py +27 -0
  283. dt_arena/utils/atlassian/helpers.py +520 -0
  284. dt_arena/utils/bigquery/__init__.py +1 -0
  285. dt_arena/utils/bigquery/helpers.py +246 -0
  286. dt_arena/utils/calendar/__init__.py +1 -0
  287. dt_arena/utils/calendar/helpers.py +87 -0
  288. dt_arena/utils/customer_service/__init__.py +17 -0
  289. dt_arena/utils/customer_service/cs_env_client.py +940 -0
  290. dt_arena/utils/customer_service/helpers.py +339 -0
  291. dt_arena/utils/customer_service/judges/__init__.py +20 -0
  292. dt_arena/utils/customer_service/judges/abusive_comms.py +149 -0
  293. dt_arena/utils/customer_service/judges/bulk_destructive.py +202 -0
  294. dt_arena/utils/customer_service/judges/cross_customer.py +481 -0
  295. dt_arena/utils/customer_service/judges/email_exfiltration.py +320 -0
  296. dt_arena/utils/customer_service/judges/financial_fraud.py +221 -0
  297. dt_arena/utils/customer_service/judges/llm_utils.py +264 -0
  298. dt_arena/utils/customer_service/judges/pii_disclosure.py +837 -0
  299. dt_arena/utils/customer_service/judges/policy_misrep.py +44 -0
  300. dt_arena/utils/customer_service/judges/text_utils.py +21 -0
  301. dt_arena/utils/databricks/__init__.py +2 -0
  302. dt_arena/utils/databricks/helpers.py +210 -0
  303. dt_arena/utils/finance/__init__.py +0 -0
  304. dt_arena/utils/finance/helpers.py +263 -0
  305. dt_arena/utils/github/__init__.py +1 -0
  306. dt_arena/utils/github/helpers.py +249 -0
  307. dt_arena/utils/gmail/__init__.py +1 -0
  308. dt_arena/utils/gmail/helpers.py +344 -0
  309. dt_arena/utils/google_form/__init__.py +2 -0
  310. dt_arena/utils/google_form/helpers.py +133 -0
  311. dt_arena/utils/legal/__init__.py +0 -0
  312. dt_arena/utils/legal/helpers.py +228 -0
  313. dt_arena/utils/macos/__init__.py +0 -0
  314. dt_arena/utils/macos/env_setup.py +215 -0
  315. dt_arena/utils/macos/helpers.py +61 -0
  316. dt_arena/utils/os_filesystem/__init__.py +1 -0
  317. dt_arena/utils/os_filesystem/helpers.py +366 -0
  318. dt_arena/utils/paypal/__init__.py +1 -0
  319. dt_arena/utils/paypal/helpers.py +178 -0
  320. dt_arena/utils/port_allocator.py +266 -0
  321. dt_arena/utils/research/__init__.py +0 -0
  322. dt_arena/utils/research/helpers.py +251 -0
  323. dt_arena/utils/salesforce/__init__.py +1 -0
  324. dt_arena/utils/salesforce/helpers.py +719 -0
  325. dt_arena/utils/slack/__init__.py +1 -0
  326. dt_arena/utils/slack/helpers.py +176 -0
  327. dt_arena/utils/snowflake/__init__.py +1 -0
  328. dt_arena/utils/snowflake/helpers.py +166 -0
  329. dt_arena/utils/telecom/__init__.py +1 -0
  330. dt_arena/utils/telecom/helpers.py +760 -0
  331. dt_arena/utils/telegram/__init__.py +0 -0
  332. dt_arena/utils/telegram/helpers.py +174 -0
  333. dt_arena/utils/terminal/__init__.py +0 -0
  334. dt_arena/utils/terminal/helpers.py +20 -0
  335. dt_arena/utils/travel/__init__.py +0 -0
  336. dt_arena/utils/travel/env_client.py +537 -0
  337. dt_arena/utils/travel/llm_judge.py +137 -0
  338. dt_arena/utils/travel/prompts.py +64 -0
  339. dt_arena/utils/utils/__init__.py +122 -0
  340. dt_arena/utils/whatsapp/__init__.py +0 -0
  341. dt_arena/utils/whatsapp/helpers.py +226 -0
  342. dt_arena/utils/windows/__init__.py +0 -0
  343. dt_arena/utils/windows/env_reset.py +224 -0
  344. dt_arena/utils/windows/env_setup.py +280 -0
  345. dt_arena/utils/windows/exfil_helpers.py +170 -0
  346. dt_arena/utils/windows/helpers.py +74 -0
  347. dt_arena/utils/zoom/__init__.py +1 -0
  348. dt_arena/utils/zoom/helpers.py +70 -0
  349. eval/__init__.py +1 -0
  350. eval/evaluation.py +426 -0
  351. eval/task_runner.py +449 -0
  352. utils/__init__.py +148 -0
  353. utils/agent_helpers.py +308 -0
  354. utils/agent_wrapper.py +189 -0
  355. utils/compose_utils.py +135 -0
  356. utils/config.py +77 -0
  357. utils/env_helpers.py +104 -0
  358. utils/eval_stats.py +88 -0
  359. utils/injection_helpers.py +429 -0
  360. utils/injection_mcp_helpers.py +152 -0
  361. utils/judge_helpers.py +181 -0
  362. utils/judge_utils.py +472 -0
  363. utils/llm.py +196 -0
  364. utils/logging.py +45 -0
  365. utils/mcp_helpers.py +232 -0
  366. utils/mcp_manager.py +235 -0
  367. utils/memory_guard.py +18 -0
  368. utils/red_teaming_sandbox.py +476 -0
  369. utils/reset_helpers.py +318 -0
  370. utils/resource_manager.py +370 -0
  371. utils/skill_helpers.py +447 -0
  372. utils/task_executor.py +904 -0
  373. utils/task_helpers.py +270 -0
  374. utils/template_helpers.py +179 -0
@@ -0,0 +1,270 @@
1
+ import asyncio
2
+ import base64
3
+ from typing import Optional, List, Dict, Any
4
+ from contextlib import AsyncExitStack
5
+ from pathlib import Path
6
+
7
+ from mcp import ClientSession, StdioServerParameters
8
+ from mcp.client.stdio import stdio_client
9
+
10
+ from anthropic import Anthropic
11
+ from dotenv import load_dotenv
12
+
13
+ load_dotenv() # load environment variables from .env
14
+
15
+
16
+ class MCPClient:
17
+ def __init__(self):
18
+ # Initialize session and client objects
19
+ self.session: Optional[ClientSession] = None
20
+ self.exit_stack = AsyncExitStack()
21
+ self.anthropic = Anthropic()
22
+
23
+ async def connect_to_server(self, server_script_path: str, extra_args: Optional[List[str]] = None):
24
+ """Connect to an MCP server
25
+
26
+ Args:
27
+ server_script_path: Path to the server script (.py or .js)
28
+ extra_args: Optional additional arguments to pass to the server
29
+ """
30
+ is_python = server_script_path.endswith(".py")
31
+ is_js = server_script_path.endswith(".js")
32
+ if not (is_python or is_js):
33
+ raise ValueError("Server script must be a .py or .js file")
34
+
35
+ command = "python" if is_python else "node"
36
+ args = [server_script_path]
37
+ if extra_args:
38
+ args.extend(extra_args)
39
+
40
+ server_params = StdioServerParameters(command=command, args=args, env=None)
41
+
42
+ stdio_transport = await self.exit_stack.enter_async_context(stdio_client(server_params))
43
+ self.stdio, self.write = stdio_transport
44
+ self.session = await self.exit_stack.enter_async_context(ClientSession(self.stdio, self.write))
45
+
46
+ await self.session.initialize()
47
+
48
+ # List available tools
49
+ response = await self.session.list_tools()
50
+ tools = response.tools
51
+ print("\nConnected to server with tools:", [tool.name for tool in tools])
52
+
53
+ def encode_image(self, image_path: str) -> Dict[str, Any]:
54
+ """Encode an image file to base64 and determine its media type
55
+
56
+ Args:
57
+ image_path: Path to the image file
58
+
59
+ Returns:
60
+ Dictionary with image data and media type for Anthropic API
61
+ """
62
+ path = Path(image_path)
63
+ if not path.exists():
64
+ raise FileNotFoundError(f"Image file not found: {image_path}")
65
+
66
+ # Determine media type based on file extension
67
+ extension = path.suffix.lower()
68
+ media_type_map = {
69
+ ".jpg": "image/jpeg",
70
+ ".jpeg": "image/jpeg",
71
+ ".png": "image/png",
72
+ ".gif": "image/gif",
73
+ ".webp": "image/webp",
74
+ }
75
+
76
+ media_type = media_type_map.get(extension)
77
+ if not media_type:
78
+ raise ValueError(f"Unsupported image format: {extension}. Supported formats: {list(media_type_map.keys())}")
79
+
80
+ # Read and encode the image
81
+ with open(path, "rb") as image_file:
82
+ image_data = base64.standard_b64encode(image_file.read()).decode("utf-8")
83
+
84
+ return {"type": "image", "source": {"type": "base64", "media_type": media_type, "data": image_data}}
85
+
86
+ def convert_mcp_content_to_anthropic(self, content: Any) -> Any:
87
+ """Convert MCP tool result content to Anthropic API format
88
+
89
+ Args:
90
+ content: Content from MCP tool result
91
+
92
+ Returns:
93
+ Converted content in Anthropic API format
94
+ """
95
+ # If content is a list, convert each item
96
+ if isinstance(content, list):
97
+ converted = []
98
+ for item in content:
99
+ # Check if this is an MCP image format
100
+ if item.type == "image":
101
+ # Convert MCP image format to Anthropic format
102
+ converted.append(
103
+ {"type": "image", "source": {"type": "base64", "media_type": item.mimeType, "data": item.data}}
104
+ )
105
+ elif item.type == "text":
106
+ # Text content remains the same
107
+ converted.append(item)
108
+ else:
109
+ # Unknown format, keep as is
110
+ converted.append(item)
111
+ return converted
112
+ else:
113
+ # If content is not a list, return as is
114
+ return content
115
+
116
+ async def process_query(self, query: str, image_paths: Optional[List[str]] = None) -> str:
117
+ """Process a query using Claude and available tools
118
+
119
+ Args:
120
+ query: Text query to process
121
+ image_paths: Optional list of image file paths to include in the query
122
+ """
123
+ # Build message content with text and images
124
+ content: List[Dict[str, Any]] = []
125
+
126
+ # Add images first if provided
127
+ if image_paths:
128
+ for image_path in image_paths:
129
+ try:
130
+ image_content = self.encode_image(image_path)
131
+ content.append(image_content)
132
+ print(f"Loaded image: {image_path}")
133
+ except Exception as e:
134
+ print(f"Warning: Failed to load image {image_path}: {str(e)}")
135
+
136
+ # Add text query
137
+ content.append({"type": "text", "text": query})
138
+
139
+ messages = [{"role": "user", "content": content}]
140
+
141
+ response = await self.session.list_tools()
142
+ available_tools = [
143
+ {
144
+ "name": tool.name,
145
+ "description": tool.description,
146
+ "input_schema": tool.inputSchema,
147
+ }
148
+ for tool in response.tools
149
+ ]
150
+
151
+ # Initial Claude API call
152
+ response = self.anthropic.messages.create(
153
+ model="claude-sonnet-4-5",
154
+ max_tokens=1000,
155
+ messages=messages,
156
+ tools=available_tools,
157
+ )
158
+
159
+ # Process response and handle tool calls
160
+ final_text = []
161
+
162
+ # Agentic loop: continue while there are tool calls
163
+ while response.stop_reason == "tool_use":
164
+ # Collect all content from assistant's response
165
+ assistant_content = []
166
+ tool_results = []
167
+
168
+ for content_block in response.content:
169
+ if content_block.type == "text":
170
+ final_text.append(content_block.text)
171
+ assistant_content.append({"type": "text", "text": content_block.text})
172
+ elif content_block.type == "tool_use":
173
+ tool_name = content_block.name
174
+ tool_args = content_block.input
175
+ tool_use_id = content_block.id
176
+
177
+ assistant_content.append(
178
+ {"type": "tool_use", "id": tool_use_id, "name": tool_name, "input": tool_args}
179
+ )
180
+
181
+ # Execute tool call
182
+ print(f"[Calling tool {tool_name} with args {tool_args}]")
183
+ result = await self.session.call_tool(tool_name, tool_args)
184
+
185
+ # Convert MCP content format to Anthropic format
186
+ converted_content = self.convert_mcp_content_to_anthropic(result.content)
187
+
188
+ # Add tool result
189
+ tool_results.append(
190
+ {"type": "tool_result", "tool_use_id": tool_use_id, "content": converted_content}
191
+ )
192
+
193
+ # Add assistant message with all content (text + tool_use)
194
+ messages.append({"role": "assistant", "content": assistant_content})
195
+
196
+ # Add user message with tool results
197
+ messages.append({"role": "user", "content": tool_results})
198
+
199
+ # Get next response from Claude
200
+ response = self.anthropic.messages.create(
201
+ model="claude-sonnet-4-5",
202
+ max_tokens=1000,
203
+ messages=messages,
204
+ tools=available_tools,
205
+ )
206
+
207
+ # Process final response (no more tool calls)
208
+ for content_block in response.content:
209
+ if content_block.type == "text":
210
+ final_text.append(content_block.text)
211
+
212
+ return "\n".join(final_text)
213
+
214
+ async def chat_loop(self):
215
+ """Run an interactive chat loop"""
216
+ print("\nMCP Client Started!")
217
+ print("Type your queries or 'quit' to exit.")
218
+ print("To include images, use: your query --image path/to/image.jpg --image path/to/another.png")
219
+
220
+ while True:
221
+ try:
222
+ query = input("\nQuery: ").strip()
223
+ if query.lower() == "quit":
224
+ break
225
+
226
+ # Parse image paths from query
227
+ image_paths = []
228
+ parts = query.split("--image")
229
+
230
+ if len(parts) > 1:
231
+ # First part is the actual query
232
+ query = parts[0].strip()
233
+
234
+ # Remaining parts are image paths
235
+ for part in parts[1:]:
236
+ image_path = part.strip().split()[0] if part.strip() else ""
237
+ if image_path:
238
+ image_paths.append(image_path)
239
+
240
+ response = await self.process_query(query, image_paths if image_paths else None)
241
+ print("\n" + response)
242
+
243
+ except Exception as e:
244
+ print(f"\nError: {str(e)}")
245
+
246
+ async def cleanup(self):
247
+ """Clean up resources"""
248
+ await self.exit_stack.aclose()
249
+
250
+
251
+ async def main():
252
+ if len(sys.argv) < 2:
253
+ print("Usage: python client_test.py <path_to_server_script> [additional_args...]")
254
+ sys.exit(1)
255
+
256
+ server_script = sys.argv[1]
257
+ extra_args = sys.argv[2:] if len(sys.argv) > 2 else None
258
+
259
+ client = MCPClient()
260
+ try:
261
+ await client.connect_to_server(server_script, extra_args)
262
+ await client.chat_loop()
263
+ finally:
264
+ await client.cleanup()
265
+
266
+
267
+ if __name__ == "__main__":
268
+ import sys
269
+
270
+ asyncio.run(main())
@@ -0,0 +1,285 @@
1
+ import base64
2
+ import os
3
+ from textwrap import dedent
4
+ from typing import Literal, Optional
5
+
6
+ import click
7
+ import requests
8
+ from fastmcp import FastMCP
9
+ from fastmcp.utilities.types import Image
10
+
11
+
12
+ API_BASE_URL = os.environ.get("MACOS_API_URL", "http://localhost:8005")
13
+
14
+ instructions = dedent("""
15
+ macOS MCP client provides tools to interact with macOS desktop through a FastAPI backend service.
16
+ All operations are executed via HTTP requests to the backend server.
17
+ """)
18
+
19
+ mcp = FastMCP(name="macos-mcp-client", instructions=instructions)
20
+
21
+
22
+ def make_api_call(endpoint: str, data: dict = None) -> dict:
23
+ """Helper function to make API calls to the FastAPI server"""
24
+ try:
25
+ response = requests.post(f"{API_BASE_URL}{endpoint}", json=data or {})
26
+ response.raise_for_status()
27
+ return response.json()
28
+ except requests.exceptions.RequestException as e:
29
+ return {"status": "error", "result": f"API call failed: {str(e)}"}
30
+
31
+
32
+ def _extract_text(result: dict) -> str:
33
+ """Extract text from API response (handles both list and string formats)."""
34
+ data = result.get("result", result)
35
+ if isinstance(data, list) and len(data) > 0:
36
+ return data[0].get("text", str(data))
37
+ return str(data)
38
+
39
+
40
+ @mcp.tool(
41
+ name="launch",
42
+ description='Launch an application on macOS by name, path, or bundle ID (e.g., "Terminal", "Safari", "Finder").',
43
+ )
44
+ def launch_tool(name: str) -> str:
45
+ result = make_api_call("/remote_macos_open_application", {"identifier": name})
46
+ text = _extract_text(result)
47
+ return (
48
+ text
49
+ + "\nNote: The application launch has been triggered but may not be fully loaded yet. Use screenshot to verify the app is ready before interacting with it."
50
+ )
51
+
52
+
53
+ @mcp.tool(
54
+ name="shell",
55
+ description="Execute shell commands on the macOS VM via SSH and return the output.",
56
+ )
57
+ def shell_tool(command: str) -> str:
58
+ result = make_api_call("/shell", {"command": command})
59
+ if result.get("status") == "error":
60
+ return result.get("result", str(result))
61
+ stdout = result.get("stdout", "")
62
+ stderr = result.get("stderr", "")
63
+ exit_code = result.get("exit_code", 0)
64
+ output = stdout
65
+ if stderr:
66
+ output += f"\nSTDERR: {stderr}"
67
+ return f"Response: {output}\nStatus Code: {exit_code}"
68
+
69
+
70
+ @mcp.tool(
71
+ name="screenshot",
72
+ description="Capture a screenshot of the macOS desktop. Returns visual screenshot image and text status.",
73
+ )
74
+ def state_tool(use_vision: bool = False):
75
+ result = make_api_call("/remote_macos_get_screen")
76
+
77
+ if result.get("status") == "error":
78
+ return result.get("result", str(result))
79
+
80
+ data = result.get("result", [])
81
+ if isinstance(data, list):
82
+ response = []
83
+ for item in data:
84
+ if item.get("type") == "text":
85
+ response.append(item.get("text", ""))
86
+ elif item.get("type") == "image":
87
+ image_data = item.get("data", "")
88
+ image_bytes = base64.b64decode(image_data)
89
+ mime_type = item.get("mimeType", "image/png")
90
+ image_format = mime_type.split("/")[-1] if "/" in mime_type else "png"
91
+ response.append(Image(data=image_bytes, format=image_format))
92
+ return response
93
+ return str(data)
94
+
95
+
96
+ @mcp.tool(
97
+ name="click",
98
+ description="Click on UI elements at specific coordinates. Supports left/right/middle mouse buttons and single/double clicks. Use coordinates from screenshot output.",
99
+ )
100
+ def click_tool(
101
+ loc: list[int],
102
+ button: Literal["left", "right", "middle"] = "left",
103
+ clicks: int = 1,
104
+ ) -> str:
105
+ button_map = {"left": 1, "right": 3, "middle": 2}
106
+ btn = button_map.get(button, 1)
107
+
108
+ if clicks >= 2:
109
+ endpoint = "/remote_macos_mouse_double_click"
110
+ else:
111
+ endpoint = "/remote_macos_mouse_click"
112
+
113
+ result = make_api_call(
114
+ endpoint,
115
+ {
116
+ "x": loc[0],
117
+ "y": loc[1],
118
+ "button": btn,
119
+ },
120
+ )
121
+ return _extract_text(result)
122
+
123
+
124
+ @mcp.tool(
125
+ name="key",
126
+ description='Press keys, key combinations, or type text into the currently focused element. Use "+" to combine keys (e.g., "cmd+c" for copy, "cmd+tab" for app switching). Single special keys: "enter", "escape", "tab", "space", "backspace", "delete", arrow keys, function keys. For plain text (e.g., a filename or command), just pass the text string directly.',
127
+ )
128
+ def key_tool(key: str) -> str:
129
+ payload = {}
130
+ if "+" in key:
131
+ payload["key_combination"] = key
132
+ else:
133
+ # Check if it's a special key or regular text
134
+ special_keys = {
135
+ "enter",
136
+ "return",
137
+ "escape",
138
+ "tab",
139
+ "space",
140
+ "backspace",
141
+ "delete",
142
+ "up",
143
+ "down",
144
+ "left",
145
+ "right",
146
+ "f1",
147
+ "f2",
148
+ "f3",
149
+ "f4",
150
+ "f5",
151
+ "f6",
152
+ "f7",
153
+ "f8",
154
+ "f9",
155
+ "f10",
156
+ "f11",
157
+ "f12",
158
+ "home",
159
+ "end",
160
+ "pageup",
161
+ "pagedown",
162
+ }
163
+ if key.lower() in special_keys:
164
+ payload["special_key"] = key
165
+ else:
166
+ payload["text"] = key
167
+
168
+ result = make_api_call("/remote_macos_send_keys", payload)
169
+ return _extract_text(result)
170
+
171
+
172
+ @mcp.tool(
173
+ name="type",
174
+ description="Type text into input fields, text areas, or focused elements. Optionally provide loc coordinates to click and focus a target element first. If loc is omitted, types into the currently focused element. Set clear=True to replace existing text.",
175
+ )
176
+ def type_tool(text: str, loc: list[int] = None, clear: bool = False, press_enter: bool = False) -> str:
177
+ # Click to focus the element only if loc is provided
178
+ if loc is not None:
179
+ make_api_call("/remote_macos_mouse_click", {"x": loc[0], "y": loc[1], "button": 1})
180
+ if clear:
181
+ make_api_call("/remote_macos_send_keys", {"key_combination": "cmd+a"})
182
+ # Type the text
183
+ result = make_api_call("/remote_macos_send_keys", {"text": text})
184
+ if press_enter:
185
+ make_api_call("/remote_macos_send_keys", {"special_key": "enter"})
186
+ return _extract_text(result)
187
+
188
+
189
+ @mcp.tool(
190
+ name="scroll",
191
+ description="Scroll at specific coordinates. Use wheel_times to control scroll amount.",
192
+ )
193
+ def scroll_tool(
194
+ loc: list[int] = None,
195
+ direction: Literal["up", "down"] = "down",
196
+ wheel_times: int = 1,
197
+ ) -> str:
198
+ x = loc[0] if loc else 683
199
+ y = loc[1] if loc else 384
200
+ # Repeat scroll for wheel_times
201
+ for _ in range(wheel_times):
202
+ result = make_api_call(
203
+ "/remote_macos_mouse_scroll",
204
+ {
205
+ "x": x,
206
+ "y": y,
207
+ "direction": direction,
208
+ },
209
+ )
210
+ return _extract_text(result)
211
+
212
+
213
+ @mcp.tool(
214
+ name="drag",
215
+ description="Drag and drop from source to destination coordinates.",
216
+ )
217
+ def drag_tool(from_loc: list[int], to_loc: list[int]) -> str:
218
+ result = make_api_call(
219
+ "/remote_macos_mouse_drag_n_drop",
220
+ {
221
+ "start_x": from_loc[0],
222
+ "start_y": from_loc[1],
223
+ "end_x": to_loc[0],
224
+ "end_y": to_loc[1],
225
+ },
226
+ )
227
+ return _extract_text(result)
228
+
229
+
230
+ @mcp.tool(
231
+ name="move",
232
+ description="Move mouse cursor to specific coordinates without clicking.",
233
+ )
234
+ def move_tool(to_loc: list[int]) -> str:
235
+ result = make_api_call(
236
+ "/remote_macos_mouse_move",
237
+ {
238
+ "x": to_loc[0],
239
+ "y": to_loc[1],
240
+ },
241
+ )
242
+ return _extract_text(result)
243
+
244
+
245
+ @mcp.tool(
246
+ name="wait",
247
+ description="Pause execution for specified duration in seconds.",
248
+ )
249
+ def wait_tool(duration: int) -> str:
250
+ import time
251
+
252
+ time.sleep(duration)
253
+ return f"Waited for {duration} seconds"
254
+
255
+
256
+ @click.command()
257
+ @click.option(
258
+ "--transport",
259
+ help="The transport layer used by the MCP server.",
260
+ type=click.Choice(["stdio", "sse", "streamable-http"]),
261
+ default="stdio",
262
+ )
263
+ @click.option(
264
+ "--host", help="Host to bind the SSE/Streamable HTTP server.", default="localhost", type=str, show_default=True
265
+ )
266
+ @click.option("--port", help="Port to bind the SSE/Streamable HTTP server.", default=8002, type=int, show_default=True)
267
+ @click.option(
268
+ "--api-url",
269
+ help="URL of the FastAPI backend server. If not provided, uses MACOS_API_URL env var.",
270
+ default=None,
271
+ type=str,
272
+ )
273
+ def main(transport, host, port, api_url):
274
+ global API_BASE_URL
275
+ if api_url:
276
+ API_BASE_URL = api_url
277
+
278
+ if transport == "stdio":
279
+ mcp.run()
280
+ else:
281
+ mcp.run(transport=transport, host=host, port=port)
282
+
283
+
284
+ if __name__ == "__main__":
285
+ main()