decodingtrust-agent-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. agent/__init__.py +30 -0
  2. agent/claudesdk/__init__.py +8 -0
  3. agent/claudesdk/example.py +221 -0
  4. agent/claudesdk/src/__init__.py +8 -0
  5. agent/claudesdk/src/agent.py +400 -0
  6. agent/claudesdk/src/mcp_proxy.py +409 -0
  7. agent/claudesdk/src/utils.py +420 -0
  8. agent/googleadk/__init__.py +15 -0
  9. agent/googleadk/example.py +237 -0
  10. agent/googleadk/src/__init__.py +12 -0
  11. agent/googleadk/src/agent.py +401 -0
  12. agent/googleadk/src/mcp_wrapper.py +163 -0
  13. agent/googleadk/src/utils.py +602 -0
  14. agent/langchain/__init__.py +8 -0
  15. agent/langchain/example.py +213 -0
  16. agent/langchain/src/__init__.py +8 -0
  17. agent/langchain/src/agent.py +645 -0
  18. agent/langchain/src/utils.py +433 -0
  19. agent/openaisdk/__init__.py +17 -0
  20. agent/openaisdk/example.py +228 -0
  21. agent/openaisdk/src/__init__.py +12 -0
  22. agent/openaisdk/src/agent.py +491 -0
  23. agent/openaisdk/src/agent_wrapper.py +143 -0
  24. agent/openaisdk/src/mcp_wrapper.py +395 -0
  25. agent/openaisdk/src/utils.py +493 -0
  26. agent/openclaw/__init__.py +10 -0
  27. agent/openclaw/example.py +251 -0
  28. agent/openclaw/src/__init__.py +14 -0
  29. agent/openclaw/src/agent.py +930 -0
  30. agent/openclaw/src/helpers/__init__.py +1 -0
  31. agent/openclaw/src/helpers/auth_helpers.py +55 -0
  32. agent/openclaw/src/mcp_proxy.py +564 -0
  33. agent/openclaw/src/plugin_generator.py +231 -0
  34. agent/openclaw/src/utils.py +341 -0
  35. agent/pocketflow/__init__.py +18 -0
  36. agent/pocketflow/example.py +221 -0
  37. agent/pocketflow/prompts/react_agent.py +46 -0
  38. agent/pocketflow/src/__init__.py +6 -0
  39. agent/pocketflow/src/agent.py +507 -0
  40. agent/pocketflow/src/agent_wrapper.py +159 -0
  41. agent/pocketflow/src/async_helper.py +92 -0
  42. agent/pocketflow/src/mcp_react_agent.py +279 -0
  43. agent/pocketflow/src/native_agent.py +74 -0
  44. agent/pocketflow/src/nodes.py +467 -0
  45. benchmark/__init__.py +0 -0
  46. benchmark/browser/benign.jsonl +34 -0
  47. benchmark/browser/direct.jsonl +85 -0
  48. benchmark/browser/indirect.jsonl +82 -0
  49. benchmark/code/benign.jsonl +0 -0
  50. benchmark/code/direct.jsonl +121 -0
  51. benchmark/code/indirect.jsonl +165 -0
  52. benchmark/crm/benign.jsonl +165 -0
  53. benchmark/crm/direct.jsonl +90 -0
  54. benchmark/crm/indirect.jsonl +150 -0
  55. benchmark/customer-service/benign.jsonl +160 -0
  56. benchmark/customer-service/direct.jsonl +100 -0
  57. benchmark/customer-service/indirect.jsonl +101 -0
  58. benchmark/finance/benign.jsonl +0 -0
  59. benchmark/finance/direct.jsonl +200 -0
  60. benchmark/finance/indirect.jsonl +200 -0
  61. benchmark/legal/benign.jsonl +0 -0
  62. benchmark/legal/direct.jsonl +200 -0
  63. benchmark/legal/indirect.jsonl +200 -0
  64. benchmark/macos/benign.jsonl +30 -0
  65. benchmark/macos/direct.jsonl +50 -0
  66. benchmark/macos/indirect.jsonl +50 -0
  67. benchmark/medical/benign.jsonl +642 -0
  68. benchmark/medical/direct.jsonl +229 -0
  69. benchmark/medical/indirect.jsonl +222 -0
  70. benchmark/os-filesystem/benign.jsonl +200 -0
  71. benchmark/os-filesystem/direct.jsonl +200 -0
  72. benchmark/os-filesystem/indirect.jsonl +200 -0
  73. benchmark/research/benign.jsonl +0 -0
  74. benchmark/research/direct.jsonl +119 -0
  75. benchmark/research/indirect.jsonl +125 -0
  76. benchmark/telecom/benign.jsonl +120 -0
  77. benchmark/telecom/direct.jsonl +161 -0
  78. benchmark/telecom/indirect.jsonl +166 -0
  79. benchmark/travel/benign.jsonl +130 -0
  80. benchmark/travel/direct.jsonl +105 -0
  81. benchmark/travel/indirect.jsonl +120 -0
  82. benchmark/windows/benign.jsonl +100 -0
  83. benchmark/windows/direct.jsonl +140 -0
  84. benchmark/windows/indirect.jsonl +107 -0
  85. benchmark/workflow/benign.jsonl +335 -0
  86. benchmark/workflow/direct.jsonl +78 -0
  87. benchmark/workflow/indirect.jsonl +107 -0
  88. cli/__init__.py +5 -0
  89. cli/main.py +182 -0
  90. cli/scaffold.py +334 -0
  91. decodingtrust_agent_sdk-0.1.0.dist-info/METADATA +642 -0
  92. decodingtrust_agent_sdk-0.1.0.dist-info/RECORD +374 -0
  93. decodingtrust_agent_sdk-0.1.0.dist-info/WHEEL +5 -0
  94. decodingtrust_agent_sdk-0.1.0.dist-info/entry_points.txt +2 -0
  95. decodingtrust_agent_sdk-0.1.0.dist-info/licenses/LICENSE +201 -0
  96. decodingtrust_agent_sdk-0.1.0.dist-info/top_level.txt +6 -0
  97. dt_arena/config/env.yaml +515 -0
  98. dt_arena/config/injection_mcp.yaml +430 -0
  99. dt_arena/config/mcp.yaml +642 -0
  100. dt_arena/envs/arxiv/docker-compose-hub.yml +31 -0
  101. dt_arena/envs/arxiv/docker-compose.yml +36 -0
  102. dt_arena/envs/atlassian/docker/docker-compose.dev.yml +65 -0
  103. dt_arena/envs/atlassian/docker/docker-compose.yml +53 -0
  104. dt_arena/envs/atlassian/docker-compose-hub.yml +57 -0
  105. dt_arena/envs/atlassian/docker-compose.yml +72 -0
  106. dt_arena/envs/bigquery/docker-compose.yml +20 -0
  107. dt_arena/envs/booking/docker-compose.yml +59 -0
  108. dt_arena/envs/calendar/docker-compose-hub.yml +30 -0
  109. dt_arena/envs/calendar/docker-compose.yml +42 -0
  110. dt_arena/envs/custom-website/docker-compose.yml +6 -0
  111. dt_arena/envs/customer_service/docker-compose.yml +59 -0
  112. dt_arena/envs/databricks/docker-compose-hub.yml +47 -0
  113. dt_arena/envs/databricks/docker-compose.yml +51 -0
  114. dt_arena/envs/ecommerce/docker-compose.yml +6 -0
  115. dt_arena/envs/ers/docker-compose.yml +36 -0
  116. dt_arena/envs/ers/hrms/docker/docker-compose.yml +31 -0
  117. dt_arena/envs/finance/docker-compose.yml +23 -0
  118. dt_arena/envs/github/docker/docker-compose-hub.yml +50 -0
  119. dt_arena/envs/github/docker/docker-compose.yml +50 -0
  120. dt_arena/envs/gmail/docker-compose-hub.yml +51 -0
  121. dt_arena/envs/gmail/docker-compose.yml +65 -0
  122. dt_arena/envs/google-form/docker-compose-hub.yml +33 -0
  123. dt_arena/envs/google-form/docker-compose.yml +41 -0
  124. dt_arena/envs/googledocs/docker-compose-hub.yml +61 -0
  125. dt_arena/envs/googledocs/docker-compose.yml +78 -0
  126. dt_arena/envs/hospital/docker-compose-hub.yml +25 -0
  127. dt_arena/envs/hospital/docker-compose.yml +27 -0
  128. dt_arena/envs/legal/docker-compose.yml +22 -0
  129. dt_arena/envs/linkedin/docker-compose.yml +63 -0
  130. dt_arena/envs/macos/docker-compose.yml +79 -0
  131. dt_arena/envs/os-filesystem/docker-compose-hub.yml +16 -0
  132. dt_arena/envs/os-filesystem/docker-compose.yml +20 -0
  133. dt_arena/envs/paypal/docker-compose-hub.yml +48 -0
  134. dt_arena/envs/paypal/docker-compose.yml +63 -0
  135. dt_arena/envs/research/docker-compose-hub.yml +13 -0
  136. dt_arena/envs/research/docker-compose.yml +24 -0
  137. dt_arena/envs/salesforce_crm/docker-compose-hub.yaml +45 -0
  138. dt_arena/envs/salesforce_crm/docker-compose.yaml +49 -0
  139. dt_arena/envs/slack/docker-compose-hub.yml +28 -0
  140. dt_arena/envs/slack/docker-compose.yml +41 -0
  141. dt_arena/envs/snowflake/docker-compose-hub.yml +41 -0
  142. dt_arena/envs/snowflake/docker-compose.yml +44 -0
  143. dt_arena/envs/telecom/docker-compose-hub.yml +16 -0
  144. dt_arena/envs/telecom/docker-compose.yml +17 -0
  145. dt_arena/envs/telegram/docker-compose-hub.yml +57 -0
  146. dt_arena/envs/telegram/docker-compose.yml +62 -0
  147. dt_arena/envs/terminal/docker-compose-hub.yml +12 -0
  148. dt_arena/envs/terminal/docker-compose.yml +26 -0
  149. dt_arena/envs/travel/docker-compose-hub.yml +19 -0
  150. dt_arena/envs/travel/docker-compose.yml +19 -0
  151. dt_arena/envs/whatsapp/docker-compose-hub.yml +61 -0
  152. dt_arena/envs/whatsapp/docker-compose.yml +78 -0
  153. dt_arena/envs/windows/docker-compose.yml +71 -0
  154. dt_arena/envs/zoom/docker-compose-hub.yml +27 -0
  155. dt_arena/envs/zoom/docker-compose.yml +40 -0
  156. dt_arena/injection_mcp_server/atlassian/env_injection.py +134 -0
  157. dt_arena/injection_mcp_server/calendar/env_injection.py +217 -0
  158. dt_arena/injection_mcp_server/custom_website/env_injection.py +97 -0
  159. dt_arena/injection_mcp_server/customer_service/env_injection.py +659 -0
  160. dt_arena/injection_mcp_server/databricks/env_injection.py +255 -0
  161. dt_arena/injection_mcp_server/ecommerce/env_injection.py +110 -0
  162. dt_arena/injection_mcp_server/finance/env_injection.py +85 -0
  163. dt_arena/injection_mcp_server/github/env_injection.py +206 -0
  164. dt_arena/injection_mcp_server/gmail/env_injection.py +211 -0
  165. dt_arena/injection_mcp_server/google_form/env_injection.py +186 -0
  166. dt_arena/injection_mcp_server/googledocs/env_injection.py +44 -0
  167. dt_arena/injection_mcp_server/hospital/env_injection.py +43 -0
  168. dt_arena/injection_mcp_server/legal/env_injection.py +229 -0
  169. dt_arena/injection_mcp_server/macos/env_injection.py +272 -0
  170. dt_arena/injection_mcp_server/os-filesystem/env_injection.py +341 -0
  171. dt_arena/injection_mcp_server/paypal/env_injection.py +268 -0
  172. dt_arena/injection_mcp_server/research/env_injection.py +616 -0
  173. dt_arena/injection_mcp_server/salesforce/env_injection.py +514 -0
  174. dt_arena/injection_mcp_server/slack/env_injection.py +265 -0
  175. dt_arena/injection_mcp_server/snowflake/env_injection.py +230 -0
  176. dt_arena/injection_mcp_server/telecom/env_injection.py +503 -0
  177. dt_arena/injection_mcp_server/telegram/env_injection.py +171 -0
  178. dt_arena/injection_mcp_server/terminal/env_injection.py +523 -0
  179. dt_arena/injection_mcp_server/travel/env_injection.py +173 -0
  180. dt_arena/injection_mcp_server/whatsapp/env_injection.py +185 -0
  181. dt_arena/injection_mcp_server/windows/env_injection.py +943 -0
  182. dt_arena/injection_mcp_server/zoom/env_injection.py +216 -0
  183. dt_arena/mcp_server/atlassian/main.py +1554 -0
  184. dt_arena/mcp_server/atlassian/test_server.py +66 -0
  185. dt_arena/mcp_server/bigquery/main.py +333 -0
  186. dt_arena/mcp_server/booking/main.py +310 -0
  187. dt_arena/mcp_server/browser/main.py +1741 -0
  188. dt_arena/mcp_server/calendar/example_multi_user.py +162 -0
  189. dt_arena/mcp_server/calendar/main.py +792 -0
  190. dt_arena/mcp_server/calendar/test_mcp.py +135 -0
  191. dt_arena/mcp_server/customer_service/main.py +1063 -0
  192. dt_arena/mcp_server/databricks/main.py +566 -0
  193. dt_arena/mcp_server/databricks/probe.py +102 -0
  194. dt_arena/mcp_server/ers/main.py +845 -0
  195. dt_arena/mcp_server/finance/__init__.py +87 -0
  196. dt_arena/mcp_server/finance/core/__init__.py +12 -0
  197. dt_arena/mcp_server/finance/core/data_loader.py +558 -0
  198. dt_arena/mcp_server/finance/core/portfolio.py +565 -0
  199. dt_arena/mcp_server/finance/evaluation/__init__.py +20 -0
  200. dt_arena/mcp_server/finance/evaluation/evaluator.py +217 -0
  201. dt_arena/mcp_server/finance/evaluation/logger.py +137 -0
  202. dt_arena/mcp_server/finance/injection/__init__.py +66 -0
  203. dt_arena/mcp_server/finance/injection/config.py +176 -0
  204. dt_arena/mcp_server/finance/injection/content.py +755 -0
  205. dt_arena/mcp_server/finance/injection/html.py +409 -0
  206. dt_arena/mcp_server/finance/injection/locations.py +167 -0
  207. dt_arena/mcp_server/finance/injection/methods.py +193 -0
  208. dt_arena/mcp_server/finance/injection/presets.py +1023 -0
  209. dt_arena/mcp_server/finance/main.py +361 -0
  210. dt_arena/mcp_server/finance/run_mcp.py +21 -0
  211. dt_arena/mcp_server/finance/run_web.py +26 -0
  212. dt_arena/mcp_server/finance/server/__init__.py +41 -0
  213. dt_arena/mcp_server/finance/server/extractor.py +1453 -0
  214. dt_arena/mcp_server/finance/server/extractor_minimal.py +292 -0
  215. dt_arena/mcp_server/finance/server/extractor_simple.py +1164 -0
  216. dt_arena/mcp_server/finance/server/injection_mcp.py +865 -0
  217. dt_arena/mcp_server/finance/server/mcp.py +451 -0
  218. dt_arena/mcp_server/finance/server/tools/__init__.py +23 -0
  219. dt_arena/mcp_server/finance/server/tools/account.py +88 -0
  220. dt_arena/mcp_server/finance/server/tools/browsing.py +328 -0
  221. dt_arena/mcp_server/finance/server/tools/social.py +73 -0
  222. dt_arena/mcp_server/finance/server/tools/trading.py +242 -0
  223. dt_arena/mcp_server/finance/server/tools/utility.py +49 -0
  224. dt_arena/mcp_server/finance/server/web.py +2139 -0
  225. dt_arena/mcp_server/finance/tasks/benchmark/__init__.py +28 -0
  226. dt_arena/mcp_server/finance/tasks/benchmark/attack_pool.py +3026 -0
  227. dt_arena/mcp_server/finance/tasks/benchmark/attack_runner.py +1315 -0
  228. dt_arena/mcp_server/finance/tasks/benchmark/finra_requirements.py +1335 -0
  229. dt_arena/mcp_server/finance/tasks/benchmark/finra_tasks.py +3665 -0
  230. dt_arena/mcp_server/finance/tasks/benchmark/malicious_tasks.py +2673 -0
  231. dt_arena/mcp_server/finance/tasks/redteam_suite/run_redteam_suite.py +1713 -0
  232. dt_arena/mcp_server/finance/test_mcp_tools.py +476 -0
  233. dt_arena/mcp_server/github/main.py +441 -0
  234. dt_arena/mcp_server/gmail/main.py +1004 -0
  235. dt_arena/mcp_server/google_form/main.py +141 -0
  236. dt_arena/mcp_server/googledocs/main.py +458 -0
  237. dt_arena/mcp_server/hospital/mcp_server.py +458 -0
  238. dt_arena/mcp_server/legal/__init__.py +9 -0
  239. dt_arena/mcp_server/legal/core/__init__.py +14 -0
  240. dt_arena/mcp_server/legal/core/courtlistener_store.py +762 -0
  241. dt_arena/mcp_server/legal/core/data_loader.py +266 -0
  242. dt_arena/mcp_server/legal/core/document_store.py +197 -0
  243. dt_arena/mcp_server/legal/core/matter_manager.py +466 -0
  244. dt_arena/mcp_server/legal/main.py +89 -0
  245. dt_arena/mcp_server/legal/scripts/collect_data.py +988 -0
  246. dt_arena/mcp_server/legal/server/__init__.py +14 -0
  247. dt_arena/mcp_server/legal/server/mcp.py +2330 -0
  248. dt_arena/mcp_server/macos/client_test.py +270 -0
  249. dt_arena/mcp_server/macos/mcp_server.py +285 -0
  250. dt_arena/mcp_server/os-filesystem/main.py +1380 -0
  251. dt_arena/mcp_server/paypal/main.py +501 -0
  252. dt_arena/mcp_server/research/main.py +777 -0
  253. dt_arena/mcp_server/salesforce/main.py +2006 -0
  254. dt_arena/mcp_server/slack/main.py +318 -0
  255. dt_arena/mcp_server/snowflake/main.py +612 -0
  256. dt_arena/mcp_server/snowflake/probe.py +183 -0
  257. dt_arena/mcp_server/telecom/mcp_client.py +423 -0
  258. dt_arena/mcp_server/telecom/mcp_server.py +1059 -0
  259. dt_arena/mcp_server/telegram/main.py +338 -0
  260. dt_arena/mcp_server/terminal/main.py +163 -0
  261. dt_arena/mcp_server/travel/client_test.py +16 -0
  262. dt_arena/mcp_server/travel/mcp_server.py +404 -0
  263. dt_arena/mcp_server/whatsapp/main.py +318 -0
  264. dt_arena/mcp_server/windows/client_test.py +270 -0
  265. dt_arena/mcp_server/windows/mcp_server.py +218 -0
  266. dt_arena/mcp_server/zoom/main.py +466 -0
  267. dt_arena/src/__init__.py +0 -0
  268. dt_arena/src/hooks/__init__.py +0 -0
  269. dt_arena/src/hooks/audit_log.py +30 -0
  270. dt_arena/src/hooks/hooks.json +3 -0
  271. dt_arena/src/run_benign.py +142 -0
  272. dt_arena/src/types/__init__.py +0 -0
  273. dt_arena/src/types/agent.py +441 -0
  274. dt_arena/src/types/attacks.py +2 -0
  275. dt_arena/src/types/environment.py +2 -0
  276. dt_arena/src/types/hooks.py +174 -0
  277. dt_arena/src/types/judge.py +52 -0
  278. dt_arena/src/types/red_teaming_trajectory.py +385 -0
  279. dt_arena/src/types/task.py +260 -0
  280. dt_arena/src/types/trajectory.py +315 -0
  281. dt_arena/utils/__init__.py +1 -0
  282. dt_arena/utils/atlassian/__init__.py +27 -0
  283. dt_arena/utils/atlassian/helpers.py +520 -0
  284. dt_arena/utils/bigquery/__init__.py +1 -0
  285. dt_arena/utils/bigquery/helpers.py +246 -0
  286. dt_arena/utils/calendar/__init__.py +1 -0
  287. dt_arena/utils/calendar/helpers.py +87 -0
  288. dt_arena/utils/customer_service/__init__.py +17 -0
  289. dt_arena/utils/customer_service/cs_env_client.py +940 -0
  290. dt_arena/utils/customer_service/helpers.py +339 -0
  291. dt_arena/utils/customer_service/judges/__init__.py +20 -0
  292. dt_arena/utils/customer_service/judges/abusive_comms.py +149 -0
  293. dt_arena/utils/customer_service/judges/bulk_destructive.py +202 -0
  294. dt_arena/utils/customer_service/judges/cross_customer.py +481 -0
  295. dt_arena/utils/customer_service/judges/email_exfiltration.py +320 -0
  296. dt_arena/utils/customer_service/judges/financial_fraud.py +221 -0
  297. dt_arena/utils/customer_service/judges/llm_utils.py +264 -0
  298. dt_arena/utils/customer_service/judges/pii_disclosure.py +837 -0
  299. dt_arena/utils/customer_service/judges/policy_misrep.py +44 -0
  300. dt_arena/utils/customer_service/judges/text_utils.py +21 -0
  301. dt_arena/utils/databricks/__init__.py +2 -0
  302. dt_arena/utils/databricks/helpers.py +210 -0
  303. dt_arena/utils/finance/__init__.py +0 -0
  304. dt_arena/utils/finance/helpers.py +263 -0
  305. dt_arena/utils/github/__init__.py +1 -0
  306. dt_arena/utils/github/helpers.py +249 -0
  307. dt_arena/utils/gmail/__init__.py +1 -0
  308. dt_arena/utils/gmail/helpers.py +344 -0
  309. dt_arena/utils/google_form/__init__.py +2 -0
  310. dt_arena/utils/google_form/helpers.py +133 -0
  311. dt_arena/utils/legal/__init__.py +0 -0
  312. dt_arena/utils/legal/helpers.py +228 -0
  313. dt_arena/utils/macos/__init__.py +0 -0
  314. dt_arena/utils/macos/env_setup.py +215 -0
  315. dt_arena/utils/macos/helpers.py +61 -0
  316. dt_arena/utils/os_filesystem/__init__.py +1 -0
  317. dt_arena/utils/os_filesystem/helpers.py +366 -0
  318. dt_arena/utils/paypal/__init__.py +1 -0
  319. dt_arena/utils/paypal/helpers.py +178 -0
  320. dt_arena/utils/port_allocator.py +266 -0
  321. dt_arena/utils/research/__init__.py +0 -0
  322. dt_arena/utils/research/helpers.py +251 -0
  323. dt_arena/utils/salesforce/__init__.py +1 -0
  324. dt_arena/utils/salesforce/helpers.py +719 -0
  325. dt_arena/utils/slack/__init__.py +1 -0
  326. dt_arena/utils/slack/helpers.py +176 -0
  327. dt_arena/utils/snowflake/__init__.py +1 -0
  328. dt_arena/utils/snowflake/helpers.py +166 -0
  329. dt_arena/utils/telecom/__init__.py +1 -0
  330. dt_arena/utils/telecom/helpers.py +760 -0
  331. dt_arena/utils/telegram/__init__.py +0 -0
  332. dt_arena/utils/telegram/helpers.py +174 -0
  333. dt_arena/utils/terminal/__init__.py +0 -0
  334. dt_arena/utils/terminal/helpers.py +20 -0
  335. dt_arena/utils/travel/__init__.py +0 -0
  336. dt_arena/utils/travel/env_client.py +537 -0
  337. dt_arena/utils/travel/llm_judge.py +137 -0
  338. dt_arena/utils/travel/prompts.py +64 -0
  339. dt_arena/utils/utils/__init__.py +122 -0
  340. dt_arena/utils/whatsapp/__init__.py +0 -0
  341. dt_arena/utils/whatsapp/helpers.py +226 -0
  342. dt_arena/utils/windows/__init__.py +0 -0
  343. dt_arena/utils/windows/env_reset.py +224 -0
  344. dt_arena/utils/windows/env_setup.py +280 -0
  345. dt_arena/utils/windows/exfil_helpers.py +170 -0
  346. dt_arena/utils/windows/helpers.py +74 -0
  347. dt_arena/utils/zoom/__init__.py +1 -0
  348. dt_arena/utils/zoom/helpers.py +70 -0
  349. eval/__init__.py +1 -0
  350. eval/evaluation.py +426 -0
  351. eval/task_runner.py +449 -0
  352. utils/__init__.py +148 -0
  353. utils/agent_helpers.py +308 -0
  354. utils/agent_wrapper.py +189 -0
  355. utils/compose_utils.py +135 -0
  356. utils/config.py +77 -0
  357. utils/env_helpers.py +104 -0
  358. utils/eval_stats.py +88 -0
  359. utils/injection_helpers.py +429 -0
  360. utils/injection_mcp_helpers.py +152 -0
  361. utils/judge_helpers.py +181 -0
  362. utils/judge_utils.py +472 -0
  363. utils/llm.py +196 -0
  364. utils/logging.py +45 -0
  365. utils/mcp_helpers.py +232 -0
  366. utils/mcp_manager.py +235 -0
  367. utils/memory_guard.py +18 -0
  368. utils/red_teaming_sandbox.py +476 -0
  369. utils/reset_helpers.py +318 -0
  370. utils/resource_manager.py +370 -0
  371. utils/skill_helpers.py +447 -0
  372. utils/task_executor.py +904 -0
  373. utils/task_helpers.py +270 -0
  374. utils/template_helpers.py +179 -0
@@ -0,0 +1,318 @@
1
+ """
2
+ WhatsApp MCP Server - thin client that proxies to the WhatsApp ENV API.
3
+ Independent environment with its own PostgreSQL + FastAPI backend.
4
+ """
5
+ import os
6
+ import sys
7
+ import asyncio
8
+ from typing import Any, Dict, List, Optional
9
+
10
+ from fastmcp import FastMCP
11
+ import httpx
12
+
13
+ API_URL = os.getenv("WHATSAPP_API_URL", "http://127.0.0.1:8039")
14
+ USER_ACCESS_TOKEN = os.getenv("WHATSAPP_USER_ACCESS_TOKEN", "whatsapp_token_tom_dev_123")
15
+
16
+
17
+ async def _api_call(name: str, arguments: Dict[str, Any]) -> Any:
18
+ """Call the WhatsApp ENV API with retry for cold-start tolerance."""
19
+ last_exc: Optional[Exception] = None
20
+ for attempt in range(1, 11):
21
+ try:
22
+ async with httpx.AsyncClient(timeout=20) as client:
23
+ args = dict(arguments or {})
24
+ if USER_ACCESS_TOKEN:
25
+ args["access_token"] = USER_ACCESS_TOKEN
26
+ resp = await client.post(f"{API_URL}/tools/call", json={"name": name, "arguments": args})
27
+ resp.raise_for_status()
28
+ data = resp.json()
29
+ return data.get("result")
30
+ except httpx.HTTPStatusError as e:
31
+ if e.response.status_code < 500:
32
+ raise RuntimeError(f"ENV API call failed: {e}")
33
+ last_exc = e
34
+ await asyncio.sleep(min(0.5 * attempt, 3.0))
35
+ except Exception as e:
36
+ last_exc = e
37
+ await asyncio.sleep(min(0.5 * attempt, 3.0))
38
+ raise RuntimeError(f"ENV API call failed after retries: {last_exc}")
39
+
40
+
41
+ mcp = FastMCP("WhatsApp MCP Server")
42
+
43
+
44
+ # ═════════════════════════════════════════════════════════════════════════════
45
+ # Contact Lookup
46
+ # ═════════════════════════════════════════════════════════════════════════════
47
+
48
+ @mcp.tool()
49
+ async def list_contacts(search: Optional[str] = None, limit: int = 100, offset: int = 0) -> Dict[str, Any]:
50
+ """List contacts from your WhatsApp address book.
51
+
52
+ Args:
53
+ search: Search by name, email, phone, or company
54
+ limit: Max results (default 100)
55
+ offset: Pagination offset
56
+ """
57
+ return await _api_call("list_contacts", {"search": search, "limit": limit, "offset": offset})
58
+
59
+
60
+ @mcp.tool()
61
+ async def get_contact(contact_id: str) -> Dict[str, Any]:
62
+ """Get full details for a contact.
63
+
64
+ Args:
65
+ contact_id: The contact identifier (REQUIRED)
66
+ """
67
+ return await _api_call("get_contact", {"contact_id": contact_id})
68
+
69
+
70
+ @mcp.tool()
71
+ async def create_contact(first_name: str, last_name: Optional[str] = None,
72
+ email: Optional[str] = None, phone: Optional[str] = None,
73
+ company: Optional[str] = None, job_title: Optional[str] = None,
74
+ address: Optional[str] = None, notes: Optional[str] = None,
75
+ favorite: bool = False) -> Dict[str, Any]:
76
+ """Create a new contact in WhatsApp.
77
+
78
+ Args:
79
+ first_name: First name (REQUIRED)
80
+ last_name: Last name
81
+ email: Email address
82
+ phone: Phone number
83
+ company: Company name
84
+ job_title: Job title
85
+ address: Mailing address
86
+ notes: Free-form notes
87
+ favorite: Mark as favorite
88
+ """
89
+ return await _api_call("create_contact", {
90
+ "first_name": first_name, "last_name": last_name,
91
+ "email": email, "phone": phone,
92
+ "company": company, "job_title": job_title,
93
+ "address": address, "notes": notes, "favorite": favorite,
94
+ })
95
+
96
+
97
+ @mcp.tool()
98
+ async def search_contacts(query: str, limit: int = 20) -> Dict[str, Any]:
99
+ """Search contacts by name, email, phone, or company.
100
+
101
+ Args:
102
+ query: Search term (REQUIRED)
103
+ limit: Max results (default 20)
104
+ """
105
+ return await _api_call("search_contacts", {"query": query, "limit": limit})
106
+
107
+
108
+ # ═════════════════════════════════════════════════════════════════════════════
109
+ # WhatsApp Messaging
110
+ # ═════════════════════════════════════════════════════════════════════════════
111
+
112
+ @mcp.tool()
113
+ async def send_whatsapp_message(phone_number: str, body: str, contact_id: Optional[str] = None,
114
+ media_url: Optional[str] = None) -> Dict[str, Any]:
115
+ """Send a WhatsApp message to a phone number.
116
+
117
+ Args:
118
+ phone_number: Recipient phone number (REQUIRED)
119
+ body: Message text (REQUIRED)
120
+ contact_id: Associated contact ID
121
+ media_url: URL of photo/video/document attachment
122
+ """
123
+ return await _api_call("send_message", {
124
+ "phone_number": phone_number, "body": body,
125
+ "contact_id": contact_id, "media_url": media_url,
126
+ "message_type": "media" if media_url else "text",
127
+ })
128
+
129
+
130
+ @mcp.tool()
131
+ async def list_whatsapp_chats(limit: int = 50, offset: int = 0,
132
+ unread_only: bool = False) -> Dict[str, Any]:
133
+ """List WhatsApp conversation threads (chats).
134
+
135
+ Args:
136
+ limit: Max results
137
+ offset: Pagination offset
138
+ unread_only: Only return chats with unread messages
139
+ """
140
+ return await _api_call("list_message_threads", {"limit": limit, "offset": offset, "unread_only": unread_only})
141
+
142
+
143
+ @mcp.tool()
144
+ async def get_whatsapp_chat(phone_number: str, limit: int = 100, offset: int = 0) -> Dict[str, Any]:
145
+ """Get the full WhatsApp conversation with a phone number. Marks incoming as read.
146
+
147
+ Args:
148
+ phone_number: Phone number for the conversation (REQUIRED)
149
+ limit: Max messages
150
+ offset: Pagination offset
151
+ """
152
+ return await _api_call("get_conversation", {"phone_number": phone_number, "limit": limit, "offset": offset})
153
+
154
+
155
+ @mcp.tool()
156
+ async def get_whatsapp_message(message_id: str) -> Dict[str, Any]:
157
+ """Get a single WhatsApp message by ID.
158
+
159
+ Args:
160
+ message_id: Message identifier (REQUIRED)
161
+ """
162
+ return await _api_call("get_message", {"message_id": message_id})
163
+
164
+
165
+ @mcp.tool()
166
+ async def reply_to_whatsapp_message(message_id: str, body: str,
167
+ media_url: Optional[str] = None) -> Dict[str, Any]:
168
+ """Reply to a WhatsApp message.
169
+
170
+ Args:
171
+ message_id: Original message ID to reply to (REQUIRED)
172
+ body: Reply text (REQUIRED)
173
+ media_url: Optional media attachment
174
+ """
175
+ return await _api_call("reply_to_message", {"message_id": message_id, "body": body, "media_url": media_url})
176
+
177
+
178
+ @mcp.tool()
179
+ async def forward_whatsapp_message(message_id: str, to_phone_number: str,
180
+ additional_text: Optional[str] = None) -> Dict[str, Any]:
181
+ """Forward a WhatsApp message to another phone number.
182
+
183
+ Args:
184
+ message_id: Message to forward (REQUIRED)
185
+ to_phone_number: Recipient phone number (REQUIRED)
186
+ additional_text: Optional text to add before forwarded content
187
+ """
188
+ return await _api_call("forward_message", {
189
+ "message_id": message_id, "to_phone_number": to_phone_number, "additional_text": additional_text,
190
+ })
191
+
192
+
193
+ @mcp.tool()
194
+ async def delete_whatsapp_message(message_id: str) -> Dict[str, Any]:
195
+ """Delete a WhatsApp message by ID.
196
+
197
+ Args:
198
+ message_id: Message identifier (REQUIRED)
199
+ """
200
+ return await _api_call("delete_message", {"message_id": message_id})
201
+
202
+
203
+ @mcp.tool()
204
+ async def delete_whatsapp_chat(phone_number: str) -> Dict[str, Any]:
205
+ """Delete an entire WhatsApp conversation with a phone number.
206
+
207
+ Args:
208
+ phone_number: Phone number of the conversation to delete (REQUIRED)
209
+ """
210
+ return await _api_call("delete_conversation", {"phone_number": phone_number})
211
+
212
+
213
+ @mcp.tool()
214
+ async def search_whatsapp_messages(query: str, phone_number: Optional[str] = None,
215
+ direction: Optional[str] = None, limit: int = 50) -> Dict[str, Any]:
216
+ """Search WhatsApp messages by text content.
217
+
218
+ Args:
219
+ query: Search term (REQUIRED)
220
+ phone_number: Limit search to a specific conversation
221
+ direction: Filter by 'incoming' or 'outgoing'
222
+ limit: Max results
223
+ """
224
+ return await _api_call("search_messages", {
225
+ "query": query, "phone_number": phone_number, "direction": direction, "limit": limit,
226
+ })
227
+
228
+
229
+ @mcp.tool()
230
+ async def get_unread_whatsapp_messages(limit: int = 50) -> Dict[str, Any]:
231
+ """Get all unread incoming WhatsApp messages.
232
+
233
+ Args:
234
+ limit: Max results
235
+ """
236
+ return await _api_call("get_unread_messages", {"limit": limit})
237
+
238
+
239
+ @mcp.tool()
240
+ async def mark_whatsapp_message_read(message_id: str) -> Dict[str, Any]:
241
+ """Mark a WhatsApp message as read.
242
+
243
+ Args:
244
+ message_id: Message identifier (REQUIRED)
245
+ """
246
+ return await _api_call("mark_message_read", {"message_id": message_id})
247
+
248
+
249
+ # ═════════════════════════════════════════════════════════════════════════════
250
+ # WhatsApp Phone Calls
251
+ # ═════════════════════════════════════════════════════════════════════════════
252
+
253
+ @mcp.tool()
254
+ async def make_whatsapp_call(phone_number: str, contact_id: Optional[str] = None,
255
+ notes: Optional[str] = None) -> Dict[str, Any]:
256
+ """Initiate a WhatsApp voice/video call.
257
+
258
+ Args:
259
+ phone_number: Phone number to call (REQUIRED)
260
+ contact_id: Associated contact ID
261
+ notes: Optional notes about the call
262
+ """
263
+ return await _api_call("make_call", {
264
+ "phone_number": phone_number, "contact_id": contact_id, "notes": notes,
265
+ })
266
+
267
+
268
+ @mcp.tool()
269
+ async def list_whatsapp_calls(phone_number: Optional[str] = None,
270
+ direction: Optional[str] = None,
271
+ limit: int = 50, offset: int = 0) -> Dict[str, Any]:
272
+ """List WhatsApp call history.
273
+
274
+ Args:
275
+ phone_number: Filter by phone number
276
+ direction: Filter by 'incoming', 'outgoing', or 'missed'
277
+ limit: Max results (default 50)
278
+ offset: Pagination offset
279
+ """
280
+ return await _api_call("list_calls", {
281
+ "phone_number": phone_number, "direction": direction, "limit": limit, "offset": offset,
282
+ })
283
+
284
+
285
+ @mcp.tool()
286
+ async def get_whatsapp_call(call_id: str) -> Dict[str, Any]:
287
+ """Get details of a specific WhatsApp call.
288
+
289
+ Args:
290
+ call_id: Call identifier (REQUIRED)
291
+ """
292
+ return await _api_call("get_call", {"call_id": call_id})
293
+
294
+
295
+ @mcp.tool()
296
+ async def delete_whatsapp_call(call_id: str) -> Dict[str, Any]:
297
+ """Delete a WhatsApp call record from history.
298
+
299
+ Args:
300
+ call_id: Call identifier (REQUIRED)
301
+ """
302
+ return await _api_call("delete_call", {"call_id": call_id})
303
+
304
+
305
+ # ═════════════════════════════════════════════════════════════════════════════
306
+ # Entrypoint
307
+ # ═════════════════════════════════════════════════════════════════════════════
308
+
309
+ def main() -> None:
310
+ print("Starting WhatsApp MCP Server (proxying to ENV API)...", file=sys.stderr)
311
+ host = os.getenv("WHATSAPP_MCP_HOST", "localhost")
312
+ port_str = os.getenv("PORT", "").strip() or os.getenv("WHATSAPP_MCP_PORT", "8865")
313
+ port = int(port_str)
314
+ mcp.run(transport="http", host=host, port=port)
315
+
316
+
317
+ if __name__ == "__main__":
318
+ main()
@@ -0,0 +1,270 @@
1
+ import asyncio
2
+ import base64
3
+ from typing import Optional, List, Dict, Any
4
+ from contextlib import AsyncExitStack
5
+ from pathlib import Path
6
+
7
+ from mcp import ClientSession, StdioServerParameters
8
+ from mcp.client.stdio import stdio_client
9
+
10
+ from anthropic import Anthropic
11
+ from dotenv import load_dotenv
12
+
13
+ load_dotenv() # load environment variables from .env
14
+
15
+
16
+ class MCPClient:
17
+ def __init__(self):
18
+ # Initialize session and client objects
19
+ self.session: Optional[ClientSession] = None
20
+ self.exit_stack = AsyncExitStack()
21
+ self.anthropic = Anthropic()
22
+
23
+ async def connect_to_server(self, server_script_path: str, extra_args: Optional[List[str]] = None):
24
+ """Connect to an MCP server
25
+
26
+ Args:
27
+ server_script_path: Path to the server script (.py or .js)
28
+ extra_args: Optional additional arguments to pass to the server
29
+ """
30
+ is_python = server_script_path.endswith(".py")
31
+ is_js = server_script_path.endswith(".js")
32
+ if not (is_python or is_js):
33
+ raise ValueError("Server script must be a .py or .js file")
34
+
35
+ command = "python" if is_python else "node"
36
+ args = [server_script_path]
37
+ if extra_args:
38
+ args.extend(extra_args)
39
+
40
+ server_params = StdioServerParameters(command=command, args=args, env=None)
41
+
42
+ stdio_transport = await self.exit_stack.enter_async_context(stdio_client(server_params))
43
+ self.stdio, self.write = stdio_transport
44
+ self.session = await self.exit_stack.enter_async_context(ClientSession(self.stdio, self.write))
45
+
46
+ await self.session.initialize()
47
+
48
+ # List available tools
49
+ response = await self.session.list_tools()
50
+ tools = response.tools
51
+ print("\nConnected to server with tools:", [tool.name for tool in tools])
52
+
53
+ def encode_image(self, image_path: str) -> Dict[str, Any]:
54
+ """Encode an image file to base64 and determine its media type
55
+
56
+ Args:
57
+ image_path: Path to the image file
58
+
59
+ Returns:
60
+ Dictionary with image data and media type for Anthropic API
61
+ """
62
+ path = Path(image_path)
63
+ if not path.exists():
64
+ raise FileNotFoundError(f"Image file not found: {image_path}")
65
+
66
+ # Determine media type based on file extension
67
+ extension = path.suffix.lower()
68
+ media_type_map = {
69
+ ".jpg": "image/jpeg",
70
+ ".jpeg": "image/jpeg",
71
+ ".png": "image/png",
72
+ ".gif": "image/gif",
73
+ ".webp": "image/webp",
74
+ }
75
+
76
+ media_type = media_type_map.get(extension)
77
+ if not media_type:
78
+ raise ValueError(f"Unsupported image format: {extension}. Supported formats: {list(media_type_map.keys())}")
79
+
80
+ # Read and encode the image
81
+ with open(path, "rb") as image_file:
82
+ image_data = base64.standard_b64encode(image_file.read()).decode("utf-8")
83
+
84
+ return {"type": "image", "source": {"type": "base64", "media_type": media_type, "data": image_data}}
85
+
86
+ def convert_mcp_content_to_anthropic(self, content: Any) -> Any:
87
+ """Convert MCP tool result content to Anthropic API format
88
+
89
+ Args:
90
+ content: Content from MCP tool result
91
+
92
+ Returns:
93
+ Converted content in Anthropic API format
94
+ """
95
+ # If content is a list, convert each item
96
+ if isinstance(content, list):
97
+ converted = []
98
+ for item in content:
99
+ # Check if this is an MCP image format
100
+ if item.type == "image":
101
+ # Convert MCP image format to Anthropic format
102
+ converted.append(
103
+ {"type": "image", "source": {"type": "base64", "media_type": item.mimeType, "data": item.data}}
104
+ )
105
+ elif item.type == "text":
106
+ # Text content remains the same
107
+ converted.append(item)
108
+ else:
109
+ # Unknown format, keep as is
110
+ converted.append(item)
111
+ return converted
112
+ else:
113
+ # If content is not a list, return as is
114
+ return content
115
+
116
+ async def process_query(self, query: str, image_paths: Optional[List[str]] = None) -> str:
117
+ """Process a query using Claude and available tools
118
+
119
+ Args:
120
+ query: Text query to process
121
+ image_paths: Optional list of image file paths to include in the query
122
+ """
123
+ # Build message content with text and images
124
+ content: List[Dict[str, Any]] = []
125
+
126
+ # Add images first if provided
127
+ if image_paths:
128
+ for image_path in image_paths:
129
+ try:
130
+ image_content = self.encode_image(image_path)
131
+ content.append(image_content)
132
+ print(f"Loaded image: {image_path}")
133
+ except Exception as e:
134
+ print(f"Warning: Failed to load image {image_path}: {str(e)}")
135
+
136
+ # Add text query
137
+ content.append({"type": "text", "text": query})
138
+
139
+ messages = [{"role": "user", "content": content}]
140
+
141
+ response = await self.session.list_tools()
142
+ available_tools = [
143
+ {
144
+ "name": tool.name,
145
+ "description": tool.description,
146
+ "input_schema": tool.inputSchema,
147
+ }
148
+ for tool in response.tools
149
+ ]
150
+
151
+ # Initial Claude API call
152
+ response = self.anthropic.messages.create(
153
+ model="claude-sonnet-4-5",
154
+ max_tokens=1000,
155
+ messages=messages,
156
+ tools=available_tools,
157
+ )
158
+
159
+ # Process response and handle tool calls
160
+ final_text = []
161
+
162
+ # Agentic loop: continue while there are tool calls
163
+ while response.stop_reason == "tool_use":
164
+ # Collect all content from assistant's response
165
+ assistant_content = []
166
+ tool_results = []
167
+
168
+ for content_block in response.content:
169
+ if content_block.type == "text":
170
+ final_text.append(content_block.text)
171
+ assistant_content.append({"type": "text", "text": content_block.text})
172
+ elif content_block.type == "tool_use":
173
+ tool_name = content_block.name
174
+ tool_args = content_block.input
175
+ tool_use_id = content_block.id
176
+
177
+ assistant_content.append(
178
+ {"type": "tool_use", "id": tool_use_id, "name": tool_name, "input": tool_args}
179
+ )
180
+
181
+ # Execute tool call
182
+ print(f"[Calling tool {tool_name} with args {tool_args}]")
183
+ result = await self.session.call_tool(tool_name, tool_args)
184
+
185
+ # Convert MCP content format to Anthropic format
186
+ converted_content = self.convert_mcp_content_to_anthropic(result.content)
187
+
188
+ # Add tool result
189
+ tool_results.append(
190
+ {"type": "tool_result", "tool_use_id": tool_use_id, "content": converted_content}
191
+ )
192
+
193
+ # Add assistant message with all content (text + tool_use)
194
+ messages.append({"role": "assistant", "content": assistant_content})
195
+
196
+ # Add user message with tool results
197
+ messages.append({"role": "user", "content": tool_results})
198
+
199
+ # Get next response from Claude
200
+ response = self.anthropic.messages.create(
201
+ model="claude-sonnet-4-5",
202
+ max_tokens=1000,
203
+ messages=messages,
204
+ tools=available_tools,
205
+ )
206
+
207
+ # Process final response (no more tool calls)
208
+ for content_block in response.content:
209
+ if content_block.type == "text":
210
+ final_text.append(content_block.text)
211
+
212
+ return "\n".join(final_text)
213
+
214
+ async def chat_loop(self):
215
+ """Run an interactive chat loop"""
216
+ print("\nMCP Client Started!")
217
+ print("Type your queries or 'quit' to exit.")
218
+ print("To include images, use: your query --image path/to/image.jpg --image path/to/another.png")
219
+
220
+ while True:
221
+ try:
222
+ query = input("\nQuery: ").strip()
223
+ if query.lower() == "quit":
224
+ break
225
+
226
+ # Parse image paths from query
227
+ image_paths = []
228
+ parts = query.split("--image")
229
+
230
+ if len(parts) > 1:
231
+ # First part is the actual query
232
+ query = parts[0].strip()
233
+
234
+ # Remaining parts are image paths
235
+ for part in parts[1:]:
236
+ image_path = part.strip().split()[0] if part.strip() else ""
237
+ if image_path:
238
+ image_paths.append(image_path)
239
+
240
+ response = await self.process_query(query, image_paths if image_paths else None)
241
+ print("\n" + response)
242
+
243
+ except Exception as e:
244
+ print(f"\nError: {str(e)}")
245
+
246
+ async def cleanup(self):
247
+ """Clean up resources"""
248
+ await self.exit_stack.aclose()
249
+
250
+
251
+ async def main():
252
+ if len(sys.argv) < 2:
253
+ print("Usage: python client_test.py <path_to_server_script> [additional_args...]")
254
+ sys.exit(1)
255
+
256
+ server_script = sys.argv[1]
257
+ extra_args = sys.argv[2:] if len(sys.argv) > 2 else None
258
+
259
+ client = MCPClient()
260
+ try:
261
+ await client.connect_to_server(server_script, extra_args)
262
+ await client.chat_loop()
263
+ finally:
264
+ await client.cleanup()
265
+
266
+
267
+ if __name__ == "__main__":
268
+ import sys
269
+
270
+ asyncio.run(main())