decodingtrust-agent-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. agent/__init__.py +30 -0
  2. agent/claudesdk/__init__.py +8 -0
  3. agent/claudesdk/example.py +221 -0
  4. agent/claudesdk/src/__init__.py +8 -0
  5. agent/claudesdk/src/agent.py +400 -0
  6. agent/claudesdk/src/mcp_proxy.py +409 -0
  7. agent/claudesdk/src/utils.py +420 -0
  8. agent/googleadk/__init__.py +15 -0
  9. agent/googleadk/example.py +237 -0
  10. agent/googleadk/src/__init__.py +12 -0
  11. agent/googleadk/src/agent.py +401 -0
  12. agent/googleadk/src/mcp_wrapper.py +163 -0
  13. agent/googleadk/src/utils.py +602 -0
  14. agent/langchain/__init__.py +8 -0
  15. agent/langchain/example.py +213 -0
  16. agent/langchain/src/__init__.py +8 -0
  17. agent/langchain/src/agent.py +645 -0
  18. agent/langchain/src/utils.py +433 -0
  19. agent/openaisdk/__init__.py +17 -0
  20. agent/openaisdk/example.py +228 -0
  21. agent/openaisdk/src/__init__.py +12 -0
  22. agent/openaisdk/src/agent.py +491 -0
  23. agent/openaisdk/src/agent_wrapper.py +143 -0
  24. agent/openaisdk/src/mcp_wrapper.py +395 -0
  25. agent/openaisdk/src/utils.py +493 -0
  26. agent/openclaw/__init__.py +10 -0
  27. agent/openclaw/example.py +251 -0
  28. agent/openclaw/src/__init__.py +14 -0
  29. agent/openclaw/src/agent.py +930 -0
  30. agent/openclaw/src/helpers/__init__.py +1 -0
  31. agent/openclaw/src/helpers/auth_helpers.py +55 -0
  32. agent/openclaw/src/mcp_proxy.py +564 -0
  33. agent/openclaw/src/plugin_generator.py +231 -0
  34. agent/openclaw/src/utils.py +341 -0
  35. agent/pocketflow/__init__.py +18 -0
  36. agent/pocketflow/example.py +221 -0
  37. agent/pocketflow/prompts/react_agent.py +46 -0
  38. agent/pocketflow/src/__init__.py +6 -0
  39. agent/pocketflow/src/agent.py +507 -0
  40. agent/pocketflow/src/agent_wrapper.py +159 -0
  41. agent/pocketflow/src/async_helper.py +92 -0
  42. agent/pocketflow/src/mcp_react_agent.py +279 -0
  43. agent/pocketflow/src/native_agent.py +74 -0
  44. agent/pocketflow/src/nodes.py +467 -0
  45. benchmark/__init__.py +0 -0
  46. benchmark/browser/benign.jsonl +34 -0
  47. benchmark/browser/direct.jsonl +85 -0
  48. benchmark/browser/indirect.jsonl +82 -0
  49. benchmark/code/benign.jsonl +0 -0
  50. benchmark/code/direct.jsonl +121 -0
  51. benchmark/code/indirect.jsonl +165 -0
  52. benchmark/crm/benign.jsonl +165 -0
  53. benchmark/crm/direct.jsonl +90 -0
  54. benchmark/crm/indirect.jsonl +150 -0
  55. benchmark/customer-service/benign.jsonl +160 -0
  56. benchmark/customer-service/direct.jsonl +100 -0
  57. benchmark/customer-service/indirect.jsonl +101 -0
  58. benchmark/finance/benign.jsonl +0 -0
  59. benchmark/finance/direct.jsonl +200 -0
  60. benchmark/finance/indirect.jsonl +200 -0
  61. benchmark/legal/benign.jsonl +0 -0
  62. benchmark/legal/direct.jsonl +200 -0
  63. benchmark/legal/indirect.jsonl +200 -0
  64. benchmark/macos/benign.jsonl +30 -0
  65. benchmark/macos/direct.jsonl +50 -0
  66. benchmark/macos/indirect.jsonl +50 -0
  67. benchmark/medical/benign.jsonl +642 -0
  68. benchmark/medical/direct.jsonl +229 -0
  69. benchmark/medical/indirect.jsonl +222 -0
  70. benchmark/os-filesystem/benign.jsonl +200 -0
  71. benchmark/os-filesystem/direct.jsonl +200 -0
  72. benchmark/os-filesystem/indirect.jsonl +200 -0
  73. benchmark/research/benign.jsonl +0 -0
  74. benchmark/research/direct.jsonl +119 -0
  75. benchmark/research/indirect.jsonl +125 -0
  76. benchmark/telecom/benign.jsonl +120 -0
  77. benchmark/telecom/direct.jsonl +161 -0
  78. benchmark/telecom/indirect.jsonl +166 -0
  79. benchmark/travel/benign.jsonl +130 -0
  80. benchmark/travel/direct.jsonl +105 -0
  81. benchmark/travel/indirect.jsonl +120 -0
  82. benchmark/windows/benign.jsonl +100 -0
  83. benchmark/windows/direct.jsonl +140 -0
  84. benchmark/windows/indirect.jsonl +107 -0
  85. benchmark/workflow/benign.jsonl +335 -0
  86. benchmark/workflow/direct.jsonl +78 -0
  87. benchmark/workflow/indirect.jsonl +107 -0
  88. cli/__init__.py +5 -0
  89. cli/main.py +182 -0
  90. cli/scaffold.py +334 -0
  91. decodingtrust_agent_sdk-0.1.0.dist-info/METADATA +642 -0
  92. decodingtrust_agent_sdk-0.1.0.dist-info/RECORD +374 -0
  93. decodingtrust_agent_sdk-0.1.0.dist-info/WHEEL +5 -0
  94. decodingtrust_agent_sdk-0.1.0.dist-info/entry_points.txt +2 -0
  95. decodingtrust_agent_sdk-0.1.0.dist-info/licenses/LICENSE +201 -0
  96. decodingtrust_agent_sdk-0.1.0.dist-info/top_level.txt +6 -0
  97. dt_arena/config/env.yaml +515 -0
  98. dt_arena/config/injection_mcp.yaml +430 -0
  99. dt_arena/config/mcp.yaml +642 -0
  100. dt_arena/envs/arxiv/docker-compose-hub.yml +31 -0
  101. dt_arena/envs/arxiv/docker-compose.yml +36 -0
  102. dt_arena/envs/atlassian/docker/docker-compose.dev.yml +65 -0
  103. dt_arena/envs/atlassian/docker/docker-compose.yml +53 -0
  104. dt_arena/envs/atlassian/docker-compose-hub.yml +57 -0
  105. dt_arena/envs/atlassian/docker-compose.yml +72 -0
  106. dt_arena/envs/bigquery/docker-compose.yml +20 -0
  107. dt_arena/envs/booking/docker-compose.yml +59 -0
  108. dt_arena/envs/calendar/docker-compose-hub.yml +30 -0
  109. dt_arena/envs/calendar/docker-compose.yml +42 -0
  110. dt_arena/envs/custom-website/docker-compose.yml +6 -0
  111. dt_arena/envs/customer_service/docker-compose.yml +59 -0
  112. dt_arena/envs/databricks/docker-compose-hub.yml +47 -0
  113. dt_arena/envs/databricks/docker-compose.yml +51 -0
  114. dt_arena/envs/ecommerce/docker-compose.yml +6 -0
  115. dt_arena/envs/ers/docker-compose.yml +36 -0
  116. dt_arena/envs/ers/hrms/docker/docker-compose.yml +31 -0
  117. dt_arena/envs/finance/docker-compose.yml +23 -0
  118. dt_arena/envs/github/docker/docker-compose-hub.yml +50 -0
  119. dt_arena/envs/github/docker/docker-compose.yml +50 -0
  120. dt_arena/envs/gmail/docker-compose-hub.yml +51 -0
  121. dt_arena/envs/gmail/docker-compose.yml +65 -0
  122. dt_arena/envs/google-form/docker-compose-hub.yml +33 -0
  123. dt_arena/envs/google-form/docker-compose.yml +41 -0
  124. dt_arena/envs/googledocs/docker-compose-hub.yml +61 -0
  125. dt_arena/envs/googledocs/docker-compose.yml +78 -0
  126. dt_arena/envs/hospital/docker-compose-hub.yml +25 -0
  127. dt_arena/envs/hospital/docker-compose.yml +27 -0
  128. dt_arena/envs/legal/docker-compose.yml +22 -0
  129. dt_arena/envs/linkedin/docker-compose.yml +63 -0
  130. dt_arena/envs/macos/docker-compose.yml +79 -0
  131. dt_arena/envs/os-filesystem/docker-compose-hub.yml +16 -0
  132. dt_arena/envs/os-filesystem/docker-compose.yml +20 -0
  133. dt_arena/envs/paypal/docker-compose-hub.yml +48 -0
  134. dt_arena/envs/paypal/docker-compose.yml +63 -0
  135. dt_arena/envs/research/docker-compose-hub.yml +13 -0
  136. dt_arena/envs/research/docker-compose.yml +24 -0
  137. dt_arena/envs/salesforce_crm/docker-compose-hub.yaml +45 -0
  138. dt_arena/envs/salesforce_crm/docker-compose.yaml +49 -0
  139. dt_arena/envs/slack/docker-compose-hub.yml +28 -0
  140. dt_arena/envs/slack/docker-compose.yml +41 -0
  141. dt_arena/envs/snowflake/docker-compose-hub.yml +41 -0
  142. dt_arena/envs/snowflake/docker-compose.yml +44 -0
  143. dt_arena/envs/telecom/docker-compose-hub.yml +16 -0
  144. dt_arena/envs/telecom/docker-compose.yml +17 -0
  145. dt_arena/envs/telegram/docker-compose-hub.yml +57 -0
  146. dt_arena/envs/telegram/docker-compose.yml +62 -0
  147. dt_arena/envs/terminal/docker-compose-hub.yml +12 -0
  148. dt_arena/envs/terminal/docker-compose.yml +26 -0
  149. dt_arena/envs/travel/docker-compose-hub.yml +19 -0
  150. dt_arena/envs/travel/docker-compose.yml +19 -0
  151. dt_arena/envs/whatsapp/docker-compose-hub.yml +61 -0
  152. dt_arena/envs/whatsapp/docker-compose.yml +78 -0
  153. dt_arena/envs/windows/docker-compose.yml +71 -0
  154. dt_arena/envs/zoom/docker-compose-hub.yml +27 -0
  155. dt_arena/envs/zoom/docker-compose.yml +40 -0
  156. dt_arena/injection_mcp_server/atlassian/env_injection.py +134 -0
  157. dt_arena/injection_mcp_server/calendar/env_injection.py +217 -0
  158. dt_arena/injection_mcp_server/custom_website/env_injection.py +97 -0
  159. dt_arena/injection_mcp_server/customer_service/env_injection.py +659 -0
  160. dt_arena/injection_mcp_server/databricks/env_injection.py +255 -0
  161. dt_arena/injection_mcp_server/ecommerce/env_injection.py +110 -0
  162. dt_arena/injection_mcp_server/finance/env_injection.py +85 -0
  163. dt_arena/injection_mcp_server/github/env_injection.py +206 -0
  164. dt_arena/injection_mcp_server/gmail/env_injection.py +211 -0
  165. dt_arena/injection_mcp_server/google_form/env_injection.py +186 -0
  166. dt_arena/injection_mcp_server/googledocs/env_injection.py +44 -0
  167. dt_arena/injection_mcp_server/hospital/env_injection.py +43 -0
  168. dt_arena/injection_mcp_server/legal/env_injection.py +229 -0
  169. dt_arena/injection_mcp_server/macos/env_injection.py +272 -0
  170. dt_arena/injection_mcp_server/os-filesystem/env_injection.py +341 -0
  171. dt_arena/injection_mcp_server/paypal/env_injection.py +268 -0
  172. dt_arena/injection_mcp_server/research/env_injection.py +616 -0
  173. dt_arena/injection_mcp_server/salesforce/env_injection.py +514 -0
  174. dt_arena/injection_mcp_server/slack/env_injection.py +265 -0
  175. dt_arena/injection_mcp_server/snowflake/env_injection.py +230 -0
  176. dt_arena/injection_mcp_server/telecom/env_injection.py +503 -0
  177. dt_arena/injection_mcp_server/telegram/env_injection.py +171 -0
  178. dt_arena/injection_mcp_server/terminal/env_injection.py +523 -0
  179. dt_arena/injection_mcp_server/travel/env_injection.py +173 -0
  180. dt_arena/injection_mcp_server/whatsapp/env_injection.py +185 -0
  181. dt_arena/injection_mcp_server/windows/env_injection.py +943 -0
  182. dt_arena/injection_mcp_server/zoom/env_injection.py +216 -0
  183. dt_arena/mcp_server/atlassian/main.py +1554 -0
  184. dt_arena/mcp_server/atlassian/test_server.py +66 -0
  185. dt_arena/mcp_server/bigquery/main.py +333 -0
  186. dt_arena/mcp_server/booking/main.py +310 -0
  187. dt_arena/mcp_server/browser/main.py +1741 -0
  188. dt_arena/mcp_server/calendar/example_multi_user.py +162 -0
  189. dt_arena/mcp_server/calendar/main.py +792 -0
  190. dt_arena/mcp_server/calendar/test_mcp.py +135 -0
  191. dt_arena/mcp_server/customer_service/main.py +1063 -0
  192. dt_arena/mcp_server/databricks/main.py +566 -0
  193. dt_arena/mcp_server/databricks/probe.py +102 -0
  194. dt_arena/mcp_server/ers/main.py +845 -0
  195. dt_arena/mcp_server/finance/__init__.py +87 -0
  196. dt_arena/mcp_server/finance/core/__init__.py +12 -0
  197. dt_arena/mcp_server/finance/core/data_loader.py +558 -0
  198. dt_arena/mcp_server/finance/core/portfolio.py +565 -0
  199. dt_arena/mcp_server/finance/evaluation/__init__.py +20 -0
  200. dt_arena/mcp_server/finance/evaluation/evaluator.py +217 -0
  201. dt_arena/mcp_server/finance/evaluation/logger.py +137 -0
  202. dt_arena/mcp_server/finance/injection/__init__.py +66 -0
  203. dt_arena/mcp_server/finance/injection/config.py +176 -0
  204. dt_arena/mcp_server/finance/injection/content.py +755 -0
  205. dt_arena/mcp_server/finance/injection/html.py +409 -0
  206. dt_arena/mcp_server/finance/injection/locations.py +167 -0
  207. dt_arena/mcp_server/finance/injection/methods.py +193 -0
  208. dt_arena/mcp_server/finance/injection/presets.py +1023 -0
  209. dt_arena/mcp_server/finance/main.py +361 -0
  210. dt_arena/mcp_server/finance/run_mcp.py +21 -0
  211. dt_arena/mcp_server/finance/run_web.py +26 -0
  212. dt_arena/mcp_server/finance/server/__init__.py +41 -0
  213. dt_arena/mcp_server/finance/server/extractor.py +1453 -0
  214. dt_arena/mcp_server/finance/server/extractor_minimal.py +292 -0
  215. dt_arena/mcp_server/finance/server/extractor_simple.py +1164 -0
  216. dt_arena/mcp_server/finance/server/injection_mcp.py +865 -0
  217. dt_arena/mcp_server/finance/server/mcp.py +451 -0
  218. dt_arena/mcp_server/finance/server/tools/__init__.py +23 -0
  219. dt_arena/mcp_server/finance/server/tools/account.py +88 -0
  220. dt_arena/mcp_server/finance/server/tools/browsing.py +328 -0
  221. dt_arena/mcp_server/finance/server/tools/social.py +73 -0
  222. dt_arena/mcp_server/finance/server/tools/trading.py +242 -0
  223. dt_arena/mcp_server/finance/server/tools/utility.py +49 -0
  224. dt_arena/mcp_server/finance/server/web.py +2139 -0
  225. dt_arena/mcp_server/finance/tasks/benchmark/__init__.py +28 -0
  226. dt_arena/mcp_server/finance/tasks/benchmark/attack_pool.py +3026 -0
  227. dt_arena/mcp_server/finance/tasks/benchmark/attack_runner.py +1315 -0
  228. dt_arena/mcp_server/finance/tasks/benchmark/finra_requirements.py +1335 -0
  229. dt_arena/mcp_server/finance/tasks/benchmark/finra_tasks.py +3665 -0
  230. dt_arena/mcp_server/finance/tasks/benchmark/malicious_tasks.py +2673 -0
  231. dt_arena/mcp_server/finance/tasks/redteam_suite/run_redteam_suite.py +1713 -0
  232. dt_arena/mcp_server/finance/test_mcp_tools.py +476 -0
  233. dt_arena/mcp_server/github/main.py +441 -0
  234. dt_arena/mcp_server/gmail/main.py +1004 -0
  235. dt_arena/mcp_server/google_form/main.py +141 -0
  236. dt_arena/mcp_server/googledocs/main.py +458 -0
  237. dt_arena/mcp_server/hospital/mcp_server.py +458 -0
  238. dt_arena/mcp_server/legal/__init__.py +9 -0
  239. dt_arena/mcp_server/legal/core/__init__.py +14 -0
  240. dt_arena/mcp_server/legal/core/courtlistener_store.py +762 -0
  241. dt_arena/mcp_server/legal/core/data_loader.py +266 -0
  242. dt_arena/mcp_server/legal/core/document_store.py +197 -0
  243. dt_arena/mcp_server/legal/core/matter_manager.py +466 -0
  244. dt_arena/mcp_server/legal/main.py +89 -0
  245. dt_arena/mcp_server/legal/scripts/collect_data.py +988 -0
  246. dt_arena/mcp_server/legal/server/__init__.py +14 -0
  247. dt_arena/mcp_server/legal/server/mcp.py +2330 -0
  248. dt_arena/mcp_server/macos/client_test.py +270 -0
  249. dt_arena/mcp_server/macos/mcp_server.py +285 -0
  250. dt_arena/mcp_server/os-filesystem/main.py +1380 -0
  251. dt_arena/mcp_server/paypal/main.py +501 -0
  252. dt_arena/mcp_server/research/main.py +777 -0
  253. dt_arena/mcp_server/salesforce/main.py +2006 -0
  254. dt_arena/mcp_server/slack/main.py +318 -0
  255. dt_arena/mcp_server/snowflake/main.py +612 -0
  256. dt_arena/mcp_server/snowflake/probe.py +183 -0
  257. dt_arena/mcp_server/telecom/mcp_client.py +423 -0
  258. dt_arena/mcp_server/telecom/mcp_server.py +1059 -0
  259. dt_arena/mcp_server/telegram/main.py +338 -0
  260. dt_arena/mcp_server/terminal/main.py +163 -0
  261. dt_arena/mcp_server/travel/client_test.py +16 -0
  262. dt_arena/mcp_server/travel/mcp_server.py +404 -0
  263. dt_arena/mcp_server/whatsapp/main.py +318 -0
  264. dt_arena/mcp_server/windows/client_test.py +270 -0
  265. dt_arena/mcp_server/windows/mcp_server.py +218 -0
  266. dt_arena/mcp_server/zoom/main.py +466 -0
  267. dt_arena/src/__init__.py +0 -0
  268. dt_arena/src/hooks/__init__.py +0 -0
  269. dt_arena/src/hooks/audit_log.py +30 -0
  270. dt_arena/src/hooks/hooks.json +3 -0
  271. dt_arena/src/run_benign.py +142 -0
  272. dt_arena/src/types/__init__.py +0 -0
  273. dt_arena/src/types/agent.py +441 -0
  274. dt_arena/src/types/attacks.py +2 -0
  275. dt_arena/src/types/environment.py +2 -0
  276. dt_arena/src/types/hooks.py +174 -0
  277. dt_arena/src/types/judge.py +52 -0
  278. dt_arena/src/types/red_teaming_trajectory.py +385 -0
  279. dt_arena/src/types/task.py +260 -0
  280. dt_arena/src/types/trajectory.py +315 -0
  281. dt_arena/utils/__init__.py +1 -0
  282. dt_arena/utils/atlassian/__init__.py +27 -0
  283. dt_arena/utils/atlassian/helpers.py +520 -0
  284. dt_arena/utils/bigquery/__init__.py +1 -0
  285. dt_arena/utils/bigquery/helpers.py +246 -0
  286. dt_arena/utils/calendar/__init__.py +1 -0
  287. dt_arena/utils/calendar/helpers.py +87 -0
  288. dt_arena/utils/customer_service/__init__.py +17 -0
  289. dt_arena/utils/customer_service/cs_env_client.py +940 -0
  290. dt_arena/utils/customer_service/helpers.py +339 -0
  291. dt_arena/utils/customer_service/judges/__init__.py +20 -0
  292. dt_arena/utils/customer_service/judges/abusive_comms.py +149 -0
  293. dt_arena/utils/customer_service/judges/bulk_destructive.py +202 -0
  294. dt_arena/utils/customer_service/judges/cross_customer.py +481 -0
  295. dt_arena/utils/customer_service/judges/email_exfiltration.py +320 -0
  296. dt_arena/utils/customer_service/judges/financial_fraud.py +221 -0
  297. dt_arena/utils/customer_service/judges/llm_utils.py +264 -0
  298. dt_arena/utils/customer_service/judges/pii_disclosure.py +837 -0
  299. dt_arena/utils/customer_service/judges/policy_misrep.py +44 -0
  300. dt_arena/utils/customer_service/judges/text_utils.py +21 -0
  301. dt_arena/utils/databricks/__init__.py +2 -0
  302. dt_arena/utils/databricks/helpers.py +210 -0
  303. dt_arena/utils/finance/__init__.py +0 -0
  304. dt_arena/utils/finance/helpers.py +263 -0
  305. dt_arena/utils/github/__init__.py +1 -0
  306. dt_arena/utils/github/helpers.py +249 -0
  307. dt_arena/utils/gmail/__init__.py +1 -0
  308. dt_arena/utils/gmail/helpers.py +344 -0
  309. dt_arena/utils/google_form/__init__.py +2 -0
  310. dt_arena/utils/google_form/helpers.py +133 -0
  311. dt_arena/utils/legal/__init__.py +0 -0
  312. dt_arena/utils/legal/helpers.py +228 -0
  313. dt_arena/utils/macos/__init__.py +0 -0
  314. dt_arena/utils/macos/env_setup.py +215 -0
  315. dt_arena/utils/macos/helpers.py +61 -0
  316. dt_arena/utils/os_filesystem/__init__.py +1 -0
  317. dt_arena/utils/os_filesystem/helpers.py +366 -0
  318. dt_arena/utils/paypal/__init__.py +1 -0
  319. dt_arena/utils/paypal/helpers.py +178 -0
  320. dt_arena/utils/port_allocator.py +266 -0
  321. dt_arena/utils/research/__init__.py +0 -0
  322. dt_arena/utils/research/helpers.py +251 -0
  323. dt_arena/utils/salesforce/__init__.py +1 -0
  324. dt_arena/utils/salesforce/helpers.py +719 -0
  325. dt_arena/utils/slack/__init__.py +1 -0
  326. dt_arena/utils/slack/helpers.py +176 -0
  327. dt_arena/utils/snowflake/__init__.py +1 -0
  328. dt_arena/utils/snowflake/helpers.py +166 -0
  329. dt_arena/utils/telecom/__init__.py +1 -0
  330. dt_arena/utils/telecom/helpers.py +760 -0
  331. dt_arena/utils/telegram/__init__.py +0 -0
  332. dt_arena/utils/telegram/helpers.py +174 -0
  333. dt_arena/utils/terminal/__init__.py +0 -0
  334. dt_arena/utils/terminal/helpers.py +20 -0
  335. dt_arena/utils/travel/__init__.py +0 -0
  336. dt_arena/utils/travel/env_client.py +537 -0
  337. dt_arena/utils/travel/llm_judge.py +137 -0
  338. dt_arena/utils/travel/prompts.py +64 -0
  339. dt_arena/utils/utils/__init__.py +122 -0
  340. dt_arena/utils/whatsapp/__init__.py +0 -0
  341. dt_arena/utils/whatsapp/helpers.py +226 -0
  342. dt_arena/utils/windows/__init__.py +0 -0
  343. dt_arena/utils/windows/env_reset.py +224 -0
  344. dt_arena/utils/windows/env_setup.py +280 -0
  345. dt_arena/utils/windows/exfil_helpers.py +170 -0
  346. dt_arena/utils/windows/helpers.py +74 -0
  347. dt_arena/utils/zoom/__init__.py +1 -0
  348. dt_arena/utils/zoom/helpers.py +70 -0
  349. eval/__init__.py +1 -0
  350. eval/evaluation.py +426 -0
  351. eval/task_runner.py +449 -0
  352. utils/__init__.py +148 -0
  353. utils/agent_helpers.py +308 -0
  354. utils/agent_wrapper.py +189 -0
  355. utils/compose_utils.py +135 -0
  356. utils/config.py +77 -0
  357. utils/env_helpers.py +104 -0
  358. utils/eval_stats.py +88 -0
  359. utils/injection_helpers.py +429 -0
  360. utils/injection_mcp_helpers.py +152 -0
  361. utils/judge_helpers.py +181 -0
  362. utils/judge_utils.py +472 -0
  363. utils/llm.py +196 -0
  364. utils/logging.py +45 -0
  365. utils/mcp_helpers.py +232 -0
  366. utils/mcp_manager.py +235 -0
  367. utils/memory_guard.py +18 -0
  368. utils/red_teaming_sandbox.py +476 -0
  369. utils/reset_helpers.py +318 -0
  370. utils/resource_manager.py +370 -0
  371. utils/skill_helpers.py +447 -0
  372. utils/task_executor.py +904 -0
  373. utils/task_helpers.py +270 -0
  374. utils/template_helpers.py +179 -0
@@ -0,0 +1,92 @@
1
+ import asyncio
2
+ import threading
3
+ from typing import Any, Coroutine, TypeVar
4
+
5
+ T = TypeVar("T")
6
+
7
+
8
+ class AsyncHelper:
9
+ """
10
+ Helper class to run async operations in a sync context.
11
+
12
+ This is needed because PocketFlow nodes are synchronous, but MCP client
13
+ operations are asynchronous. We run async operations in a separate thread
14
+ with its own event loop to avoid "event loop already running" errors.
15
+ """
16
+
17
+ def __init__(self):
18
+ """Initialize the async helper."""
19
+ self._loop = None
20
+ self._thread = None
21
+ self._started = False
22
+
23
+ def start(self):
24
+ """Start the background event loop in a separate thread."""
25
+ if self._started:
26
+ return
27
+
28
+ def run_loop(loop):
29
+ """Run the event loop in the thread."""
30
+ asyncio.set_event_loop(loop)
31
+ loop.run_forever()
32
+
33
+ self._loop = asyncio.new_event_loop()
34
+ self._thread = threading.Thread(target=run_loop, args=(self._loop,), daemon=True)
35
+ self._thread.start()
36
+ self._started = True
37
+
38
+ def stop(self):
39
+ """Stop the background event loop."""
40
+ if not self._started:
41
+ return
42
+
43
+ if self._loop:
44
+ self._loop.call_soon_threadsafe(self._loop.stop)
45
+ self._thread.join(timeout=5.0)
46
+ self._loop.close()
47
+
48
+ self._loop = None
49
+ self._thread = None
50
+ self._started = False
51
+
52
+ def run_async(self, coro: Coroutine[Any, Any, T], timeout: float = 300.0) -> T:
53
+ """
54
+ Run an async coroutine and return its result synchronously.
55
+
56
+ Args:
57
+ coro: The coroutine to run
58
+ timeout: Timeout in seconds (default: 300s / 5 minutes)
59
+
60
+ Returns:
61
+ The result of the coroutine
62
+
63
+ Raises:
64
+ RuntimeError: If the helper is not started or loop is not running
65
+ TimeoutError: If the coroutine times out
66
+ concurrent.futures.CancelledError: If the future is cancelled
67
+ """
68
+ if not self._started:
69
+ raise RuntimeError("AsyncHelper not started. Call start() first.")
70
+
71
+ # Check if the loop and thread are still alive
72
+ if self._loop is None or self._loop.is_closed():
73
+ raise RuntimeError("AsyncHelper event loop is closed.")
74
+ if self._thread is None or not self._thread.is_alive():
75
+ raise RuntimeError("AsyncHelper thread is not running.")
76
+
77
+ future = asyncio.run_coroutine_threadsafe(coro, self._loop)
78
+ try:
79
+ return future.result(timeout=timeout)
80
+ except TimeoutError:
81
+ future.cancel()
82
+ raise TimeoutError(f"Async operation timed out after {timeout}s")
83
+
84
+ def __enter__(self):
85
+ """Context manager entry."""
86
+ self.start()
87
+ return self
88
+
89
+ def __exit__(self, exc_type, exc_val, exc_tb):
90
+ """Context manager exit."""
91
+ self.stop()
92
+ return False
@@ -0,0 +1,279 @@
1
+ import asyncio
2
+ from typing import Dict, Any, List, Optional, Tuple
3
+ from pocketflow import Flow
4
+ from .nodes import DecideActionNode, ExecuteToolNode, FinalAnswerNode
5
+ from .async_helper import AsyncHelper
6
+
7
+
8
+ class MCPReactAgent:
9
+ """
10
+ A ReAct (Reasoning and Acting) agent that uses MCP tools through FastMCP.
11
+
12
+ This agent follows the ReAct pattern:
13
+ 1. Think: Reason about the current state and decide what to do
14
+ 2. Act: Execute a tool through MCP
15
+ 3. Observe: Process the tool's result
16
+ 4. Repeat until the task is complete
17
+
18
+ The agent is built using PocketFlow for workflow orchestration and
19
+ FastMCP for MCP server communication.
20
+
21
+ Supports multiple MCP servers - tools from all servers are combined.
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ system_prompt: str,
27
+ mcp_server_url: Optional[str] = None,
28
+ mcp_server_urls: Optional[List[str]] = None,
29
+ model: Optional[str] = None,
30
+ max_iterations: int = 50,
31
+ timeout: float = 30.0,
32
+ ):
33
+ """
34
+ Initialize the MCP ReAct Agent.
35
+
36
+ Args:
37
+ system_prompt: System prompt that defines the agent's role and behavior
38
+ mcp_server_url: URL of a single MCP server (for backward compatibility)
39
+ mcp_server_urls: List of MCP server URLs (preferred for multiple servers)
40
+ model: Optional LLM model name to use
41
+ max_iterations: Maximum number of reasoning iterations
42
+ timeout: Timeout for MCP operations in seconds
43
+ """
44
+ self.system_prompt = system_prompt
45
+
46
+ # Support both single URL (backward compat) and multiple URLs
47
+ if mcp_server_urls:
48
+ self.mcp_server_urls = mcp_server_urls
49
+ elif mcp_server_url:
50
+ self.mcp_server_urls = [mcp_server_url]
51
+ else:
52
+ self.mcp_server_urls = []
53
+
54
+ # Keep single URL for backward compatibility
55
+ self.mcp_server_url = self.mcp_server_urls[0] if self.mcp_server_urls else None
56
+
57
+ self.model = model
58
+ self.max_iterations = max_iterations
59
+ self.timeout = timeout
60
+
61
+ # MCP clients (will be initialized when needed) - one per server
62
+ self._mcp_clients: Dict[str, Any] = {} # url -> client
63
+ self._tool_to_client: Dict[str, Any] = {} # tool_name -> client (for routing)
64
+
65
+ # Build the ReAct flow
66
+ self._flow = self._build_flow()
67
+
68
+ def _build_flow(self) -> Flow:
69
+ """Build the ReAct workflow using PocketFlow."""
70
+ # Create nodes
71
+ decide_node = DecideActionNode()
72
+ execute_tool_node = ExecuteToolNode()
73
+ final_answer_node = FinalAnswerNode()
74
+
75
+ # Connect nodes based on actions
76
+ # From decide_node:
77
+ # - "use_tool" -> execute_tool_node
78
+ # - "answer" -> final_answer_node
79
+ # - "retry" -> back to decide_node (for parse error recovery)
80
+ decide_node - "use_tool" >> execute_tool_node
81
+ decide_node - "answer" >> final_answer_node
82
+ decide_node - "retry" >> decide_node
83
+
84
+ # From execute_tool_node:
85
+ # - "decide" -> back to decide_node (for next iteration)
86
+ execute_tool_node - "decide" >> decide_node
87
+
88
+ # final_answer_node doesn't connect to anything (ends the flow)
89
+
90
+ # Create and return the flow
91
+ return Flow(start=decide_node)
92
+
93
+ async def _initialize_mcp_client(self):
94
+ """Initialize MCP client connections for all configured servers."""
95
+ from fastmcp import Client
96
+
97
+ for url in self.mcp_server_urls:
98
+ try:
99
+ # Create client for this server
100
+ client = Client(url, timeout=self.timeout)
101
+
102
+ # Connect to the server
103
+ await client.__aenter__()
104
+
105
+ # Store the client
106
+ self._mcp_clients[url] = client
107
+ print(f"[MCP] Connected to server: {url}")
108
+
109
+ except Exception as e:
110
+ print(f"[MCP] Warning: Failed to connect to {url}: {e}")
111
+
112
+ # For backward compatibility, set _mcp_client to first client
113
+ if self._mcp_clients:
114
+ first_url = self.mcp_server_urls[0]
115
+ self._mcp_client = self._mcp_clients.get(first_url)
116
+
117
+ return self._mcp_clients
118
+
119
+ async def _close_mcp_client(self):
120
+ """Close all MCP client connections."""
121
+ for url, client in self._mcp_clients.items():
122
+ try:
123
+ await client.__aexit__(None, None, None)
124
+ print(f"[MCP] Disconnected from server: {url}")
125
+ except Exception as e:
126
+ print(f"[MCP] Warning: Error closing connection to {url}: {e}")
127
+
128
+ self._mcp_clients = {}
129
+ self._mcp_client = None
130
+ self._tool_to_client = {}
131
+
132
+ async def _get_available_tools(self) -> List[Dict[str, Any]]:
133
+ """Get the list of available tools from all MCP servers."""
134
+ all_tools = []
135
+
136
+ for url, client in self._mcp_clients.items():
137
+ try:
138
+ tools_response = await client.list_tools()
139
+
140
+ # Convert to our format and track which client owns each tool
141
+ for tool in tools_response:
142
+ tool_name = tool.name
143
+ # Store mapping from tool name to client for routing
144
+ self._tool_to_client[tool_name] = client
145
+
146
+ all_tools.append({
147
+ "name": tool_name,
148
+ "description": tool.description or "",
149
+ "inputSchema": tool.inputSchema or {},
150
+ "_mcp_server": url, # Track source server (internal use)
151
+ })
152
+
153
+ print(f"[MCP] Loaded {len(tools_response)} tools from {url}")
154
+
155
+ except Exception as e:
156
+ print(f"Warning: Failed to get tools from MCP server {url}: {e}")
157
+
158
+ return all_tools
159
+
160
+ def get_client_for_tool(self, tool_name: str) -> Optional[Any]:
161
+ """Get the MCP client that provides a specific tool."""
162
+ return self._tool_to_client.get(tool_name)
163
+
164
+
165
+
166
+ async def run_async(
167
+ self, user_query: str
168
+ ) -> Tuple[str, List[Dict[str, Any]]]:
169
+ """
170
+ Run the agent asynchronously with the given query.
171
+
172
+ Args:
173
+ user_query: The user's input query/task
174
+
175
+ Returns:
176
+ A tuple of (final_answer, trajectory)
177
+ - final_answer: The agent's final response
178
+ - trajectory: List of all reasoning steps, actions, and observations
179
+ """
180
+ # Initialize MCP client
181
+ await self._initialize_mcp_client()
182
+
183
+ # Create and start async helper
184
+ async_helper = AsyncHelper()
185
+ async_helper.start()
186
+
187
+ try:
188
+ # Get available tools
189
+ available_tools = await self._get_available_tools()
190
+
191
+ # Initialize shared store
192
+ shared = {
193
+ "system_prompt": self.system_prompt,
194
+ "user_query": user_query,
195
+ "trajectory": [],
196
+ "message_history": [], # Initialize message history for conversational context
197
+ "available_tools": available_tools,
198
+ "max_iterations": self.max_iterations,
199
+ "model": self.model,
200
+ "mcp_client": self._mcp_client, # Pass the first MCP client (backward compat)
201
+ "mcp_clients": self._mcp_clients, # Pass all MCP clients
202
+ "tool_to_client": self._tool_to_client, # Pass tool->client routing map
203
+ "async_helper": async_helper, # Pass the async helper
204
+ "final_answer": None,
205
+ "current_decision": None,
206
+ }
207
+
208
+ # Run the flow using standard PocketFlow Flow.run()
209
+ self._flow.run(shared)
210
+
211
+ # Extract results
212
+ final_answer = shared.get("final_answer", "No answer provided")
213
+ trajectory = shared.get("trajectory", [])
214
+
215
+ return final_answer, trajectory
216
+
217
+ finally:
218
+ # Clean up
219
+ async_helper.stop()
220
+ await self._close_mcp_client()
221
+
222
+ def run(
223
+ self, user_query: str
224
+ ) -> Tuple[str, List[Dict[str, Any]]]:
225
+ """
226
+ Run the agent synchronously with the given query.
227
+
228
+ Args:
229
+ user_query: The user's input query/task
230
+
231
+ Returns:
232
+ A tuple of (final_answer, trajectory)
233
+ - final_answer: The agent's final response
234
+ - trajectory: List of all reasoning steps, actions, and observations
235
+ """
236
+ # Run the async version
237
+ return asyncio.run(self.run_async(user_query))
238
+
239
+
240
+ def format_trajectory(self, trajectory: List[Dict[str, Any]]) -> str:
241
+ """
242
+ Format the trajectory into a human-readable string.
243
+
244
+ Args:
245
+ trajectory: The trajectory list from run()
246
+
247
+ Returns:
248
+ A formatted string representation of the trajectory
249
+ """
250
+ lines = []
251
+ thought_count = 0
252
+ action_count = 0
253
+
254
+ for entry in trajectory:
255
+ entry_type = entry.get("type", "unknown")
256
+
257
+ if entry_type == "thought":
258
+ thought_count += 1
259
+ lines.append(f"\n{'='*60}")
260
+ lines.append(f"ITERATION {thought_count}")
261
+ lines.append(f"{'='*60}")
262
+ lines.append(f"Thought: {entry.get('content', '')}")
263
+
264
+ elif entry_type == "action":
265
+ action_count += 1
266
+ tool_name = entry.get("tool_name", "unknown")
267
+ tool_args = entry.get("tool_arguments", {})
268
+ lines.append(f"Action: {tool_name}({tool_args})")
269
+
270
+ elif entry_type == "observation":
271
+ lines.append(f"Observation: {entry.get('content', '')}")
272
+
273
+ elif entry_type == "final_answer":
274
+ lines.append(f"\n{'='*60}")
275
+ lines.append("FINAL ANSWER")
276
+ lines.append(f"{'='*60}")
277
+ lines.append(entry.get("content", ""))
278
+
279
+ return "\n".join(lines)
@@ -0,0 +1,74 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Dict, List, Any, Optional
3
+ from pocketflow import Flow
4
+ from agent.pocketflow.src.agent import MCPServerInfo
5
+
6
+
7
+ @dataclass
8
+ class NativeMCPReactAgent:
9
+ """
10
+ User-customizable PocketFlow Native Agent.
11
+ """
12
+
13
+ # Required: MCP server configurations
14
+ mcp_servers: Dict[str, 'MCPServerInfo']
15
+ # Optional: Custom PocketFlow Flow (None = use default ReAct Flow)
16
+ flow: Optional['Flow'] = None
17
+
18
+ # Auto-computed attributes (populated in __post_init__)
19
+ tool_to_server: Dict[str, str] = field(default_factory=dict)
20
+ all_tools: List[Dict[str, Any]] = field(default_factory=list)
21
+
22
+ def __post_init__(self):
23
+ """
24
+ Build tool_to_server mapping and all_tools list from mcp_servers.
25
+
26
+ This method is automatically called after dataclass initialization.
27
+ It iterates through all MCP servers and their tools to build:
28
+ - tool_to_server: Maps each tool name to its server name
29
+ - all_tools: Flat list of all tools from all servers
30
+
31
+ Raises:
32
+ ValueError: If duplicate tool names are found across servers.
33
+ """
34
+ # Reset computed fields to ensure clean state
35
+ self.tool_to_server = {}
36
+ self.all_tools = []
37
+
38
+ for server_name, server_info in self.mcp_servers.items():
39
+ for tool in server_info.tools:
40
+ # Get tool name (support both dict and object-like access)
41
+ tool_name = tool.get("name") if isinstance(tool, dict) else getattr(tool, "name", None)
42
+
43
+ if not tool_name:
44
+ continue
45
+
46
+ # Check for duplicate tool names
47
+ if tool_name in self.tool_to_server:
48
+ existing_server = self.tool_to_server[tool_name]
49
+ raise ValueError(
50
+ f"Duplicate tool name '{tool_name}' found in servers "
51
+ f"'{existing_server}' and '{server_name}'"
52
+ )
53
+
54
+ # Map tool to server
55
+ self.tool_to_server[tool_name] = server_name
56
+
57
+ # Add to all_tools list
58
+ if isinstance(tool, dict):
59
+ # Ensure server field is set
60
+ tool_copy = dict(tool)
61
+ tool_copy["server"] = server_name
62
+ self.all_tools.append(tool_copy)
63
+ else:
64
+ # Convert object to dict
65
+ self.all_tools.append({
66
+ "name": tool_name,
67
+ "server": server_name,
68
+ "description": getattr(tool, "description", ""),
69
+ "inputSchema": getattr(tool, "inputSchema", {}),
70
+ })
71
+
72
+ def has_custom_flow(self) -> bool:
73
+ """Check if a custom flow is defined."""
74
+ return self.flow is not None