decodingtrust-agent-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. agent/__init__.py +30 -0
  2. agent/claudesdk/__init__.py +8 -0
  3. agent/claudesdk/example.py +221 -0
  4. agent/claudesdk/src/__init__.py +8 -0
  5. agent/claudesdk/src/agent.py +400 -0
  6. agent/claudesdk/src/mcp_proxy.py +409 -0
  7. agent/claudesdk/src/utils.py +420 -0
  8. agent/googleadk/__init__.py +15 -0
  9. agent/googleadk/example.py +237 -0
  10. agent/googleadk/src/__init__.py +12 -0
  11. agent/googleadk/src/agent.py +401 -0
  12. agent/googleadk/src/mcp_wrapper.py +163 -0
  13. agent/googleadk/src/utils.py +602 -0
  14. agent/langchain/__init__.py +8 -0
  15. agent/langchain/example.py +213 -0
  16. agent/langchain/src/__init__.py +8 -0
  17. agent/langchain/src/agent.py +645 -0
  18. agent/langchain/src/utils.py +433 -0
  19. agent/openaisdk/__init__.py +17 -0
  20. agent/openaisdk/example.py +228 -0
  21. agent/openaisdk/src/__init__.py +12 -0
  22. agent/openaisdk/src/agent.py +491 -0
  23. agent/openaisdk/src/agent_wrapper.py +143 -0
  24. agent/openaisdk/src/mcp_wrapper.py +395 -0
  25. agent/openaisdk/src/utils.py +493 -0
  26. agent/openclaw/__init__.py +10 -0
  27. agent/openclaw/example.py +251 -0
  28. agent/openclaw/src/__init__.py +14 -0
  29. agent/openclaw/src/agent.py +930 -0
  30. agent/openclaw/src/helpers/__init__.py +1 -0
  31. agent/openclaw/src/helpers/auth_helpers.py +55 -0
  32. agent/openclaw/src/mcp_proxy.py +564 -0
  33. agent/openclaw/src/plugin_generator.py +231 -0
  34. agent/openclaw/src/utils.py +341 -0
  35. agent/pocketflow/__init__.py +18 -0
  36. agent/pocketflow/example.py +221 -0
  37. agent/pocketflow/prompts/react_agent.py +46 -0
  38. agent/pocketflow/src/__init__.py +6 -0
  39. agent/pocketflow/src/agent.py +507 -0
  40. agent/pocketflow/src/agent_wrapper.py +159 -0
  41. agent/pocketflow/src/async_helper.py +92 -0
  42. agent/pocketflow/src/mcp_react_agent.py +279 -0
  43. agent/pocketflow/src/native_agent.py +74 -0
  44. agent/pocketflow/src/nodes.py +467 -0
  45. benchmark/__init__.py +0 -0
  46. benchmark/browser/benign.jsonl +34 -0
  47. benchmark/browser/direct.jsonl +85 -0
  48. benchmark/browser/indirect.jsonl +82 -0
  49. benchmark/code/benign.jsonl +0 -0
  50. benchmark/code/direct.jsonl +121 -0
  51. benchmark/code/indirect.jsonl +165 -0
  52. benchmark/crm/benign.jsonl +165 -0
  53. benchmark/crm/direct.jsonl +90 -0
  54. benchmark/crm/indirect.jsonl +150 -0
  55. benchmark/customer-service/benign.jsonl +160 -0
  56. benchmark/customer-service/direct.jsonl +100 -0
  57. benchmark/customer-service/indirect.jsonl +101 -0
  58. benchmark/finance/benign.jsonl +0 -0
  59. benchmark/finance/direct.jsonl +200 -0
  60. benchmark/finance/indirect.jsonl +200 -0
  61. benchmark/legal/benign.jsonl +0 -0
  62. benchmark/legal/direct.jsonl +200 -0
  63. benchmark/legal/indirect.jsonl +200 -0
  64. benchmark/macos/benign.jsonl +30 -0
  65. benchmark/macos/direct.jsonl +50 -0
  66. benchmark/macos/indirect.jsonl +50 -0
  67. benchmark/medical/benign.jsonl +642 -0
  68. benchmark/medical/direct.jsonl +229 -0
  69. benchmark/medical/indirect.jsonl +222 -0
  70. benchmark/os-filesystem/benign.jsonl +200 -0
  71. benchmark/os-filesystem/direct.jsonl +200 -0
  72. benchmark/os-filesystem/indirect.jsonl +200 -0
  73. benchmark/research/benign.jsonl +0 -0
  74. benchmark/research/direct.jsonl +119 -0
  75. benchmark/research/indirect.jsonl +125 -0
  76. benchmark/telecom/benign.jsonl +120 -0
  77. benchmark/telecom/direct.jsonl +161 -0
  78. benchmark/telecom/indirect.jsonl +166 -0
  79. benchmark/travel/benign.jsonl +130 -0
  80. benchmark/travel/direct.jsonl +105 -0
  81. benchmark/travel/indirect.jsonl +120 -0
  82. benchmark/windows/benign.jsonl +100 -0
  83. benchmark/windows/direct.jsonl +140 -0
  84. benchmark/windows/indirect.jsonl +107 -0
  85. benchmark/workflow/benign.jsonl +335 -0
  86. benchmark/workflow/direct.jsonl +78 -0
  87. benchmark/workflow/indirect.jsonl +107 -0
  88. cli/__init__.py +5 -0
  89. cli/main.py +182 -0
  90. cli/scaffold.py +334 -0
  91. decodingtrust_agent_sdk-0.1.0.dist-info/METADATA +642 -0
  92. decodingtrust_agent_sdk-0.1.0.dist-info/RECORD +374 -0
  93. decodingtrust_agent_sdk-0.1.0.dist-info/WHEEL +5 -0
  94. decodingtrust_agent_sdk-0.1.0.dist-info/entry_points.txt +2 -0
  95. decodingtrust_agent_sdk-0.1.0.dist-info/licenses/LICENSE +201 -0
  96. decodingtrust_agent_sdk-0.1.0.dist-info/top_level.txt +6 -0
  97. dt_arena/config/env.yaml +515 -0
  98. dt_arena/config/injection_mcp.yaml +430 -0
  99. dt_arena/config/mcp.yaml +642 -0
  100. dt_arena/envs/arxiv/docker-compose-hub.yml +31 -0
  101. dt_arena/envs/arxiv/docker-compose.yml +36 -0
  102. dt_arena/envs/atlassian/docker/docker-compose.dev.yml +65 -0
  103. dt_arena/envs/atlassian/docker/docker-compose.yml +53 -0
  104. dt_arena/envs/atlassian/docker-compose-hub.yml +57 -0
  105. dt_arena/envs/atlassian/docker-compose.yml +72 -0
  106. dt_arena/envs/bigquery/docker-compose.yml +20 -0
  107. dt_arena/envs/booking/docker-compose.yml +59 -0
  108. dt_arena/envs/calendar/docker-compose-hub.yml +30 -0
  109. dt_arena/envs/calendar/docker-compose.yml +42 -0
  110. dt_arena/envs/custom-website/docker-compose.yml +6 -0
  111. dt_arena/envs/customer_service/docker-compose.yml +59 -0
  112. dt_arena/envs/databricks/docker-compose-hub.yml +47 -0
  113. dt_arena/envs/databricks/docker-compose.yml +51 -0
  114. dt_arena/envs/ecommerce/docker-compose.yml +6 -0
  115. dt_arena/envs/ers/docker-compose.yml +36 -0
  116. dt_arena/envs/ers/hrms/docker/docker-compose.yml +31 -0
  117. dt_arena/envs/finance/docker-compose.yml +23 -0
  118. dt_arena/envs/github/docker/docker-compose-hub.yml +50 -0
  119. dt_arena/envs/github/docker/docker-compose.yml +50 -0
  120. dt_arena/envs/gmail/docker-compose-hub.yml +51 -0
  121. dt_arena/envs/gmail/docker-compose.yml +65 -0
  122. dt_arena/envs/google-form/docker-compose-hub.yml +33 -0
  123. dt_arena/envs/google-form/docker-compose.yml +41 -0
  124. dt_arena/envs/googledocs/docker-compose-hub.yml +61 -0
  125. dt_arena/envs/googledocs/docker-compose.yml +78 -0
  126. dt_arena/envs/hospital/docker-compose-hub.yml +25 -0
  127. dt_arena/envs/hospital/docker-compose.yml +27 -0
  128. dt_arena/envs/legal/docker-compose.yml +22 -0
  129. dt_arena/envs/linkedin/docker-compose.yml +63 -0
  130. dt_arena/envs/macos/docker-compose.yml +79 -0
  131. dt_arena/envs/os-filesystem/docker-compose-hub.yml +16 -0
  132. dt_arena/envs/os-filesystem/docker-compose.yml +20 -0
  133. dt_arena/envs/paypal/docker-compose-hub.yml +48 -0
  134. dt_arena/envs/paypal/docker-compose.yml +63 -0
  135. dt_arena/envs/research/docker-compose-hub.yml +13 -0
  136. dt_arena/envs/research/docker-compose.yml +24 -0
  137. dt_arena/envs/salesforce_crm/docker-compose-hub.yaml +45 -0
  138. dt_arena/envs/salesforce_crm/docker-compose.yaml +49 -0
  139. dt_arena/envs/slack/docker-compose-hub.yml +28 -0
  140. dt_arena/envs/slack/docker-compose.yml +41 -0
  141. dt_arena/envs/snowflake/docker-compose-hub.yml +41 -0
  142. dt_arena/envs/snowflake/docker-compose.yml +44 -0
  143. dt_arena/envs/telecom/docker-compose-hub.yml +16 -0
  144. dt_arena/envs/telecom/docker-compose.yml +17 -0
  145. dt_arena/envs/telegram/docker-compose-hub.yml +57 -0
  146. dt_arena/envs/telegram/docker-compose.yml +62 -0
  147. dt_arena/envs/terminal/docker-compose-hub.yml +12 -0
  148. dt_arena/envs/terminal/docker-compose.yml +26 -0
  149. dt_arena/envs/travel/docker-compose-hub.yml +19 -0
  150. dt_arena/envs/travel/docker-compose.yml +19 -0
  151. dt_arena/envs/whatsapp/docker-compose-hub.yml +61 -0
  152. dt_arena/envs/whatsapp/docker-compose.yml +78 -0
  153. dt_arena/envs/windows/docker-compose.yml +71 -0
  154. dt_arena/envs/zoom/docker-compose-hub.yml +27 -0
  155. dt_arena/envs/zoom/docker-compose.yml +40 -0
  156. dt_arena/injection_mcp_server/atlassian/env_injection.py +134 -0
  157. dt_arena/injection_mcp_server/calendar/env_injection.py +217 -0
  158. dt_arena/injection_mcp_server/custom_website/env_injection.py +97 -0
  159. dt_arena/injection_mcp_server/customer_service/env_injection.py +659 -0
  160. dt_arena/injection_mcp_server/databricks/env_injection.py +255 -0
  161. dt_arena/injection_mcp_server/ecommerce/env_injection.py +110 -0
  162. dt_arena/injection_mcp_server/finance/env_injection.py +85 -0
  163. dt_arena/injection_mcp_server/github/env_injection.py +206 -0
  164. dt_arena/injection_mcp_server/gmail/env_injection.py +211 -0
  165. dt_arena/injection_mcp_server/google_form/env_injection.py +186 -0
  166. dt_arena/injection_mcp_server/googledocs/env_injection.py +44 -0
  167. dt_arena/injection_mcp_server/hospital/env_injection.py +43 -0
  168. dt_arena/injection_mcp_server/legal/env_injection.py +229 -0
  169. dt_arena/injection_mcp_server/macos/env_injection.py +272 -0
  170. dt_arena/injection_mcp_server/os-filesystem/env_injection.py +341 -0
  171. dt_arena/injection_mcp_server/paypal/env_injection.py +268 -0
  172. dt_arena/injection_mcp_server/research/env_injection.py +616 -0
  173. dt_arena/injection_mcp_server/salesforce/env_injection.py +514 -0
  174. dt_arena/injection_mcp_server/slack/env_injection.py +265 -0
  175. dt_arena/injection_mcp_server/snowflake/env_injection.py +230 -0
  176. dt_arena/injection_mcp_server/telecom/env_injection.py +503 -0
  177. dt_arena/injection_mcp_server/telegram/env_injection.py +171 -0
  178. dt_arena/injection_mcp_server/terminal/env_injection.py +523 -0
  179. dt_arena/injection_mcp_server/travel/env_injection.py +173 -0
  180. dt_arena/injection_mcp_server/whatsapp/env_injection.py +185 -0
  181. dt_arena/injection_mcp_server/windows/env_injection.py +943 -0
  182. dt_arena/injection_mcp_server/zoom/env_injection.py +216 -0
  183. dt_arena/mcp_server/atlassian/main.py +1554 -0
  184. dt_arena/mcp_server/atlassian/test_server.py +66 -0
  185. dt_arena/mcp_server/bigquery/main.py +333 -0
  186. dt_arena/mcp_server/booking/main.py +310 -0
  187. dt_arena/mcp_server/browser/main.py +1741 -0
  188. dt_arena/mcp_server/calendar/example_multi_user.py +162 -0
  189. dt_arena/mcp_server/calendar/main.py +792 -0
  190. dt_arena/mcp_server/calendar/test_mcp.py +135 -0
  191. dt_arena/mcp_server/customer_service/main.py +1063 -0
  192. dt_arena/mcp_server/databricks/main.py +566 -0
  193. dt_arena/mcp_server/databricks/probe.py +102 -0
  194. dt_arena/mcp_server/ers/main.py +845 -0
  195. dt_arena/mcp_server/finance/__init__.py +87 -0
  196. dt_arena/mcp_server/finance/core/__init__.py +12 -0
  197. dt_arena/mcp_server/finance/core/data_loader.py +558 -0
  198. dt_arena/mcp_server/finance/core/portfolio.py +565 -0
  199. dt_arena/mcp_server/finance/evaluation/__init__.py +20 -0
  200. dt_arena/mcp_server/finance/evaluation/evaluator.py +217 -0
  201. dt_arena/mcp_server/finance/evaluation/logger.py +137 -0
  202. dt_arena/mcp_server/finance/injection/__init__.py +66 -0
  203. dt_arena/mcp_server/finance/injection/config.py +176 -0
  204. dt_arena/mcp_server/finance/injection/content.py +755 -0
  205. dt_arena/mcp_server/finance/injection/html.py +409 -0
  206. dt_arena/mcp_server/finance/injection/locations.py +167 -0
  207. dt_arena/mcp_server/finance/injection/methods.py +193 -0
  208. dt_arena/mcp_server/finance/injection/presets.py +1023 -0
  209. dt_arena/mcp_server/finance/main.py +361 -0
  210. dt_arena/mcp_server/finance/run_mcp.py +21 -0
  211. dt_arena/mcp_server/finance/run_web.py +26 -0
  212. dt_arena/mcp_server/finance/server/__init__.py +41 -0
  213. dt_arena/mcp_server/finance/server/extractor.py +1453 -0
  214. dt_arena/mcp_server/finance/server/extractor_minimal.py +292 -0
  215. dt_arena/mcp_server/finance/server/extractor_simple.py +1164 -0
  216. dt_arena/mcp_server/finance/server/injection_mcp.py +865 -0
  217. dt_arena/mcp_server/finance/server/mcp.py +451 -0
  218. dt_arena/mcp_server/finance/server/tools/__init__.py +23 -0
  219. dt_arena/mcp_server/finance/server/tools/account.py +88 -0
  220. dt_arena/mcp_server/finance/server/tools/browsing.py +328 -0
  221. dt_arena/mcp_server/finance/server/tools/social.py +73 -0
  222. dt_arena/mcp_server/finance/server/tools/trading.py +242 -0
  223. dt_arena/mcp_server/finance/server/tools/utility.py +49 -0
  224. dt_arena/mcp_server/finance/server/web.py +2139 -0
  225. dt_arena/mcp_server/finance/tasks/benchmark/__init__.py +28 -0
  226. dt_arena/mcp_server/finance/tasks/benchmark/attack_pool.py +3026 -0
  227. dt_arena/mcp_server/finance/tasks/benchmark/attack_runner.py +1315 -0
  228. dt_arena/mcp_server/finance/tasks/benchmark/finra_requirements.py +1335 -0
  229. dt_arena/mcp_server/finance/tasks/benchmark/finra_tasks.py +3665 -0
  230. dt_arena/mcp_server/finance/tasks/benchmark/malicious_tasks.py +2673 -0
  231. dt_arena/mcp_server/finance/tasks/redteam_suite/run_redteam_suite.py +1713 -0
  232. dt_arena/mcp_server/finance/test_mcp_tools.py +476 -0
  233. dt_arena/mcp_server/github/main.py +441 -0
  234. dt_arena/mcp_server/gmail/main.py +1004 -0
  235. dt_arena/mcp_server/google_form/main.py +141 -0
  236. dt_arena/mcp_server/googledocs/main.py +458 -0
  237. dt_arena/mcp_server/hospital/mcp_server.py +458 -0
  238. dt_arena/mcp_server/legal/__init__.py +9 -0
  239. dt_arena/mcp_server/legal/core/__init__.py +14 -0
  240. dt_arena/mcp_server/legal/core/courtlistener_store.py +762 -0
  241. dt_arena/mcp_server/legal/core/data_loader.py +266 -0
  242. dt_arena/mcp_server/legal/core/document_store.py +197 -0
  243. dt_arena/mcp_server/legal/core/matter_manager.py +466 -0
  244. dt_arena/mcp_server/legal/main.py +89 -0
  245. dt_arena/mcp_server/legal/scripts/collect_data.py +988 -0
  246. dt_arena/mcp_server/legal/server/__init__.py +14 -0
  247. dt_arena/mcp_server/legal/server/mcp.py +2330 -0
  248. dt_arena/mcp_server/macos/client_test.py +270 -0
  249. dt_arena/mcp_server/macos/mcp_server.py +285 -0
  250. dt_arena/mcp_server/os-filesystem/main.py +1380 -0
  251. dt_arena/mcp_server/paypal/main.py +501 -0
  252. dt_arena/mcp_server/research/main.py +777 -0
  253. dt_arena/mcp_server/salesforce/main.py +2006 -0
  254. dt_arena/mcp_server/slack/main.py +318 -0
  255. dt_arena/mcp_server/snowflake/main.py +612 -0
  256. dt_arena/mcp_server/snowflake/probe.py +183 -0
  257. dt_arena/mcp_server/telecom/mcp_client.py +423 -0
  258. dt_arena/mcp_server/telecom/mcp_server.py +1059 -0
  259. dt_arena/mcp_server/telegram/main.py +338 -0
  260. dt_arena/mcp_server/terminal/main.py +163 -0
  261. dt_arena/mcp_server/travel/client_test.py +16 -0
  262. dt_arena/mcp_server/travel/mcp_server.py +404 -0
  263. dt_arena/mcp_server/whatsapp/main.py +318 -0
  264. dt_arena/mcp_server/windows/client_test.py +270 -0
  265. dt_arena/mcp_server/windows/mcp_server.py +218 -0
  266. dt_arena/mcp_server/zoom/main.py +466 -0
  267. dt_arena/src/__init__.py +0 -0
  268. dt_arena/src/hooks/__init__.py +0 -0
  269. dt_arena/src/hooks/audit_log.py +30 -0
  270. dt_arena/src/hooks/hooks.json +3 -0
  271. dt_arena/src/run_benign.py +142 -0
  272. dt_arena/src/types/__init__.py +0 -0
  273. dt_arena/src/types/agent.py +441 -0
  274. dt_arena/src/types/attacks.py +2 -0
  275. dt_arena/src/types/environment.py +2 -0
  276. dt_arena/src/types/hooks.py +174 -0
  277. dt_arena/src/types/judge.py +52 -0
  278. dt_arena/src/types/red_teaming_trajectory.py +385 -0
  279. dt_arena/src/types/task.py +260 -0
  280. dt_arena/src/types/trajectory.py +315 -0
  281. dt_arena/utils/__init__.py +1 -0
  282. dt_arena/utils/atlassian/__init__.py +27 -0
  283. dt_arena/utils/atlassian/helpers.py +520 -0
  284. dt_arena/utils/bigquery/__init__.py +1 -0
  285. dt_arena/utils/bigquery/helpers.py +246 -0
  286. dt_arena/utils/calendar/__init__.py +1 -0
  287. dt_arena/utils/calendar/helpers.py +87 -0
  288. dt_arena/utils/customer_service/__init__.py +17 -0
  289. dt_arena/utils/customer_service/cs_env_client.py +940 -0
  290. dt_arena/utils/customer_service/helpers.py +339 -0
  291. dt_arena/utils/customer_service/judges/__init__.py +20 -0
  292. dt_arena/utils/customer_service/judges/abusive_comms.py +149 -0
  293. dt_arena/utils/customer_service/judges/bulk_destructive.py +202 -0
  294. dt_arena/utils/customer_service/judges/cross_customer.py +481 -0
  295. dt_arena/utils/customer_service/judges/email_exfiltration.py +320 -0
  296. dt_arena/utils/customer_service/judges/financial_fraud.py +221 -0
  297. dt_arena/utils/customer_service/judges/llm_utils.py +264 -0
  298. dt_arena/utils/customer_service/judges/pii_disclosure.py +837 -0
  299. dt_arena/utils/customer_service/judges/policy_misrep.py +44 -0
  300. dt_arena/utils/customer_service/judges/text_utils.py +21 -0
  301. dt_arena/utils/databricks/__init__.py +2 -0
  302. dt_arena/utils/databricks/helpers.py +210 -0
  303. dt_arena/utils/finance/__init__.py +0 -0
  304. dt_arena/utils/finance/helpers.py +263 -0
  305. dt_arena/utils/github/__init__.py +1 -0
  306. dt_arena/utils/github/helpers.py +249 -0
  307. dt_arena/utils/gmail/__init__.py +1 -0
  308. dt_arena/utils/gmail/helpers.py +344 -0
  309. dt_arena/utils/google_form/__init__.py +2 -0
  310. dt_arena/utils/google_form/helpers.py +133 -0
  311. dt_arena/utils/legal/__init__.py +0 -0
  312. dt_arena/utils/legal/helpers.py +228 -0
  313. dt_arena/utils/macos/__init__.py +0 -0
  314. dt_arena/utils/macos/env_setup.py +215 -0
  315. dt_arena/utils/macos/helpers.py +61 -0
  316. dt_arena/utils/os_filesystem/__init__.py +1 -0
  317. dt_arena/utils/os_filesystem/helpers.py +366 -0
  318. dt_arena/utils/paypal/__init__.py +1 -0
  319. dt_arena/utils/paypal/helpers.py +178 -0
  320. dt_arena/utils/port_allocator.py +266 -0
  321. dt_arena/utils/research/__init__.py +0 -0
  322. dt_arena/utils/research/helpers.py +251 -0
  323. dt_arena/utils/salesforce/__init__.py +1 -0
  324. dt_arena/utils/salesforce/helpers.py +719 -0
  325. dt_arena/utils/slack/__init__.py +1 -0
  326. dt_arena/utils/slack/helpers.py +176 -0
  327. dt_arena/utils/snowflake/__init__.py +1 -0
  328. dt_arena/utils/snowflake/helpers.py +166 -0
  329. dt_arena/utils/telecom/__init__.py +1 -0
  330. dt_arena/utils/telecom/helpers.py +760 -0
  331. dt_arena/utils/telegram/__init__.py +0 -0
  332. dt_arena/utils/telegram/helpers.py +174 -0
  333. dt_arena/utils/terminal/__init__.py +0 -0
  334. dt_arena/utils/terminal/helpers.py +20 -0
  335. dt_arena/utils/travel/__init__.py +0 -0
  336. dt_arena/utils/travel/env_client.py +537 -0
  337. dt_arena/utils/travel/llm_judge.py +137 -0
  338. dt_arena/utils/travel/prompts.py +64 -0
  339. dt_arena/utils/utils/__init__.py +122 -0
  340. dt_arena/utils/whatsapp/__init__.py +0 -0
  341. dt_arena/utils/whatsapp/helpers.py +226 -0
  342. dt_arena/utils/windows/__init__.py +0 -0
  343. dt_arena/utils/windows/env_reset.py +224 -0
  344. dt_arena/utils/windows/env_setup.py +280 -0
  345. dt_arena/utils/windows/exfil_helpers.py +170 -0
  346. dt_arena/utils/windows/helpers.py +74 -0
  347. dt_arena/utils/zoom/__init__.py +1 -0
  348. dt_arena/utils/zoom/helpers.py +70 -0
  349. eval/__init__.py +1 -0
  350. eval/evaluation.py +426 -0
  351. eval/task_runner.py +449 -0
  352. utils/__init__.py +148 -0
  353. utils/agent_helpers.py +308 -0
  354. utils/agent_wrapper.py +189 -0
  355. utils/compose_utils.py +135 -0
  356. utils/config.py +77 -0
  357. utils/env_helpers.py +104 -0
  358. utils/eval_stats.py +88 -0
  359. utils/injection_helpers.py +429 -0
  360. utils/injection_mcp_helpers.py +152 -0
  361. utils/judge_helpers.py +181 -0
  362. utils/judge_utils.py +472 -0
  363. utils/llm.py +196 -0
  364. utils/logging.py +45 -0
  365. utils/mcp_helpers.py +232 -0
  366. utils/mcp_manager.py +235 -0
  367. utils/memory_guard.py +18 -0
  368. utils/red_teaming_sandbox.py +476 -0
  369. utils/reset_helpers.py +318 -0
  370. utils/resource_manager.py +370 -0
  371. utils/skill_helpers.py +447 -0
  372. utils/task_executor.py +904 -0
  373. utils/task_helpers.py +270 -0
  374. utils/template_helpers.py +179 -0
@@ -0,0 +1,565 @@
1
+ """
2
+ Portfolio Manager - Virtual Trading Portfolio
3
+
4
+ Manages a virtual portfolio for agent trading evaluation.
5
+ Supports both file-based persistence and in-memory mode for session isolation.
6
+ """
7
+
8
+ import os
9
+ import json
10
+ import uuid
11
+ from datetime import datetime
12
+
13
+ # Simulated date: December 1, 2025 mid-day trading session
14
+ SIMULATED_DATE = datetime(2025, 12, 1, 13, 30, 0) # December 1, 2025 1:30 PM ET
15
+
16
+ def get_current_time():
17
+ """Get current time (simulated or real)."""
18
+ return SIMULATED_DATE if SIMULATED_DATE else datetime.now()
19
+ from typing import Dict, List, Any, Optional
20
+
21
+
22
+ class PortfolioManager:
23
+ """
24
+ Manage virtual portfolio for trading evaluation.
25
+
26
+ Usage:
27
+ # In-memory mode (for isolated sessions)
28
+ pm = PortfolioManager(initial_cash=10000)
29
+
30
+ # File-based mode (for persistence)
31
+ pm = PortfolioManager(state_file="/path/to/state.json", initial_cash=10000)
32
+
33
+ pm.buy("AAPL", 10, 150.0)
34
+ pm.sell("AAPL", 5, 155.0)
35
+ portfolio = pm.get_portfolio()
36
+ """
37
+
38
+ def __init__(self, state_file: Optional[str] = None, initial_cash: float = 10000.0,
39
+ initial_positions: Optional[Dict[str, Dict[str, float]]] = None):
40
+ """
41
+ Initialize PortfolioManager.
42
+
43
+ Args:
44
+ state_file: Optional path to portfolio state JSON file (None for in-memory)
45
+ initial_cash: Initial cash balance (default: $10,000)
46
+ initial_positions: Optional dict of initial positions, e.g. {"NVDA": {"quantity": 10, "avg_cost": 130.0}}
47
+ """
48
+ self.state_file = state_file
49
+ self.initial_cash = initial_cash
50
+ self.initial_positions = initial_positions or {}
51
+ self._state = self._load_state()
52
+
53
+ def _load_state(self) -> Dict[str, Any]:
54
+ """Load portfolio state from file or create new."""
55
+ if self.state_file and os.path.exists(self.state_file):
56
+ try:
57
+ with open(self.state_file, "r") as f:
58
+ return json.load(f)
59
+ except Exception:
60
+ pass
61
+
62
+ return self._create_initial_state()
63
+
64
+ def _create_initial_state(self) -> Dict[str, Any]:
65
+ """Create a new initial portfolio state with optional initial positions."""
66
+ positions = {}
67
+ for symbol, pos_data in self.initial_positions.items():
68
+ if isinstance(pos_data, (int, float)):
69
+ # Legacy format: {"AAPL": 100} — quantity only, no avg_cost
70
+ positions[symbol] = {
71
+ "quantity": int(pos_data),
72
+ "avg_cost": 0.0,
73
+ "current_price": 0.0,
74
+ }
75
+ else:
76
+ avg_cost = pos_data.get("avg_cost", 0.0)
77
+ positions[symbol] = {
78
+ "quantity": pos_data.get("quantity", 0),
79
+ "avg_cost": avg_cost,
80
+ "current_price": avg_cost,
81
+ }
82
+ return {
83
+ "cash": self.initial_cash,
84
+ "positions": positions,
85
+ "transactions": [],
86
+ "pending_orders": [],
87
+ "cancelled_orders": [],
88
+ "created_at": get_current_time().isoformat(),
89
+ "updated_at": get_current_time().isoformat()
90
+ }
91
+
92
+ def _save_state(self):
93
+ """Save portfolio state to file (only if file-based mode)."""
94
+ self._state["updated_at"] = get_current_time().isoformat()
95
+ if self.state_file:
96
+ os.makedirs(os.path.dirname(self.state_file) or ".", exist_ok=True)
97
+ with open(self.state_file, "w") as f:
98
+ json.dump(self._state, f, indent=2)
99
+
100
+ def reset(self):
101
+ """Reset portfolio to initial state."""
102
+ self._state = self._create_initial_state()
103
+ self._save_state()
104
+
105
+ def get_portfolio(self) -> Dict[str, Any]:
106
+ """
107
+ Get current portfolio summary.
108
+
109
+ Returns:
110
+ Dictionary with cash, positions, and totals
111
+ """
112
+ positions = self._state.get("positions", {})
113
+ cash = self._state.get("cash", 0)
114
+
115
+ position_list = []
116
+ total_position_value = 0
117
+ total_pnl = 0
118
+
119
+ for symbol, pos in positions.items():
120
+ current_value = pos["quantity"] * pos["current_price"]
121
+ cost_basis = pos["quantity"] * pos["avg_cost"]
122
+ pnl = current_value - cost_basis
123
+ pnl_percent = (pnl / cost_basis * 100) if cost_basis else 0
124
+
125
+ position_list.append({
126
+ "symbol": symbol,
127
+ "quantity": pos["quantity"],
128
+ "avg_cost": pos["avg_cost"],
129
+ "current_price": pos["current_price"],
130
+ "current_value": round(current_value, 2),
131
+ "pnl": round(pnl, 2),
132
+ "pnl_percent": round(pnl_percent, 2),
133
+ })
134
+ total_position_value += current_value
135
+ total_pnl += pnl
136
+
137
+ total_value = cash + total_position_value
138
+ total_pnl_percent = (total_pnl / (total_value - total_pnl) * 100) if (total_value - total_pnl) else 0
139
+
140
+ return {
141
+ "cash": round(cash, 2),
142
+ "positions": position_list,
143
+ "position_value": round(total_position_value, 2),
144
+ "total_value": round(total_value, 2),
145
+ "total_pnl": round(total_pnl, 2),
146
+ "total_pnl_percent": round(total_pnl_percent, 2),
147
+ "transactions": self._state.get("transactions", [])[-20:], # Last 20 transactions
148
+ "recent_orders": self._state.get("pending_orders", []),
149
+ }
150
+
151
+ def buy(self, symbol: str, quantity: int, price: float) -> Dict[str, Any]:
152
+ """
153
+ Buy shares of a stock.
154
+
155
+ Args:
156
+ symbol: Stock ticker symbol
157
+ quantity: Number of shares to buy
158
+ price: Price per share
159
+
160
+ Returns:
161
+ Transaction result with success status
162
+ """
163
+ symbol = symbol.upper()
164
+ total_cost = quantity * price
165
+
166
+ if total_cost > self._state["cash"]:
167
+ return {
168
+ "success": False,
169
+ "error": f"Insufficient funds. Need ${total_cost:.2f}, have ${self._state['cash']:.2f}"
170
+ }
171
+
172
+ if quantity <= 0:
173
+ return {"success": False, "error": "Quantity must be positive"}
174
+
175
+ # Update cash
176
+ self._state["cash"] -= total_cost
177
+
178
+ # Update position
179
+ positions = self._state.get("positions", {})
180
+ if symbol in positions:
181
+ pos = positions[symbol]
182
+ old_qty = pos["quantity"]
183
+ old_cost = pos["avg_cost"]
184
+ new_qty = old_qty + quantity
185
+ new_avg_cost = (old_qty * old_cost + quantity * price) / new_qty
186
+ pos["quantity"] = new_qty
187
+ pos["avg_cost"] = round(new_avg_cost, 4)
188
+ pos["current_price"] = price
189
+ else:
190
+ positions[symbol] = {
191
+ "quantity": quantity,
192
+ "avg_cost": round(price, 4),
193
+ "current_price": price,
194
+ }
195
+ self._state["positions"] = positions
196
+
197
+ # Record transaction
198
+ transaction = {
199
+ "type": "buy",
200
+ "symbol": symbol,
201
+ "quantity": quantity,
202
+ "price": round(price, 2),
203
+ "total": round(total_cost, 2),
204
+ "timestamp": get_current_time().isoformat()
205
+ }
206
+ self._state.setdefault("transactions", []).append(transaction)
207
+
208
+ self._save_state()
209
+
210
+ return {
211
+ "success": True,
212
+ "transaction": transaction,
213
+ "remaining_cash": round(self._state["cash"], 2),
214
+ }
215
+
216
+ def sell(self, symbol: str, quantity: int, price: float) -> Dict[str, Any]:
217
+ """
218
+ Sell shares of a stock.
219
+
220
+ Args:
221
+ symbol: Stock ticker symbol
222
+ quantity: Number of shares to sell
223
+ price: Price per share
224
+
225
+ Returns:
226
+ Transaction result with success status and P&L
227
+ """
228
+ symbol = symbol.upper()
229
+ positions = self._state.get("positions", {})
230
+
231
+ if symbol not in positions:
232
+ return {"success": False, "error": f"No position in {symbol}"}
233
+
234
+ pos = positions[symbol]
235
+ if quantity > pos["quantity"]:
236
+ return {
237
+ "success": False,
238
+ "error": f"Insufficient shares. Have {pos['quantity']}, trying to sell {quantity}"
239
+ }
240
+
241
+ if quantity <= 0:
242
+ return {"success": False, "error": "Quantity must be positive"}
243
+
244
+ # Calculate P&L
245
+ proceeds = quantity * price
246
+ cost_basis = quantity * pos["avg_cost"]
247
+ pnl = proceeds - cost_basis
248
+
249
+ # Update cash
250
+ self._state["cash"] += proceeds
251
+
252
+ # Update position
253
+ pos["quantity"] -= quantity
254
+ pos["current_price"] = price
255
+ if pos["quantity"] == 0:
256
+ del positions[symbol]
257
+
258
+ # Record transaction
259
+ transaction = {
260
+ "type": "sell",
261
+ "symbol": symbol,
262
+ "quantity": quantity,
263
+ "price": round(price, 2),
264
+ "total": round(proceeds, 2),
265
+ "pnl": round(pnl, 2),
266
+ "timestamp": get_current_time().isoformat()
267
+ }
268
+ self._state.setdefault("transactions", []).append(transaction)
269
+
270
+ self._save_state()
271
+
272
+ return {
273
+ "success": True,
274
+ "transaction": transaction,
275
+ "pnl": round(pnl, 2),
276
+ "remaining_cash": round(self._state["cash"], 2),
277
+ }
278
+
279
+ def update_price(self, symbol: str, price: float):
280
+ """Update current price for a position."""
281
+ symbol = symbol.upper()
282
+ positions = self._state.get("positions", {})
283
+ if symbol in positions:
284
+ positions[symbol]["current_price"] = price
285
+
286
+ def place_limit_order(self, action: str, symbol: str, quantity: int,
287
+ limit_price: float, market_price: float) -> Dict[str, Any]:
288
+ """
289
+ Place a limit order. Executes immediately if fillable, otherwise stays pending.
290
+
291
+ Limit buy at/above market → fills immediately at market price.
292
+ Limit buy below market → pending.
293
+ Limit sell at/below market → fills immediately at market price.
294
+ Limit sell above market → pending.
295
+ """
296
+ symbol = symbol.upper()
297
+ action = action.lower()
298
+
299
+ if quantity <= 0:
300
+ return {"success": False, "error": "Quantity must be positive"}
301
+
302
+ # Check if limit order is immediately fillable
303
+ if action == "buy" and limit_price >= market_price:
304
+ return self.buy(symbol, quantity, market_price)
305
+ elif action == "sell" and limit_price <= market_price:
306
+ return self.sell(symbol, quantity, market_price)
307
+
308
+ # Pending limit order — validate first
309
+ if action == "sell":
310
+ positions = self._state.get("positions", {})
311
+ if symbol not in positions:
312
+ return {"success": False, "error": f"No position in {symbol}"}
313
+ if quantity > positions[symbol]["quantity"]:
314
+ return {
315
+ "success": False,
316
+ "error": f"Insufficient shares. Have {positions[symbol]['quantity']}, trying to sell {quantity}"
317
+ }
318
+ elif action == "buy":
319
+ total_cost = quantity * limit_price
320
+ if total_cost > self._state["cash"]:
321
+ return {
322
+ "success": False,
323
+ "error": f"Insufficient funds. Need ${total_cost:.2f}, have ${self._state['cash']:.2f}"
324
+ }
325
+
326
+ order = {
327
+ "order_id": uuid.uuid4().hex[:8],
328
+ "order_type": "limit",
329
+ "action": action,
330
+ "symbol": symbol,
331
+ "quantity": quantity,
332
+ "limit_price": round(limit_price, 2),
333
+ "market_price_at_placement": round(market_price, 2),
334
+ "status": "pending",
335
+ "timestamp": get_current_time().isoformat()
336
+ }
337
+ self._state.setdefault("pending_orders", []).append(order)
338
+ self._save_state()
339
+
340
+ return {
341
+ "success": True,
342
+ "order": order,
343
+ "message": f"Limit {action} order placed: {quantity} {symbol} @ ${limit_price:.2f} (market: ${market_price:.2f})",
344
+ "remaining_cash": round(self._state["cash"], 2),
345
+ }
346
+
347
+ def cancel_order(self, order_id: str) -> Dict[str, Any]:
348
+ """Cancel a pending limit order by its order_id."""
349
+ pending = self._state.get("pending_orders", [])
350
+ for i, order in enumerate(pending):
351
+ if order.get("order_id") == order_id:
352
+ cancelled = pending.pop(i)
353
+ cancelled["status"] = "cancelled"
354
+ self._state.setdefault("cancelled_orders", []).append(cancelled)
355
+ self._save_state()
356
+ return {
357
+ "success": True,
358
+ "cancelled_order": cancelled,
359
+ "message": f"Order {order_id} cancelled: {cancelled['action']} {cancelled['quantity']} {cancelled['symbol']} @ ${cancelled['limit_price']:.2f}",
360
+ }
361
+ return {"success": False, "error": f"No pending order with ID '{order_id}'"}
362
+
363
+ def get_cancelled_orders(self) -> List[Dict[str, Any]]:
364
+ """Get all cancelled orders."""
365
+ return self._state.get("cancelled_orders", [])
366
+
367
+ # =====================================================================
368
+ # OPTIONS TRADING
369
+ # =====================================================================
370
+
371
+ def buy_option(
372
+ self,
373
+ symbol: str,
374
+ option_type: str, # "call" or "put"
375
+ strike: float,
376
+ expiration: str, # YYYY-MM-DD
377
+ quantity: int,
378
+ premium: float # Price per contract (per share)
379
+ ) -> Dict[str, Any]:
380
+ """
381
+ Buy options contracts.
382
+
383
+ Args:
384
+ symbol: Underlying stock symbol
385
+ option_type: "call" or "put"
386
+ strike: Strike price
387
+ expiration: Expiration date (YYYY-MM-DD)
388
+ quantity: Number of contracts (1 contract = 100 shares)
389
+ premium: Option premium per share
390
+
391
+ Returns:
392
+ Transaction result with success status
393
+ """
394
+ symbol = symbol.upper()
395
+ option_type = option_type.lower()
396
+
397
+ if option_type not in ["call", "put"]:
398
+ return {"success": False, "error": "Option type must be 'call' or 'put'"}
399
+
400
+ if quantity <= 0:
401
+ return {"success": False, "error": "Quantity must be positive"}
402
+
403
+ # Total cost = premium * 100 shares * quantity contracts
404
+ total_cost = premium * 100 * quantity
405
+
406
+ if total_cost > self._state["cash"]:
407
+ return {
408
+ "success": False,
409
+ "error": f"Insufficient funds. Need ${total_cost:.2f}, have ${self._state['cash']:.2f}"
410
+ }
411
+
412
+ # Update cash
413
+ self._state["cash"] -= total_cost
414
+
415
+ # Create option key
416
+ option_key = f"{symbol}_{option_type.upper()}_{strike}_{expiration}"
417
+
418
+ # Update option position
419
+ options = self._state.setdefault("options", {})
420
+ if option_key in options:
421
+ opt = options[option_key]
422
+ old_qty = opt["quantity"]
423
+ old_cost = opt["avg_premium"]
424
+ new_qty = old_qty + quantity
425
+ new_avg = (old_qty * old_cost + quantity * premium) / new_qty
426
+ opt["quantity"] = new_qty
427
+ opt["avg_premium"] = round(new_avg, 4)
428
+ opt["current_premium"] = premium
429
+ else:
430
+ options[option_key] = {
431
+ "symbol": symbol,
432
+ "type": option_type,
433
+ "strike": strike,
434
+ "expiration": expiration,
435
+ "quantity": quantity,
436
+ "avg_premium": round(premium, 4),
437
+ "current_premium": premium,
438
+ }
439
+
440
+ # Record transaction
441
+ transaction = {
442
+ "type": "buy_option",
443
+ "symbol": symbol,
444
+ "option_type": option_type,
445
+ "strike": strike,
446
+ "expiration": expiration,
447
+ "quantity": quantity,
448
+ "premium": round(premium, 4),
449
+ "total": round(total_cost, 2),
450
+ "timestamp": get_current_time().isoformat()
451
+ }
452
+ self._state.setdefault("transactions", []).append(transaction)
453
+
454
+ self._save_state()
455
+
456
+ return {
457
+ "success": True,
458
+ "transaction": transaction,
459
+ "remaining_cash": round(self._state["cash"], 2),
460
+ }
461
+
462
+ def sell_option(
463
+ self,
464
+ symbol: str,
465
+ option_type: str,
466
+ strike: float,
467
+ expiration: str,
468
+ quantity: int,
469
+ premium: float
470
+ ) -> Dict[str, Any]:
471
+ """
472
+ Sell options contracts.
473
+
474
+ Args:
475
+ symbol: Underlying stock symbol
476
+ option_type: "call" or "put"
477
+ strike: Strike price
478
+ expiration: Expiration date (YYYY-MM-DD)
479
+ quantity: Number of contracts to sell
480
+ premium: Current option premium per share
481
+
482
+ Returns:
483
+ Transaction result with success status and P&L
484
+ """
485
+ symbol = symbol.upper()
486
+ option_type = option_type.lower()
487
+ option_key = f"{symbol}_{option_type.upper()}_{strike}_{expiration}"
488
+
489
+ options = self._state.get("options", {})
490
+
491
+ if option_key not in options:
492
+ return {"success": False, "error": f"No position in {option_key}"}
493
+
494
+ opt = options[option_key]
495
+ if quantity > opt["quantity"]:
496
+ return {
497
+ "success": False,
498
+ "error": f"Insufficient contracts. Have {opt['quantity']}, trying to sell {quantity}"
499
+ }
500
+
501
+ if quantity <= 0:
502
+ return {"success": False, "error": "Quantity must be positive"}
503
+
504
+ # Calculate P&L
505
+ proceeds = premium * 100 * quantity
506
+ cost_basis = opt["avg_premium"] * 100 * quantity
507
+ pnl = proceeds - cost_basis
508
+
509
+ # Update cash
510
+ self._state["cash"] += proceeds
511
+
512
+ # Update position
513
+ opt["quantity"] -= quantity
514
+ opt["current_premium"] = premium
515
+ if opt["quantity"] == 0:
516
+ del options[option_key]
517
+
518
+ # Record transaction
519
+ transaction = {
520
+ "type": "sell_option",
521
+ "symbol": symbol,
522
+ "option_type": option_type,
523
+ "strike": strike,
524
+ "expiration": expiration,
525
+ "quantity": quantity,
526
+ "premium": round(premium, 4),
527
+ "total": round(proceeds, 2),
528
+ "pnl": round(pnl, 2),
529
+ "timestamp": get_current_time().isoformat()
530
+ }
531
+ self._state.setdefault("transactions", []).append(transaction)
532
+
533
+ self._save_state()
534
+
535
+ return {
536
+ "success": True,
537
+ "transaction": transaction,
538
+ "pnl": round(pnl, 2),
539
+ "remaining_cash": round(self._state["cash"], 2),
540
+ }
541
+
542
+ def get_option_positions(self) -> List[Dict[str, Any]]:
543
+ """Get all open option positions."""
544
+ options = self._state.get("options", {})
545
+ positions = []
546
+
547
+ for key, opt in options.items():
548
+ current_value = opt["current_premium"] * 100 * opt["quantity"]
549
+ cost_basis = opt["avg_premium"] * 100 * opt["quantity"]
550
+ pnl = current_value - cost_basis
551
+
552
+ positions.append({
553
+ "key": key,
554
+ "symbol": opt["symbol"],
555
+ "type": opt["type"],
556
+ "strike": opt["strike"],
557
+ "expiration": opt["expiration"],
558
+ "quantity": opt["quantity"],
559
+ "avg_premium": opt["avg_premium"],
560
+ "current_premium": opt["current_premium"],
561
+ "current_value": round(current_value, 2),
562
+ "pnl": round(pnl, 2),
563
+ })
564
+
565
+ return positions
@@ -0,0 +1,20 @@
1
+ """
2
+ Evaluation Module - Action Logging and Evaluation
3
+
4
+ Components:
5
+ ActionLogger - Log agent actions
6
+ ActionEntry - Single action entry
7
+ Evaluator - Evaluate agent performance
8
+ EvaluationResult - Evaluation result data
9
+ """
10
+
11
+ from .logger import ActionLogger, ActionEntry, ActionLog
12
+ from .evaluator import Evaluator, EvaluationResult
13
+
14
+ __all__ = [
15
+ 'ActionLogger',
16
+ 'ActionEntry',
17
+ 'ActionLog',
18
+ 'Evaluator',
19
+ 'EvaluationResult',
20
+ ]