decodingtrust-agent-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. agent/__init__.py +30 -0
  2. agent/claudesdk/__init__.py +8 -0
  3. agent/claudesdk/example.py +221 -0
  4. agent/claudesdk/src/__init__.py +8 -0
  5. agent/claudesdk/src/agent.py +400 -0
  6. agent/claudesdk/src/mcp_proxy.py +409 -0
  7. agent/claudesdk/src/utils.py +420 -0
  8. agent/googleadk/__init__.py +15 -0
  9. agent/googleadk/example.py +237 -0
  10. agent/googleadk/src/__init__.py +12 -0
  11. agent/googleadk/src/agent.py +401 -0
  12. agent/googleadk/src/mcp_wrapper.py +163 -0
  13. agent/googleadk/src/utils.py +602 -0
  14. agent/langchain/__init__.py +8 -0
  15. agent/langchain/example.py +213 -0
  16. agent/langchain/src/__init__.py +8 -0
  17. agent/langchain/src/agent.py +645 -0
  18. agent/langchain/src/utils.py +433 -0
  19. agent/openaisdk/__init__.py +17 -0
  20. agent/openaisdk/example.py +228 -0
  21. agent/openaisdk/src/__init__.py +12 -0
  22. agent/openaisdk/src/agent.py +491 -0
  23. agent/openaisdk/src/agent_wrapper.py +143 -0
  24. agent/openaisdk/src/mcp_wrapper.py +395 -0
  25. agent/openaisdk/src/utils.py +493 -0
  26. agent/openclaw/__init__.py +10 -0
  27. agent/openclaw/example.py +251 -0
  28. agent/openclaw/src/__init__.py +14 -0
  29. agent/openclaw/src/agent.py +930 -0
  30. agent/openclaw/src/helpers/__init__.py +1 -0
  31. agent/openclaw/src/helpers/auth_helpers.py +55 -0
  32. agent/openclaw/src/mcp_proxy.py +564 -0
  33. agent/openclaw/src/plugin_generator.py +231 -0
  34. agent/openclaw/src/utils.py +341 -0
  35. agent/pocketflow/__init__.py +18 -0
  36. agent/pocketflow/example.py +221 -0
  37. agent/pocketflow/prompts/react_agent.py +46 -0
  38. agent/pocketflow/src/__init__.py +6 -0
  39. agent/pocketflow/src/agent.py +507 -0
  40. agent/pocketflow/src/agent_wrapper.py +159 -0
  41. agent/pocketflow/src/async_helper.py +92 -0
  42. agent/pocketflow/src/mcp_react_agent.py +279 -0
  43. agent/pocketflow/src/native_agent.py +74 -0
  44. agent/pocketflow/src/nodes.py +467 -0
  45. benchmark/__init__.py +0 -0
  46. benchmark/browser/benign.jsonl +34 -0
  47. benchmark/browser/direct.jsonl +85 -0
  48. benchmark/browser/indirect.jsonl +82 -0
  49. benchmark/code/benign.jsonl +0 -0
  50. benchmark/code/direct.jsonl +121 -0
  51. benchmark/code/indirect.jsonl +165 -0
  52. benchmark/crm/benign.jsonl +165 -0
  53. benchmark/crm/direct.jsonl +90 -0
  54. benchmark/crm/indirect.jsonl +150 -0
  55. benchmark/customer-service/benign.jsonl +160 -0
  56. benchmark/customer-service/direct.jsonl +100 -0
  57. benchmark/customer-service/indirect.jsonl +101 -0
  58. benchmark/finance/benign.jsonl +0 -0
  59. benchmark/finance/direct.jsonl +200 -0
  60. benchmark/finance/indirect.jsonl +200 -0
  61. benchmark/legal/benign.jsonl +0 -0
  62. benchmark/legal/direct.jsonl +200 -0
  63. benchmark/legal/indirect.jsonl +200 -0
  64. benchmark/macos/benign.jsonl +30 -0
  65. benchmark/macos/direct.jsonl +50 -0
  66. benchmark/macos/indirect.jsonl +50 -0
  67. benchmark/medical/benign.jsonl +642 -0
  68. benchmark/medical/direct.jsonl +229 -0
  69. benchmark/medical/indirect.jsonl +222 -0
  70. benchmark/os-filesystem/benign.jsonl +200 -0
  71. benchmark/os-filesystem/direct.jsonl +200 -0
  72. benchmark/os-filesystem/indirect.jsonl +200 -0
  73. benchmark/research/benign.jsonl +0 -0
  74. benchmark/research/direct.jsonl +119 -0
  75. benchmark/research/indirect.jsonl +125 -0
  76. benchmark/telecom/benign.jsonl +120 -0
  77. benchmark/telecom/direct.jsonl +161 -0
  78. benchmark/telecom/indirect.jsonl +166 -0
  79. benchmark/travel/benign.jsonl +130 -0
  80. benchmark/travel/direct.jsonl +105 -0
  81. benchmark/travel/indirect.jsonl +120 -0
  82. benchmark/windows/benign.jsonl +100 -0
  83. benchmark/windows/direct.jsonl +140 -0
  84. benchmark/windows/indirect.jsonl +107 -0
  85. benchmark/workflow/benign.jsonl +335 -0
  86. benchmark/workflow/direct.jsonl +78 -0
  87. benchmark/workflow/indirect.jsonl +107 -0
  88. cli/__init__.py +5 -0
  89. cli/main.py +182 -0
  90. cli/scaffold.py +334 -0
  91. decodingtrust_agent_sdk-0.1.0.dist-info/METADATA +642 -0
  92. decodingtrust_agent_sdk-0.1.0.dist-info/RECORD +374 -0
  93. decodingtrust_agent_sdk-0.1.0.dist-info/WHEEL +5 -0
  94. decodingtrust_agent_sdk-0.1.0.dist-info/entry_points.txt +2 -0
  95. decodingtrust_agent_sdk-0.1.0.dist-info/licenses/LICENSE +201 -0
  96. decodingtrust_agent_sdk-0.1.0.dist-info/top_level.txt +6 -0
  97. dt_arena/config/env.yaml +515 -0
  98. dt_arena/config/injection_mcp.yaml +430 -0
  99. dt_arena/config/mcp.yaml +642 -0
  100. dt_arena/envs/arxiv/docker-compose-hub.yml +31 -0
  101. dt_arena/envs/arxiv/docker-compose.yml +36 -0
  102. dt_arena/envs/atlassian/docker/docker-compose.dev.yml +65 -0
  103. dt_arena/envs/atlassian/docker/docker-compose.yml +53 -0
  104. dt_arena/envs/atlassian/docker-compose-hub.yml +57 -0
  105. dt_arena/envs/atlassian/docker-compose.yml +72 -0
  106. dt_arena/envs/bigquery/docker-compose.yml +20 -0
  107. dt_arena/envs/booking/docker-compose.yml +59 -0
  108. dt_arena/envs/calendar/docker-compose-hub.yml +30 -0
  109. dt_arena/envs/calendar/docker-compose.yml +42 -0
  110. dt_arena/envs/custom-website/docker-compose.yml +6 -0
  111. dt_arena/envs/customer_service/docker-compose.yml +59 -0
  112. dt_arena/envs/databricks/docker-compose-hub.yml +47 -0
  113. dt_arena/envs/databricks/docker-compose.yml +51 -0
  114. dt_arena/envs/ecommerce/docker-compose.yml +6 -0
  115. dt_arena/envs/ers/docker-compose.yml +36 -0
  116. dt_arena/envs/ers/hrms/docker/docker-compose.yml +31 -0
  117. dt_arena/envs/finance/docker-compose.yml +23 -0
  118. dt_arena/envs/github/docker/docker-compose-hub.yml +50 -0
  119. dt_arena/envs/github/docker/docker-compose.yml +50 -0
  120. dt_arena/envs/gmail/docker-compose-hub.yml +51 -0
  121. dt_arena/envs/gmail/docker-compose.yml +65 -0
  122. dt_arena/envs/google-form/docker-compose-hub.yml +33 -0
  123. dt_arena/envs/google-form/docker-compose.yml +41 -0
  124. dt_arena/envs/googledocs/docker-compose-hub.yml +61 -0
  125. dt_arena/envs/googledocs/docker-compose.yml +78 -0
  126. dt_arena/envs/hospital/docker-compose-hub.yml +25 -0
  127. dt_arena/envs/hospital/docker-compose.yml +27 -0
  128. dt_arena/envs/legal/docker-compose.yml +22 -0
  129. dt_arena/envs/linkedin/docker-compose.yml +63 -0
  130. dt_arena/envs/macos/docker-compose.yml +79 -0
  131. dt_arena/envs/os-filesystem/docker-compose-hub.yml +16 -0
  132. dt_arena/envs/os-filesystem/docker-compose.yml +20 -0
  133. dt_arena/envs/paypal/docker-compose-hub.yml +48 -0
  134. dt_arena/envs/paypal/docker-compose.yml +63 -0
  135. dt_arena/envs/research/docker-compose-hub.yml +13 -0
  136. dt_arena/envs/research/docker-compose.yml +24 -0
  137. dt_arena/envs/salesforce_crm/docker-compose-hub.yaml +45 -0
  138. dt_arena/envs/salesforce_crm/docker-compose.yaml +49 -0
  139. dt_arena/envs/slack/docker-compose-hub.yml +28 -0
  140. dt_arena/envs/slack/docker-compose.yml +41 -0
  141. dt_arena/envs/snowflake/docker-compose-hub.yml +41 -0
  142. dt_arena/envs/snowflake/docker-compose.yml +44 -0
  143. dt_arena/envs/telecom/docker-compose-hub.yml +16 -0
  144. dt_arena/envs/telecom/docker-compose.yml +17 -0
  145. dt_arena/envs/telegram/docker-compose-hub.yml +57 -0
  146. dt_arena/envs/telegram/docker-compose.yml +62 -0
  147. dt_arena/envs/terminal/docker-compose-hub.yml +12 -0
  148. dt_arena/envs/terminal/docker-compose.yml +26 -0
  149. dt_arena/envs/travel/docker-compose-hub.yml +19 -0
  150. dt_arena/envs/travel/docker-compose.yml +19 -0
  151. dt_arena/envs/whatsapp/docker-compose-hub.yml +61 -0
  152. dt_arena/envs/whatsapp/docker-compose.yml +78 -0
  153. dt_arena/envs/windows/docker-compose.yml +71 -0
  154. dt_arena/envs/zoom/docker-compose-hub.yml +27 -0
  155. dt_arena/envs/zoom/docker-compose.yml +40 -0
  156. dt_arena/injection_mcp_server/atlassian/env_injection.py +134 -0
  157. dt_arena/injection_mcp_server/calendar/env_injection.py +217 -0
  158. dt_arena/injection_mcp_server/custom_website/env_injection.py +97 -0
  159. dt_arena/injection_mcp_server/customer_service/env_injection.py +659 -0
  160. dt_arena/injection_mcp_server/databricks/env_injection.py +255 -0
  161. dt_arena/injection_mcp_server/ecommerce/env_injection.py +110 -0
  162. dt_arena/injection_mcp_server/finance/env_injection.py +85 -0
  163. dt_arena/injection_mcp_server/github/env_injection.py +206 -0
  164. dt_arena/injection_mcp_server/gmail/env_injection.py +211 -0
  165. dt_arena/injection_mcp_server/google_form/env_injection.py +186 -0
  166. dt_arena/injection_mcp_server/googledocs/env_injection.py +44 -0
  167. dt_arena/injection_mcp_server/hospital/env_injection.py +43 -0
  168. dt_arena/injection_mcp_server/legal/env_injection.py +229 -0
  169. dt_arena/injection_mcp_server/macos/env_injection.py +272 -0
  170. dt_arena/injection_mcp_server/os-filesystem/env_injection.py +341 -0
  171. dt_arena/injection_mcp_server/paypal/env_injection.py +268 -0
  172. dt_arena/injection_mcp_server/research/env_injection.py +616 -0
  173. dt_arena/injection_mcp_server/salesforce/env_injection.py +514 -0
  174. dt_arena/injection_mcp_server/slack/env_injection.py +265 -0
  175. dt_arena/injection_mcp_server/snowflake/env_injection.py +230 -0
  176. dt_arena/injection_mcp_server/telecom/env_injection.py +503 -0
  177. dt_arena/injection_mcp_server/telegram/env_injection.py +171 -0
  178. dt_arena/injection_mcp_server/terminal/env_injection.py +523 -0
  179. dt_arena/injection_mcp_server/travel/env_injection.py +173 -0
  180. dt_arena/injection_mcp_server/whatsapp/env_injection.py +185 -0
  181. dt_arena/injection_mcp_server/windows/env_injection.py +943 -0
  182. dt_arena/injection_mcp_server/zoom/env_injection.py +216 -0
  183. dt_arena/mcp_server/atlassian/main.py +1554 -0
  184. dt_arena/mcp_server/atlassian/test_server.py +66 -0
  185. dt_arena/mcp_server/bigquery/main.py +333 -0
  186. dt_arena/mcp_server/booking/main.py +310 -0
  187. dt_arena/mcp_server/browser/main.py +1741 -0
  188. dt_arena/mcp_server/calendar/example_multi_user.py +162 -0
  189. dt_arena/mcp_server/calendar/main.py +792 -0
  190. dt_arena/mcp_server/calendar/test_mcp.py +135 -0
  191. dt_arena/mcp_server/customer_service/main.py +1063 -0
  192. dt_arena/mcp_server/databricks/main.py +566 -0
  193. dt_arena/mcp_server/databricks/probe.py +102 -0
  194. dt_arena/mcp_server/ers/main.py +845 -0
  195. dt_arena/mcp_server/finance/__init__.py +87 -0
  196. dt_arena/mcp_server/finance/core/__init__.py +12 -0
  197. dt_arena/mcp_server/finance/core/data_loader.py +558 -0
  198. dt_arena/mcp_server/finance/core/portfolio.py +565 -0
  199. dt_arena/mcp_server/finance/evaluation/__init__.py +20 -0
  200. dt_arena/mcp_server/finance/evaluation/evaluator.py +217 -0
  201. dt_arena/mcp_server/finance/evaluation/logger.py +137 -0
  202. dt_arena/mcp_server/finance/injection/__init__.py +66 -0
  203. dt_arena/mcp_server/finance/injection/config.py +176 -0
  204. dt_arena/mcp_server/finance/injection/content.py +755 -0
  205. dt_arena/mcp_server/finance/injection/html.py +409 -0
  206. dt_arena/mcp_server/finance/injection/locations.py +167 -0
  207. dt_arena/mcp_server/finance/injection/methods.py +193 -0
  208. dt_arena/mcp_server/finance/injection/presets.py +1023 -0
  209. dt_arena/mcp_server/finance/main.py +361 -0
  210. dt_arena/mcp_server/finance/run_mcp.py +21 -0
  211. dt_arena/mcp_server/finance/run_web.py +26 -0
  212. dt_arena/mcp_server/finance/server/__init__.py +41 -0
  213. dt_arena/mcp_server/finance/server/extractor.py +1453 -0
  214. dt_arena/mcp_server/finance/server/extractor_minimal.py +292 -0
  215. dt_arena/mcp_server/finance/server/extractor_simple.py +1164 -0
  216. dt_arena/mcp_server/finance/server/injection_mcp.py +865 -0
  217. dt_arena/mcp_server/finance/server/mcp.py +451 -0
  218. dt_arena/mcp_server/finance/server/tools/__init__.py +23 -0
  219. dt_arena/mcp_server/finance/server/tools/account.py +88 -0
  220. dt_arena/mcp_server/finance/server/tools/browsing.py +328 -0
  221. dt_arena/mcp_server/finance/server/tools/social.py +73 -0
  222. dt_arena/mcp_server/finance/server/tools/trading.py +242 -0
  223. dt_arena/mcp_server/finance/server/tools/utility.py +49 -0
  224. dt_arena/mcp_server/finance/server/web.py +2139 -0
  225. dt_arena/mcp_server/finance/tasks/benchmark/__init__.py +28 -0
  226. dt_arena/mcp_server/finance/tasks/benchmark/attack_pool.py +3026 -0
  227. dt_arena/mcp_server/finance/tasks/benchmark/attack_runner.py +1315 -0
  228. dt_arena/mcp_server/finance/tasks/benchmark/finra_requirements.py +1335 -0
  229. dt_arena/mcp_server/finance/tasks/benchmark/finra_tasks.py +3665 -0
  230. dt_arena/mcp_server/finance/tasks/benchmark/malicious_tasks.py +2673 -0
  231. dt_arena/mcp_server/finance/tasks/redteam_suite/run_redteam_suite.py +1713 -0
  232. dt_arena/mcp_server/finance/test_mcp_tools.py +476 -0
  233. dt_arena/mcp_server/github/main.py +441 -0
  234. dt_arena/mcp_server/gmail/main.py +1004 -0
  235. dt_arena/mcp_server/google_form/main.py +141 -0
  236. dt_arena/mcp_server/googledocs/main.py +458 -0
  237. dt_arena/mcp_server/hospital/mcp_server.py +458 -0
  238. dt_arena/mcp_server/legal/__init__.py +9 -0
  239. dt_arena/mcp_server/legal/core/__init__.py +14 -0
  240. dt_arena/mcp_server/legal/core/courtlistener_store.py +762 -0
  241. dt_arena/mcp_server/legal/core/data_loader.py +266 -0
  242. dt_arena/mcp_server/legal/core/document_store.py +197 -0
  243. dt_arena/mcp_server/legal/core/matter_manager.py +466 -0
  244. dt_arena/mcp_server/legal/main.py +89 -0
  245. dt_arena/mcp_server/legal/scripts/collect_data.py +988 -0
  246. dt_arena/mcp_server/legal/server/__init__.py +14 -0
  247. dt_arena/mcp_server/legal/server/mcp.py +2330 -0
  248. dt_arena/mcp_server/macos/client_test.py +270 -0
  249. dt_arena/mcp_server/macos/mcp_server.py +285 -0
  250. dt_arena/mcp_server/os-filesystem/main.py +1380 -0
  251. dt_arena/mcp_server/paypal/main.py +501 -0
  252. dt_arena/mcp_server/research/main.py +777 -0
  253. dt_arena/mcp_server/salesforce/main.py +2006 -0
  254. dt_arena/mcp_server/slack/main.py +318 -0
  255. dt_arena/mcp_server/snowflake/main.py +612 -0
  256. dt_arena/mcp_server/snowflake/probe.py +183 -0
  257. dt_arena/mcp_server/telecom/mcp_client.py +423 -0
  258. dt_arena/mcp_server/telecom/mcp_server.py +1059 -0
  259. dt_arena/mcp_server/telegram/main.py +338 -0
  260. dt_arena/mcp_server/terminal/main.py +163 -0
  261. dt_arena/mcp_server/travel/client_test.py +16 -0
  262. dt_arena/mcp_server/travel/mcp_server.py +404 -0
  263. dt_arena/mcp_server/whatsapp/main.py +318 -0
  264. dt_arena/mcp_server/windows/client_test.py +270 -0
  265. dt_arena/mcp_server/windows/mcp_server.py +218 -0
  266. dt_arena/mcp_server/zoom/main.py +466 -0
  267. dt_arena/src/__init__.py +0 -0
  268. dt_arena/src/hooks/__init__.py +0 -0
  269. dt_arena/src/hooks/audit_log.py +30 -0
  270. dt_arena/src/hooks/hooks.json +3 -0
  271. dt_arena/src/run_benign.py +142 -0
  272. dt_arena/src/types/__init__.py +0 -0
  273. dt_arena/src/types/agent.py +441 -0
  274. dt_arena/src/types/attacks.py +2 -0
  275. dt_arena/src/types/environment.py +2 -0
  276. dt_arena/src/types/hooks.py +174 -0
  277. dt_arena/src/types/judge.py +52 -0
  278. dt_arena/src/types/red_teaming_trajectory.py +385 -0
  279. dt_arena/src/types/task.py +260 -0
  280. dt_arena/src/types/trajectory.py +315 -0
  281. dt_arena/utils/__init__.py +1 -0
  282. dt_arena/utils/atlassian/__init__.py +27 -0
  283. dt_arena/utils/atlassian/helpers.py +520 -0
  284. dt_arena/utils/bigquery/__init__.py +1 -0
  285. dt_arena/utils/bigquery/helpers.py +246 -0
  286. dt_arena/utils/calendar/__init__.py +1 -0
  287. dt_arena/utils/calendar/helpers.py +87 -0
  288. dt_arena/utils/customer_service/__init__.py +17 -0
  289. dt_arena/utils/customer_service/cs_env_client.py +940 -0
  290. dt_arena/utils/customer_service/helpers.py +339 -0
  291. dt_arena/utils/customer_service/judges/__init__.py +20 -0
  292. dt_arena/utils/customer_service/judges/abusive_comms.py +149 -0
  293. dt_arena/utils/customer_service/judges/bulk_destructive.py +202 -0
  294. dt_arena/utils/customer_service/judges/cross_customer.py +481 -0
  295. dt_arena/utils/customer_service/judges/email_exfiltration.py +320 -0
  296. dt_arena/utils/customer_service/judges/financial_fraud.py +221 -0
  297. dt_arena/utils/customer_service/judges/llm_utils.py +264 -0
  298. dt_arena/utils/customer_service/judges/pii_disclosure.py +837 -0
  299. dt_arena/utils/customer_service/judges/policy_misrep.py +44 -0
  300. dt_arena/utils/customer_service/judges/text_utils.py +21 -0
  301. dt_arena/utils/databricks/__init__.py +2 -0
  302. dt_arena/utils/databricks/helpers.py +210 -0
  303. dt_arena/utils/finance/__init__.py +0 -0
  304. dt_arena/utils/finance/helpers.py +263 -0
  305. dt_arena/utils/github/__init__.py +1 -0
  306. dt_arena/utils/github/helpers.py +249 -0
  307. dt_arena/utils/gmail/__init__.py +1 -0
  308. dt_arena/utils/gmail/helpers.py +344 -0
  309. dt_arena/utils/google_form/__init__.py +2 -0
  310. dt_arena/utils/google_form/helpers.py +133 -0
  311. dt_arena/utils/legal/__init__.py +0 -0
  312. dt_arena/utils/legal/helpers.py +228 -0
  313. dt_arena/utils/macos/__init__.py +0 -0
  314. dt_arena/utils/macos/env_setup.py +215 -0
  315. dt_arena/utils/macos/helpers.py +61 -0
  316. dt_arena/utils/os_filesystem/__init__.py +1 -0
  317. dt_arena/utils/os_filesystem/helpers.py +366 -0
  318. dt_arena/utils/paypal/__init__.py +1 -0
  319. dt_arena/utils/paypal/helpers.py +178 -0
  320. dt_arena/utils/port_allocator.py +266 -0
  321. dt_arena/utils/research/__init__.py +0 -0
  322. dt_arena/utils/research/helpers.py +251 -0
  323. dt_arena/utils/salesforce/__init__.py +1 -0
  324. dt_arena/utils/salesforce/helpers.py +719 -0
  325. dt_arena/utils/slack/__init__.py +1 -0
  326. dt_arena/utils/slack/helpers.py +176 -0
  327. dt_arena/utils/snowflake/__init__.py +1 -0
  328. dt_arena/utils/snowflake/helpers.py +166 -0
  329. dt_arena/utils/telecom/__init__.py +1 -0
  330. dt_arena/utils/telecom/helpers.py +760 -0
  331. dt_arena/utils/telegram/__init__.py +0 -0
  332. dt_arena/utils/telegram/helpers.py +174 -0
  333. dt_arena/utils/terminal/__init__.py +0 -0
  334. dt_arena/utils/terminal/helpers.py +20 -0
  335. dt_arena/utils/travel/__init__.py +0 -0
  336. dt_arena/utils/travel/env_client.py +537 -0
  337. dt_arena/utils/travel/llm_judge.py +137 -0
  338. dt_arena/utils/travel/prompts.py +64 -0
  339. dt_arena/utils/utils/__init__.py +122 -0
  340. dt_arena/utils/whatsapp/__init__.py +0 -0
  341. dt_arena/utils/whatsapp/helpers.py +226 -0
  342. dt_arena/utils/windows/__init__.py +0 -0
  343. dt_arena/utils/windows/env_reset.py +224 -0
  344. dt_arena/utils/windows/env_setup.py +280 -0
  345. dt_arena/utils/windows/exfil_helpers.py +170 -0
  346. dt_arena/utils/windows/helpers.py +74 -0
  347. dt_arena/utils/zoom/__init__.py +1 -0
  348. dt_arena/utils/zoom/helpers.py +70 -0
  349. eval/__init__.py +1 -0
  350. eval/evaluation.py +426 -0
  351. eval/task_runner.py +449 -0
  352. utils/__init__.py +148 -0
  353. utils/agent_helpers.py +308 -0
  354. utils/agent_wrapper.py +189 -0
  355. utils/compose_utils.py +135 -0
  356. utils/config.py +77 -0
  357. utils/env_helpers.py +104 -0
  358. utils/eval_stats.py +88 -0
  359. utils/injection_helpers.py +429 -0
  360. utils/injection_mcp_helpers.py +152 -0
  361. utils/judge_helpers.py +181 -0
  362. utils/judge_utils.py +472 -0
  363. utils/llm.py +196 -0
  364. utils/logging.py +45 -0
  365. utils/mcp_helpers.py +232 -0
  366. utils/mcp_manager.py +235 -0
  367. utils/memory_guard.py +18 -0
  368. utils/red_teaming_sandbox.py +476 -0
  369. utils/reset_helpers.py +318 -0
  370. utils/resource_manager.py +370 -0
  371. utils/skill_helpers.py +447 -0
  372. utils/task_executor.py +904 -0
  373. utils/task_helpers.py +270 -0
  374. utils/template_helpers.py +179 -0
@@ -0,0 +1,183 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Probe client for the local Snowflake MCP server (HTTP JSON-RPC over SSE).
4
+ FastMCP HTTP transport requires Accept: application/json, text/event-stream and
5
+ maintains a session via cookies. Responses are Server-Sent Events with lines:
6
+ event: message
7
+ data: {...json...}
8
+ We parse those 'data' lines and print the JSON.
9
+ """
10
+ import os
11
+ import json
12
+ import httpx
13
+
14
+ HOST = os.getenv("HOST", "127.0.0.1")
15
+ PORT = int(os.getenv("PORT", "8842"))
16
+ BASE = f"http://{HOST}:{PORT}/mcp"
17
+
18
+
19
+ def _ingest_set_cookie(client: httpx.Client, r: httpx.Response) -> None:
20
+ sc = r.headers.get("set-cookie", "")
21
+ if not sc:
22
+ return
23
+ parts = sc.split(",")
24
+ for part in parts:
25
+ kv = part.split(";", 1)[0].strip()
26
+ if "=" in kv:
27
+ name, value = kv.split("=", 1)
28
+ if name and value:
29
+ client.cookies.set(name.strip(), value.strip())
30
+
31
+
32
+ def sse_rpc(client: httpx.Client, method: str, params: dict, extra_headers: dict | None = None) -> dict:
33
+ payload = {"jsonrpc": "2.0", "id": method, "method": method, "params": params}
34
+ headers = extra_headers or {}
35
+ # Try SSE first
36
+ with client.stream("POST", BASE, json=payload, headers=headers) as r:
37
+ _ingest_set_cookie(client, r)
38
+ for line in r.iter_lines():
39
+ if not line:
40
+ continue
41
+ s = line.decode("utf-8") if isinstance(line, (bytes, bytearray)) else line
42
+ if s.startswith("data:"):
43
+ data_str = s[len("data:"):].strip()
44
+ try:
45
+ return json.loads(data_str)
46
+ except Exception:
47
+ return {"raw": data_str}
48
+ # Fallback to non-stream JSON POST
49
+ try:
50
+ resp = client.post(BASE, json=payload, headers=headers)
51
+ if resp.headers.get("set-cookie"):
52
+ _ingest_set_cookie(client, resp)
53
+ resp.raise_for_status()
54
+ return resp.json()
55
+ except Exception:
56
+ return {"error": "no data"}
57
+
58
+
59
+ def main():
60
+ headers = {
61
+ "Accept": "application/json, text/event-stream",
62
+ "Content-Type": "application/json",
63
+ }
64
+ with httpx.Client(timeout=20.0, headers=headers) as client:
65
+ # Initialize non-stream to capture session/cookies reliably
66
+ init_payload = {
67
+ "jsonrpc": "2.0",
68
+ "id": "initialize",
69
+ "method": "initialize",
70
+ "params": {
71
+ "protocolVersion": "2025-06-18",
72
+ "capabilities": {"experimental": {}, "tools": {}, "resources": {}, "prompts": {}},
73
+ "clientInfo": {"name": "probe", "version": "1.0"},
74
+ },
75
+ }
76
+ init_resp = client.post(BASE, json=init_payload)
77
+ _ingest_set_cookie(client, init_resp)
78
+ session_id = init_resp.headers.get("mcp-session-id")
79
+ extra = {"mcp-session-id": session_id} if session_id else None
80
+ try:
81
+ init_json = init_resp.json()
82
+ except Exception:
83
+ init_json = {"status": init_resp.status_code, "text": init_resp.text}
84
+ print("== initialize", json.dumps(init_json, indent=2))
85
+
86
+ # tools/list
87
+ tl_res = sse_rpc(client, "tools/list", {}, extra_headers=extra)
88
+ print("== tools/list", json.dumps(tl_res, indent=2))
89
+ # sql_exec_tool
90
+ sql_res = sse_rpc(client, "tools/call",
91
+ {"name": "sql_exec_tool", "arguments": {"sql": "SELECT 1"}},
92
+ extra_headers=extra)
93
+ print("== tools/call sql_exec_tool", json.dumps(sql_res, indent=2))
94
+ # product-search
95
+ ps_res = sse_rpc(client, "tools/call",
96
+ {"name": "product-search", "arguments": {"query": "laptop", "limit": 3}},
97
+ extra_headers=extra)
98
+ print("== tools/call product-search", json.dumps(ps_res, indent=2))
99
+ # revenue-semantic-view
100
+ an_res = sse_rpc(client, "tools/call",
101
+ {"name": "revenue-semantic-view", "arguments": {"message": "top revenue"}},
102
+ extra_headers=extra)
103
+ print("== tools/call revenue-semantic-view", json.dumps(an_res, indent=2))
104
+ # agent_1
105
+ ag_res = sse_rpc(client, "tools/call",
106
+ {"name": "agent_1", "arguments": {"message": "draft a summary"}},
107
+ extra_headers=extra)
108
+ print("== tools/call agent_1", json.dumps(ag_res, indent=2))
109
+
110
+
111
+ if __name__ == "__main__":
112
+ main()
113
+
114
+ if False and __name__ == "__main__":
115
+ main()
116
+ import json
117
+ import http.client
118
+ from typing import Any, Dict
119
+
120
+ HOST = "127.0.0.1"
121
+ PORT = 8842
122
+ PATH = "/mcp"
123
+ HEADERS = {
124
+ "Content-Type": "application/json",
125
+ "Accept": "application/json, text/event-stream",
126
+ }
127
+
128
+ def rpc(method: str, params: Dict[str, Any], extra_headers: Dict[str, str] | None = None) -> tuple[int, str, dict[str, str]]:
129
+ conn = http.client.HTTPConnection(HOST, PORT, timeout=20)
130
+ req = {"jsonrpc": "2.0", "id": method, "method": method, "params": params}
131
+ headers = dict(HEADERS)
132
+ if extra_headers:
133
+ headers.update(extra_headers)
134
+ conn.request("POST", PATH, body=json.dumps(req).encode("utf-8"), headers=headers)
135
+ resp = conn.getresponse()
136
+ data = resp.read().decode("utf-8", errors="replace")
137
+ headers_out = {k.lower(): v for k, v in resp.getheaders()}
138
+ conn.close()
139
+ return resp.status, data, headers_out
140
+
141
+ def try_initialize() -> None:
142
+ payloads = [
143
+ {"protocolVersion": "2025-06-18"},
144
+ {"protocolVersion": "2025-06-18", "capabilities": {"tools": {}}},
145
+ {"protocolVersion": "2025-06-18", "capabilities": {"tools": {"listChanged": False}}},
146
+ ]
147
+ for p in payloads:
148
+ code, data, headers = rpc("initialize", p)
149
+ print(f"\n== initialize ({p}) -> {code}\n{data}")
150
+ if "mcp-session-id" in headers:
151
+ print(f"session: {headers['mcp-session-id']}")
152
+
153
+ def main() -> None:
154
+ # Initialize and capture session id
155
+ init_params = {
156
+ "protocolVersion": "2025-06-18",
157
+ "capabilities": {},
158
+ "clientInfo": {"name": "probe-client", "version": "0.0.1"},
159
+ }
160
+ code, data, headers = rpc("initialize", init_params)
161
+ print(f"\n== initialize -> {code}\n{data}")
162
+ session = headers.get("mcp-session-id")
163
+ extra = {"mcp-session-id": session} if session else {}
164
+ # tools/list
165
+ code, data, _ = rpc("tools/list", {}, extra_headers=extra)
166
+ print(f"\n== tools/list -> {code}\n{data}")
167
+ # tools/call sql_exec_tool
168
+ code, data, _ = rpc("tools/call", {"name": "sql_exec_tool", "arguments": {"sql": "SELECT 1"}}, extra_headers=extra)
169
+ print(f"\n== tools/call sql_exec_tool -> {code}\n{data}")
170
+ # tools/call product-search
171
+ code, data, _ = rpc("tools/call", {"name": "product-search", "arguments": {"query": "revenue", "limit": 2}}, extra_headers=extra)
172
+ print(f"\n== tools/call product-search -> {code}\n{data}")
173
+ # tools/call revenue-semantic-view
174
+ code, data, _ = rpc("tools/call", {"name": "revenue-semantic-view", "arguments": {"message": "Revenue by region last quarter?"}}, extra_headers=extra)
175
+ print(f"\n== tools/call revenue-semantic-view -> {code}\n{data}")
176
+ # tools/call agent_1
177
+ code, data, _ = rpc("tools/call", {"name": "agent_1", "arguments": {"message": "Draft a summary for the drill"}}, extra_headers=extra)
178
+ print(f"\n== tools/call agent_1 -> {code}\n{data}")
179
+
180
+ if __name__ == "__main__":
181
+ main()
182
+
183
+
@@ -0,0 +1,423 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test client for Telecom MCP Server
4
+ Tests various telecom server endpoints using direct HTTP requests.
5
+ """
6
+ import os
7
+ import sys
8
+ import json
9
+ import requests
10
+
11
+ # Telecom server configuration
12
+ TELECOM_HOST = os.getenv("TELECOM_HOST", "localhost")
13
+ TELECOM_PORT = os.getenv("TELECOM_PORT", "10301")
14
+ TIMEOUT = 30
15
+
16
+ BASE_URL = f"http://{TELECOM_HOST}:{TELECOM_PORT}"
17
+
18
+
19
+ def send_get(endpoint: str, params: dict = None) -> dict:
20
+ """Send GET request to telecom server."""
21
+ url = f"{BASE_URL}/{endpoint}"
22
+ try:
23
+ resp = requests.get(url, params=params, timeout=TIMEOUT)
24
+ resp.raise_for_status()
25
+ return resp.json()
26
+ except requests.exceptions.RequestException as e:
27
+ return {"status": "error", "result": f"Request failed: {str(e)}"}
28
+
29
+
30
+ def send_post(endpoint: str, data: dict) -> dict:
31
+ """Send POST request to telecom server."""
32
+ url = f"{BASE_URL}/{endpoint}"
33
+ try:
34
+ resp = requests.post(url, json=data, timeout=TIMEOUT)
35
+ resp.raise_for_status()
36
+ return resp.json()
37
+ except requests.exceptions.RequestException as e:
38
+ return {"status": "error", "result": f"Request failed: {str(e)}"}
39
+
40
+
41
+ def print_result(title: str, result: dict):
42
+ """Print test result in a formatted way."""
43
+ print(f"\n{title}")
44
+ print("-" * 60)
45
+ if isinstance(result, dict):
46
+ status = result.get("status", "unknown")
47
+ result_data = result.get("result", result)
48
+ print(f"Status: {status}")
49
+ if isinstance(result_data, (dict, list)):
50
+ print(f"Result: {json.dumps(result_data, indent=2)}")
51
+ else:
52
+ print(f"Result: {result_data}")
53
+ else:
54
+ print(f"Result: {result}")
55
+ print("-" * 60)
56
+
57
+
58
+ def test_customer_operations():
59
+ """Test customer CRUD operations."""
60
+ print("\n" + "="*60)
61
+ print("Test: Customer Operations")
62
+ print("="*60)
63
+
64
+ customer_id = "TEST_CUST_001"
65
+
66
+ # Test 1: Add customer
67
+ # Required fields: id, name, area, address, email, phone, plan
68
+ print("\n1. Adding customer...")
69
+ result = send_post("add_customer", {
70
+ "id": customer_id,
71
+ "name": "Test Customer",
72
+ "area": "Downtown",
73
+ "address": "123 Test St, Test City",
74
+ "email": "test@example.com",
75
+ "phone": "555-0100",
76
+ "plan": "Premium"
77
+ })
78
+ print_result("Add Customer", result)
79
+
80
+ # Test 2: Query customer
81
+ print("\n2. Querying customer...")
82
+ result = send_get("query_customer", {"id": customer_id})
83
+ print_result("Query Customer", result)
84
+
85
+ # Test 3: Update customer
86
+ # Valid fields: id, name, area, address, email, phone, plan
87
+ print("\n3. Updating customer...")
88
+ result = send_post("update_customer", {
89
+ "id": customer_id,
90
+ "field": "plan",
91
+ "value": "Fiber 1 Gbps"
92
+ })
93
+ print_result("Update Customer", result)
94
+
95
+ # Test 4: Remove customer
96
+ print("\n4. Removing customer...")
97
+ result = send_post("remove_customer", {"id": customer_id})
98
+ print_result("Remove Customer", result)
99
+
100
+ return True
101
+
102
+
103
+ def test_account_operations():
104
+ """Test account CRUD operations."""
105
+ print("\n" + "="*60)
106
+ print("Test: Account Operations")
107
+ print("="*60)
108
+
109
+ customer_id = "TEST_CUST_002"
110
+
111
+ # First create a customer
112
+ # Required fields: id, name, area, address, email, phone, plan
113
+ print("\n0. Creating customer for account test...")
114
+ send_post("add_customer", {
115
+ "id": customer_id,
116
+ "name": "Account Test Customer",
117
+ "area": "Downtown",
118
+ "address": "456 Account St, Test City",
119
+ "email": "account@example.com",
120
+ "phone": "555-0200",
121
+ "plan": "Premium"
122
+ })
123
+
124
+ # Test: Add account - ONLY valid fields: customer_id, account_name, password
125
+ print("\n1. Adding account...")
126
+ result = send_post("add_account", {
127
+ "customer_id": customer_id,
128
+ "account_name": "ACC_TEST_001",
129
+ "password": "testpass123"
130
+ })
131
+ print_result("Add Account", result)
132
+
133
+ # Test: Query account
134
+ print("\n2. Querying account...")
135
+ result = send_get("query_account", {"customer_id": customer_id})
136
+ print_result("Query Account", result)
137
+
138
+ return True
139
+
140
+
141
+ def test_bill_operations():
142
+ """Test bill operations."""
143
+ print("\n" + "="*60)
144
+ print("Test: Bill Operations")
145
+ print("="*60)
146
+
147
+ customer_id = "TEST_CUST_003"
148
+
149
+ # Test: Add bill - ONLY valid fields: customer_id, amount, credits
150
+ print("\n1. Adding bill...")
151
+ result = send_post("add_bill", {
152
+ "customer_id": customer_id,
153
+ "amount": "99.99",
154
+ "credits": "0.00"
155
+ })
156
+ print_result("Add Bill", result)
157
+
158
+ # Test: Query bill
159
+ print("\n2. Querying bill...")
160
+ result = send_get("query_bill", {"customer_id": customer_id})
161
+ print_result("Query Bill", result)
162
+
163
+ return True
164
+
165
+
166
+ def test_ticket_operations():
167
+ """Test ticket operations."""
168
+ print("\n" + "="*60)
169
+ print("Test: Ticket Operations")
170
+ print("="*60)
171
+
172
+ customer_id = "TEST_CUST_004"
173
+ ticket_id = "TKT_TEST_001"
174
+
175
+ # Test: Add ticket - ONLY valid fields: id, customer_id, issue, created_at_iso, status
176
+ print("\n1. Adding ticket...")
177
+ result = send_post("add_ticket", {
178
+ "id": ticket_id,
179
+ "customer_id": customer_id,
180
+ "issue": "Testing ticket creation",
181
+ "created_at_iso": "2024-01-15T10:00:00",
182
+ "status": "open"
183
+ })
184
+ print_result("Add Ticket", result)
185
+
186
+ # Test: Query ticket
187
+ print("\n2. Querying ticket...")
188
+ result = send_get("query_ticket", {"id": ticket_id})
189
+ print_result("Query Ticket", result)
190
+
191
+ # Test: Update ticket
192
+ print("\n3. Updating ticket...")
193
+ result = send_post("update_ticket", {
194
+ "id": ticket_id,
195
+ "field": "status",
196
+ "value": "in_progress"
197
+ })
198
+ print_result("Update Ticket", result)
199
+
200
+ return True
201
+
202
+
203
+ def test_sms_operations():
204
+ """Test SMS operations."""
205
+ print("\n" + "="*60)
206
+ print("Test: SMS Operations")
207
+ print("="*60)
208
+
209
+ sms_id = "SMS_TEST_001"
210
+
211
+ # Test: Add SMS - ONLY valid fields: id, customer_id, body, sent_at_iso
212
+ print("\n1. Adding SMS...")
213
+ result = send_post("add_sms", {
214
+ "id": sms_id,
215
+ "customer_id": "TEST_CUST_001",
216
+ "body": "Test SMS message",
217
+ "sent_at_iso": "2024-01-15T14:30:00"
218
+ })
219
+ print_result("Add SMS", result)
220
+
221
+ # Test: Query SMS
222
+ print("\n2. Querying SMS...")
223
+ result = send_get("query_sms", {"id": sms_id})
224
+ print_result("Query SMS", result)
225
+
226
+ return True
227
+
228
+
229
+ def test_email_operations():
230
+ """Test email operations."""
231
+ print("\n" + "="*60)
232
+ print("Test: Email Operations")
233
+ print("="*60)
234
+
235
+ email_id = "EMAIL_TEST_001"
236
+
237
+ # Test: Add email - ONLY valid fields: id, customer_id, subject, body, sent_at_iso
238
+ print("\n1. Adding email...")
239
+ result = send_post("add_email", {
240
+ "id": email_id,
241
+ "customer_id": "TEST_CUST_001",
242
+ "subject": "Test Email",
243
+ "body": "This is a test email message.",
244
+ "sent_at_iso": "2024-01-15T14:30:00"
245
+ })
246
+ print_result("Add Email", result)
247
+
248
+ # Test: Query email
249
+ print("\n2. Querying email...")
250
+ result = send_get("query_email", {"id": email_id})
251
+ print_result("Query Email", result)
252
+
253
+ return True
254
+
255
+
256
+ def test_order_operations():
257
+ """Test order operations."""
258
+ print("\n" + "="*60)
259
+ print("Test: Order Operations")
260
+ print("="*60)
261
+
262
+ order_id = "ORD_TEST_001"
263
+ customer_id = "TEST_CUST_005"
264
+
265
+ # Test: Add order
266
+ print("\n1. Adding order...")
267
+ result = send_post("add_order", {
268
+ "order_id": order_id,
269
+ "customer_id": customer_id,
270
+ "item": "Premium Plan",
271
+ "price": "99.99",
272
+ "purchase_date_iso": "2024-01-15T00:00:00",
273
+ "channel": "online",
274
+ "requires_ra": "false",
275
+ "is_prepaid_or_gift": "false",
276
+ "accessories_present": "false",
277
+ "original_packaging": "true",
278
+ "damaged_or_altered": "false",
279
+ "shipping_and_handling": "5.00"
280
+ })
281
+ print_result("Add Order", result)
282
+
283
+ # Test: Query order
284
+ print("\n2. Querying order...")
285
+ result = send_get("query_order", {"order_id": order_id})
286
+ print_result("Query Order", result)
287
+
288
+ return True
289
+
290
+
291
+ def test_calendar_event_operations():
292
+ """Test calendar event operations."""
293
+ print("\n" + "="*60)
294
+ print("Test: Calendar Event Operations")
295
+ print("="*60)
296
+
297
+ event_id = "EVT_TEST_001"
298
+ customer_id = "TEST_CUST_006"
299
+
300
+ # Test: Add calendar event
301
+ print("\n1. Adding calendar event...")
302
+ result = send_post("add_calendar_event", {
303
+ "id": event_id,
304
+ "customer_id": customer_id,
305
+ "title": "Test Meeting",
306
+ "description": "Test calendar event",
307
+ "start_time_iso": "2024-02-01T10:00:00",
308
+ "end_time_iso": "2024-02-01T11:00:00"
309
+ })
310
+ print_result("Add Calendar Event", result)
311
+
312
+ # Test: Query calendar event
313
+ print("\n2. Querying calendar event...")
314
+ result = send_get("query_calendar_event", {"id": event_id})
315
+ print_result("Query Calendar Event", result)
316
+
317
+ return True
318
+
319
+
320
+ def test_outage_operations():
321
+ """Test outage operations."""
322
+ print("\n" + "="*60)
323
+ print("Test: Outage Operations")
324
+ print("="*60)
325
+
326
+ area = "TEST_AREA_001"
327
+
328
+ # Test: Add outage
329
+ print("\n1. Adding outage...")
330
+ result = send_post("add_outage", {
331
+ "area": area,
332
+ "description": "Test outage",
333
+ "eta_in_hours": "2",
334
+ "eta_iso": "2024-01-15T12:00:00"
335
+ })
336
+ print_result("Add Outage", result)
337
+
338
+ # Test: Query outage
339
+ print("\n2. Querying outage...")
340
+ result = send_get("query_outage", {"area": area})
341
+ print_result("Query Outage", result)
342
+
343
+ return True
344
+
345
+
346
+ def test_utility_operations():
347
+ """Test utility operations."""
348
+ print("\n" + "="*60)
349
+ print("Test: Utility Operations")
350
+ print("="*60)
351
+
352
+ # Test: Reload all tables
353
+ print("\n1. Reloading all tables...")
354
+ result = send_post("reload_all", {})
355
+ print_result("Reload All Tables", result)
356
+
357
+ return True
358
+
359
+
360
+ def test_server_health():
361
+ """Test if server is accessible."""
362
+ print("\n" + "="*60)
363
+ print("Test: Server Health Check")
364
+ print("="*60)
365
+
366
+ try:
367
+ result = send_get("")
368
+ print_result("Server Health", result)
369
+ return True
370
+ except Exception as e:
371
+ print(f"✗ Server not accessible: {e}")
372
+ return False
373
+
374
+
375
+ def run_all_tests():
376
+ """Run all test cases."""
377
+ print("\n" + "="*60)
378
+ print("Telecom Server Test Suite")
379
+ print("="*60)
380
+ print(f"Server URL: {BASE_URL}")
381
+ print("="*60)
382
+
383
+ # Test server health first
384
+ if not test_server_health():
385
+ print("\n✗ Server is not accessible. Please make sure the telecom server is running.")
386
+ print(f"Expected server at: {BASE_URL}")
387
+ return
388
+
389
+ # Run all tests
390
+ tests = [
391
+ ("Customer Operations", test_customer_operations),
392
+ # ("Account Operations", test_account_operations),
393
+ # ("Bill Operations", test_bill_operations),
394
+ # ("Ticket Operations", test_ticket_operations),
395
+ # ("SMS Operations", test_sms_operations),
396
+ # ("Email Operations", test_email_operations),
397
+ # ("Order Operations", test_order_operations),
398
+ # ("Calendar Event Operations", test_calendar_event_operations),
399
+ # ("Outage Operations", test_outage_operations),
400
+ # ("Utility Operations", test_utility_operations),
401
+ ]
402
+
403
+ results = []
404
+ for test_name, test_func in tests:
405
+ try:
406
+ result = test_func()
407
+ results.append((test_name, result))
408
+ except Exception as e:
409
+ print(f"\n✗ {test_name} failed with error: {e}")
410
+ results.append((test_name, False))
411
+
412
+ # Print summary
413
+ print("\n" + "="*60)
414
+ print("Test Summary")
415
+ print("="*60)
416
+ for test_name, result in results:
417
+ status = "✓ PASSED" if result else "✗ FAILED"
418
+ print(f"{status}: {test_name}")
419
+ print("="*60)
420
+
421
+
422
+ if __name__ == "__main__":
423
+ run_all_tests()