mia-code 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (410) hide show
  1. package/.miette/260321.md +1 -0
  2. package/.miette/260323.md +9 -0
  3. package/.miette/260331.md +2 -0
  4. package/.pde/2604011511--83a2d7f9-24a5-4cf4-98d5-036c82f872e8/2604020008--d3417f2c-df12-4f0f-8a1b-d88e7968f822/d3417f2c-df12-4f0f-8a1b-d88e7968f822.md +63 -0
  5. package/.pde/2604011511--83a2d7f9-24a5-4cf4-98d5-036c82f872e8/2604020008--e6c3fc5d-4a70-4523-ba7d-a3250da4c235/e6c3fc5d-4a70-4523-ba7d-a3250da4c235.md +72 -0
  6. package/.pde/2604011511--83a2d7f9-24a5-4cf4-98d5-036c82f872e8/2604020008--efeb00a2-b17a-4d32-b1f0-b90c37a8d24e/efeb00a2-b17a-4d32-b1f0-b90c37a8d24e.md +62 -0
  7. package/.pde/2604011511--83a2d7f9-24a5-4cf4-98d5-036c82f872e8/83a2d7f9-24a5-4cf4-98d5-036c82f872e8.json +302 -0
  8. package/.pde/2604011511--83a2d7f9-24a5-4cf4-98d5-036c82f872e8/83a2d7f9-24a5-4cf4-98d5-036c82f872e8.md +149 -0
  9. package/.pde/2604011511--83a2d7f9-24a5-4cf4-98d5-036c82f872e8/AGENTS.md +31 -0
  10. package/.pde/2604011511--83a2d7f9-24a5-4cf4-98d5-036c82f872e8/meta-decomposition-3-children.md +67 -0
  11. package/.pde/2604040129--61f9dd4d-7aa6-45e6-a58b-e480b1aa6737/61f9dd4d-7aa6-45e6-a58b-e480b1aa6737--from-mia-openclaw-workspace.md +125 -0
  12. package/.pde/2604040129--61f9dd4d-7aa6-45e6-a58b-e480b1aa6737/STATUS.md +1 -0
  13. package/.pde/4f02ba94-9f52-422e-9389-b16f9b37f358.json +177 -0
  14. package/.pde/4f02ba94-9f52-422e-9389-b16f9b37f358.md +77 -0
  15. package/.pde/6ad9244d-5340-490f-b76c-c86728b9de52.json +222 -0
  16. package/.pde/6ad9244d-5340-490f-b76c-c86728b9de52.md +99 -0
  17. package/.pde/8b566792-ed15-4606-96f9-2b6f593d7e6b.json +111 -0
  18. package/.pde/8b566792-ed15-4606-96f9-2b6f593d7e6b.md +67 -0
  19. package/.pde/c7f1e74b-05a5-40e2-9f01-4cc48d2528f7.json +349 -0
  20. package/.pde/c7f1e74b-05a5-40e2-9f01-4cc48d2528f7.md +147 -0
  21. package/.pde/dfc00a78-1da0-4c09-8a16-c6982644051b.json +118 -0
  22. package/.pde/dfc00a78-1da0-4c09-8a16-c6982644051b.md +64 -0
  23. package/GUILLAUME.md +8 -0
  24. package/KINSHIP.md +9 -0
  25. package/MIA_CODE_ARCHITECTURE_REPORT.md +718 -0
  26. package/contextual_research/260119-MIA-CODE--98090899-8aff-4e11-9dc3-8b99466d1.md +1101 -0
  27. package/contextual_research/MIA.md +38 -0
  28. package/contextual_research/MIAWAPASCONE.md +59 -0
  29. package/contextual_research/MIETTE.md +38 -0
  30. package/contextual_research/PDE-generalization--caefee82-efb1-4dbb-8733-691b01581464--260130/2504.00218v2.pdf +7483 -12
  31. package/contextual_research/PDE-generalization--caefee82-efb1-4dbb-8733-691b01581464--260130/2505.00212v3.pdf +0 -0
  32. package/contextual_research/PDE-generalization--caefee82-efb1-4dbb-8733-691b01581464--260130/CONTENT.md +1014 -0
  33. package/contextual_research/PDE-generalization--caefee82-efb1-4dbb-8733-691b01581464--260130/DESIGN.gemini.md +242 -0
  34. package/contextual_research/PDE-generalization--caefee82-efb1-4dbb-8733-691b01581464--260130/INDEX.md +45 -0
  35. package/contextual_research/PDE-generalization--caefee82-efb1-4dbb-8733-691b01581464--260130/sources/2504.00218v2.md +2025 -0
  36. package/contextual_research/PDE-generalization--caefee82-efb1-4dbb-8733-691b01581464--260130/sources/2504.00218v2.pdf +7483 -12
  37. package/contextual_research/PDE-generalization--caefee82-efb1-4dbb-8733-691b01581464--260130/sources/2505.00212v3.md +1755 -0
  38. package/contextual_research/PDE-generalization--caefee82-efb1-4dbb-8733-691b01581464--260130/sources/2505.00212v3.pdf +0 -0
  39. package/contextual_research/PDE-generalization--caefee82-efb1-4dbb-8733-691b01581464--260130/sources/footnote_1_12_decomposed_prompting.pdf +0 -0
  40. package/contextual_research/PDE-generalization--caefee82-efb1-4dbb-8733-691b01581464--260130/sources/footnote_1_19_hugginggpt_planning.pdf +0 -0
  41. package/contextual_research/PDE-generalization--caefee82-efb1-4dbb-8733-691b01581464--260130/sources/footnote_1_1_coordination_challenges.md +766 -0
  42. package/contextual_research/PDE-generalization--caefee82-efb1-4dbb-8733-691b01581464--260130/sources/footnote_1_1_coordination_challenges.pdf +3431 -4
  43. package/contextual_research/PDE-generalization--caefee82-efb1-4dbb-8733-691b01581464--260130/sources/footnote_1_28_guardrails_multi_agent.md +260 -0
  44. package/contextual_research/PDE-generalization--caefee82-efb1-4dbb-8733-691b01581464--260130/sources/footnote_1_28_guardrails_multi_agent.pdf +0 -0
  45. package/contextual_research/PDE-generalization--caefee82-efb1-4dbb-8733-691b01581464--260130/sources/footnote_1_2_navigating_complexity.md +558 -0
  46. package/contextual_research/PDE-generalization--caefee82-efb1-4dbb-8733-691b01581464--260130/sources/footnote_1_2_navigating_complexity.pdf +0 -0
  47. package/contextual_research/PDE-generalization--caefee82-efb1-4dbb-8733-691b01581464--260130/sources/footnote_1_34_hierarchical_multi_agent.pdf +0 -0
  48. package/contextual_research/PDE-generalization--caefee82-efb1-4dbb-8733-691b01581464--260130/sources/footnote_1_5_open_intent_extraction.pdf +0 -0
  49. package/contextual_research/PODCAST.md +109 -0
  50. package/contextual_research/langchain-principles-roadmap.md +157 -0
  51. package/contextual_research/persona-to-narrative-character-inquiry_260201.md +50 -0
  52. package/dist/cli.js +35 -11
  53. package/dist/geminiHeadless.js +8 -2
  54. package/dist/index.js +2 -1
  55. package/dist/mcp/miaco-server.js +10 -1
  56. package/dist/mcp/miatel-server.js +10 -1
  57. package/dist/mcp/miawa-server.js +10 -1
  58. package/dist/mcp/utils.d.ts +6 -1
  59. package/dist/mcp/utils.js +24 -3
  60. package/dist/sessionStore.d.ts +8 -2
  61. package/dist/sessionStore.js +39 -3
  62. package/dist/types.d.ts +1 -0
  63. package/miaco/README.md +124 -0
  64. package/miaco/dist/commands/chart.d.ts +6 -0
  65. package/miaco/dist/commands/chart.d.ts.map +1 -0
  66. package/miaco/dist/commands/chart.js +222 -0
  67. package/miaco/dist/commands/chart.js.map +1 -0
  68. package/miaco/dist/commands/decompose.d.ts +6 -0
  69. package/miaco/dist/commands/decompose.d.ts.map +1 -0
  70. package/miaco/dist/commands/decompose.js +98 -0
  71. package/miaco/dist/commands/decompose.js.map +1 -0
  72. package/miaco/dist/commands/schema.d.ts +6 -0
  73. package/miaco/dist/commands/schema.d.ts.map +1 -0
  74. package/miaco/dist/commands/schema.js +66 -0
  75. package/miaco/dist/commands/schema.js.map +1 -0
  76. package/miaco/dist/commands/stc.d.ts +11 -0
  77. package/miaco/dist/commands/stc.d.ts.map +1 -0
  78. package/miaco/dist/commands/stc.js +590 -0
  79. package/miaco/dist/commands/stc.js.map +1 -0
  80. package/miaco/dist/commands/trace.d.ts +6 -0
  81. package/miaco/dist/commands/trace.d.ts.map +1 -0
  82. package/miaco/dist/commands/trace.js +83 -0
  83. package/miaco/dist/commands/trace.js.map +1 -0
  84. package/miaco/dist/commands/validate.d.ts +6 -0
  85. package/miaco/dist/commands/validate.d.ts.map +1 -0
  86. package/miaco/dist/commands/validate.js +58 -0
  87. package/miaco/dist/commands/validate.js.map +1 -0
  88. package/miaco/dist/decompose.d.ts +93 -0
  89. package/miaco/dist/decompose.d.ts.map +1 -0
  90. package/miaco/dist/decompose.js +562 -0
  91. package/miaco/dist/decompose.js.map +1 -0
  92. package/miaco/dist/index.d.ts +18 -0
  93. package/miaco/dist/index.d.ts.map +1 -0
  94. package/miaco/dist/index.js +83 -0
  95. package/miaco/dist/index.js.map +1 -0
  96. package/miaco/dist/storage.d.ts +60 -0
  97. package/miaco/dist/storage.d.ts.map +1 -0
  98. package/miaco/dist/storage.js +100 -0
  99. package/miaco/dist/storage.js.map +1 -0
  100. package/miaco/package-lock.json +4103 -0
  101. package/miaco/package.json +40 -0
  102. package/miaco/tsconfig.json +18 -0
  103. package/miaco/version-patch-commit-and-publish.sh +1 -0
  104. package/miatel/MISSION_251231.md +3 -0
  105. package/miatel/README.md +107 -0
  106. package/miatel/dist/commands/analyze.d.ts +6 -0
  107. package/miatel/dist/commands/analyze.d.ts.map +1 -0
  108. package/miatel/dist/commands/analyze.js +100 -0
  109. package/miatel/dist/commands/analyze.js.map +1 -0
  110. package/miatel/dist/commands/arc.d.ts +6 -0
  111. package/miatel/dist/commands/arc.d.ts.map +1 -0
  112. package/miatel/dist/commands/arc.js +71 -0
  113. package/miatel/dist/commands/arc.js.map +1 -0
  114. package/miatel/dist/commands/beat.d.ts +6 -0
  115. package/miatel/dist/commands/beat.d.ts.map +1 -0
  116. package/miatel/dist/commands/beat.js +165 -0
  117. package/miatel/dist/commands/beat.js.map +1 -0
  118. package/miatel/dist/commands/theme.d.ts +6 -0
  119. package/miatel/dist/commands/theme.d.ts.map +1 -0
  120. package/miatel/dist/commands/theme.js +54 -0
  121. package/miatel/dist/commands/theme.js.map +1 -0
  122. package/miatel/dist/index.d.ts +18 -0
  123. package/miatel/dist/index.d.ts.map +1 -0
  124. package/miatel/dist/index.js +80 -0
  125. package/miatel/dist/index.js.map +1 -0
  126. package/miatel/dist/storage.d.ts +55 -0
  127. package/miatel/dist/storage.d.ts.map +1 -0
  128. package/miatel/dist/storage.js +100 -0
  129. package/miatel/dist/storage.js.map +1 -0
  130. package/miatel/package-lock.json +4103 -0
  131. package/miatel/package.json +35 -0
  132. package/miatel/src/commands/analyze.ts +109 -0
  133. package/miatel/src/commands/arc.ts +78 -0
  134. package/miatel/src/commands/beat.ts +176 -0
  135. package/miatel/src/commands/theme.ts +60 -0
  136. package/miatel/src/index.ts +94 -0
  137. package/miatel/src/storage.ts +156 -0
  138. package/miatel/tsconfig.json +18 -0
  139. package/miawa/MISSION_251231.md +144 -0
  140. package/miawa/README.md +133 -0
  141. package/miawa/dist/commands/beat.d.ts +6 -0
  142. package/miawa/dist/commands/beat.d.ts.map +1 -0
  143. package/miawa/dist/commands/beat.js +69 -0
  144. package/miawa/dist/commands/beat.js.map +1 -0
  145. package/miawa/dist/commands/ceremony.d.ts +6 -0
  146. package/miawa/dist/commands/ceremony.d.ts.map +1 -0
  147. package/miawa/dist/commands/ceremony.js +239 -0
  148. package/miawa/dist/commands/ceremony.js.map +1 -0
  149. package/miawa/dist/commands/circle.d.ts +6 -0
  150. package/miawa/dist/commands/circle.d.ts.map +1 -0
  151. package/miawa/dist/commands/circle.js +75 -0
  152. package/miawa/dist/commands/circle.js.map +1 -0
  153. package/miawa/dist/commands/eva.d.ts +6 -0
  154. package/miawa/dist/commands/eva.d.ts.map +1 -0
  155. package/miawa/dist/commands/eva.js +73 -0
  156. package/miawa/dist/commands/eva.js.map +1 -0
  157. package/miawa/dist/commands/wound.d.ts +6 -0
  158. package/miawa/dist/commands/wound.d.ts.map +1 -0
  159. package/miawa/dist/commands/wound.js +74 -0
  160. package/miawa/dist/commands/wound.js.map +1 -0
  161. package/miawa/dist/index.d.ts +19 -0
  162. package/miawa/dist/index.d.ts.map +1 -0
  163. package/miawa/dist/index.js +91 -0
  164. package/miawa/dist/index.js.map +1 -0
  165. package/miawa/dist/storage.d.ts +73 -0
  166. package/miawa/dist/storage.d.ts.map +1 -0
  167. package/miawa/dist/storage.js +100 -0
  168. package/miawa/dist/storage.js.map +1 -0
  169. package/miawa/package-lock.json +4103 -0
  170. package/miawa/package.json +36 -0
  171. package/miawa/src/commands/beat.ts +74 -0
  172. package/miawa/src/commands/ceremony.ts +256 -0
  173. package/miawa/src/commands/circle.ts +83 -0
  174. package/miawa/src/commands/eva.ts +84 -0
  175. package/miawa/src/commands/wound.ts +79 -0
  176. package/miawa/src/index.ts +108 -0
  177. package/miawa/src/storage.ts +179 -0
  178. package/miawa/tsconfig.json +18 -0
  179. package/package.json +7 -5
  180. package/references/acp/CLAUDE.md +7 -0
  181. package/references/acp/agent-plan.md +84 -0
  182. package/references/acp/clients.md +31 -0
  183. package/references/acp/extensibility.md +137 -0
  184. package/references/acp/initialization.md +225 -0
  185. package/references/acp/prompt-turn.md +321 -0
  186. package/references/acp/proxy-chains.md +562 -0
  187. package/references/acp/schema.md +3171 -0
  188. package/references/acp/session-list.md +334 -0
  189. package/references/acp/session-modes.md +170 -0
  190. package/references/acp/slash-commands.md +99 -0
  191. package/references/acp/terminals.md +281 -0
  192. package/references/acp/tool-calls.md +311 -0
  193. package/references/acp/typescript.md +29 -0
  194. package/references/claude/agent-teams.md +399 -0
  195. package/references/claude/chrome.md +231 -0
  196. package/references/claude/headless.md +158 -0
  197. package/references/claude/hooks-guide.md +708 -0
  198. package/references/claude/output-styles.md +112 -0
  199. package/references/claude/plugins.md +432 -0
  200. package/references/claude/skills.md +693 -0
  201. package/references/claude/sub-agents.md +816 -0
  202. package/references/copilot/acp/agents.md +32 -0
  203. package/references/copilot/acp/architecture.md +37 -0
  204. package/references/copilot/acp/clients.md +31 -0
  205. package/references/copilot/acp/introduction.md +42 -0
  206. package/references/copilot/acp/registry.md +339 -0
  207. package/references/copilot/acp-server.md +117 -0
  208. package/references/copilot/create-copilot-instructions.md +840 -0
  209. package/references/langchain/llms.txt +833 -0
  210. package/references/langchain/python/agents.md +677 -0
  211. package/references/langchain/python/context-engineering.md +1195 -0
  212. package/references/langchain/python/human-in-the-loop.md +326 -0
  213. package/references/langchain/python/long-term-memory.md +168 -0
  214. package/references/langchain/python/mcp.md +949 -0
  215. package/references/langchain/python/multi-agents/custom-workflow.md +187 -0
  216. package/references/langchain/python/multi-agents/handoffs.md +436 -0
  217. package/references/langchain/python/multi-agents/overview.md +295 -0
  218. package/references/langchain/python/multi-agents/router.md +150 -0
  219. package/references/langchain/python/multi-agents/skills.md +92 -0
  220. package/references/langchain/python/multi-agents/subagents.md +486 -0
  221. package/references/langchain/python/retrieval.md +320 -0
  222. package/references/langchain/python/runtime.md +141 -0
  223. package/references/langchain/python/short-term-memory.md +658 -0
  224. package/references/langchain/python/structured-output.md +712 -0
  225. package/references/langfuse/llms.txt +148 -0
  226. package/references/langgraph/javascript/llms.txt +275 -0
  227. package/references/skills/home.md +259 -0
  228. package/references/skills/integrate-skills.md +103 -0
  229. package/references/skills/specification.md +254 -0
  230. package/references/skills/what-are-skills.md +74 -0
  231. package/rispecs/README.md +164 -0
  232. package/rispecs/_sync_/miadi-code/SPEC.md +313 -0
  233. package/rispecs/_sync_/miadi-code/STATUS.md +177 -0
  234. package/rispecs/_sync_/miadi-code/dashboard/SPEC.md +465 -0
  235. package/rispecs/_sync_/miadi-code/dashboard/STATUS.md +212 -0
  236. package/rispecs/_sync_/miadi-code/multiline-input/SPEC.md +232 -0
  237. package/rispecs/_sync_/miadi-code/multiline-input/STATUS.md +108 -0
  238. package/rispecs/_sync_/miadi-code/pde/SPEC.md +253 -0
  239. package/rispecs/_sync_/miadi-code/pde/STATUS.md +56 -0
  240. package/rispecs/_sync_/miadi-code/stc/SPEC.md +397 -0
  241. package/rispecs/_sync_/miadi-code/stc/STATUS.md +70 -0
  242. package/rispecs/ava-langstack/inquiry-routing-upgrade.spec.md +119 -0
  243. package/rispecs/borrowed_from_opencode/001-client-server-architecture.rispec.md +98 -0
  244. package/rispecs/borrowed_from_opencode/002-event-bus-system.rispec.md +125 -0
  245. package/rispecs/borrowed_from_opencode/003-instance-state-pattern.rispec.md +136 -0
  246. package/rispecs/borrowed_from_opencode/004-namespace-module-pattern.rispec.md +151 -0
  247. package/rispecs/borrowed_from_opencode/005-zod-schema-validation.rispec.md +139 -0
  248. package/rispecs/borrowed_from_opencode/006-named-error-system.rispec.md +155 -0
  249. package/rispecs/borrowed_from_opencode/007-structured-logging.rispec.md +138 -0
  250. package/rispecs/borrowed_from_opencode/008-lazy-initialization.rispec.md +127 -0
  251. package/rispecs/borrowed_from_opencode/009-multi-agent-system.rispec.md +97 -0
  252. package/rispecs/borrowed_from_opencode/010-agent-definition-config.rispec.md +135 -0
  253. package/rispecs/borrowed_from_opencode/011-agent-permission-rulesets.rispec.md +151 -0
  254. package/rispecs/borrowed_from_opencode/012-agent-prompt-templates.rispec.md +141 -0
  255. package/rispecs/borrowed_from_opencode/013-agent-generation.rispec.md +142 -0
  256. package/rispecs/borrowed_from_opencode/014-plan-build-mode-toggle.rispec.md +155 -0
  257. package/rispecs/borrowed_from_opencode/015-subagent-task-delegation.rispec.md +146 -0
  258. package/rispecs/borrowed_from_opencode/016-agent-model-selection.rispec.md +151 -0
  259. package/rispecs/borrowed_from_opencode/017-compaction-agent.rispec.md +150 -0
  260. package/rispecs/borrowed_from_opencode/018-session-persistence.rispec.md +125 -0
  261. package/rispecs/borrowed_from_opencode/019-session-compaction.rispec.md +132 -0
  262. package/rispecs/borrowed_from_opencode/020-session-forking.rispec.md +134 -0
  263. package/rispecs/borrowed_from_opencode/021-session-revert-snapshot.rispec.md +135 -0
  264. package/rispecs/borrowed_from_opencode/022-session-sharing.rispec.md +165 -0
  265. package/rispecs/borrowed_from_opencode/023-session-summary-diffs.rispec.md +165 -0
  266. package/rispecs/borrowed_from_opencode/024-child-sessions.rispec.md +164 -0
  267. package/rispecs/borrowed_from_opencode/025-session-title-generation.rispec.md +162 -0
  268. package/rispecs/borrowed_from_opencode/026-message-parts-model.rispec.md +201 -0
  269. package/rispecs/borrowed_from_opencode/027-streaming-message-deltas.rispec.md +212 -0
  270. package/rispecs/borrowed_from_opencode/028-multi-provider-architecture.rispec.md +184 -0
  271. package/rispecs/borrowed_from_opencode/029-provider-authentication.rispec.md +225 -0
  272. package/rispecs/borrowed_from_opencode/030-model-registry.rispec.md +222 -0
  273. package/rispecs/borrowed_from_opencode/031-cost-tracking.rispec.md +243 -0
  274. package/rispecs/borrowed_from_opencode/032-provider-transform-pipeline.rispec.md +282 -0
  275. package/rispecs/borrowed_from_opencode/033-provider-sdk-abstraction.rispec.md +338 -0
  276. package/rispecs/borrowed_from_opencode/034-tool-registry.rispec.md +110 -0
  277. package/rispecs/borrowed_from_opencode/035-tool-context-injection.rispec.md +155 -0
  278. package/rispecs/borrowed_from_opencode/036-tool-output-truncation.rispec.md +138 -0
  279. package/rispecs/borrowed_from_opencode/037-batch-tool.rispec.md +129 -0
  280. package/rispecs/borrowed_from_opencode/038-multi-edit-tool.rispec.md +167 -0
  281. package/rispecs/borrowed_from_opencode/039-apply-patch-tool.rispec.md +161 -0
  282. package/rispecs/borrowed_from_opencode/040-code-search-tool.rispec.md +143 -0
  283. package/rispecs/borrowed_from_opencode/041-web-fetch-tool.rispec.md +131 -0
  284. package/rispecs/borrowed_from_opencode/042-web-search-tool.rispec.md +159 -0
  285. package/rispecs/borrowed_from_opencode/043-todo-tool.rispec.md +156 -0
  286. package/rispecs/borrowed_from_opencode/044-plan-mode-tool.rispec.md +139 -0
  287. package/rispecs/borrowed_from_opencode/045-task-tool.rispec.md +146 -0
  288. package/rispecs/borrowed_from_opencode/046-question-tool.rispec.md +170 -0
  289. package/rispecs/borrowed_from_opencode/047-external-directory-tool.rispec.md +166 -0
  290. package/rispecs/borrowed_from_opencode/048-file-read-write-tools.rispec.md +205 -0
  291. package/rispecs/borrowed_from_opencode/049-lsp-server-management.rispec.md +104 -0
  292. package/rispecs/borrowed_from_opencode/050-lsp-hover-completion.rispec.md +102 -0
  293. package/rispecs/borrowed_from_opencode/051-lsp-diagnostics.rispec.md +86 -0
  294. package/rispecs/borrowed_from_opencode/052-lsp-root-detection.rispec.md +109 -0
  295. package/rispecs/borrowed_from_opencode/053-remote-mcp-servers.rispec.md +119 -0
  296. package/rispecs/borrowed_from_opencode/054-mcp-oauth-flow.rispec.md +107 -0
  297. package/rispecs/borrowed_from_opencode/055-mcp-tool-conversion.rispec.md +118 -0
  298. package/rispecs/borrowed_from_opencode/056-mcp-connection-monitoring.rispec.md +106 -0
  299. package/rispecs/borrowed_from_opencode/057-local-mcp-servers.rispec.md +116 -0
  300. package/rispecs/borrowed_from_opencode/058-rich-tui.rispec.md +108 -0
  301. package/rispecs/borrowed_from_opencode/059-streaming-display.rispec.md +116 -0
  302. package/rispecs/borrowed_from_opencode/060-permission-prompts.rispec.md +130 -0
  303. package/rispecs/borrowed_from_opencode/061-session-navigation.rispec.md +155 -0
  304. package/rispecs/borrowed_from_opencode/062-syntax-highlighting.rispec.md +151 -0
  305. package/rispecs/borrowed_from_opencode/063-keybinding-system.rispec.md +181 -0
  306. package/rispecs/borrowed_from_opencode/064-multi-level-config.rispec.md +155 -0
  307. package/rispecs/borrowed_from_opencode/065-jsonc-config.rispec.md +190 -0
  308. package/rispecs/borrowed_from_opencode/066-config-env-variables.rispec.md +153 -0
  309. package/rispecs/borrowed_from_opencode/067-config-deep-merging.rispec.md +178 -0
  310. package/rispecs/borrowed_from_opencode/068-remote-org-config.rispec.md +183 -0
  311. package/rispecs/borrowed_from_opencode/069-config-markdown-frontmatter.rispec.md +206 -0
  312. package/rispecs/borrowed_from_opencode/070-managed-config-directory.rispec.md +232 -0
  313. package/rispecs/borrowed_from_opencode/071-plugin-architecture.rispec.md +104 -0
  314. package/rispecs/borrowed_from_opencode/072-plugin-hooks.rispec.md +123 -0
  315. package/rispecs/borrowed_from_opencode/073-plugin-auto-install.rispec.md +115 -0
  316. package/rispecs/borrowed_from_opencode/074-permission-system.rispec.md +133 -0
  317. package/rispecs/borrowed_from_opencode/075-git-worktree-management.rispec.md +126 -0
  318. package/rispecs/borrowed_from_opencode/076-snapshot-system.rispec.md +124 -0
  319. package/rispecs/borrowed_from_opencode/077-snapshot-diff.rispec.md +117 -0
  320. package/rispecs/borrowed_from_opencode/078-snapshot-restore.rispec.md +128 -0
  321. package/rispecs/borrowed_from_opencode/079-worktree-branch-naming.rispec.md +122 -0
  322. package/rispecs/borrowed_from_opencode/080-sqlite-storage.rispec.md +134 -0
  323. package/rispecs/borrowed_from_opencode/081-database-migrations.rispec.md +148 -0
  324. package/rispecs/borrowed_from_opencode/082-database-transactions.rispec.md +138 -0
  325. package/rispecs/borrowed_from_opencode/083-deferred-effects.rispec.md +148 -0
  326. package/rispecs/borrowed_from_opencode/084-permission-rules.rispec.md +123 -0
  327. package/rispecs/borrowed_from_opencode/085-permission-glob-patterns.rispec.md +113 -0
  328. package/rispecs/borrowed_from_opencode/086-permission-merging.rispec.md +134 -0
  329. package/rispecs/borrowed_from_opencode/087-permission-modes.rispec.md +145 -0
  330. package/rispecs/borrowed_from_opencode/088-http-api-server.rispec.md +165 -0
  331. package/rispecs/borrowed_from_opencode/089-openapi-spec-generation.rispec.md +164 -0
  332. package/rispecs/borrowed_from_opencode/090-websocket-support.rispec.md +136 -0
  333. package/rispecs/borrowed_from_opencode/091-sse-streaming.rispec.md +168 -0
  334. package/rispecs/borrowed_from_opencode/092-mdns-discovery.rispec.md +145 -0
  335. package/rispecs/borrowed_from_opencode/093-javascript-sdk.rispec.md +200 -0
  336. package/rispecs/borrowed_from_opencode/094-skill-system.rispec.md +187 -0
  337. package/rispecs/borrowed_from_opencode/095-skill-discovery.rispec.md +182 -0
  338. package/rispecs/borrowed_from_opencode/096-desktop-remote-driving.rispec.md +175 -0
  339. package/rispecs/borrowed_from_opencode/INDEX.md +255 -0
  340. package/rispecs/core.rispecs.md +261 -0
  341. package/rispecs/engines.rispecs.md +241 -0
  342. package/rispecs/formatting.rispecs.md +252 -0
  343. package/rispecs/living-specifications.rispecs.md +361 -0
  344. package/rispecs/mcp.rispecs.md +197 -0
  345. package/rispecs/pde.rispecs.md +399 -0
  346. package/rispecs/pi-mono-envisionning/ENVISIONING.md +366 -0
  347. package/rispecs/pi-mono-envisionning/storytelling-horizon.rispecs.md +76 -0
  348. package/rispecs/pi-mono-envisionning/widget.rispecs.md +2 -0
  349. package/rispecs/relation-to-mcp-structural-thinking.kin.md +72 -0
  350. package/rispecs/research-for-better-framework/CLAUDE.md +7 -0
  351. package/rispecs/research-for-better-framework/survey-pi-openclaw-opencode-openhands.md +210 -0
  352. package/rispecs/session.rispecs.md +277 -0
  353. package/rispecs/stc.rispecs.md +138 -0
  354. package/rispecs/unifier.rispecs.md +317 -0
  355. package/scripts/LAUNCH--mcp-mia-code--testing--2603141315--ac705a66-2c15-4a1c-a26d-9491018c5ba8.sh +2 -0
  356. package/scripts/RESUME--mia-code--mcps--260313--ac705a66-2c15-4a1c-a26d-9491018c5ba8.sh +1 -0
  357. package/scripts/install-widget-in-home-pi-agent-extensions.sh +4 -0
  358. package/scripts/sample-decompose--2604011535-prompt.sh +1 -0
  359. package/skills/deep-search/AGENTS.md +17 -0
  360. package/skills/deep-search/SKILL.md +281 -0
  361. package/skills/deep-search/agent-templates.md +224 -0
  362. package/skills/deep-search/orchestration-patterns.md +95 -0
  363. package/skills/miaco-pde-inquiry-routing-deep-search/AGENTS.md +13 -0
  364. package/skills/miaco-pde-inquiry-routing-deep-search/SKILL.md +136 -0
  365. package/skills/miaco-pde-inquiry-routing-internal-external-relationship/AGENTS.md +4 -0
  366. package/skills/miaco-pde-inquiry-routing-internal-external-relationship/SKILL.md +157 -0
  367. package/skills/miaco-pde-inquiry-routing-local-qmd/AGENTS.md +42 -0
  368. package/skills/miaco-pde-inquiry-routing-local-qmd/SKILL.md +135 -0
  369. package/skills/qmd/AGENTS.md +3 -0
  370. package/skills/qmd/SKILL.md +144 -0
  371. package/skills/qmd/references/mcp-setup.md +102 -0
  372. package/skills/rise-pde-inquiry-session-multi-agents-v3/SKILL.md +234 -0
  373. package/skills/rise-pde-inquiry-session-multi-agents-v3/agent-templates.md +436 -0
  374. package/skills/rise-pde-inquiry-session-multi-agents-v3/orchestration-patterns.md +197 -0
  375. package/skills/rise-pde-inquiry-session-multi-agents-v3/references/ceremonial-technology.md +102 -0
  376. package/skills/rise-pde-inquiry-session-multi-agents-v3/references/creative-orientation.md +99 -0
  377. package/skills/rise-pde-inquiry-session-multi-agents-v3/references/prompt-decomposition.md +73 -0
  378. package/skills/rise-pde-inquiry-session-multi-agents-v3/references/rise-framework.md +74 -0
  379. package/skills/rise-pde-inquiry-session-multi-agents-v3/references/structural-tension.md +82 -0
  380. package/src/cli.ts +35 -11
  381. package/src/geminiHeadless.ts +7 -2
  382. package/src/index.ts +2 -1
  383. package/src/mcp/miaco-server.ts +13 -1
  384. package/src/mcp/miatel-server.ts +13 -1
  385. package/src/mcp/miawa-server.ts +13 -1
  386. package/src/mcp/utils.ts +41 -8
  387. package/src/sessionStore.ts +44 -4
  388. package/src/types.ts +2 -1
  389. package/widget/mia-ceremony/README.md +36 -0
  390. package/widget/mia-ceremony/index.ts +143 -0
  391. package/widget/mia-interceptor/README.md +39 -0
  392. package/widget/mia-interceptor/index.ts +221 -0
  393. package/widget/mia-tools/README.md +37 -0
  394. package/widget/mia-tools/index.ts +569 -0
  395. package/widget/miette-echo/README.md +44 -0
  396. package/widget/miette-echo/index.ts +164 -0
  397. package/.claude/settings.local.json +0 -9
  398. package/.hch/issue_.env +0 -4
  399. package/.hch/issue_add__2601211715.json +0 -77
  400. package/.hch/issue_add__2601211715.md +0 -4
  401. package/.hch/issue_add__2602242020.json +0 -78
  402. package/.hch/issue_add__2602242020.md +0 -7
  403. package/.hch/issues.json +0 -2312
  404. package/.hch/issues.md +0 -30
  405. package/WS__mia-code__260214__IAIP_PDE.code-workspace +0 -29
  406. package/WS__mia-code__src332__260122.code-workspace +0 -23
  407. package/samples/copilot/session-state/be76abaa-a27f-4725-b2a9-22fb45f7e0f7/checkpoints/index.md +0 -6
  408. package/samples/copilot/session-state/be76abaa-a27f-4725-b2a9-22fb45f7e0f7/events.jsonl +0 -213
  409. package/samples/copilot/session-state/be76abaa-a27f-4725-b2a9-22fb45f7e0f7/plan.md +0 -243
  410. package/samples/copilot/session-state/be76abaa-a27f-4725-b2a9-22fb45f7e0f7/workspace.yaml +0 -5
@@ -0,0 +1,1755 @@
1
+ Which Agent Causes Task Failures and When?
2
+ On Automated Failure Attribution of LLM Multi-Agent Systems
3
+
4
+ Shaokun Zhang * † 1 Ming Yin * 2 Jieyu Zhang 3 Jiale Liu 1 Zhiguang Han 4 Jingyang Zhang 2 Beibin Li 5
5
+ Chi Wang 6 Huazheng Wang 7 Yiran Chen 2 Qingyun Wu 1 8
6
+
7
+ 5
8
+ 2
9
+ 0
10
+ 2
11
+
12
+ n
13
+ u
14
+ J
15
+
16
+ 2
17
+
18
+ ]
19
+
20
+ A
21
+ M
22
+
23
+ .
24
+ s
25
+ c
26
+ [
27
+
28
+ 3
29
+ v
30
+ 2
31
+ 1
32
+ 2
33
+ 0
34
+ 0
35
+ .
36
+ 5
37
+ 0
38
+ 5
39
+ 2
40
+ :
41
+ v
42
+ i
43
+ X
44
+ r
45
+ a
46
+
47
+ Abstract
48
+
49
+ Failure attribution in LLM multi-agent sys-
50
+ tems—identifying the agent and step responsible
51
+ for task failures—provides crucial clues for sys-
52
+ tems debugging but remains underexplored and
53
+ labor-intensive. In this paper, we propose and for-
54
+ mulate a new research area: automated failure
55
+ attribution for LLM multi-agent systems. To sup-
56
+ port this initiative, we introduce the Who&When
57
+ dataset, comprising extensive failure logs from
58
+ 127 LLM multi-agent systems with fine-grained
59
+ annotations linking failures to specific agents and
60
+ decisive error steps. Using the Who&When, we
61
+ develop and evaluate three automated failure attri-
62
+ bution methods, summarizing their corresponding
63
+ pros and cons. The best method achieves 53.5%
64
+ accuracy in identifying failure-responsible agents
65
+ but only 14.2% in pinpointing failure steps, with
66
+ some methods performing below random. Even
67
+ SOTA reasoning models, such as OpenAI o1 and
68
+ DeepSeek R1, fail to achieve practical usability.
69
+ These results highlight the task’s complexity and
70
+ the need for further research in this area. Code
71
+ and dataset are available in the public repository.
72
+
73
+ 1. Introduction
74
+
75
+ In recent years,
76
+ the reframing Large Language Mod-
77
+ els (LLMs) as agents and built agentic multi-agent sys-
78
+ tems—composed of interactive, LLM-powered agents col-
79
+ laborating to achieve shared goals—has garnered significant
80
+ attention (Hong et al., 2023; Li et al., 2023a; Wu et al., 2023).
81
+ These purposefully designed agentic systems have demon-
82
+
83
+ *Equal contribution 1Pennsylvania State University 2Duke
84
+ University 3University of Washington 4Nanyang Technologi-
85
+ cal University 5Meta 6Google DeepMind 7Oregon State Uni-
86
+ versity 8AG2AI, Inc.. Correspondence to: †Shaokun Zhang
87
+ <shaokun.zhang@psu.edu>.
88
+
89
+ Proceedings of the 42 nd International Conference on Machine
90
+ Learning, Vancouver, Canada. PMLR 267, 2025. Copyright 2025
91
+ by the author(s).
92
+
93
+ 1
94
+
95
+ Figure 1. When developing LLMs-powered multi-agent systems,
96
+ failure attribution—identifying system components responsible
97
+ for task failures based on evaluation results—has received limited
98
+ attention in existing research. This process is typically performed
99
+ manually, demanding substantial labor and specialized expertise.
100
+ In this study, we explore the potential for automating this process.
101
+
102
+ strated remarkable potential across various domains, includ-
103
+ ing coding (Hong et al., 2023), scientific discovery (Gha-
104
+ farollahi & Buehler, 2024), and addressing complex real-
105
+ world challenges (Fourney et al., 2024).
106
+
107
+ Once constructed,
108
+ these systems are typically refined
109
+ through an iterative process when they fail in specific scenar-
110
+ ios: evaluation against established benchmarks, followed by
111
+ manual failure attribution and system refinement. This cycle
112
+ repeats until the desired outcomes are achieved. Failure
113
+ attribution—identifying the components of the system that
114
+ directly lead to task failures—is a crucial step that serves as
115
+ the foundation for guiding improvements. Despite its im-
116
+ portance, this process is largely overlooked in mainstream
117
+ research, which typically leaves it as a manual task requir-
118
+ ing significant labor, such as analyzing complex historical
119
+ logs and navigating the technical intricacies of the system.
120
+ Moreover, mapping benchmark evaluation results to failure
121
+ components is heavily dependent on domain expertise, im-
122
+ posing additional requirements on practitioners. As systems
123
+ grow in complexity, this challenge becomes increasingly
124
+ difficult due to the growing number of components that must
125
+ be considered when conducting failure attribution.
126
+
127
+ Previous manual efforts involve a non-straightforward way
128
+
129
+ Title Suppressed Due to Excessive Size
130
+
131
+ to facilitate failure attribution in multi-agent systems: de-
132
+ veloping increasingly fine-grained benchmarks, with the
133
+ hope that more metrics will enable quicker failure attribu-
134
+ tion (Zhuge et al., 2024). For example, DevAI (Zhuge et al.,
135
+ 2024) introduces a coding benchmark that incorporates di-
136
+ verse delivery requirements, offering a more nuanced eval-
137
+ uation compared to the widely used SWE-Bench (Jimenez
138
+ et al., 2023), which focuses exclusively on final resolution
139
+ rates. Despite these advancements, the process of failure
140
+ attribution based on benchmark results remains a manual
141
+ process, merely providing more metrics as reference points
142
+ without fundamentally addressing the underlying challenges.
143
+ With increasingly comprehensive benchmarks, a fundamen-
144
+ tal question remains unanswered: which components of
145
+ the agentic system require improvement?
146
+
147
+ We argue that evaluation and failure attribution should be
148
+ tightly integrated, adhering to the principle that “evaluation
149
+ is not an end in itself, but a means to improvement.” (Scriven,
150
+ 1991) More research efforts should focus on bridging the
151
+ substantial gap between evaluation results and failure attribu-
152
+ tion, which currently relies heavily on manual labor. Draw-
153
+ ing inspiration from the LLM-as-a-judge paradigm (Gu
154
+ et al., 2024; Zheng et al., 2023), which leverages LLMs to
155
+ replace human effort in evaluation, we propose to bridge the
156
+ gap between evaluation and failure attribution by harnessing
157
+ the comprehensive judgment capabilities of LLMs. Specifi-
158
+ cally, we propose and formulate a new research problem: au-
159
+ tomated failure attributions in LLM multi-agent systems.
160
+ When failures occur under specific scenarios during evalu-
161
+ ation, the goal is to automatically identify the components
162
+ responsible for these failures without human intervention.
163
+ We believe this research could serve as a substitute for man-
164
+ ual failure attribution, enabling human resources to focus
165
+ on improving system functionality rather than performing
166
+ time-intensive diagnostics as shown in Figure 1
167
+
168
+ To advance the research in this area, we introduce the
169
+ Who&When benchmark, comprising extensive failure logs
170
+ annotated with fine-grained failure details for addressing
171
+ real-world tasks, where these logs are collected from 127
172
+ LLMs-powered multi-agent systems. This benchmark in-
173
+ cludes both algorithmically generated and hand-crafted
174
+ multi-agent systems, encompassing a wide range of realistic
175
+ scenarios. Each failure log is accompanied by meticulous
176
+ annotations, specifying the failure-responsible agent, the
177
+ corresponding failure step, and the reasons for failure in
178
+ plain language. The primary task involves pinpointing the
179
+ agent most accountable for the failure and the exact step
180
+ where the error occurred. Predicting the failure-responsible
181
+ agent serves as a fundamental requirement for failure attri-
182
+ bution, given that agents are the basic units of multi-agent
183
+ systems. Extending this to the specific failure step predic-
184
+ tion imposes a higher level of requirement, enabling more
185
+ fine-grained failure attribution, such as uncovering the spe-
186
+
187
+ cific reasons behind failures, which can further facilitate
188
+ targeted system refinements. We believe that Who&When
189
+ can serve as a foundational resource for driving progress in
190
+ automated failure attribution research.
191
+
192
+ Additionally, we construct and evaluate several automated
193
+ failure attribution approaches on the Who&When. Our find-
194
+ ings reveal the strengths and limitations of each method,
195
+ as well as their performance across different conditions, in-
196
+ cluding model variations, historical context lengths, and the
197
+ presence or absence of query labels. The results underscore
198
+ the complexity of using LLMs for failure analysis in multi-
199
+ agent systems. For example, the best-performing method
200
+ achieved only 8.77% accuracy in identifying decisive error
201
+ steps within the hand-crafted agentic system.
202
+
203
+ 2. Problem Formulation: Automated Failure
204
+ Attribution in Multi-Agent Cooperation
205
+
206
+ In this section, we introduce decisive errors and formu-
207
+ late the automated failure attribution problem. We adopt
208
+ the widely-adopted turn-based LLM multi-agent proto-
209
+ col (Hong et al., 2023; Li et al., 2023a; Wu et al., 2023).
210
+
211
+ Background. Considering a LLMs-powered multi-agent
212
+ system M with a group of N agents, denoted as N =
213
+ {1, 2, ..N }, that operate at discrete time steps. These agents
214
+ are taking actions in a turn-based protocol, meaning that
215
+ exactly one agent performs an action at each time step.
216
+ Formally, the system is described as:
217
+
218
+ (cid:68)
219
+
220
+ M =
221
+
222
+ N , S, A, P, ϕ
223
+
224
+ (cid:69)
225
+ .
226
+
227
+ (1)
228
+
229
+ Here, S is the set of possible states. A is the global action
230
+ set; each agent i ∈ N can typically perform actions from
231
+ some subset Ai ⊆ A. ϕ(t) is a function that indicates which
232
+ agent is active at time t, thus specifying the turn-based rule.
233
+ P (cid:0)st+1 | st, at, ϕ(t)(cid:1) is the state-transition probability,
234
+ given that only one agent ϕ(t) acts at time t.
235
+
236
+ We employ ϕ(t) to denote the agent that takes an action
237
+ at at time step t. A full trajectory τ can be written as:
238
+ τ = (cid:0)s0, a0, s1, a1, . . . , sT
239
+ (cid:1), where T is a terminal time
240
+ step or when the system enters a terminating state.
241
+
242
+ Decisive Error and Objective. We use a tuple (i, t) to
243
+ denote a mistake in a trajectory, which means agent i is
244
+ active at time t, and its action at is deemed a mistake (e.g.,
245
+ wrong reasoning etc.). A trajectory may contain multiple
246
+ mistakes, but not all of them result in overall failure. We
247
+ employ Z(τ ) to denote the result of a trajectory τ .
248
+
249
+
250
+
251
+
252
+ 1,
253
+
254
+ Z(τ ) =
255
+
256
+ if the system ultimately fails,
257
+
258
+ (2)
259
+
260
+
261
+
262
+ 0,
263
+
264
+ otherwise.
265
+
266
+ 2
267
+
268
+ Title Suppressed Due to Excessive Size
269
+
270
+ Suppose the original trajectory τ is a failure, i.e., Z(τ ) = 1.
271
+ Considering the following scenario, if correcting the mistake
272
+ made by agent i at time t: we replace at with a ”correct”
273
+ action ˜at. The steps prior to step t remain unchanged, while
274
+ the actions following t are adjusted accordingly to ensure
275
+ correctness. This process generates a modified trajectory:
276
+
277
+ τ (i,t) = I(i,t)(τ ),
278
+
279
+ (3)
280
+
281
+ where I(i,t) denotes the intervention. If in the modified
282
+ trajectory we obtain Z(cid:0)τ (i,t)(cid:1) = 0 (success), then the error
283
+ (i, t) is said to be a decisive error. Formally, we define the
284
+ decisive error indicator ∆i,t(τ ) as
285
+
286
+ ∆i,t(τ ) =
287
+
288
+
289
+
290
+
291
+ 1,
292
+
293
+ if Z(τ ) = 1 and Z(cid:0)τ (i,t)(cid:1) = 0,
294
+
295
+ (4)
296
+
297
+
298
+
299
+ 0, otherwise.
300
+
301
+ In words, ∆i,t(τ ) = 1 ⇐⇒ Fixing agent i’s mistake
302
+ at time t changes Z(τ ) from 1 (fail) to 0 (success). For-
303
+ mally, the decisive error could be defined as agent-time
304
+ pairs (i∗, t∗) such that ∆i∗,t∗ (τ ) = 1, where i∗ represents
305
+ the agent responsible for the system failure, and t∗ repre-
306
+ sents the exact time step at which the critical mistake occurs.
307
+ We refer to these as the failure-responsible agent and the
308
+ decisive error step, respectively across the paper.
309
+
310
+ In practice, multiple decisive errors may occur within a tra-
311
+ jectory. In our study, we address this situation by identifying
312
+ the earliest error in time as the principal cause of failure.
313
+ Specifically, we define an objective to determine:
314
+
315
+ C(τ ) = (cid:8)(i, t) | ∆i,t(τ ) = 1(cid:9),
316
+ (i∗, t∗) = arg min
317
+
318
+ t.
319
+
320
+ (i,t) ∈ C(τ )
321
+
322
+ (5)
323
+
324
+ which selects the pair (i∗, t∗) yielding the highest decisive
325
+ error indicator with earliest time step. In this study, the
326
+ research problem focuses on the automatic identification of
327
+ the (i∗, t∗) in LLMs-powered multiple agent systems.
328
+
329
+ 3. The Who&When Dataset
330
+
331
+ To advance research in this area, we introduce a dataset
332
+ called Who&When. This dataset comprises extensive fail-
333
+ ure logs from 127 LLM multi-agent systems including both
334
+ algorithm-generated and human-crafted systems. These logs
335
+ are carefully annotated with labels that identify the failure-
336
+ reponsible agents and the decisive error steps in agent co-
337
+ operation directly responsible for problem-solving failures.
338
+ Additionally, each annotation is supplemented with natural
339
+ language explanations, culminating in 184 distinct failure
340
+ annotation tasks. The dataset is specifically designed to
341
+ detect the failure-reponsible agents (who) and the corre-
342
+ sponding steps (when) within each failure log.
343
+
344
+ 3
345
+
346
+ Specifically, each instance in Who&When includes the fol-
347
+ lowing entry: (1) Query: A query from GAIA (Mialon et al.,
348
+ 2023) or AssistantBench (Yoran et al., 2024), describing a
349
+ real-world question. (2) Failure log: The full conversation
350
+ log of a specific system as it fails to solve the query. (3)
351
+ Agentic system information: For algorithm-generated sys-
352
+ tems, including system prompts, tools, and agent names, all
353
+ tailored to this specific query. (4) Annotations: An annota-
354
+ tion of the agent responsible for task failure, specifying the
355
+ step where the failure occurred, along with a plain-language
356
+ explanation of why the failure took place. An example of the
357
+ instance in this benchmark could be found in Appendix C.
358
+
359
+ To better reflect our definition of decisive error in Section 2,
360
+ we design three metrics to evaluate the performance of vari-
361
+ ous failure attribution methods: (1) Agent-Level Accuracy:
362
+ This metric measures the percentage of correctly predicted
363
+ failure-responsible agents by failure attribution algorithms.
364
+ (2) Step-Level Accuracy: This metric quantifies the per-
365
+ centage of correctly identified decisive error steps. It im-
366
+ poses higher requirements on the algorithms compared to
367
+ the first metric. (3) Step-Level Accuracy with Tolerance:
368
+ To account for slight deviations, this metric allows a toler-
369
+ ance range for mistake step predictions. If the predicted
370
+ step falls within the specified tolerance range of the actual
371
+ mistake step, the prediction is considered correct.
372
+
373
+ 3.1. Agentic Systems Constructions
374
+
375
+ Who&When includes two types of agentic systems:
376
+ algorithm-generated agentic systems and one meticulously
377
+ hand-crafted agentic systems, totaling 127 agentic systems
378
+ equipped with diverse tools for evaluation.
379
+
380
+ Algorithm-Generated Agentic Systems. To ensure an
381
+ adequate number of agentic systems for the Who&When
382
+ datasets, we first employ the CaptainAgent algorithm (Song
383
+ et al., 2024) from the AG2 library 1 to automatically gen-
384
+ erate agentic systems for each data instance sourced from
385
+ the validation sets of the GAIA (Mialon et al., 2023) and
386
+ AssistantBench (Yoran et al., 2024) benchmarks. Specifi-
387
+ cally, it constructs a team of agents tailored to a given task,
388
+ assigning appropriate agent names, prompts, and necessary
389
+ tools. The system iteratively optimizes the agents’ con-
390
+ figuration until the task is successfully completed. In the
391
+ Who&When, we select only the final multi-agent config-
392
+ urations, along with the corresponding execution history,
393
+ as these represent the optimized solutions for each query.
394
+ All agents within the constructed systems, as well as the
395
+ CaptainAgent algorithm itself, are based on the GPT-4o
396
+ on 2024-08-01-preview version. Additionally, since
397
+ the primary objective of the Who&When is to capture mis-
398
+ takes made by agents that lead to failures in solving real-
399
+
400
+ 1https://github.com/ag2ai/ag2
401
+
402
+ Title Suppressed Due to Excessive Size
403
+
404
+ (a) Annotation labor cost.
405
+
406
+ (b) Uncertain annotation percentages.
407
+
408
+ (c) Disagreement rates in voting.
409
+
410
+ Figure 2. Statistical analysis of the annotation process: (1) Total labor cost for annotations in human hours. (2) The proportion of uncertain
411
+ annotations to total annotations during the second round. (3) Initial disagreement rates between annotators (note that we make sure
412
+ to reach a consensus through a careful discussion and voting process afterwards). These results highlight the challenges involved in
413
+ performing manual failure attribution.
414
+
415
+ world problems within agentic systems, we retain only those
416
+ agentic systems that fail to successfully address the queries
417
+ associated with each data instance from these benchmarks.
418
+
419
+ Hand-Crafted Agentic Systems.
420
+ In addition to algorithm-
421
+ generated systems, Who&When also includes a meticu-
422
+ lously hand-crafted, mature multi-agent system, Magnetic-
423
+ One (Fourney et al., 2024), to ensure the representation of
424
+ realistic and highly refined agentic systems. Magnetic-One
425
+ is a generalist agentic system designed to handle a broad
426
+ range of tasks. It comprises five carefully crafted agents,
427
+ each specializing in distinct capabilities, such as operat-
428
+ ing a web browser or navigating local files. We evaluate
429
+ Magnetic-One using the validation set from the Assistant-
430
+ Bench (Yoran et al., 2024) benchmark, aggregating its fail-
431
+ ure logs for subsequent annotation. We also test Magnetic-
432
+ One on a randomly sampled subset of 30 instances from the
433
+ GAIA (Mialon et al., 2023), incorporating the correspond-
434
+ ing execution failure logs into the dataset. We exclude the
435
+ rest of the GAIA dataset due to the complexity of annotating
436
+ the long context logs produced by Magentic-One.
437
+
438
+ reasoning behind the mistake in natural language. Addition-
439
+ ally, each expert is required to categorize their annotations
440
+ into two groups: those they are undoubtedly confident are
441
+ correct and those they have any uncertainty about. Round
442
+ II: In the second round, people are instructed to make an
443
+ agreement on all the uncertain annotations from Round II.
444
+ For these uncertain annotations, we engage in a collabo-
445
+ rative discussion to reach a consensus. We do not simply
446
+ follow the principle of majority rule; instead, we aim to
447
+ ensure that everyone is persuaded and that a consensus is
448
+ ultimately reached. Round III: In the final round, a cross-
449
+ validation procedure is employed. Each expert is asked to
450
+ go through another expert’s annotations to assess the con-
451
+ sistency of the annotation standards. If any discrepancies
452
+ or issues with the annotations are identified, the experts
453
+ engage in further discussion and, if necessary, re-annotate
454
+ the data according to the established guidelines until a con-
455
+ sensus is reached. Incorporating the viewpoints of multiple
456
+ annotators and ensuring consensus among them, we aim to
457
+ accurately reach the actual ground truth, as suggested by
458
+ previous studies (Clemen, 1989; Zhuge et al., 2024).
459
+
460
+ 3.2. Decisive Error Annotation
461
+
462
+ 3.3. Analysis
463
+
464
+ After obtaining the failure logs of various agentic systems,
465
+ we introduce an annotation procedure to identify the deci-
466
+ sive error failure and decisive error step. To ensure precise
467
+ annotation, we conduct multiple rounds of annotation per-
468
+ formed by three human experts in AI agent (whose identities
469
+ are anonymized as 0dmfogp3, 8n3d0wmg, and 204nd84n).
470
+
471
+ Round I: In the first round, we distribute the failure logs
472
+ from all agentic systems for each query equally among three
473
+ experts. To ensure consistency, we provide the experts with
474
+ a standardized annotation guideline as shown in Appendix F.
475
+ Each expert is tasked with annotating three elements: the
476
+ single erroneous agent primarily responsible for the task
477
+ failure, the specific step at which the error occurred, and the
478
+
479
+ Annotating the decisive error agent and identifying the spe-
480
+ cific step of the error is challenging for both non-expert
481
+ people and domain experts. The annotators must parse com-
482
+ plex logs, follow the problem-solving logic of each agent,
483
+ and assess whether each action is correct or if it misleads
484
+ the entire problem-solving process. For example, if an
485
+ agent uses a web browser to gather essential information for
486
+ problem-solving, annotators must check the browser history
487
+ and visit each website to determine whether the failure is
488
+ due to unavailable information on the website or because
489
+ the agent failed to retrieve it. As shown in Figure 2(a), three
490
+ annotators spent 30.9, 30.2, and 23.2 human hours, respec-
491
+ tively, to complete the annotations. This demonstrates that
492
+
493
+ 4
494
+
495
+ 204nd84n8n3d0wmg0dmfogp3051015202530Annotion Cost (hour)18.416.212.412.514.010.8Algorithm Generated SystemsHand Crafted Systems204nd84n8n3d0wmg0dmfogp301020304050Uncertainty Rate (%)16.7%23.8%16.7%26.3%21.1%21.1%204nd84n8n3d0wmg0dmfogp30dmfogp38n3d0wmg204nd84n33.350.0N/A16.7N/A9.1N/A10.018.2 Title Suppressed Due to Excessive Size
496
+
497
+ the annotation process is very time-consuming, leading us
498
+ to consider doing research on automated failure attribution.
499
+
500
+ Additionally, in many data instances, it’s not just one agent
501
+ that makes mistakes, but several agents. People need to iden-
502
+ tify these mistakes and select the most severe ones, which
503
+ can directly lead to problem-solving failures as formulated
504
+ in Section 2. Since the severity of mistakes may be subtle
505
+ and even subjective at times, the process becomes even more
506
+ difficult. As shown in Figure 2(b), we present the uncertain
507
+ annotation percentages for three individuals. The uncertain
508
+ percentages across different annotators range from 15% to
509
+ 30%. We also visualize the disagreement rates between
510
+ different individuals when voting on each other’s uncertain
511
+ data in Figure 2(c). We can see some disagreement remains
512
+ before discussing to make the agreement, further highlight-
513
+ ing the difficulties involved in the annotation process.
514
+
515
+ 4. Can LLMs help identify When and Which
516
+
517
+ agent causes task failures?
518
+
519
+ As revealed in Section 3.3, detecting the failure-responsible
520
+ agent and corresponding failure step in agentic system are
521
+ often subtle, requiring significant human effort. Given these
522
+ challenges, we were thinking of performing automated fail-
523
+ ure attribution, using LLMs themselves to detect these errors
524
+ and provide signal for human to perform essential improve-
525
+ ment. In this section, we set up experiments to answer a
526
+ fundamental question: Can LLMs help identify when and
527
+ which agent causes task failures in multi-agent systems?
528
+
529
+ 4.1. LLMs for Failure Attribution in Agentic Systems
530
+
531
+ To answer the question mentioned above, we propose three
532
+ judgement methods for automated failure attribution in agen-
533
+ tic systems. Through extensive experiments, we demon-
534
+ strate that each method has distinct advantages and limita-
535
+ tions, and they can be applied either independently or in
536
+ combination. Furthermore, we analyze the performance
537
+ of these methods across various scenarios and constraints,
538
+ highlighting their applicability in different contexts.
539
+
540
+ (1) All-at-once: An LLM is provided with a query and the
541
+ complete failure log, and it is tasked with identifying the
542
+ failure-responsible agent as well as the specific step where
543
+ the decisive error occurred. (2) Step-by-step: An LLM is
544
+ provided with a query, and the failure log is presented step-
545
+ by-step. At each step, the LLM is tasked with determining
546
+ whether a mistake has occurred in the current step. If a
547
+ mistake is identified, the judging process terminates, and
548
+ the responsible agent’s name along with the current step
549
+ number are returned as the output. Otherwise, the process
550
+ continues until the final step is reached. (3) Binary search:
551
+ Alternatively, this method uses a receptive field approach
552
+ that lies between the previous two methods. Starting with
553
+
554
+ the query and the full failure log, the LLM is tasked with
555
+ determining whether the mistake occurred in the upper half
556
+ or lower half of the failure logs. Once this decision is
557
+ made, the LLM is provided with the selected half of the log
558
+ and the process is repeated iteratively until a single step is
559
+ identified. The three algorithms and corresponding prompts
560
+ are detailed in Appendix A and Appendix G.
561
+
562
+ 4.2. Settings
563
+
564
+ Scenario. We conduct experiments under two distinct
565
+ settings to simulate various realistic scenarios. (1) With
566
+ Ground Truth: In this setting, the final ground truth of
567
+ the query that the agentic system is attempting to resolve
568
+ is available to the LLMs. Our focus here is on the typical
569
+ AI system development cycle, where it is common practice
570
+ to use a development dataset with ground truth to identify
571
+ and debug potential errors in experimental systems. (2)
572
+ Without Ground Truth: In the second setting, the final
573
+ ground truth of the query is unavailable. In this scenario,
574
+ LLMs are employed to perform failure attribution in agentic
575
+ systems based on their running logs. This capability can
576
+ also be viewed as a form of self-reflection (Huang et al.,
577
+ 2023; Shinn et al., 2024), which contributes to the improve-
578
+ ment of multi-agent systems. Throughout this paper, unless
579
+ otherwise specified, all results are reported as the average
580
+ accuracy across these two scenarios.
581
+
582
+ Models. The primary experiments are conducted using
583
+ the GPT-4o model, unless otherwise specified. Addition-
584
+ ally, we also incorporate several other models, including
585
+ both open-source (such as the Llama and Qwen series)
586
+ and closed-source models (GPT series), to ensure the con-
587
+ sistency of the conclusions drawn from the experiments.
588
+ Additionally, we employ advanced reasoning models, i.e.,
589
+ OpenAI o1 and DeepSeek R1, to assess the performance of
590
+ reasoning models on failure attribution tasks. The results of
591
+ these evaluations are provided in Appendix 4.8.
592
+
593
+ 4.3. Overall Performance
594
+
595
+ We first perform experiments to compare the performance
596
+ of three failure attribution methods on Who&When dataset
597
+ with GPT-4o model. The results are reported on Table 1.
598
+
599
+ Agent-Level Accuracy Relies on Large Receptive Field.
600
+ As shown in Table 1, all-at-once significantly outperforms
601
+ the other two failure attribution methods in agent-level ac-
602
+ curacy. Specifically, its agent-level accuracy is 19.13% and
603
+ 20.69% higher than step-by-step when judging with ground
604
+ truth, and 25.1% and 20.69% higher when judging without
605
+ ground truth, respectively. The performance of the binary
606
+ search method falls between these two approaches.
607
+
608
+ These results can be attributed to the fact that predicting the
609
+
610
+ 5
611
+
612
+ Title Suppressed Due to Excessive Size
613
+
614
+ Agentic Systems Types Algorithm Generated Hand Crafted Algorithm Generated Hand Crafted
615
+ Random
616
+
617
+ With Ground Truth
618
+
619
+ Without Ground Truth
620
+
621
+ Agent-Level Accuracy
622
+ Step-Level Accuracy
623
+
624
+ Agent-Level Accuracy
625
+ Step-Level Accuracy
626
+
627
+ Agent-Level Accuracy
628
+ Step-Level Accuracy
629
+
630
+ Agent-Level Accuracy
631
+ Step-Level Accuracy
632
+
633
+ 29.10
634
+ 19.06
635
+
636
+ 54.33
637
+ 12.50
638
+
639
+ 35.20
640
+ 25.51
641
+
642
+ 44.13
643
+ 23.98
644
+
645
+ 12.00
646
+ 4.16
647
+
648
+ All-at-Once
649
+
650
+ 55.17
651
+ 5.26
652
+ Step-by-Step
653
+
654
+ 34.48
655
+ 7.02
656
+ Binary Search
657
+ 51.72
658
+ 6.90
659
+
660
+ 29.10
661
+ 19.06
662
+
663
+ 51.12
664
+ 13.53
665
+
666
+ 26.02
667
+ 15.31
668
+
669
+ 30.11
670
+ 16.59
671
+
672
+ 12.00
673
+ 4.16
674
+
675
+ 53.44
676
+ 3.51
677
+
678
+ 32.75
679
+ 8.77
680
+
681
+ 36.21
682
+ 6.90
683
+
684
+ Table 1. Performance of the three failure attribution methods on the Who&When dataset with and without labels, evaluated on the
685
+ GPT-4o model. For agent-level accuracy, all-at-once outperforms binary search, which in turn surpasses step-by-step. Conversely, for
686
+ step-level accuracy, step-by-step achieves the best performance, followed by binary search and then all-at-once.
687
+
688
+ Fine-Grained Predictions Boost Better Step-Level Ac-
689
+ curacy.
690
+ In terms of step-level accuracy, the all-at-once
691
+ approach performs obviously the worst, even with its aver-
692
+ age performance falling below that of random method. This
693
+ outcome underscores the method’s practical infeasibility.
694
+ By contrast, the step-by-step approach achieves the highest
695
+ performance, outperforming other methods in step-level ac-
696
+ curacy in 3 out of 4 cases. The performance of the binary
697
+ search method still falls between these two approaches.
698
+
699
+ These two results can be attributed to the ’space-in-the-
700
+ needle’ problem, where LLMs often struggle to retrieve
701
+ specific information from long contexts (Nelson et al., 2024).
702
+ The all-at-once method has access to the largest context
703
+ when making decisions about the decisive error step, but
704
+ this can lead to difficulties in pinpointing the exact failure
705
+ step within a long history.
706
+ In contrast, the step-by-step
707
+ method processes the context incrementally, allowing for
708
+ more focused decision-making. The binary search method
709
+ performs at a level between these two approaches.
710
+
711
+ Finding 2:
712
+ Incrementally processing context en-
713
+ ables better step-level failure attribution since LLMs
714
+ struggle to retrieve information from long contexts.
715
+
716
+ Impact of Ground Truth on Failure Attribution. We
717
+ also observed that failure attribution accuracy is higher for
718
+ all three methods when ground truth are available, compared
719
+ to when judgments are made without ground truth in all
720
+ cases in all metrics. Although the answers to users’ queries
721
+ may not serve as definitive ’golden labels’ for each agent’s
722
+ correct behavior, they provide a useful reference signal for
723
+ the judgment LLMs. For instance, if an agent leads the
724
+ system in a completely wrong direction, with no possibility
725
+ of reaching the correct final answer, the label information
726
+ can directly help alert the judgment LLMs to this error.
727
+
728
+ 6
729
+
730
+ Figure 3. Performance comparison of three failure attribution meth-
731
+ ods on different models in both two metrics. We found the conclu-
732
+ sion is mostly consistent with Table 1.
733
+
734
+ failure-responsible agent requires the judge LLMs to con-
735
+ sider a broader context, including the behaviors of multiple
736
+ agents. Since all-at-once has access to the entire conver-
737
+ sation log when making the final judgment, its prediction
738
+ of the failure-responsible agent is more accurate. In con-
739
+ trast, the step-by-step method processes the conversation
740
+ history incrementally, while the final decision can be made
741
+ with incomplete information, thus resulting in lower perfor-
742
+ mance. Moreover, all failure attribution methods outperform
743
+ the random baseline, highlighting that these approaches are
744
+ nontrivial and affirming the necessity of involving LLMs
745
+ for failure attribution.
746
+
747
+ Finding 1: Providing broader failure log context
748
+ enables more accurate agent-level failure attribution
749
+ by incorporating more complete information.
750
+
751
+ GPT-4-turboGPT-4o-miniLlama-3.1-8bLlama-3.1-70bQwen-2.5-7bQwen-2.5-72b0204060Agent-Level Accuracy (%)GPT-4-turboGPT-4o-miniLlama-3.1-8bLlama-3.1-70bQwen-2.5-7bQwen-2.5-72b05101520Step-Level Accuracy (%)All-at-OnceStep-by-StepBinary Search Title Suppressed Due to Excessive Size
752
+
753
+ (a) Agent-Level Accuracy
754
+
755
+ (b) Step-Level Accuracy
756
+
757
+ (a) Algorithm-Generated
758
+
759
+ (b) Hand-Crafted
760
+
761
+ Figure 4. Comparison of three failure attribution methods applied
762
+ to all failure logs from the hand-crafted systems in the Who&When,
763
+ evaluated under varying failure log lengths across both metrics.
764
+
765
+ Without such intervention, the entire system might proceed
766
+ in the wrong direction without any external warning.
767
+
768
+ Consistency of Conclusions Across Various LLMs.
769
+ In
770
+ addition to the GPT-4o model, we conducted evaluations
771
+ on other LLMs, including open-source models (e.g., the
772
+ Llama series and Qwen series) as well as closed-source
773
+ models (e.g., the GPT series). Due to the significant compu-
774
+ tational and token costs, we only perform experiments on
775
+ hand-crafted agentic systems from Who&When which has
776
+ fewer failure logs. The results of three methods are shown in
777
+ Figure 3. We found that the phenomena observed in Table 1
778
+ hold consistently across different LLMs. Specifically, for
779
+ agent-level accuracy, the ranking is: all-at-once, followed
780
+ by binary search, and then step-by-step. Conversely, for
781
+ step-level accuracy, the ranking is: step-by-step, followed
782
+ by binary search, and then all-at-once.
783
+
784
+ Finding 3: The pros and cons of different failure
785
+ attributions methods in this study are mostly consis-
786
+ tent across different LLMs.
787
+
788
+ 4.4. Performance Across Varying Context Lengths
789
+
790
+ We investigate the relationship between the length of failure
791
+ logs and the corresponding failure attribution performance.
792
+ Specifically, the failure logs of hand-crafted agentic systems
793
+ from the Who&When dataset are divided into five levels,
794
+ with context length progressively increasing from Level 1
795
+ to Level 5. Specifically, Level 1 spans 5–17 steps, Level
796
+ 2 covers 19–29, Level 3 includes 31–49, Level 4 ranges
797
+ from 51–91, and Level 5 spans 93–130 steps. Both agent-
798
+ level and step-level judgment performances across the three
799
+ evaluation methods are presented in Figure 4. Algorithm-
800
+ generated systems are excluded from this analysis due to
801
+ their limited maximum step count of 10, which prevents
802
+ meaningful divisions of context length.
803
+
804
+ Our findings indicate that all three methods exhibit a decline
805
+ in both metrics as context length increases. Notably, step-
806
+ level accuracy is more sensitive to context length changes
807
+
808
+ 7
809
+
810
+ Figure 5. The distances between human-annotated decisive error
811
+ steps and the predicted steps for each date instance on failure logs
812
+ from both algorithm-generated and hand-crafted systems.
813
+
814
+ than agent-level accuracy. Furthermore, the step-by-step
815
+ performance decline is particularly pronounced compared
816
+ to the other two. We also analyze the distances between
817
+ human-annotated decisive error steps and the predicted steps
818
+ for each data instance, as shown in Figure 5. These results
819
+ demonstrate that the step-by-step method outperforms the
820
+ other two methods in accurately predicting the decisive error
821
+ steps. However, as context length reaches its maximum, all
822
+ three failure attribution methods converge to near 0%, as
823
+ shown in Figure 4.
824
+
825
+ Finding 4: Failure attribution performance de-
826
+ clines as context length increases, with step-level
827
+ accuracy being more sensitive.
828
+
829
+ 4.5. Step-Level Accuracy Under Different Tolerances
830
+
831
+ Toler. All-at-Once
832
+ ± 1
833
+ 12.07
834
+ ± 2
835
+ 19.83
836
+ ± 3
837
+ 30.17
838
+ ± 4
839
+ 37.07
840
+ ± 5
841
+ 43.10
842
+
843
+ Step-by-Step Binary Search
844
+ 14.66
845
+ 16.38
846
+ 18.10
847
+ 31.90
848
+ 33.62
849
+
850
+ 13.79
851
+ 18.97
852
+ 22.41
853
+ 31.89
854
+ 36.21
855
+
856
+ Table 2. Step-level accuracy with different tolerances on the failure
857
+ logs of hand-crafted agentic systems from Who&When dataset.
858
+
859
+ In practice, directly identifying the exact decisive error step
860
+ is not always necessary; it is often sufficient to determine a
861
+ range of steps where the mistake might occur. In this section,
862
+ we show the performance of the three failure attribution
863
+ methods under varying tolerance conditions on the failure
864
+ logs of hand-crafted agentic systems from the Who&When
865
+ dataset. Algorithm-generated systems are excluded from
866
+ this analysis because their maximum step count is limited
867
+ to 10, and increasing the tolerance would lead to artificially
868
+ inflated accuracy.
869
+
870
+ As shown in Table 2, our findings show that step-by-step
871
+ achieves the highest performance when the tolerance is set to
872
+ 0 or 1. However, as the tolerance increases, the advantages
873
+ of all-at-once become more pronounced, while the benefits
874
+
875
+ 12345Length Level020406080100Accuracy (%)All-at-OnceStep-by-StepBinary Search12345Length Level0.02.55.07.510.012.515.017.520.0Accuracy (%)All-at-OnceStep-by-StepBinary Search02468DistanceAll-at-OnceStep-by-StepBinary Search020406080100120Distance Title Suppressed Due to Excessive Size
876
+
877
+ positive, all three methods still yield meaningful insights
878
+ from a statistical perspective. In practice, these statistical re-
879
+ sults provide a more actionable basis for system refinement
880
+ compared to focusing solely on single data instances.
881
+
882
+ Finding 6: The three baseline methods are more
883
+ effective at performing failure attribution at a statis-
884
+ tical level than at an instance level.
885
+
886
+ 4.7. Can We Combine Multiple Failure Attribution
887
+
888
+ Methods?
889
+
890
+ Metrics
891
+
892
+ Cost
893
+ Token Num
894
+
895
+ Agent-Level
896
+ Accuracy
897
+
898
+ Step-Level
899
+ Accuracy
900
+
901
+ Binary Search
902
+ △ All-at-Once
903
+ □ Step-by-Step
904
+ Hybrid Method (□&△)
905
+
906
+ 34,659
907
+ 17,106
908
+ 87,720
909
+ 149,177
910
+
911
+ 43.97
912
+ 57.02
913
+ 35.96
914
+ 57.02
915
+
916
+ 6.90
917
+ 4.39
918
+ 7.90
919
+ 12.28
920
+
921
+ Table 3. Comparison of the three failure attribution methods with a
922
+ hybrid approach that combines all-at-once and step-by-step on the
923
+ failure logs of hand-crafted systems from the Who&When dataset.
924
+ The hybrid method achieves the highest performance in both two
925
+ metrics but incurs the highest token costs.
926
+
927
+ We then investigate whether a hybrid method could leverage
928
+ the advantages of both two different methods, all-at-once
929
+ and step-by-step. The former excels at failure-responsible
930
+ agent predictions, while the latter is better at accurately
931
+ predicting the decisive error step. Specifically, we start
932
+ by prompting all-at-once to predict the failure-responsible
933
+ agent and then use step-by-step to detect the mistake step in
934
+ the actions step taken by the identified failure-responsible
935
+ agent. To evaluate this, we perform experiments on the hand-
936
+ crafted systems from the Who&When dataset considering
937
+ the token cost. The results are shown on Table 3.
938
+
939
+ We observe that the hybrid method outperforms all methods
940
+ in step-level accuracy. This improvement is attributed to
941
+ the all-at-once narrowing the range of possible failure steps
942
+ by excluding action steps taken by other agents, thereby
943
+ significantly reducing the difficulty of prediction for step-
944
+ by-step. However, the hybrid method comes with a notable
945
+ drawback: it requires running two algorithms sequentially.
946
+ Compared to making judgments with a single algorithm,
947
+ this approach incurs higher computational costs.
948
+
949
+ Finding 7: Combining different failure attribution
950
+ methods allows leveraging their respective strengths
951
+ for better performance.
952
+
953
+ 4.8. Strong Reasoning Model for Automated Failure
954
+
955
+ Attributions
956
+
957
+ We fianlly examine whether reasoning models OpenAI o1
958
+ and DeepSeek R1 (DeepSeek-AI, 2025) can enhance the
959
+ automated failure attribution process. However, the original
960
+
961
+ Figure 6. Histogram of the actual and predicted failure-responsible
962
+ agents for all three methods. We present only the failure logs
963
+ of hand-crafted systems in Who&When to aggregate the largest
964
+ number of results for one multi-agent system. Number 0, 1, 2,
965
+ 3 represents Assistant, FileSurfer, Orchestrator and
966
+ WebSurfer respectively.
967
+
968
+ of step-by-step diminish. Compared to all-at-once, step-by-
969
+ step demonstrates better alignment with accurate predictions
970
+ when high precision is required.
971
+
972
+ Finding 5: Allowing tolerance in failure attribu-
973
+ tion enables broader context processing methods to
974
+ achieve competitive step-level accuracy.
975
+
976
+ 4.6. A Statistical Viewpoint on Failure Attribution
977
+
978
+ This study primarily perform experiments on single-data-
979
+ level failure attribution in LLM-powered multi-agent sys-
980
+ tems, i.e., identifying the specific component (referred to as
981
+ the failure-responsible agent) and the precise location (the
982
+ decisive error step) responsible for task failure in a single
983
+ data instance. This practice indeed mirrors human proce-
984
+ dures for failure attribution and could serves as a founda-
985
+ tional tool for deriving statistical-level conclusions. There-
986
+ fore, we think of whether these methods could be applied to
987
+ entire datasets to extract meaningful statistical results.
988
+
989
+ In Figure 6, we show the histogram of actual and the pre-
990
+ dicted failure-responsible agents for all three methods. We
991
+ only show the failure logs of hand-crafted systems from
992
+ Who&When to aggregate the largest number of results for
993
+ one system type. We observe that the single agent that make
994
+ the most decisive errors predicted by all methods to are con-
995
+ sistent with the ground truth (agent 3). Moreover, the top
996
+ two failure-responsible agents predicted by three methods
997
+ are also consistent with the ground truth in most cases (2
998
+ out of 3). These experiments demonstrate that, although
999
+ the instance-level failure attribution results are not highly
1000
+
1001
+ 8
1002
+
1003
+ Title Suppressed Due to Excessive Size
1004
+
1005
+ GPT-4o
1006
+
1007
+ OpenAI o1
1008
+
1009
+ DeepSeek R1
1010
+
1011
+ Accuracy Agent-Level Step-Level Agent-Level Step-Level Agent-Level Step-Level
1012
+ All-at-Once
1013
+ 41.38
1014
+ 36.21
1015
+ Step-by-Step
1016
+
1017
+ 10.34
1018
+ 13.79
1019
+
1020
+ 56.90
1021
+ 32.76
1022
+
1023
+ 54.31
1024
+ 33.62
1025
+
1026
+ 3.45
1027
+ 6.90
1028
+
1029
+ 4.39
1030
+ 7.90
1031
+
1032
+ Table 4. The performance of the automated failure attribution methods with reasoning mechanism with strong reasoning models.
1033
+
1034
+ prompt used in our experiments was flagged by OpenAI’s
1035
+ policy as violating usage guidelines. Therefore, we imple-
1036
+ mented minor modifications to the prompt while preserving
1037
+ its original intent. For DeepSeek R1, we employed the same
1038
+ prompt as used in other experiments to ensure consistency.
1039
+ The results are shown in Table 4.8. We don’t include binary
1040
+ search because it doesn’t include reasoning mechanisms
1041
+ in their prompt. We perform experiments on hand-crafted
1042
+ agentic systems of Who&When. The results indicate that
1043
+ stronger reasoning models do not necessarily outperform
1044
+ standard models. Although it provides some improvement,
1045
+ but still far from practical usability. For instance, DeepSeek
1046
+ R1 underperforms GPT-4o in three out of four cases, and
1047
+ OpenAI o1 fails to consistently surpass GPT-4o across all
1048
+ metrics. These findings highlight the inherent challenges
1049
+ of failure attribution.
1050
+ In contrast, integrating reasoning
1051
+ mechanisms into the prompt yields significant performance
1052
+ improvements across all metrics and cases, as shown in
1053
+ Figure 7. This demonstrates that replacing the base model
1054
+ alone does not guarantee better outcomes.
1055
+
1056
+ 5. Related Works
1057
+
1058
+ LLM Multi-Agent Systems. An emerging research focus
1059
+ examines using LLMs (Achiam et al., 2023; Wang et al.,
1060
+ 2024) as central controllers to develop LLM agents that inter-
1061
+ act with the external world beyond text domains (Deng et al.,
1062
+ 2024; Xie et al., 2024; Zhang et al., 2024b; 2025). While
1063
+ single-agent systems (Yao et al., 2022; Zhang et al., 2023a;
1064
+ 2024a) excel in specific tasks, they struggle with challenges
1065
+ requiring collaboration and collective intelligence. To ad-
1066
+ dress this, studies have explored LLM-powered multi-agent
1067
+ systems, where multiple interactive agents work concur-
1068
+ rently (Hong et al., 2023; Li et al., 2023a). These systems
1069
+ leverage the specialized skills and roles of individual agents,
1070
+ enabling collaborative problem-solving for complex tasks
1071
+ by simulating real-world cooperation patterns.
1072
+
1073
+ LLM for Judging. Numerous studies have explored the
1074
+ use of large language models (LLMs) as evaluators to as-
1075
+ sess various tasks based on pre-defined standards (Fu et al.,
1076
+ 2023; Gu et al., 2024; Hu et al., 2024; Li et al., 2023b; Liu
1077
+ et al., 2023; Thakur et al., 2024). For instance, Chan et al.
1078
+ (2023); Zheng et al. (2023) utilize LLMs to evaluate the
1079
+ performance of LLMs in chat conversation scenarios, which
1080
+ would otherwise incur significant labor costs if performed
1081
+ by humans. Another notable example is Miao et al. (2023);
1082
+
1083
+ van Schaik & Pugh (2024), who employ LLMs as evalua-
1084
+ tors in the context of text summarization which also heavily
1085
+ relies on human efforts. In the field of agentic systems,
1086
+ related research includes Shinn et al. (2024), who adopt
1087
+ the concept of LLMs-as-judges to analyze task feedback
1088
+ signals and guide corrective actions. Similarly, Zhuge et al.
1089
+ (2024) demonstrate the use of LLMs to provide detailed
1090
+ evaluations of agentic systems within their proposed De-
1091
+ vAI dataset. Despite these advancements, failure attribution
1092
+ remains a manual process, with evaluation results serving
1093
+ only as a reference for such attributions
1094
+
1095
+ Reward Models Most reward models (RMs) are designed
1096
+ either to predict human preference rankings for outputs gen-
1097
+ erated by large language models (Zhong et al., 2025) or
1098
+ to evaluate the reasoning process step by step, rather than
1099
+ assessing only the final answer (Cui et al., 2025; Lightman
1100
+ et al., 2023; Wang et al., 2023; Zheng et al., 2024). A num-
1101
+ ber of studies have proposed training process-level reward
1102
+ models that evaluate the correctness of intermediate rea-
1103
+ soning steps produced by a single LLM (Cui et al., 2025;
1104
+ Lightman et al., 2023). For instance, Math-Shepherd (Wang
1105
+ et al., 2023) employs automatically generated supervision
1106
+ data to assign reward scores to each step in solving math-
1107
+ ematical problems. Similarly, ProcessBench introduces a
1108
+ benchmark of step-by-step solutions annotated by human
1109
+ experts, identifying the location of errors within mathemati-
1110
+ cal problem-solving processes. In this setting, models are
1111
+ tasked with detecting the earliest erroneous step or confirm-
1112
+ ing that the entire solution is correct. However, these works
1113
+ focus primarily on constructing reward models for evaluat-
1114
+ ing the outputs of individual LLMs, rather than identifying
1115
+ the errors in complex agentic systems.
1116
+
1117
+ 6. Conclusion
1118
+
1119
+ In this study, we propose and formulate a new research area:
1120
+ automated failure attribution in LLM multi-agent systems,
1121
+ an area that has been largely overlooked in current research.
1122
+ To advance this field, we introduce the Who&When dataset,
1123
+ which consists of 127 multi-agent systems with extensive
1124
+ failure logs meticulously annotated with failure details. Fur-
1125
+ thermore, we develop and evaluate three automated failure
1126
+ attribution methods, highlighting the challenges and com-
1127
+ plexities of this task. Our findings underscore the significant
1128
+ difficulty of automated failure attribution and emphasize the
1129
+ urgent need for further research in this emerging area.
1130
+
1131
+ 9
1132
+
1133
+ Title Suppressed Due to Excessive Size
1134
+
1135
+ Impact Statement
1136
+
1137
+ Our approach has societal implications, both positive and
1138
+ negative. On the positive side, our work contributes to the
1139
+ efficient development of multi-agent systems powered by
1140
+ LLMs, enabling their application across a wide range of do-
1141
+ mains. Incorporating mechanisms for failure attribution and
1142
+ conduct corresponding improvement, these advancements
1143
+ have the potential to enhance LLM multi-agent systems
1144
+ significantly. However, the work also introduces potential
1145
+ risks. For instance, granting these systems the ability to
1146
+ modify external environments, such as executing code on
1147
+ computers, could lead to unintended consequences.
1148
+
1149
+ References
1150
+
1151
+ Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I.,
1152
+ Aleman, F. L., Almeida, D., Altenschmidt, J., Altman, S.,
1153
+ Anadkat, S., et al. Gpt-4 technical report. arXiv preprint
1154
+ arXiv:2303.08774, 2023.
1155
+
1156
+ Chan, C.-M., Chen, W., Su, Y., Yu, J., Xue, W., Zhang, S.,
1157
+ Fu, J., and Liu, Z. Chateval: Towards better llm-based
1158
+ evaluators through multi-agent debate. International Con-
1159
+ ference on Learning Representations, 2023.
1160
+
1161
+ Clemen, R. T. Combining forecasts: A review and annotated
1162
+ bibliography. International journal of forecasting, 1989.
1163
+
1164
+ Gu, J., Jiang, X., Shi, Z., Tan, H., Zhai, X., Xu, C., Li, W.,
1165
+ Shen, Y., Ma, S., Liu, H., et al. A survey on llm-as-a-
1166
+ judge. arXiv preprint arXiv:2411.15594, 2024.
1167
+
1168
+ Hong, S., Zheng, X., Chen, J., Cheng, Y., Wang, J., Zhang,
1169
+ C., Wang, Z., Yau, S. K. S., Lin, Z., Zhou, L., et al.
1170
+ Metagpt: Meta programming for multi-agent collabora-
1171
+ tive framework. International Conference on Learning
1172
+ Representations, 2023.
1173
+
1174
+ Hu, Z., Zhang, J., Xiong, Z., Ratner, A., Xiong, H.,
1175
+ and Krishna, R. Language model preference evalu-
1176
+ ation with multiple weak evaluators. arXiv preprint
1177
+ arXiv:2410.12869, 2024.
1178
+
1179
+ Huang, J. and Chang, K. C.-C. Towards reasoning in large
1180
+ language models: A survey. In Findings of the Associa-
1181
+ tion for Computational Linguistics: ACL 2023, 2022.
1182
+
1183
+ Huang, J., Chen, X., Mishra, S., Zheng, H. S., Yu,
1184
+ A. W., Song, X., and Zhou, D. Large language mod-
1185
+ els cannot self-correct reasoning yet. arXiv preprint
1186
+ arXiv:2310.01798, 2023.
1187
+
1188
+ Jimenez, C. E., Yang, J., Wettig, A., Yao, S., Pei, K., Press,
1189
+ O., and Narasimhan, K. Swe-bench: Can language mod-
1190
+ els resolve real-world github issues? International Con-
1191
+ ference on Learning Representations, 2023.
1192
+
1193
+ Cui, G., Yuan, L., Wang, Z., Wang, H., Li, W., He, B.,
1194
+ Fan, Y., Yu, T., Xu, Q., Chen, W., et al. Process re-
1195
+ inforcement through implicit rewards. arXiv preprint
1196
+ arXiv:2502.01456, 2025.
1197
+
1198
+ Li, G., Hammoud, H., Itani, H., Khizbullin, D., and Ghanem,
1199
+ B. Camel: Communicative agents for” mind” exploration
1200
+ of large language model society. Advances in Neural In-
1201
+ formation Processing Systems, 36:51991–52008, 2023a.
1202
+
1203
+ DeepSeek-AI. Deepseek-r1: Incentivizing reasoning ca-
1204
+ pability in llms via reinforcement learning deepseek-ai.
1205
+ arXiv preprint arXiv:2501.12948, 2025.
1206
+
1207
+ Deng, X., Gu, Y., Zheng, B., Chen, S., Stevens, S., Wang,
1208
+ B., Sun, H., and Su, Y. Mind2web: Towards a general-
1209
+ ist agent for the web. Advances in Neural Information
1210
+ Processing Systems, 2024.
1211
+
1212
+ Fourney, A., Bansal, G., Mozannar, H., Tan, C., Salinas,
1213
+ E., Niedtner, F., Proebsting, G., Bassman, G., Gerrits,
1214
+ J., Alber, J., et al. Magentic-one: A generalist multi-
1215
+ agent system for solving complex tasks. arXiv preprint
1216
+ arXiv:2411.04468, 2024.
1217
+
1218
+ Fu, J., Ng, S.-K., Jiang, Z., and Liu, P. Gptscore: Evaluate
1219
+ as you desire. Proceedings of the 2024 Conference of
1220
+ the North American Chapter of the Association for Com-
1221
+ putational Linguistics: Human Language Technologies,
1222
+ 2023.
1223
+
1224
+ Ghafarollahi, A. and Buehler, M. J. Sciagents: Automating
1225
+ scientific discovery through multi-agent intelligent graph
1226
+ reasoning. arXiv preprint arXiv:2409.05556, 2024.
1227
+
1228
+ Li, X., Zhang, T., Dubois, Y., Taori, R., Gulrajani, I.,
1229
+ Guestrin, C., Liang, P., and Hashimoto, T. B. Alpacaeval:
1230
+ An automatic evaluator of instruction-following models,
1231
+ 2023b.
1232
+
1233
+ Lightman, H., Kosaraju, V., Burda, Y., Edwards, H., Baker,
1234
+ B., Lee, T., Leike, J., Schulman, J., Sutskever, I., and
1235
+ Cobbe, K. Let’s verify step by step. In International
1236
+ Conference on Learning Representations, 2023.
1237
+
1238
+ Liu, Y., Iter, D., Xu, Y., Wang, S., Xu, R., and Zhu, C.
1239
+ G-eval: Nlg evaluation using gpt-4 with better human
1240
+ alignment. arXiv preprint arXiv:2303.16634, 2023.
1241
+
1242
+ Mialon, G., Fourrier, C., Swift, C., Wolf, T., LeCun, Y., and
1243
+ Scialom, T. Gaia: a benchmark for general ai assistants.
1244
+ International Conference on Learning Representations,
1245
+ 2023.
1246
+
1247
+ Miao, N., Teh, Y. W., and Rainforth, T. Selfcheck: Using
1248
+ llms to zero-shot check their own step-by-step reasoning.
1249
+ International Conference on Learning Representations,
1250
+ 2023.
1251
+
1252
+ 10
1253
+
1254
+ Title Suppressed Due to Excessive Size
1255
+
1256
+ Nelson, E., Kollias, G., Das, P., Chaudhury, S., and Dan, S.
1257
+ Needle in the haystack for memory based large language
1258
+ models. arXiv preprint arXiv:2407.01437, 2024.
1259
+
1260
+ Scriven, M. Evaluation thesaurus. Sage Publications, 1991.
1261
+
1262
+ Shinn, N., Cassano, F., Gopinath, A., Narasimhan, K., and
1263
+ Yao, S. Reflexion: Language agents with verbal rein-
1264
+ forcement learning. Advances in Neural Information
1265
+ Processing Systems, 2024.
1266
+
1267
+ Song, L., Liu, J., Zhang, J., Zhang, S., Luo, A., Wang,
1268
+ S., Wu, Q., and Wang, C. Adaptive in-conversation
1269
+ team building for language model agents. arXiv preprint
1270
+ arXiv:2405.19425, 2024.
1271
+
1272
+ Thakur, A. S., Choudhary, K., Ramayapally, V. S.,
1273
+ Vaidyanathan, S., and Hupkes, D. Judging the judges:
1274
+ Evaluating alignment and vulnerabilities in llms-as-
1275
+ judges. arXiv preprint arXiv:2406.12624, 2024.
1276
+
1277
+ van Schaik, T. A. and Pugh, B. A field guide to automatic
1278
+ evaluation of llm-generated summaries. In Proceedings
1279
+ of the 47th International ACM SIGIR Conference on Re-
1280
+ search and Development in Information Retrieval, pp.
1281
+ 2832–2836, 2024.
1282
+
1283
+ Wang, F., Zhang, Z., Zhang, X., Wu, Z., Mo, T., Lu, Q.,
1284
+ Wang, W., Li, R., Xu, J., Tang, X., et al. A comprehen-
1285
+ sive survey of small language models in the era of large
1286
+ language models: Techniques, enhancements, applica-
1287
+ tions, collaboration with llms, and trustworthiness. arXiv
1288
+ preprint arXiv:2411.03350, 2024.
1289
+
1290
+ Wang, P., Li, L., Shao, Z., Xu, R., Dai, D., Li, Y., Chen,
1291
+ D., Wu, Y., and Sui, Z. Math-shepherd: Verify and
1292
+ reinforce llms step-by-step without human annotations.
1293
+ arXiv preprint arXiv:2312.08935, 2023.
1294
+
1295
+ Wei, J., Wang, X., Schuurmans, D., Bosma, M., Xia, F.,
1296
+ Chi, E., Le, Q. V., Zhou, D., et al. Chain-of-thought
1297
+ prompting elicits reasoning in large language models.
1298
+ Advances in Neural Information Processing Systems, pp.
1299
+ 24824–24837, 2022.
1300
+
1301
+ Yao, S., Yu, D., Zhao, J., Shafran, I., Griffiths, T., Cao, Y.,
1302
+ and Narasimhan, K. Tree of thoughts: Deliberate problem
1303
+ solving with large language models. Advances in Neural
1304
+ Information Processing Systems, 2024.
1305
+
1306
+ Yoran, O., Amouyal, S. J., Malaviya, C., Bogin, B., Press,
1307
+ O., and Berant, J. Assistantbench: Can web agents solve
1308
+ realistic and time-consuming tasks? In Proceedings of
1309
+ the 2024 Conference on Empirical Methods in Natural
1310
+ Language Processing, 2024.
1311
+
1312
+ Yu, T. and Zhu, H. Hyper-parameter optimization: A
1313
+ review of algorithms and applications. arXiv preprint
1314
+ arXiv:2003.05689, 2020.
1315
+
1316
+ Zhang, J., Krishna, R., Awadallah, A. H., and Wang, C.
1317
+ Ecoassistant: Using llm assistant more affordably and
1318
+ accurately. arXiv preprint arXiv:2310.03046, 2023a.
1319
+
1320
+ Zhang, S., Jia, F., Wang, C., and Wu, Q. Targeted hyper-
1321
+ parameter optimization with lexicographic preferences
1322
+ over multiple objectives. In International Conference on
1323
+ Learning Representations, 2023b.
1324
+
1325
+ Zhang, S., Zhang, J., Ding, D., Garcia, M. H., Mallick, A.,
1326
+ Madrigal, D., Xia, M., R¨uhle, V., Wu, Q., and Wang,
1327
+ C. Ecoact: Economic agent determines when to register
1328
+ what action. arXiv preprint arXiv:2411.01643, 2024a.
1329
+
1330
+ Zhang, S., Zhang, J., Liu, J., Song, L., Wang, C., Krishna,
1331
+ R., and Wu, Q. Offline training of language model agents
1332
+ In International
1333
+ with functions as learnable weights.
1334
+ Conference on Machine Learning, 2024b.
1335
+
1336
+ Zhang, S., Dong, Y., Zhang, J., Kautz, J., Catanzaro, B.,
1337
+ Tao, A., Wu, Q., Yu, Z., and Liu, G. Nemotron-research-
1338
+ tool-n1: Tool-using language models with reinforced rea-
1339
+ soning. arXiv preprint arXiv:2505.00024, 2025.
1340
+
1341
+ Zheng, C., Zhang, Z., Zhang, B., Lin, R., Lu, K., Yu, B.,
1342
+ Liu, D., Zhou, J., and Lin, J. Processbench: Identifying
1343
+ process errors in mathematical reasoning. arXiv preprint
1344
+ arXiv:2412.06559, 2024.
1345
+
1346
+ Wu, Q., Bansal, G., Zhang, J., Wu, Y., Zhang, S., Zhu, E., Li,
1347
+ B., Jiang, L., Zhang, X., and Wang, C. Autogen: Enabling
1348
+ next-gen llm applications via multi-agent conversation
1349
+ framework. Conference on Language Modeling, 2023.
1350
+
1351
+ Zheng, L., Chiang, W.-L., Sheng, Y., Zhuang, S., Wu, Z.,
1352
+ Zhuang, Y., Lin, Z., Li, Z., Li, D., Xing, E., et al. Judging
1353
+ llm-as-a-judge with mt-bench and chatbot arena. Ad-
1354
+ vances in Neural Information Processing Systems, 2023.
1355
+
1356
+ Xie, T., Zhang, D., Chen, J., Li, X., Zhao, S., Cao, R.,
1357
+ Hua, T. J., Cheng, Z., Shin, D., Lei, F., et al. Os-
1358
+ world: Benchmarking multimodal agents for open-ended
1359
+ tasks in real computer environments. arXiv preprint
1360
+ arXiv:2404.07972, 2024.
1361
+
1362
+ Yao, S., Zhao, J., Yu, D., Du, N., Shafran, I., Narasimhan,
1363
+ K., and Cao, Y. React: Synergizing reasoning and acting
1364
+ in language models. ICLR, 2022.
1365
+
1366
+ Zhong, J., Shen, W., Li, Y., Gao, S., Lu, H., Chen, Y., Zhang,
1367
+ Y., Zhou, W., Gu, J., and Zou, L. A comprehensive survey
1368
+ of reward models: Taxonomy, applications, challenges,
1369
+ and future. arXiv preprint arXiv:2504.12328, 2025.
1370
+
1371
+ Zhuge, M., Zhao, C., Ashley, D., Wang, W., Khizbullin, D.,
1372
+ Xiong, Y., Liu, Z., Chang, E., Krishnamoorthi, R., Tian,
1373
+ Y., et al. Agent-as-a-judge: Evaluate agents with agents.
1374
+ arXiv preprint arXiv:2410.10934, 2024.
1375
+
1376
+ 11
1377
+
1378
+ Title Suppressed Due to Excessive Size
1379
+
1380
+ Appendix
1381
+
1382
+ A. Algorithm Details
1383
+
1384
+ A.1. Notations
1385
+
1386
+ We then provide more details on the Step-by-Step and Binary Search failure attribution methods. To begin, we define some
1387
+ notations used in the algorithms. We employ Q to denote the query provided to the system. L = {l1, l2, . . . , ln} denotes the
1388
+ failure log consisting of n entries where each entry li specifies the action taken at time step i by one agent. A∗, s∗ denotes
1389
+ the agent responsible for the task failure and the decisive error step respectively.
1390
+
1391
+ A.2. Details of Step-by-Step
1392
+
1393
+ Provide Q and {l1, ..., li} to LLM
1394
+ if LLM indicates error at step i then
1395
+
1396
+ Algorithm 1 Step-by-Step
1397
+ Require: Query Q, failure log L = {l1, l2, . . . , ln}
1398
+ Ensure: Responsible agent A∗, error step s∗
1399
+ 1: for i ∈ {1, 2, . . . , n} do
1400
+ 2:
1401
+ 3:
1402
+ 4:
1403
+ 5:
1404
+ 6:
1405
+ end if
1406
+ 7:
1407
+ 8: end for
1408
+ 9: No error found
1409
+
1410
+ s∗ ← i
1411
+ Identify responsible agent A∗ in li
1412
+ Return A∗, s∗
1413
+
1414
+ A.3. Details of Binary Search
1415
+
1416
+ Algorithm 2 Binary Search
1417
+ Require: Query Q, failure log L = {l1, l2, . . . , ln}
1418
+ Ensure: Responsible agent A∗, error step s∗
1419
+
1420
+ Initialize low ← 1, high ← n
1421
+ while low < high do
1422
+
1423
+ mid ←
1424
+
1425
+ (cid:22) low + high
1426
+ 2
1427
+
1428
+ (cid:23)
1429
+
1430
+ Extract log segment L′ ← {llow, llow+1, . . . , lmid}
1431
+ Provide Q and L′ to LLM
1432
+ if LLM indicates error in L′ then
1433
+
1434
+ high ← mid
1435
+
1436
+ else
1437
+
1438
+ low ← mid + 1
1439
+
1440
+ end if
1441
+ end while
1442
+ s∗ ← low, identify responsible agent A∗ in ls∗
1443
+ Return A∗, s∗
1444
+
1445
+ B. Additional Experiments
1446
+
1447
+ B.1. Ablation of Reasoning Prompts
1448
+
1449
+ LLMs have shown incredible reasoning ability (Huang & Chang, 2022; Wei et al., 2022; Yao et al., 2024), considering these,
1450
+ in both the all-at-once and step-by-step approaches, we explicitly require the LLMs to not only conduct failure attributions
1451
+ but also specify the reasons for these attributions within the prompt. We don’t include binary search here because it doesn’t
1452
+
1453
+ 12
1454
+
1455
+ Title Suppressed Due to Excessive Size
1456
+
1457
+ (a) Alg.-Generated Agent-Level (b) Alg.-Generated Step-Level
1458
+
1459
+ (c) Hand-Crafted Agent-Level
1460
+
1461
+ (d) Hand-Crafted Step-Level
1462
+
1463
+ Figure 7. Ablation of the explicit reasoning prompts in all-at-once and step-by-step. From the result we could observe that the explicit
1464
+ specify reasoning in failure attributing methods could greatly boost their performance.
1465
+
1466
+ include reasoning mechanisms in their prompt. We only want binary search to do simple classification task. To investigate
1467
+ the impact of these reasoning prompts on the failure attributions, we conduct additional experiments where the reasoning
1468
+ prompt is removed, allowing the LLMs to directly provide the judgment results. We make comparisons and the results are
1469
+ shown in Figure 7. We observed a significant drop in performance after removing the explicit reasoning prompts for failure
1470
+ attribution in both metrics. For example, in algorithm-generated multi-agent systems, the agent-level accuracy decreased
1471
+ by 7.4% for the all-at-once method. For the step-by-step method, the step-level performance drops 4.4%. These results
1472
+ highlight the necessity of incorporating additional reasoning mechanisms in failure attributions.
1473
+
1474
+ C. More Details of Who&When
1475
+
1476
+ C.1. Overview
1477
+
1478
+ Algorithm-Generated
1479
+ GAIA AssistantBench GAIA AssistantBench
1480
+
1481
+ Hand-Crafted
1482
+
1483
+ Total Number
1484
+ Maximum Agent Number
1485
+ Minimum Agent Number
1486
+ Maximum Log Length
1487
+ Minimum Log Length
1488
+
1489
+ 98
1490
+ 4
1491
+ 1
1492
+ 10
1493
+ 5
1494
+
1495
+ 28
1496
+ 4
1497
+ 3
1498
+ 10
1499
+ 6
1500
+
1501
+ 30
1502
+ 5
1503
+ 1
1504
+ 130
1505
+ 5
1506
+
1507
+ 28
1508
+ 4
1509
+ 2
1510
+ 129
1511
+ 8
1512
+
1513
+ Table 5. Additional details about the Who&When benchmark: We present the total number of tasks for each category, along with the
1514
+ maximum and minimum number of agents and log lengths.
1515
+
1516
+ We then provide more details about the Who&When dataset, which comprises 184 failure annotations tasks from both
1517
+ hand-crafted and algorithm-generated agentic systems. These failure logs encompass diverse scenarios with varying numbers
1518
+ of agents and interaction lengths. In Table 5, we show the total number of data instances for each category, along with
1519
+ the maximum and minimum number of agents and log lengths. We also visualize the information of each data instance in
1520
+ Figure 8. Note that due to task overlap, some data points may appear sparse in the visualization. We also show an failure
1521
+ task example in Figure 9.
1522
+
1523
+ C.2. Data Distribution
1524
+
1525
+ (a) Algorithm-Generated
1526
+
1527
+ (b) Hand-Crafted
1528
+
1529
+ Figure 8. The number of agents involved and the total length of each failure log instance in the Who&When dataset. Note that due to task
1530
+ overlap, some data points may appear sparse in the visualization
1531
+
1532
+ 13
1533
+
1534
+ Once-for-AllStep-by-Step0102030405060Accuracy (%)52.730.645.325.2With ReasoningWithout ReasoningOnce-for-AllStep-by-Step0510152025Accuracy (%)15.420.411.516.0With ReasoningWithout ReasoningOnce-for-AllStep-by-Step0102030405060Accuracy (%)54.333.647.329.3With ReasoningWithout ReasoningOnce-for-AllStep-by-Step0246810Accuracy (%)4.47.93.56.9With ReasoningWithout Reasoning1234Number of Agents5678910Total Log Length12345Number of Agents020406080100120Total Log Length C.3. Data Example
1535
+
1536
+ Title Suppressed Due to Excessive Size
1537
+
1538
+ Figure 9. A task example from Who&When, where we annotate failure-responsible agents and their corresponding error steps within the
1539
+ failure logs. Each annotation includes a natural language explanation of the failure reason for reference.
1540
+
1541
+ D. Brief Cost Analysis
1542
+
1543
+ We then present a brief analysis of the computational costs associated with three failure attribution methods. We focus solely
1544
+ on input tokens, as the contribution of output tokens such as the agent name and error step number is small. We also ignore
1545
+ the mirror token difference between one-time instruction from different methods. We let C to denote the cost of query Q
1546
+ and corresponding instructions of methods. We employ L = {l1, l2, ..., ln}, where each entry li has an average token of Tl.
1547
+
1548
+ D.1. All-at-Once
1549
+
1550
+ In the all-at-once method, the LLM receives the full context in a single input. The total input token cost is:
1551
+
1552
+ Costall−at−once = C + n · Tl
1553
+
1554
+ (6)
1555
+
1556
+ This method incurs the lowest cost as it requires only a single inference step.
1557
+
1558
+ D.2. Step-by-Step
1559
+
1560
+ In the Step-by-Step method, the LLM processes the failure log incrementally. At each step i, it receives query, instructions
1561
+ and the log segment {l1, ..., li}. The process terminates when the decisive error step i∗ is found.
1562
+
1563
+ Coststep−by−step =
1564
+
1565
+ i∗
1566
+ (cid:88)
1567
+
1568
+ (C + i · Tl) = i∗ · C + Tl ·
1569
+
1570
+ i=1
1571
+
1572
+ i∗ · (i∗ + 1)
1573
+ 2
1574
+
1575
+ (7)
1576
+
1577
+ In the worst case, i∗ = n, either when no error is detected or the decisive error occurs in the final step.
1578
+
1579
+ D.3. Binary Search
1580
+
1581
+ In the Binary Search method, the LLM operates in a logarithmic fashion by iteratively splitting the failure log into halves.
1582
+ At each step i, the segment of the failure log processed by the LLM has a size of approximately n
1583
+ 2i−1 , where n is the total
1584
+ number of log entries. Therefore the total cost at interaction i is C + n·Tl
1585
+ 2i−1 . The Binary Search continues until the search
1586
+ space is narrowed down to a single step, requiring ⌈log2(n)⌉ iterations. Therefore the cost of binary search is:
1587
+
1588
+ 14
1589
+
1590
+ Title Suppressed Due to Excessive Size
1591
+
1592
+ CostBinarySearch =
1593
+
1594
+ ⌈log2(n)⌉
1595
+ (cid:88)
1596
+
1597
+ i=1
1598
+
1599
+ (C +
1600
+
1601
+ n · Tl
1602
+ 2i−1 ) = ⌈log2(n)⌉ · C +
1603
+
1604
+ ⌈log2(n)⌉
1605
+ (cid:88)
1606
+
1607
+ i=1
1608
+
1609
+ (
1610
+
1611
+ n · Tl
1612
+ 2i−1 )
1613
+
1614
+ (8)
1615
+
1616
+ D.4. Cost Summary
1617
+
1618
+ In summary, the costs associated with the three methods are influenced by three key factors: the size of the failure log (n),
1619
+ the average token count per log entry (Tl), and the decisive error step (i∗). The choice of method should align with the user’s
1620
+ budget and specific use case requirements. Among the methods, the all-at-once approach incurs the lowest cost as it requires
1621
+ only a single inference step. In contrast, the costs of the binary search and step-by-step methods are highly dependent on the
1622
+ specific scenario, particularly the distribution of decisive error locations and the total length of the failure log.
1623
+
1624
+ E. Hyperparameters
1625
+
1626
+ Hyperparameters play a critical role in determining the performance of machine learning algorithms (Yu & Zhu, 2020; Zhang
1627
+ et al., 2023b). In this paper, the hyperparameters we utilize are divided into two categories: those used for Who&When data
1628
+ construction and those employed for automated failure attribution algorithms. For data construction, we adopt the default
1629
+ settings of CaptainAgent and Magentic-One from their official libraries (AG2 and Autogen). One notable setting is that the
1630
+ maximum iteration count for CaptainAgent is limited to 10, whereas Magentic-One allows up to 30 rounds. It is important
1631
+ to highlight that the agent’s thought processes are excluded from the round count, which contributes to longer failure log
1632
+ lengths, as discussed in Appendix C. For the inference hyperparameters of other large language models (LLMs), we adhere
1633
+ to the default configurations specified in their official documentation.
1634
+
1635
+ F. Annotation Details
1636
+
1637
+ In Figure 10, we present our standardized annotation guidelines used by all annotators. The guidelines clearly define criteria
1638
+ for identifying failure-responsible agents and decisive error steps. Annotators are instructed to document any uncertainties
1639
+ in their annotations for subsequent group discussion and voting.
1640
+
1641
+ Figure 10. The guideline in making annotation. We maintain consistent annotation guidelines across all annotators.
1642
+
1643
+ 15
1644
+
1645
+ G. Prompts
1646
+
1647
+ Title Suppressed Due to Excessive Size
1648
+
1649
+ We list the prompt templates for all three attribution methods in this section. Please refer to our code base for more details.
1650
+
1651
+ G.1. Prompts of All-at-Once
1652
+
1653
+ You are an AI assistant tasked with analyzing a multi-agent conversation history when solving a real world problem.
1654
+ The problem is: {problem}.
1655
+ Identify which agent made an error, at which step, and explain the reason for the error.
1656
+ Here’s the conversation: {failure log}
1657
+ Based on this conversation, please predict the following:
1658
+ 1. The name of the agent who made a mistake that should be directly responsible for the wrong solution to the real
1659
+ world problem. If there are no agents that make obvious mistakes, decide one single agent in your mind. Directly
1660
+ output the name of the Expert.
1661
+ 2. In which step the mistake agent first made mistake. For example, in a conversation structured as follows:
1662
+ {
1663
+ ”agent a”: ”xx”,
1664
+ ”agent b”: ”xxxx”,
1665
+ ”agent c”: ”xxxxx”,
1666
+ ”agent a”: ”xxxxxxx”
1667
+ },
1668
+ each entry represents a ’step’ where an agent provides input. The ’x’ symbolizes the speech of each agent. If the
1669
+ mistake is in agent c’s speech, the step number is 2. If the second speech by ’agent a’ contains the mistake, the step
1670
+ number is 3, and so on. Please determine the step number where the first mistake occurred.
1671
+ 3. The reason for your prediction. Please answer in the format:
1672
+ Agent Name: (Your prediction)
1673
+ Step Number: (Your prediction)
1674
+ Reason for Mistake: (Your reason)
1675
+
1676
+ You are an AI assistant tasked with analyzing a multi-agent conversation history when solving a real world problem.
1677
+ The problem is: {problem}.
1678
+ The Answer for the problem is: {ground truth}.
1679
+ Identify which agent made an error, at which step, and explain the reason for the error.
1680
+ Here’s the conversation: {failure log}
1681
+ Based on this conversation, please predict the following:
1682
+ 1. The name of the agent who made a mistake that should be directly responsible for the wrong solution to the real
1683
+ world problem. If there are no agents that make obvious mistakes, decide one single agent in your mind. Directly
1684
+ output the name of the Expert.
1685
+ 2. In which step the mistake agent first made mistake. For example, in a conversation structured as follows:
1686
+ {
1687
+ ”agent a”: ”xx”,
1688
+ ”agent b”: ”xxxx”,
1689
+ ”agent c”: ”xxxxx”,
1690
+ ”agent a”: ”xxxxxxx”
1691
+ },
1692
+ each entry represents a ’step’ where an agent provides input. The ’x’ symbolizes the speech of each agent. If the
1693
+ mistake is in agent c’s speech, the step number is 2. If the second speech by ’agent a’ contains the mistake, the step
1694
+ number is 3, and so on. Please determine the step number where the first mistake occurred.
1695
+ 3. The reason for your prediction. Please answer in the format:
1696
+ Agent Name: (Your prediction)
1697
+ Step Number: (Your prediction)
1698
+ Reason for Mistake: (Your reason)
1699
+
1700
+ 16
1701
+
1702
+ G.2. Prompts of Binary Search
1703
+
1704
+ Title Suppressed Due to Excessive Size
1705
+
1706
+ You are an AI assistant tasked with analyzing a segment of a multi-agent conversation. Multiple agents are
1707
+ collaborating to address a user query, with the goal of resolving the query through their collective dialogue.
1708
+ Your primary task is to identify location of the most critical mistake, and determine the single step in the conversation
1709
+ where this error occurs, ultimately leading to the failure in resolving the user’s query.
1710
+ The problem to address is as follows: {problem}.
1711
+ Review the following conversation range
1712
+ {range description}: {sliced log}.
1713
+ Based on your analysis, predict whether the error is more likely to be located in the upper or lower half of the segment.
1714
+ lower half is defined as the range lower half range and upper half is defined as the range upper half range.
1715
+ Please simply output either ’upper half’ or ’lower half’.
1716
+ You should not output anything else.
1717
+
1718
+ You are an AI assistant tasked with analyzing a segment of a multi-agent conversation. Multiple agents are
1719
+ collaborating to address a user query, with the goal of resolving the query through their collective dialogue.
1720
+ Your primary task is to identify location of the most critical mistake, and determine the single step in the conversation
1721
+ where this error occurs, ultimately leading to the failure in resolving the user’s query.
1722
+ The problem to address is as follows: {problem}.
1723
+ The Answer for the problem is: {ground truth}.
1724
+ Review the following conversation range
1725
+ {range description}: {sliced log}.
1726
+ Based on your analysis, predict whether the error is more likely to be located in the upper or lower half of the segment.
1727
+ lower half is defined as the range lower half range and upper half is defined as the range upper half range.
1728
+ Please simply output either ’upper half’ or ’lower half’.
1729
+ You should not output anything else.
1730
+
1731
+ G.3. Prompts of Step-by-Step
1732
+
1733
+ You are an AI assistant tasked with evaluating the correctness of each step in an ongoing multi-agent conversation
1734
+ aimed at solving a real-world problem.
1735
+ The problem being addressed is: {problem}.
1736
+ Here is the conversation history up to the current step: {failure log}.
1737
+ Your task is to determine whether the most recent agent’s action contains an error that could hinder the problem-
1738
+ solving process. Please respond with ’Yes’ or ’No’ and provide a clear explanation for your judgment.
1739
+ Note: Please avoid being overly critical in your evaluation.
1740
+ Attention: Respond in the format:
1741
+ 1. Yes/No. 2. Reason for the judgment.
1742
+
1743
+ You are an AI assistant tasked with evaluating the correctness of each step in an ongoing multi-agent conversation
1744
+ aimed at solving a real-world problem.
1745
+ The problem being addressed is: {problem}.
1746
+ Here is the conversation history up to the current step: {failure log}.
1747
+ The Answer for the problem is: {ground truth}.
1748
+ Your task is to determine whether the most recent agent’s action contains an error that could hinder the problem-
1749
+ solving process. Please respond with ’Yes’ or ’No’ and provide a clear explanation for your judgment.
1750
+ Note: Please avoid being overly critical in your evaluation.
1751
+ Attention: Respond in the format:
1752
+ 1. Yes/No. 2. Reason for the judgment.
1753
+
1754
+ 17
1755
+