mcpbr 0.4.15__py3-none-any.whl → 0.4.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,558 @@
1
+ """Failure analysis module for evaluation results.
2
+
3
+ Provides detailed analysis of evaluation failures to understand common failure
4
+ modes, categorize errors, extract patterns, and generate actionable reports.
5
+ """
6
+
7
+ from collections import Counter
8
+ from enum import StrEnum
9
+
10
+
11
+ class FailureCategory(StrEnum):
12
+ """Categories for classifying evaluation failures."""
13
+
14
+ TIMEOUT = "timeout"
15
+ NO_PATCH = "no_patch"
16
+ WRONG_ANSWER = "wrong_answer"
17
+ RUNTIME_ERROR = "runtime_error"
18
+ COMPILATION_ERROR = "compilation_error"
19
+ TEST_FAILURE = "test_failure"
20
+ TOOL_ERROR = "tool_error"
21
+ ENVIRONMENT_ERROR = "environment_error"
22
+ UNKNOWN = "unknown"
23
+
24
+
25
+ def categorize_failure(result: dict) -> str:
26
+ """Categorize a failed result into a failure category.
27
+
28
+ Examines the result dictionary to determine the most likely failure
29
+ category based on status, error messages, patch presence, and other
30
+ signals.
31
+
32
+ Args:
33
+ result: A result dictionary with keys like resolved, error, patch,
34
+ status, tool_calls, tool_usage, etc.
35
+
36
+ Returns:
37
+ The failure category string from FailureCategory.
38
+ """
39
+ # Resolved results are not failures
40
+ if result.get("resolved"):
41
+ return FailureCategory.UNKNOWN
42
+
43
+ status = str(result.get("status", "")).lower()
44
+ error = str(result.get("error", "")).lower()
45
+
46
+ # Check for timeout
47
+ if status == "timeout" or "timeout" in error or "timed out" in error:
48
+ return FailureCategory.TIMEOUT
49
+
50
+ # Check for environment errors (docker, container, permission, network)
51
+ if any(
52
+ keyword in error
53
+ for keyword in [
54
+ "docker",
55
+ "container",
56
+ "permission denied",
57
+ "access denied",
58
+ "network",
59
+ "connection refused",
60
+ "connection reset",
61
+ "environment",
62
+ "no space left",
63
+ "disk quota",
64
+ ]
65
+ ):
66
+ return FailureCategory.ENVIRONMENT_ERROR
67
+
68
+ # Check for tool errors (MCP tool failures)
69
+ if any(
70
+ keyword in error
71
+ for keyword in [
72
+ "mcp",
73
+ "tool call",
74
+ "tool error",
75
+ "tool failed",
76
+ "server error",
77
+ ]
78
+ ):
79
+ return FailureCategory.TOOL_ERROR
80
+
81
+ # Check tool failure rate as a signal for tool errors
82
+ tool_failures = result.get("tool_failures", {})
83
+ tool_usage = result.get("tool_usage", {})
84
+ if tool_failures:
85
+ total_failures = sum(tool_failures.values())
86
+ total_calls = sum(tool_usage.values()) if tool_usage else 0
87
+ if total_calls > 0 and total_failures / total_calls > 0.5:
88
+ return FailureCategory.TOOL_ERROR
89
+
90
+ # Check for compilation errors
91
+ if any(
92
+ keyword in error
93
+ for keyword in [
94
+ "compilation",
95
+ "compile error",
96
+ "syntax error",
97
+ "syntaxerror",
98
+ "indentation",
99
+ "importerror",
100
+ "modulenotfounderror",
101
+ "nameerror",
102
+ ]
103
+ ):
104
+ return FailureCategory.COMPILATION_ERROR
105
+
106
+ # Check for runtime errors
107
+ if any(
108
+ keyword in error
109
+ for keyword in [
110
+ "runtime",
111
+ "traceback",
112
+ "exception",
113
+ "attributeerror",
114
+ "typeerror",
115
+ "valueerror",
116
+ "keyerror",
117
+ "indexerror",
118
+ "zerodivisionerror",
119
+ "recursionerror",
120
+ "memoryerror",
121
+ "overflowerror",
122
+ ]
123
+ ):
124
+ return FailureCategory.RUNTIME_ERROR
125
+
126
+ # Check for no patch generated
127
+ patch = result.get("patch")
128
+ if not patch or (isinstance(patch, str) and not patch.strip()):
129
+ return FailureCategory.NO_PATCH
130
+
131
+ # Check for test failure (patch exists but tests failed)
132
+ if any(
133
+ keyword in error
134
+ for keyword in [
135
+ "test failed",
136
+ "test failure",
137
+ "assertion",
138
+ "assertionerror",
139
+ "fail_to_pass",
140
+ "pass_to_pass",
141
+ ]
142
+ ):
143
+ return FailureCategory.TEST_FAILURE
144
+
145
+ # If there is a patch but the result is not resolved and no specific error
146
+ # was detected, it is most likely a wrong answer or test failure
147
+ if patch and isinstance(patch, str) and patch.strip():
148
+ return FailureCategory.WRONG_ANSWER
149
+
150
+ return FailureCategory.UNKNOWN
151
+
152
+
153
+ def extract_failure_patterns(results: list[dict]) -> dict:
154
+ """Find common patterns across failed results.
155
+
156
+ Analyzes all failed results to identify recurring error messages,
157
+ tool-related patterns, and other commonalities.
158
+
159
+ Args:
160
+ results: List of result dictionaries.
161
+
162
+ Returns:
163
+ Dictionary with pattern analysis including:
164
+ - common_errors: Most frequent error message prefixes
165
+ - tool_failure_patterns: Tools that fail most frequently
166
+ - failure_by_tool_count: Distribution of failures by number of tool calls
167
+ - high_token_failures: Failures with above-average token usage
168
+ - zero_tool_failures: Failures where no tools were called
169
+ """
170
+ failed_results = [r for r in results if not r.get("resolved")]
171
+
172
+ if not failed_results:
173
+ return {
174
+ "common_errors": [],
175
+ "tool_failure_patterns": {},
176
+ "failure_by_tool_count": {},
177
+ "high_token_failures": 0,
178
+ "zero_tool_failures": 0,
179
+ }
180
+
181
+ # Collect common error message prefixes (first 80 chars)
182
+ error_counter: Counter[str] = Counter()
183
+ for r in failed_results:
184
+ error = r.get("error", "")
185
+ if error:
186
+ # Normalize: take first 80 chars as a prefix fingerprint
187
+ prefix = error[:80].strip()
188
+ error_counter[prefix] += 1
189
+
190
+ # Tool failure patterns
191
+ tool_failure_counter: Counter[str] = Counter()
192
+ for r in failed_results:
193
+ tool_failures = r.get("tool_failures", {})
194
+ for tool_name, count in tool_failures.items():
195
+ tool_failure_counter[tool_name] += count
196
+
197
+ # Failure distribution by tool call count
198
+ tool_count_dist: Counter[int] = Counter()
199
+ zero_tool_failures = 0
200
+ for r in failed_results:
201
+ tool_calls = r.get("tool_calls", 0)
202
+ # Also count from tool_usage if tool_calls is not set
203
+ if not tool_calls and r.get("tool_usage"):
204
+ tool_calls = sum(r["tool_usage"].values())
205
+ tool_count_dist[tool_calls] += 1
206
+ if tool_calls == 0:
207
+ zero_tool_failures += 1
208
+
209
+ # High token usage failures
210
+ all_tokens = []
211
+ for r in results:
212
+ tokens_in = r.get("tokens_input", 0)
213
+ tokens_out = r.get("tokens_output", 0)
214
+ # Also try nested tokens dict
215
+ tokens_dict = r.get("tokens", {})
216
+ if isinstance(tokens_dict, dict):
217
+ tokens_in = tokens_in or tokens_dict.get("input", 0)
218
+ tokens_out = tokens_out or tokens_dict.get("output", 0)
219
+ all_tokens.append(tokens_in + tokens_out)
220
+
221
+ avg_tokens = sum(all_tokens) / len(all_tokens) if all_tokens else 0
222
+
223
+ high_token_failures = 0
224
+ for r in failed_results:
225
+ tokens_in = r.get("tokens_input", 0)
226
+ tokens_out = r.get("tokens_output", 0)
227
+ tokens_dict = r.get("tokens", {})
228
+ if isinstance(tokens_dict, dict):
229
+ tokens_in = tokens_in or tokens_dict.get("input", 0)
230
+ tokens_out = tokens_out or tokens_dict.get("output", 0)
231
+ total = tokens_in + tokens_out
232
+ if avg_tokens > 0 and total > avg_tokens:
233
+ high_token_failures += 1
234
+
235
+ return {
236
+ "common_errors": error_counter.most_common(10),
237
+ "tool_failure_patterns": dict(tool_failure_counter.most_common(10)),
238
+ "failure_by_tool_count": dict(sorted(tool_count_dist.items())),
239
+ "high_token_failures": high_token_failures,
240
+ "zero_tool_failures": zero_tool_failures,
241
+ }
242
+
243
+
244
+ def generate_failure_report(results: list[dict]) -> dict:
245
+ """Generate a comprehensive failure analysis report.
246
+
247
+ Combines failure categorization, pattern extraction, and per-benchmark
248
+ analysis into a single report dictionary.
249
+
250
+ Args:
251
+ results: List of result dictionaries.
252
+
253
+ Returns:
254
+ Dictionary with:
255
+ - total_results: Total number of results analyzed
256
+ - total_failures: Number of failed results
257
+ - failure_rate: Overall failure rate (0.0 to 1.0)
258
+ - category_distribution: Count of failures per category
259
+ - category_percentages: Percentage of failures per category
260
+ - common_error_messages: Most frequent error messages
261
+ - failure_by_benchmark: Failure rates grouped by benchmark/instance prefix
262
+ - tool_failure_breakdown: Tool-related failure details
263
+ - patterns: Output from extract_failure_patterns
264
+ - recommendations: List of actionable recommendations
265
+ """
266
+ total = len(results)
267
+ failed = [r for r in results if not r.get("resolved")]
268
+ total_failures = len(failed)
269
+ failure_rate = total_failures / total if total > 0 else 0.0
270
+
271
+ # Category distribution
272
+ category_counter: Counter[str] = Counter()
273
+ for r in failed:
274
+ category = categorize_failure(r)
275
+ category_counter[category] += 1
276
+
277
+ category_percentages = {}
278
+ for category, count in category_counter.items():
279
+ category_percentages[category] = count / total_failures if total_failures > 0 else 0.0
280
+
281
+ # Common error messages (full text, deduplicated)
282
+ error_counter: Counter[str] = Counter()
283
+ for r in failed:
284
+ error = r.get("error", "")
285
+ if error:
286
+ error_counter[error] += 1
287
+
288
+ # Failure by benchmark/task prefix
289
+ benchmark_failures: dict[str, dict[str, int]] = {}
290
+ for r in results:
291
+ instance_id = r.get("instance_id", "")
292
+ # Extract benchmark prefix (e.g., "django/django" from "django/django-12345")
293
+ parts = instance_id.rsplit("-", 1)
294
+ prefix = parts[0] if len(parts) > 1 else instance_id
295
+
296
+ if prefix not in benchmark_failures:
297
+ benchmark_failures[prefix] = {"total": 0, "failures": 0}
298
+ benchmark_failures[prefix]["total"] += 1
299
+ if not r.get("resolved"):
300
+ benchmark_failures[prefix]["failures"] += 1
301
+
302
+ # Calculate failure rates per benchmark
303
+ failure_by_benchmark = {}
304
+ for prefix, counts in benchmark_failures.items():
305
+ failure_by_benchmark[prefix] = {
306
+ "total": counts["total"],
307
+ "failures": counts["failures"],
308
+ "failure_rate": counts["failures"] / counts["total"] if counts["total"] > 0 else 0.0,
309
+ }
310
+
311
+ # Sort by failure rate descending
312
+ failure_by_benchmark = dict(
313
+ sorted(failure_by_benchmark.items(), key=lambda x: x[1]["failure_rate"], reverse=True)
314
+ )
315
+
316
+ # Tool failure breakdown
317
+ tool_total_calls: Counter[str] = Counter()
318
+ tool_total_failures: Counter[str] = Counter()
319
+ for r in failed:
320
+ tool_usage = r.get("tool_usage", {})
321
+ for tool_name, count in tool_usage.items():
322
+ tool_total_calls[tool_name] += count
323
+ tool_failures = r.get("tool_failures", {})
324
+ for tool_name, count in tool_failures.items():
325
+ tool_total_failures[tool_name] += count
326
+
327
+ tool_failure_breakdown = {}
328
+ for tool_name in set(tool_total_calls.keys()) | set(tool_total_failures.keys()):
329
+ calls = tool_total_calls.get(tool_name, 0)
330
+ failures = tool_total_failures.get(tool_name, 0)
331
+ tool_failure_breakdown[tool_name] = {
332
+ "calls": calls,
333
+ "failures": failures,
334
+ "failure_rate": failures / calls if calls > 0 else 0.0,
335
+ }
336
+
337
+ # Sort by failure count descending
338
+ tool_failure_breakdown = dict(
339
+ sorted(tool_failure_breakdown.items(), key=lambda x: x[1]["failures"], reverse=True)
340
+ )
341
+
342
+ # Extract patterns
343
+ patterns = extract_failure_patterns(results)
344
+
345
+ # Generate recommendations
346
+ recommendations = _generate_recommendations(
347
+ category_counter=category_counter,
348
+ total_failures=total_failures,
349
+ patterns=patterns,
350
+ tool_failure_breakdown=tool_failure_breakdown,
351
+ )
352
+
353
+ return {
354
+ "total_results": total,
355
+ "total_failures": total_failures,
356
+ "failure_rate": failure_rate,
357
+ "category_distribution": dict(category_counter.most_common()),
358
+ "category_percentages": category_percentages,
359
+ "common_error_messages": error_counter.most_common(10),
360
+ "failure_by_benchmark": failure_by_benchmark,
361
+ "tool_failure_breakdown": tool_failure_breakdown,
362
+ "patterns": patterns,
363
+ "recommendations": recommendations,
364
+ }
365
+
366
+
367
+ def _generate_recommendations(
368
+ category_counter: Counter,
369
+ total_failures: int,
370
+ patterns: dict,
371
+ tool_failure_breakdown: dict,
372
+ ) -> list[str]:
373
+ """Generate actionable recommendations based on failure analysis.
374
+
375
+ Args:
376
+ category_counter: Counter of failure categories.
377
+ total_failures: Total number of failures.
378
+ patterns: Extracted failure patterns.
379
+ tool_failure_breakdown: Tool failure details.
380
+
381
+ Returns:
382
+ List of recommendation strings.
383
+ """
384
+ recommendations = []
385
+
386
+ if total_failures == 0:
387
+ return ["All tasks resolved successfully. No improvements needed."]
388
+
389
+ # Timeout recommendations
390
+ timeout_count = category_counter.get(FailureCategory.TIMEOUT, 0)
391
+ if timeout_count > 0:
392
+ pct = timeout_count / total_failures * 100
393
+ recommendations.append(
394
+ f"Timeouts account for {pct:.0f}% of failures ({timeout_count} tasks). "
395
+ "Consider increasing the timeout limit or optimizing agent efficiency."
396
+ )
397
+
398
+ # No patch recommendations
399
+ no_patch_count = category_counter.get(FailureCategory.NO_PATCH, 0)
400
+ if no_patch_count > 0:
401
+ pct = no_patch_count / total_failures * 100
402
+ recommendations.append(
403
+ f"No patch generated for {pct:.0f}% of failures ({no_patch_count} tasks). "
404
+ "The agent may need better prompting or more context about the codebase."
405
+ )
406
+
407
+ # Tool error recommendations
408
+ tool_error_count = category_counter.get(FailureCategory.TOOL_ERROR, 0)
409
+ if tool_error_count > 0:
410
+ pct = tool_error_count / total_failures * 100
411
+ recommendations.append(
412
+ f"Tool errors account for {pct:.0f}% of failures ({tool_error_count} tasks). "
413
+ "Check MCP server stability and tool reliability."
414
+ )
415
+
416
+ # Environment error recommendations
417
+ env_error_count = category_counter.get(FailureCategory.ENVIRONMENT_ERROR, 0)
418
+ if env_error_count > 0:
419
+ pct = env_error_count / total_failures * 100
420
+ recommendations.append(
421
+ f"Environment errors account for {pct:.0f}% of failures ({env_error_count} tasks). "
422
+ "Review Docker configuration and resource allocation."
423
+ )
424
+
425
+ # Wrong answer recommendations
426
+ wrong_answer_count = category_counter.get(FailureCategory.WRONG_ANSWER, 0)
427
+ if wrong_answer_count > 0:
428
+ pct = wrong_answer_count / total_failures * 100
429
+ recommendations.append(
430
+ f"Wrong answers account for {pct:.0f}% of failures ({wrong_answer_count} tasks). "
431
+ "The agent generates patches but they do not pass tests. "
432
+ "Consider improving the agent's debugging and test validation capabilities."
433
+ )
434
+
435
+ # Compilation error recommendations
436
+ compile_count = category_counter.get(FailureCategory.COMPILATION_ERROR, 0)
437
+ if compile_count > 0:
438
+ pct = compile_count / total_failures * 100
439
+ recommendations.append(
440
+ f"Compilation errors account for {pct:.0f}% of failures ({compile_count} tasks). "
441
+ "The agent is generating syntactically invalid code. "
442
+ "Consider adding a syntax check step before submitting patches."
443
+ )
444
+
445
+ # Zero tool usage pattern
446
+ zero_tool_count = patterns.get("zero_tool_failures", 0)
447
+ if zero_tool_count > 0:
448
+ recommendations.append(
449
+ f"{zero_tool_count} failed tasks had zero tool calls. "
450
+ "The agent may not be discovering or using available tools."
451
+ )
452
+
453
+ # High-failure tools
454
+ for tool_name, stats in tool_failure_breakdown.items():
455
+ if stats["failure_rate"] > 0.3 and stats["calls"] >= 5:
456
+ recommendations.append(
457
+ f"Tool '{tool_name}' has a {stats['failure_rate']:.0%} failure rate "
458
+ f"across {stats['calls']} calls in failed tasks. "
459
+ "Investigate this tool's reliability."
460
+ )
461
+
462
+ return recommendations
463
+
464
+
465
+ def format_failure_report(report: dict) -> str:
466
+ """Format a failure analysis report as human-readable text.
467
+
468
+ Args:
469
+ report: Report dictionary from generate_failure_report.
470
+
471
+ Returns:
472
+ Formatted multi-line string suitable for console output or logging.
473
+ """
474
+ lines = []
475
+
476
+ lines.append("=" * 60)
477
+ lines.append("FAILURE ANALYSIS REPORT")
478
+ lines.append("=" * 60)
479
+ lines.append("")
480
+
481
+ # Overview
482
+ lines.append("OVERVIEW")
483
+ lines.append("-" * 40)
484
+ lines.append(f"Total Results: {report['total_results']}")
485
+ lines.append(f"Total Failures: {report['total_failures']}")
486
+ lines.append(f"Failure Rate: {report['failure_rate']:.1%}")
487
+ lines.append("")
488
+
489
+ # Category distribution
490
+ category_dist = report.get("category_distribution", {})
491
+ if category_dist:
492
+ lines.append("FAILURE CATEGORIES")
493
+ lines.append("-" * 40)
494
+ for category, count in category_dist.items():
495
+ pct = report["category_percentages"].get(category, 0.0)
496
+ lines.append(f" {category:<25} {count:>4} ({pct:.1%})")
497
+ lines.append("")
498
+
499
+ # Common error messages
500
+ common_errors = report.get("common_error_messages", [])
501
+ if common_errors:
502
+ lines.append("COMMON ERROR MESSAGES")
503
+ lines.append("-" * 40)
504
+ for error_msg, count in common_errors[:5]:
505
+ # Truncate long messages
506
+ display_msg = error_msg[:70] + "..." if len(error_msg) > 70 else error_msg
507
+ lines.append(f" [{count}x] {display_msg}")
508
+ lines.append("")
509
+
510
+ # Failure by benchmark
511
+ failure_by_benchmark = report.get("failure_by_benchmark", {})
512
+ if failure_by_benchmark:
513
+ lines.append("FAILURE RATE BY BENCHMARK")
514
+ lines.append("-" * 40)
515
+ for prefix, stats in list(failure_by_benchmark.items())[:10]:
516
+ lines.append(
517
+ f" {prefix:<35} {stats['failures']}/{stats['total']} ({stats['failure_rate']:.1%})"
518
+ )
519
+ lines.append("")
520
+
521
+ # Tool failure breakdown
522
+ tool_breakdown = report.get("tool_failure_breakdown", {})
523
+ failing_tools = {k: v for k, v in tool_breakdown.items() if v["failures"] > 0}
524
+ if failing_tools:
525
+ lines.append("TOOL FAILURES (in failed tasks)")
526
+ lines.append("-" * 40)
527
+ for tool_name, stats in list(failing_tools.items())[:10]:
528
+ lines.append(
529
+ f" {tool_name:<25} {stats['failures']}/{stats['calls']} failed "
530
+ f"({stats['failure_rate']:.1%})"
531
+ )
532
+ lines.append("")
533
+
534
+ # Patterns
535
+ patterns = report.get("patterns", {})
536
+ zero_tool = patterns.get("zero_tool_failures", 0)
537
+ high_token = patterns.get("high_token_failures", 0)
538
+ if zero_tool > 0 or high_token > 0:
539
+ lines.append("PATTERNS")
540
+ lines.append("-" * 40)
541
+ if zero_tool > 0:
542
+ lines.append(f" Tasks with zero tool calls: {zero_tool}")
543
+ if high_token > 0:
544
+ lines.append(f" Failures with high token usage: {high_token}")
545
+ lines.append("")
546
+
547
+ # Recommendations
548
+ recommendations = report.get("recommendations", [])
549
+ if recommendations:
550
+ lines.append("RECOMMENDATIONS")
551
+ lines.append("-" * 40)
552
+ for i, rec in enumerate(recommendations, 1):
553
+ lines.append(f" {i}. {rec}")
554
+ lines.append("")
555
+
556
+ lines.append("=" * 60)
557
+
558
+ return "\n".join(lines)