mcpbr 0.4.16__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. mcpbr/__init__.py +20 -1
  2. mcpbr/config.py +37 -1
  3. mcpbr/config_migration.py +470 -0
  4. mcpbr/config_wizard.py +647 -0
  5. mcpbr/dashboard.py +619 -0
  6. mcpbr/dataset_streaming.py +491 -0
  7. mcpbr/docker_cache.py +539 -0
  8. mcpbr/docker_env.py +2 -1
  9. mcpbr/docker_prewarm.py +370 -0
  10. mcpbr/dry_run.py +533 -0
  11. mcpbr/formatting.py +444 -0
  12. mcpbr/gpu_support.py +2 -1
  13. mcpbr/graceful_degradation.py +277 -0
  14. mcpbr/harness.py +38 -4
  15. mcpbr/languages.py +228 -0
  16. mcpbr/logging_config.py +207 -0
  17. mcpbr/models.py +66 -0
  18. mcpbr/preflight.py +2 -1
  19. mcpbr/pricing.py +72 -0
  20. mcpbr/providers.py +316 -3
  21. mcpbr/resource_limits.py +487 -0
  22. mcpbr/result_streaming.py +519 -0
  23. mcpbr/sdk.py +264 -0
  24. mcpbr/smoke_test.py +2 -1
  25. mcpbr/task_batching.py +403 -0
  26. mcpbr/task_scheduler.py +468 -0
  27. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/METADATA +8 -1
  28. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/RECORD +38 -22
  29. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
  30. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
  31. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
  32. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
  33. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
  34. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
  35. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
  36. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/WHEEL +0 -0
  37. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/entry_points.txt +0 -0
  38. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,277 @@
1
+ """Graceful degradation for benchmark evaluation.
2
+
3
+ Provides fault-tolerant execution of benchmark tasks with failure isolation,
4
+ classification, checkpointing, and configurable error handling policies.
5
+ """
6
+
7
+ import asyncio
8
+ import json
9
+ from dataclasses import dataclass, field
10
+ from datetime import datetime, timezone
11
+ from enum import Enum
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+
16
+ class FailureType(Enum):
17
+ """Classification of task failure types."""
18
+
19
+ TRANSIENT = "transient"
20
+ PERMANENT = "permanent"
21
+ UNKNOWN = "unknown"
22
+
23
+
24
+ @dataclass
25
+ class TaskFailure:
26
+ """Record of a single task failure.
27
+
28
+ Attributes:
29
+ task_id: Identifier of the failed task.
30
+ error: Error message describing the failure.
31
+ failure_type: Classification of the failure.
32
+ timestamp: ISO 8601 timestamp of when the failure occurred.
33
+ retryable: Whether the task could be retried.
34
+ """
35
+
36
+ task_id: str
37
+ error: str
38
+ failure_type: FailureType
39
+ timestamp: str
40
+ retryable: bool = True
41
+
42
+
43
+ @dataclass
44
+ class ExecutionCheckpoint:
45
+ """Checkpoint of execution state for crash recovery and resumption.
46
+
47
+ Tracks which tasks have completed, failed, or been skipped during
48
+ an evaluation run. Can be serialized to/from JSON for persistence.
49
+
50
+ Attributes:
51
+ completed_tasks: List of task IDs that completed successfully.
52
+ failed_tasks: List of TaskFailure records for failed tasks.
53
+ skipped_tasks: List of task IDs that were skipped.
54
+ """
55
+
56
+ completed_tasks: list[str] = field(default_factory=list)
57
+ failed_tasks: list[TaskFailure] = field(default_factory=list)
58
+ skipped_tasks: list[str] = field(default_factory=list)
59
+
60
+ def save(self, path: Path) -> None:
61
+ """Save checkpoint to a JSON file.
62
+
63
+ Args:
64
+ path: File path to write the checkpoint to.
65
+ """
66
+ data = {
67
+ "completed": self.completed_tasks,
68
+ "failed": [
69
+ {
70
+ "task_id": f.task_id,
71
+ "error": f.error,
72
+ "type": f.failure_type.value,
73
+ "timestamp": f.timestamp,
74
+ "retryable": f.retryable,
75
+ }
76
+ for f in self.failed_tasks
77
+ ],
78
+ "skipped": self.skipped_tasks,
79
+ }
80
+ path.parent.mkdir(parents=True, exist_ok=True)
81
+ path.write_text(json.dumps(data, indent=2))
82
+
83
+ @classmethod
84
+ def load(cls, path: Path) -> "ExecutionCheckpoint":
85
+ """Load checkpoint from a JSON file.
86
+
87
+ Args:
88
+ path: File path to read the checkpoint from.
89
+
90
+ Returns:
91
+ ExecutionCheckpoint populated from the file.
92
+ """
93
+ data = json.loads(path.read_text())
94
+ return cls(
95
+ completed_tasks=data["completed"],
96
+ failed_tasks=[
97
+ TaskFailure(
98
+ task_id=f["task_id"],
99
+ error=f["error"],
100
+ failure_type=FailureType(f["type"]),
101
+ timestamp=f["timestamp"],
102
+ retryable=f.get("retryable", True),
103
+ )
104
+ for f in data["failed"]
105
+ ],
106
+ skipped_tasks=data["skipped"],
107
+ )
108
+
109
+
110
+ # Exception types considered transient (may succeed on retry)
111
+ _TRANSIENT_ERRORS = (
112
+ TimeoutError,
113
+ asyncio.TimeoutError,
114
+ ConnectionError,
115
+ ConnectionResetError,
116
+ ConnectionRefusedError,
117
+ ConnectionAbortedError,
118
+ OSError,
119
+ IOError,
120
+ )
121
+
122
+ # Exception types considered permanent (will not succeed on retry)
123
+ _PERMANENT_ERRORS = (
124
+ ValueError,
125
+ TypeError,
126
+ KeyError,
127
+ IndexError,
128
+ AttributeError,
129
+ NotImplementedError,
130
+ SyntaxError,
131
+ ImportError,
132
+ )
133
+
134
+
135
+ def classify_failure(error: Exception) -> FailureType:
136
+ """Classify an error as transient, permanent, or unknown.
137
+
138
+ Transient errors are those that may succeed on retry (timeouts,
139
+ connection issues, resource exhaustion). Permanent errors are
140
+ programming or configuration errors that will not resolve on retry.
141
+
142
+ Args:
143
+ error: The exception to classify.
144
+
145
+ Returns:
146
+ FailureType indicating the classification.
147
+ """
148
+ if isinstance(error, _TRANSIENT_ERRORS):
149
+ return FailureType.TRANSIENT
150
+ if isinstance(error, _PERMANENT_ERRORS):
151
+ return FailureType.PERMANENT
152
+ return FailureType.UNKNOWN
153
+
154
+
155
+ class GracefulExecutor:
156
+ """Executor that provides graceful degradation for benchmark tasks.
157
+
158
+ Isolates task failures so that one failing task does not prevent
159
+ other tasks from executing. Supports configurable error policies
160
+ including continue-on-error and max-failure thresholds.
161
+
162
+ Args:
163
+ continue_on_error: If True, continue executing tasks after failures.
164
+ If False, stop on the first failure.
165
+ max_failures: Maximum number of failures before stopping execution.
166
+ None means no limit (continue until all tasks are processed).
167
+ checkpoint_dir: Directory to save execution checkpoints for crash recovery.
168
+ None means no checkpointing.
169
+ """
170
+
171
+ def __init__(
172
+ self,
173
+ continue_on_error: bool = True,
174
+ max_failures: int | None = None,
175
+ checkpoint_dir: Path | None = None,
176
+ ) -> None:
177
+ """Initialize GracefulExecutor.
178
+
179
+ Args:
180
+ continue_on_error: Whether to continue after task failures.
181
+ max_failures: Maximum failures before halting. None for unlimited.
182
+ checkpoint_dir: Directory for saving checkpoint files.
183
+ """
184
+ self.continue_on_error = continue_on_error
185
+ self.max_failures = max_failures
186
+ self.checkpoint_dir = checkpoint_dir
187
+ self.checkpoint = ExecutionCheckpoint()
188
+
189
+ async def execute_task(self, task_id: str, coro: Any) -> Any | None:
190
+ """Execute a single task with failure isolation.
191
+
192
+ Wraps the coroutine execution in error handling that records
193
+ failures without propagating them (when continue_on_error is True).
194
+
195
+ Args:
196
+ task_id: Identifier for the task being executed.
197
+ coro: Awaitable coroutine to execute.
198
+
199
+ Returns:
200
+ The result of the coroutine, or None if the task failed.
201
+ """
202
+ try:
203
+ result = await coro
204
+ self.checkpoint.completed_tasks.append(task_id)
205
+ self._save_checkpoint()
206
+ return result
207
+ except Exception as e:
208
+ failure_type = classify_failure(e)
209
+ failure = TaskFailure(
210
+ task_id=task_id,
211
+ error=str(e),
212
+ failure_type=failure_type,
213
+ timestamp=datetime.now(timezone.utc).isoformat(),
214
+ retryable=failure_type == FailureType.TRANSIENT,
215
+ )
216
+ self.checkpoint.failed_tasks.append(failure)
217
+ self._save_checkpoint()
218
+ return None
219
+
220
+ def should_continue(self) -> bool:
221
+ """Determine whether execution should continue.
222
+
223
+ Considers the continue_on_error flag and the max_failures threshold.
224
+
225
+ Returns:
226
+ True if execution should continue, False if it should stop.
227
+ """
228
+ failure_count = len(self.checkpoint.failed_tasks)
229
+
230
+ # If any failure occurred and continue_on_error is False, stop
231
+ if not self.continue_on_error and failure_count > 0:
232
+ return False
233
+
234
+ # If max_failures is set and we've reached it, stop
235
+ if self.max_failures is not None and failure_count >= self.max_failures:
236
+ return False
237
+
238
+ return True
239
+
240
+ def get_partial_report(self) -> dict[str, Any]:
241
+ """Generate a report of execution progress including partial results.
242
+
243
+ Returns:
244
+ Dictionary with execution statistics and failure details.
245
+ """
246
+ completed_count = len(self.checkpoint.completed_tasks)
247
+ failed_count = len(self.checkpoint.failed_tasks)
248
+ skipped_count = len(self.checkpoint.skipped_tasks)
249
+ total_tasks = completed_count + failed_count + skipped_count
250
+
251
+ success_rate = completed_count / total_tasks if total_tasks > 0 else 0.0
252
+
253
+ failures = [
254
+ {
255
+ "task_id": f.task_id,
256
+ "error": f.error,
257
+ "failure_type": f.failure_type.value,
258
+ "timestamp": f.timestamp,
259
+ "retryable": f.retryable,
260
+ }
261
+ for f in self.checkpoint.failed_tasks
262
+ ]
263
+
264
+ return {
265
+ "total_tasks": total_tasks,
266
+ "completed_count": completed_count,
267
+ "failed_count": failed_count,
268
+ "skipped_count": skipped_count,
269
+ "success_rate": success_rate,
270
+ "failures": failures,
271
+ }
272
+
273
+ def _save_checkpoint(self) -> None:
274
+ """Save checkpoint to disk if checkpoint_dir is configured."""
275
+ if self.checkpoint_dir is not None:
276
+ checkpoint_path = self.checkpoint_dir / "checkpoint.json"
277
+ self.checkpoint.save(checkpoint_path)
mcpbr/harness.py CHANGED
@@ -418,6 +418,7 @@ async def _run_mcp_evaluation(
418
418
 
419
419
  start_time = time.time()
420
420
  env: TaskEnvironment | None = None
421
+ agent_result: AgentResult | None = None
421
422
  try:
422
423
  # Track Docker environment creation time
423
424
  docker_start = time.time()
@@ -480,10 +481,15 @@ async def _run_mcp_evaluation(
480
481
  return result
481
482
 
482
483
  except asyncio.TimeoutError:
483
- # Note: The agent harness should have captured partial statistics in the AgentResult
484
- # before raising TimeoutError, but this is a fallback for unexpected timeout locations
485
484
  end_time = time.time()
486
485
  runtime_seconds = end_time - start_time
486
+ # Preserve agent metrics if the agent completed before the timeout
487
+ # (timeout may have occurred during evaluation, not during agent solve)
488
+ if agent_result is not None:
489
+ result = agent_result_to_dict(agent_result, None, config.model, runtime_seconds)
490
+ result["status"] = "timeout"
491
+ result["error"] = "Evaluation timed out after agent completed"
492
+ return result
487
493
  cost = calculate_cost(config.model, 0, 0)
488
494
  return {
489
495
  "resolved": False,
@@ -499,6 +505,11 @@ async def _run_mcp_evaluation(
499
505
  except Exception as e:
500
506
  end_time = time.time()
501
507
  runtime_seconds = end_time - start_time
508
+ # Preserve agent metrics if the agent completed before the error
509
+ if agent_result is not None:
510
+ result = agent_result_to_dict(agent_result, None, config.model, runtime_seconds)
511
+ result["error"] = str(e)
512
+ return result
502
513
  cost = calculate_cost(config.model, 0, 0)
503
514
  return {
504
515
  "resolved": False,
@@ -562,6 +573,7 @@ async def _run_baseline_evaluation(
562
573
 
563
574
  start_time = time.time()
564
575
  env: TaskEnvironment | None = None
576
+ agent_result: AgentResult | None = None
565
577
  try:
566
578
  # Track Docker environment creation time
567
579
  docker_start = time.time()
@@ -622,10 +634,15 @@ async def _run_baseline_evaluation(
622
634
  return result
623
635
 
624
636
  except asyncio.TimeoutError:
625
- # Note: The agent harness should have captured partial statistics in the AgentResult
626
- # before raising TimeoutError, but this is a fallback for unexpected timeout locations
627
637
  end_time = time.time()
628
638
  runtime_seconds = end_time - start_time
639
+ # Preserve agent metrics if the agent completed before the timeout
640
+ # (timeout may have occurred during evaluation, not during agent solve)
641
+ if agent_result is not None:
642
+ result = agent_result_to_dict(agent_result, None, config.model, runtime_seconds)
643
+ result["status"] = "timeout"
644
+ result["error"] = "Evaluation timed out after agent completed"
645
+ return result
629
646
  cost = calculate_cost(config.model, 0, 0)
630
647
  return {
631
648
  "resolved": False,
@@ -641,6 +658,11 @@ async def _run_baseline_evaluation(
641
658
  except Exception as e:
642
659
  end_time = time.time()
643
660
  runtime_seconds = end_time - start_time
661
+ # Preserve agent metrics if the agent completed before the error
662
+ if agent_result is not None:
663
+ result = agent_result_to_dict(agent_result, None, config.model, runtime_seconds)
664
+ result["error"] = str(e)
665
+ return result
644
666
  cost = calculate_cost(config.model, 0, 0)
645
667
  return {
646
668
  "resolved": False,
@@ -1182,6 +1204,18 @@ async def run_evaluation(
1182
1204
  progress.stop()
1183
1205
  finally:
1184
1206
  await docker_manager.cleanup_all()
1207
+ # Force-shutdown the default executor to prevent asyncio.run() from
1208
+ # hanging during cleanup. Docker SDK background threads (urllib3
1209
+ # connection pool) may linger after client.close(), causing
1210
+ # executor.shutdown(wait=True) to block indefinitely.
1211
+ try:
1212
+ loop = asyncio.get_running_loop()
1213
+ executor = getattr(loop, "_default_executor", None)
1214
+ if executor is not None:
1215
+ executor.shutdown(wait=False, cancel_futures=True)
1216
+ loop._default_executor = None
1217
+ except RuntimeError as exc:
1218
+ console.print(f"[yellow]Default executor shutdown skipped: {exc}[/yellow]")
1185
1219
 
1186
1220
  # Check if we're in comparison mode
1187
1221
  if config.comparison_mode:
mcpbr/languages.py ADDED
@@ -0,0 +1,228 @@
1
+ """Multi-language support for code generation benchmarks.
2
+
3
+ This module provides:
4
+ - Language enum defining supported programming languages.
5
+ - LanguageConfig dataclass with per-language Docker, run, compile, and test settings.
6
+ - detect_language() to identify the language from a filename or code snippet.
7
+ - get_language_config() to retrieve configuration for a given language.
8
+ - get_supported_languages() to list all supported language names.
9
+ - CrossLanguageMetrics for comparing benchmark performance across languages.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import os
15
+ import re
16
+ from dataclasses import dataclass
17
+ from enum import Enum
18
+ from typing import Any
19
+
20
+
21
+ class Language(Enum):
22
+ """Supported programming languages for code generation benchmarks."""
23
+
24
+ PYTHON = "python"
25
+ JAVASCRIPT = "javascript"
26
+ TYPESCRIPT = "typescript"
27
+ JAVA = "java"
28
+ GO = "go"
29
+
30
+
31
+ @dataclass
32
+ class LanguageConfig:
33
+ """Configuration for running and testing code in a specific language.
34
+
35
+ Attributes:
36
+ name: Human-readable language name.
37
+ language: The Language enum member.
38
+ file_extension: File extension including the dot (e.g., ".py").
39
+ docker_image: Docker image used to run code in this language.
40
+ run_command: Command template to run a file. Use {file} as placeholder.
41
+ test_framework: Name of the default test framework for this language.
42
+ compile_command: Optional command template to compile. None for interpreted languages.
43
+ """
44
+
45
+ name: str
46
+ language: Language
47
+ file_extension: str
48
+ docker_image: str
49
+ run_command: str
50
+ test_framework: str
51
+ compile_command: str | None = None
52
+
53
+
54
+ LANGUAGE_CONFIGS: dict[Language, LanguageConfig] = {
55
+ Language.PYTHON: LanguageConfig(
56
+ name="Python",
57
+ language=Language.PYTHON,
58
+ file_extension=".py",
59
+ docker_image="python:3.11-slim",
60
+ run_command="python {file}",
61
+ test_framework="pytest",
62
+ ),
63
+ Language.JAVASCRIPT: LanguageConfig(
64
+ name="JavaScript",
65
+ language=Language.JAVASCRIPT,
66
+ file_extension=".js",
67
+ docker_image="node:20-slim",
68
+ run_command="node {file}",
69
+ test_framework="jest",
70
+ ),
71
+ Language.TYPESCRIPT: LanguageConfig(
72
+ name="TypeScript",
73
+ language=Language.TYPESCRIPT,
74
+ file_extension=".ts",
75
+ docker_image="node:20-slim",
76
+ run_command="npx ts-node {file}",
77
+ test_framework="jest",
78
+ compile_command="npx tsc {file}",
79
+ ),
80
+ Language.JAVA: LanguageConfig(
81
+ name="Java",
82
+ language=Language.JAVA,
83
+ file_extension=".java",
84
+ docker_image="eclipse-temurin:17-jdk-jammy",
85
+ run_command="java {file}", # Requires Java 11+ single-file source execution
86
+ test_framework="junit",
87
+ compile_command="javac {file}",
88
+ ),
89
+ Language.GO: LanguageConfig(
90
+ name="Go",
91
+ language=Language.GO,
92
+ file_extension=".go",
93
+ docker_image="golang:1.21-alpine",
94
+ run_command="go run {file}",
95
+ test_framework="go test",
96
+ compile_command="go build {file}",
97
+ ),
98
+ }
99
+
100
+ # Map file extensions to languages for filename-based detection.
101
+ _EXTENSION_MAP: dict[str, Language] = {
102
+ config.file_extension: lang for lang, config in LANGUAGE_CONFIGS.items()
103
+ }
104
+
105
+ # Ordered list of (pattern, language) tuples for code content detection.
106
+ # More specific patterns come first to avoid false positives.
107
+ _CODE_PATTERNS: list[tuple[re.Pattern[str], Language]] = [
108
+ # Go: package declaration is highly distinctive
109
+ (re.compile(r"^package\s+\w+", re.MULTILINE), Language.GO),
110
+ (re.compile(r"\bfunc\s+\w+\s*\("), Language.GO),
111
+ # Java: class declaration with access modifier
112
+ (re.compile(r"\bpublic\s+class\s+\w+"), Language.JAVA),
113
+ (re.compile(r"\bpublic\s+static\s+void\s+main"), Language.JAVA),
114
+ # TypeScript: type annotations on const/let/var, or interface keyword
115
+ (re.compile(r"\b(?:const|let|var)\s+\w+\s*:\s*\w+"), Language.TYPESCRIPT),
116
+ (re.compile(r"\binterface\s+\w+\s*\{"), Language.TYPESCRIPT),
117
+ # JavaScript: const/let/var without type annotations, require(), console.log
118
+ (re.compile(r"\brequire\s*\(\s*['\"]"), Language.JAVASCRIPT),
119
+ (re.compile(r"\bconsole\.log\s*\("), Language.JAVASCRIPT),
120
+ (re.compile(r"\b(?:const|let|var)\s+\w+\s*="), Language.JAVASCRIPT),
121
+ # Python: def/class with colon, import, print()
122
+ (re.compile(r"^def\s+\w+\s*\(.*\)\s*:", re.MULTILINE), Language.PYTHON),
123
+ (re.compile(r"^import\s+\w+", re.MULTILINE), Language.PYTHON),
124
+ (re.compile(r"\bprint\s*\("), Language.PYTHON),
125
+ ]
126
+
127
+
128
+ def detect_language(code: str | None = None, filename: str | None = None) -> Language | None:
129
+ """Detect the programming language from a filename or code snippet.
130
+
131
+ Filename-based detection takes priority over code content analysis.
132
+
133
+ Args:
134
+ code: Source code string to analyze.
135
+ filename: Filename (with or without path) to check extension.
136
+
137
+ Returns:
138
+ The detected Language, or None if detection fails.
139
+ """
140
+ # Try filename-based detection first (higher confidence).
141
+ if filename:
142
+ _, ext = os.path.splitext(filename)
143
+ if ext in _EXTENSION_MAP:
144
+ return _EXTENSION_MAP[ext]
145
+
146
+ # Fall back to code content analysis.
147
+ if code:
148
+ for pattern, language in _CODE_PATTERNS:
149
+ if pattern.search(code):
150
+ return language
151
+
152
+ return None
153
+
154
+
155
+ def get_language_config(language: Language) -> LanguageConfig:
156
+ """Get the configuration for a given language.
157
+
158
+ Args:
159
+ language: A Language enum member.
160
+
161
+ Returns:
162
+ The LanguageConfig for the specified language.
163
+ """
164
+ return LANGUAGE_CONFIGS[language]
165
+
166
+
167
+ def get_supported_languages() -> list[str]:
168
+ """Return a list of all supported language name strings.
169
+
170
+ Returns:
171
+ List of language value strings (e.g., ["python", "javascript", ...]).
172
+ """
173
+ return [lang.value for lang in Language]
174
+
175
+
176
+ @dataclass
177
+ class CrossLanguageMetrics:
178
+ """Aggregated benchmark metrics across multiple programming languages.
179
+
180
+ Attributes:
181
+ language_scores: Mapping of language name to its pass rate (resolved ratio).
182
+ best_language: The language with the highest pass rate.
183
+ worst_language: The language with the lowest pass rate.
184
+ average_score: The mean pass rate across all languages.
185
+ """
186
+
187
+ language_scores: dict[str, float]
188
+ best_language: str
189
+ worst_language: str
190
+ average_score: float
191
+
192
+ @classmethod
193
+ def from_results(cls, results: dict[str, list[dict[str, Any]]]) -> CrossLanguageMetrics:
194
+ """Compute cross-language metrics from per-language result lists.
195
+
196
+ Each result dict is expected to have a ``"resolved"`` boolean key.
197
+ The pass rate for a language is the fraction of results where
198
+ ``resolved`` is ``True``.
199
+
200
+ Args:
201
+ results: Mapping of language name to list of result dicts.
202
+
203
+ Returns:
204
+ A CrossLanguageMetrics instance with computed scores.
205
+
206
+ Raises:
207
+ ValueError: If results is empty or any language has an empty result list.
208
+ """
209
+ if not results:
210
+ raise ValueError("results must not be empty")
211
+
212
+ language_scores: dict[str, float] = {}
213
+ for lang_name, lang_results in results.items():
214
+ if not lang_results:
215
+ raise ValueError(f"Result list for language '{lang_name}' must not be empty")
216
+ resolved_count = sum(1 for r in lang_results if r.get("resolved", False))
217
+ language_scores[lang_name] = resolved_count / len(lang_results)
218
+
219
+ best_language = max(language_scores, key=language_scores.get) # type: ignore[arg-type]
220
+ worst_language = min(language_scores, key=language_scores.get) # type: ignore[arg-type]
221
+ average_score = sum(language_scores.values()) / len(language_scores)
222
+
223
+ return cls(
224
+ language_scores=language_scores,
225
+ best_language=best_language,
226
+ worst_language=worst_language,
227
+ average_score=average_score,
228
+ )