themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +343 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/__init__.py +5 -0
  8. themis/cli/__main__.py +6 -0
  9. themis/cli/commands/__init__.py +19 -0
  10. themis/cli/commands/benchmarks.py +221 -0
  11. themis/cli/commands/comparison.py +394 -0
  12. themis/cli/commands/config_commands.py +244 -0
  13. themis/cli/commands/cost.py +214 -0
  14. themis/cli/commands/demo.py +68 -0
  15. themis/cli/commands/info.py +90 -0
  16. themis/cli/commands/leaderboard.py +362 -0
  17. themis/cli/commands/math_benchmarks.py +318 -0
  18. themis/cli/commands/mcq_benchmarks.py +207 -0
  19. themis/cli/commands/results.py +252 -0
  20. themis/cli/commands/sample_run.py +244 -0
  21. themis/cli/commands/visualize.py +299 -0
  22. themis/cli/main.py +463 -0
  23. themis/cli/new_project.py +33 -0
  24. themis/cli/utils.py +51 -0
  25. themis/comparison/__init__.py +25 -0
  26. themis/comparison/engine.py +348 -0
  27. themis/comparison/reports.py +283 -0
  28. themis/comparison/statistics.py +402 -0
  29. themis/config/__init__.py +19 -0
  30. themis/config/loader.py +27 -0
  31. themis/config/registry.py +34 -0
  32. themis/config/runtime.py +214 -0
  33. themis/config/schema.py +112 -0
  34. themis/core/__init__.py +5 -0
  35. themis/core/conversation.py +354 -0
  36. themis/core/entities.py +184 -0
  37. themis/core/serialization.py +231 -0
  38. themis/core/tools.py +393 -0
  39. themis/core/types.py +141 -0
  40. themis/datasets/__init__.py +273 -0
  41. themis/datasets/base.py +264 -0
  42. themis/datasets/commonsense_qa.py +174 -0
  43. themis/datasets/competition_math.py +265 -0
  44. themis/datasets/coqa.py +133 -0
  45. themis/datasets/gpqa.py +190 -0
  46. themis/datasets/gsm8k.py +123 -0
  47. themis/datasets/gsm_symbolic.py +124 -0
  48. themis/datasets/math500.py +122 -0
  49. themis/datasets/med_qa.py +179 -0
  50. themis/datasets/medmcqa.py +169 -0
  51. themis/datasets/mmlu_pro.py +262 -0
  52. themis/datasets/piqa.py +146 -0
  53. themis/datasets/registry.py +201 -0
  54. themis/datasets/schema.py +245 -0
  55. themis/datasets/sciq.py +150 -0
  56. themis/datasets/social_i_qa.py +151 -0
  57. themis/datasets/super_gpqa.py +263 -0
  58. themis/evaluation/__init__.py +1 -0
  59. themis/evaluation/conditional.py +410 -0
  60. themis/evaluation/extractors/__init__.py +19 -0
  61. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  62. themis/evaluation/extractors/exceptions.py +7 -0
  63. themis/evaluation/extractors/identity_extractor.py +29 -0
  64. themis/evaluation/extractors/json_field_extractor.py +45 -0
  65. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  66. themis/evaluation/extractors/regex_extractor.py +43 -0
  67. themis/evaluation/math_verify_utils.py +87 -0
  68. themis/evaluation/metrics/__init__.py +21 -0
  69. themis/evaluation/metrics/code/__init__.py +19 -0
  70. themis/evaluation/metrics/code/codebleu.py +144 -0
  71. themis/evaluation/metrics/code/execution.py +280 -0
  72. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  73. themis/evaluation/metrics/composite_metric.py +47 -0
  74. themis/evaluation/metrics/consistency_metric.py +80 -0
  75. themis/evaluation/metrics/exact_match.py +51 -0
  76. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  77. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  78. themis/evaluation/metrics/nlp/__init__.py +21 -0
  79. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  80. themis/evaluation/metrics/nlp/bleu.py +129 -0
  81. themis/evaluation/metrics/nlp/meteor.py +153 -0
  82. themis/evaluation/metrics/nlp/rouge.py +136 -0
  83. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  84. themis/evaluation/metrics/response_length.py +33 -0
  85. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  86. themis/evaluation/pipeline.py +49 -0
  87. themis/evaluation/pipelines/__init__.py +15 -0
  88. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  89. themis/evaluation/pipelines/standard_pipeline.py +348 -0
  90. themis/evaluation/reports.py +293 -0
  91. themis/evaluation/statistics/__init__.py +53 -0
  92. themis/evaluation/statistics/bootstrap.py +79 -0
  93. themis/evaluation/statistics/confidence_intervals.py +121 -0
  94. themis/evaluation/statistics/distributions.py +207 -0
  95. themis/evaluation/statistics/effect_sizes.py +124 -0
  96. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  97. themis/evaluation/statistics/types.py +139 -0
  98. themis/evaluation/strategies/__init__.py +13 -0
  99. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  100. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  101. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  102. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  103. themis/experiment/__init__.py +5 -0
  104. themis/experiment/builder.py +151 -0
  105. themis/experiment/cache_manager.py +134 -0
  106. themis/experiment/comparison.py +631 -0
  107. themis/experiment/cost.py +310 -0
  108. themis/experiment/definitions.py +62 -0
  109. themis/experiment/export.py +798 -0
  110. themis/experiment/export_csv.py +159 -0
  111. themis/experiment/integration_manager.py +104 -0
  112. themis/experiment/math.py +192 -0
  113. themis/experiment/mcq.py +169 -0
  114. themis/experiment/orchestrator.py +415 -0
  115. themis/experiment/pricing.py +317 -0
  116. themis/experiment/storage.py +1458 -0
  117. themis/experiment/visualization.py +588 -0
  118. themis/generation/__init__.py +1 -0
  119. themis/generation/agentic_runner.py +420 -0
  120. themis/generation/batching.py +254 -0
  121. themis/generation/clients.py +143 -0
  122. themis/generation/conversation_runner.py +236 -0
  123. themis/generation/plan.py +456 -0
  124. themis/generation/providers/litellm_provider.py +221 -0
  125. themis/generation/providers/vllm_provider.py +135 -0
  126. themis/generation/router.py +34 -0
  127. themis/generation/runner.py +207 -0
  128. themis/generation/strategies.py +98 -0
  129. themis/generation/templates.py +71 -0
  130. themis/generation/turn_strategies.py +393 -0
  131. themis/generation/types.py +9 -0
  132. themis/integrations/__init__.py +0 -0
  133. themis/integrations/huggingface.py +72 -0
  134. themis/integrations/wandb.py +77 -0
  135. themis/interfaces/__init__.py +169 -0
  136. themis/presets/__init__.py +10 -0
  137. themis/presets/benchmarks.py +354 -0
  138. themis/presets/models.py +190 -0
  139. themis/project/__init__.py +20 -0
  140. themis/project/definitions.py +98 -0
  141. themis/project/patterns.py +230 -0
  142. themis/providers/__init__.py +5 -0
  143. themis/providers/registry.py +39 -0
  144. themis/server/__init__.py +28 -0
  145. themis/server/app.py +337 -0
  146. themis/utils/api_generator.py +379 -0
  147. themis/utils/cost_tracking.py +376 -0
  148. themis/utils/dashboard.py +452 -0
  149. themis/utils/logging_utils.py +41 -0
  150. themis/utils/progress.py +58 -0
  151. themis/utils/tracing.py +320 -0
  152. themis_eval-0.2.0.dist-info/METADATA +596 -0
  153. themis_eval-0.2.0.dist-info/RECORD +157 -0
  154. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
  155. themis_eval-0.1.0.dist-info/METADATA +0 -758
  156. themis_eval-0.1.0.dist-info/RECORD +0 -8
  157. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
  158. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,87 @@
1
+ """Helpers for integrating math-verify with Themis."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Any
7
+
8
+ from sympy import sympify
9
+
10
+ try: # pragma: no cover - optional dependency
11
+ from latex2sympy2_extended.math_normalization import NormalizationConfig
12
+ from math_verify import (
13
+ LatexExtractionConfig,
14
+ )
15
+ from math_verify import (
16
+ parse as mv_parse,
17
+ )
18
+ from math_verify import (
19
+ verify as mv_verify,
20
+ )
21
+ except ImportError: # pragma: no cover - triggered when math-verify isn't installed
22
+ LatexExtractionConfig = None
23
+ NormalizationConfig = None
24
+ mv_parse = None
25
+ mv_verify = None
26
+
27
+ _BOXED_PATTERN = re.compile(r"\\boxed\{([^}]*)\}")
28
+
29
+
30
+ def math_verify_available() -> bool:
31
+ return mv_parse is not None and mv_verify is not None
32
+
33
+
34
+ def require_math_verify() -> None:
35
+ if not math_verify_available(): # pragma: no cover - informative exception
36
+ raise RuntimeError(
37
+ "math-verify is required for math extraction/evaluation. Install via `uv pip install '.[math]'`."
38
+ )
39
+
40
+
41
+ def extract_last_boxed(text: str) -> str:
42
+ match = _BOXED_PATTERN.findall(text)
43
+ if match:
44
+ return match[-1]
45
+ return text
46
+
47
+
48
+ def parse_expression(text: str) -> Any:
49
+ require_math_verify()
50
+ extraction_config = [
51
+ LatexExtractionConfig(
52
+ normalization_config=NormalizationConfig(boxed="all"),
53
+ )
54
+ ]
55
+ expressions = mv_parse(
56
+ text,
57
+ extraction_config=extraction_config,
58
+ extraction_mode="first_match",
59
+ fallback_mode="first_match",
60
+ )
61
+ expr = expressions[0] if expressions else text
62
+ if isinstance(expr, str):
63
+ try:
64
+ return sympify(expr)
65
+ except Exception: # pragma: no cover - invalid sympy expr
66
+ return expr
67
+ return expr
68
+
69
+
70
+ def verify_expressions(reference: Any, prediction: Any) -> bool:
71
+ require_math_verify()
72
+ return bool(
73
+ mv_verify(
74
+ gold=reference,
75
+ target=prediction,
76
+ raise_on_error=False,
77
+ )
78
+ )
79
+
80
+
81
+ __all__ = [
82
+ "math_verify_available",
83
+ "require_math_verify",
84
+ "extract_last_boxed",
85
+ "parse_expression",
86
+ "verify_expressions",
87
+ ]
@@ -0,0 +1,21 @@
1
+ from __future__ import annotations
2
+
3
+ from .composite_metric import CompositeMetric
4
+ from .consistency_metric import ConsistencyMetric
5
+ from .exact_match import ExactMatch
6
+ from .length_difference_tolerance import LengthDifferenceTolerance
7
+ from .math_verify_accuracy import MathVerifyAccuracy
8
+ from .pairwise_judge_metric import PairwiseJudgeMetric
9
+ from .response_length import ResponseLength
10
+ from .rubric_judge_metric import RubricJudgeMetric
11
+
12
+ __all__ = [
13
+ "ExactMatch",
14
+ "LengthDifferenceTolerance",
15
+ "CompositeMetric",
16
+ "ResponseLength",
17
+ "MathVerifyAccuracy",
18
+ "RubricJudgeMetric",
19
+ "PairwiseJudgeMetric",
20
+ "ConsistencyMetric",
21
+ ]
@@ -0,0 +1,19 @@
1
+ """Code generation evaluation metrics.
2
+
3
+ This module provides metrics for evaluating code generation tasks:
4
+ - Pass@k: Functional correctness with k samples
5
+ - CodeBLEU: Code-aware BLEU variant
6
+ - ExecutionAccuracy: Safe code execution and testing
7
+ """
8
+
9
+ from themis.evaluation.metrics.code.pass_at_k import PassAtK, estimate_pass_at_k
10
+ from themis.evaluation.metrics.code.codebleu import CodeBLEU
11
+ from themis.evaluation.metrics.code.execution import ExecutionAccuracy, ExecutionResult
12
+
13
+ __all__ = [
14
+ "PassAtK",
15
+ "estimate_pass_at_k",
16
+ "CodeBLEU",
17
+ "ExecutionAccuracy",
18
+ "ExecutionResult",
19
+ ]
@@ -0,0 +1,144 @@
1
+ """CodeBLEU metric for code generation evaluation.
2
+
3
+ CodeBLEU extends BLEU with syntax awareness using abstract syntax trees (AST)
4
+ and data flow matching.
5
+
6
+ References:
7
+ Ren et al. (2020). CodeBLEU: a Method for Automatic Evaluation of Code Synthesis.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Any, Sequence
13
+
14
+ from themis.core.entities import MetricScore
15
+ from themis.interfaces import Metric
16
+
17
+
18
+ class CodeBLEU(Metric):
19
+ """CodeBLEU metric for code generation.
20
+
21
+ CodeBLEU combines:
22
+ - N-gram matching (like BLEU)
23
+ - Syntax matching (AST-based)
24
+ - Data flow matching (variable dependencies)
25
+
26
+ It's more suitable for code evaluation than plain BLEU as it considers
27
+ code structure and semantics, not just surface form.
28
+
29
+ Attributes:
30
+ name: Metric identifier ("codebleu")
31
+ lang: Programming language ("python", "java", "javascript", etc.)
32
+ weights: Weights for [ngram, syntax, dataflow] components
33
+
34
+ Example:
35
+ >>> from themis.evaluation.metrics.code import CodeBLEU
36
+ >>> metric = CodeBLEU(lang="python")
37
+ >>> score = metric.compute(
38
+ ... prediction="def add(a, b):\\n return a + b",
39
+ ... references=["def add(x, y):\\n return x + y"]
40
+ ... )
41
+ >>> print(f"CodeBLEU: {score.value:.4f}")
42
+ CodeBLEU: 0.8234
43
+ """
44
+
45
+ requires_reference = True
46
+
47
+ def __init__(
48
+ self,
49
+ lang: str = "python",
50
+ weights: tuple[float, float, float] = (0.25, 0.25, 0.50),
51
+ alpha: float = 0.25,
52
+ beta: float = 0.25,
53
+ gamma: float = 0.50,
54
+ theta: float = 0.0,
55
+ ):
56
+ """Initialize CodeBLEU metric.
57
+
58
+ Args:
59
+ lang: Programming language ("python", "java", "javascript", "go", "php", "ruby")
60
+ weights: Weights for [ngram, weighted_ngram, syntax, dataflow].
61
+ Default: (0.25, 0.25, 0.25, 0.25)
62
+ alpha: Weight for n-gram matching
63
+ beta: Weight for weighted n-gram matching
64
+ gamma: Weight for syntax matching
65
+ theta: Weight for data flow matching
66
+ """
67
+ self.name = "codebleu"
68
+ self.lang = lang
69
+ self.alpha = alpha
70
+ self.beta = beta
71
+ self.gamma = gamma
72
+ self.theta = theta
73
+
74
+ # Lazy import codebleu (not required for all users)
75
+ try:
76
+ from codebleu import calc_codebleu
77
+ self._calc_codebleu = calc_codebleu
78
+ except ImportError:
79
+ raise ImportError(
80
+ "codebleu is required for CodeBLEU metric. "
81
+ "Install it with: pip install codebleu"
82
+ )
83
+
84
+ def compute(
85
+ self,
86
+ *,
87
+ prediction: Any,
88
+ references: Sequence[Any],
89
+ metadata: dict[str, Any] | None = None,
90
+ ) -> MetricScore:
91
+ """Compute CodeBLEU score.
92
+
93
+ Args:
94
+ prediction: Generated code (already extracted by pipeline)
95
+ references: List of reference code implementations
96
+ metadata: Optional metadata dict
97
+
98
+ Returns:
99
+ MetricScore with CodeBLEU value and component scores
100
+ """
101
+ # Convert to strings
102
+ pred_str = str(prediction)
103
+ ref_strs = [str(ref) for ref in references]
104
+
105
+ try:
106
+ # Compute CodeBLEU
107
+ result = self._calc_codebleu(
108
+ references=[ref_strs], # List of reference lists
109
+ predictions=[pred_str], # List of predictions
110
+ lang=self.lang,
111
+ weights=(self.alpha, self.beta, self.gamma, self.theta),
112
+ )
113
+
114
+ codebleu_score = result["codebleu"]
115
+
116
+ return MetricScore(
117
+ metric_name=self.name,
118
+ value=codebleu_score,
119
+ details={
120
+ "codebleu": codebleu_score,
121
+ "ngram_match_score": result.get("ngram_match_score", 0.0),
122
+ "weighted_ngram_match_score": result.get("weighted_ngram_match_score", 0.0),
123
+ "syntax_match_score": result.get("syntax_match_score", 0.0),
124
+ "dataflow_match_score": result.get("dataflow_match_score", 0.0),
125
+ "lang": self.lang,
126
+ "num_references": len(ref_strs),
127
+ },
128
+ metadata=metadata or {},
129
+ )
130
+
131
+ except Exception as e:
132
+ # Handle parsing errors (invalid code, unsupported language, etc.)
133
+ return MetricScore(
134
+ metric_name=self.name,
135
+ value=0.0,
136
+ details={
137
+ "error": str(e),
138
+ "lang": self.lang,
139
+ },
140
+ metadata=metadata or {},
141
+ )
142
+
143
+
144
+ __all__ = ["CodeBLEU"]
@@ -0,0 +1,280 @@
1
+ """Safe code execution for testing functional correctness.
2
+
3
+ This module provides utilities for safely executing generated code against
4
+ test cases in a sandboxed environment.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import multiprocessing
10
+ import signal
11
+ from dataclasses import dataclass
12
+ from enum import Enum
13
+ from typing import Any, Callable, Sequence
14
+
15
+ from themis.core.entities import MetricScore
16
+ from themis.interfaces import Metric
17
+
18
+
19
+ class ExecutionStatus(str, Enum):
20
+ """Execution result status."""
21
+
22
+ PASSED = "passed"
23
+ FAILED = "failed"
24
+ TIMEOUT = "timeout"
25
+ ERROR = "error"
26
+
27
+
28
+ @dataclass
29
+ class ExecutionResult:
30
+ """Result of code execution.
31
+
32
+ Attributes:
33
+ status: Execution status
34
+ passed: Whether all tests passed
35
+ output: Captured stdout/stderr
36
+ error: Error message if any
37
+ duration: Execution time in seconds
38
+ """
39
+
40
+ status: ExecutionStatus
41
+ passed: bool
42
+ output: str = ""
43
+ error: str | None = None
44
+ duration: float = 0.0
45
+
46
+
47
+ class ExecutionAccuracy(Metric):
48
+ """Execute code and check against test cases.
49
+
50
+ This metric safely executes generated code in a restricted environment
51
+ and verifies correctness against provided test cases.
52
+
53
+ Security considerations:
54
+ - Executes in subprocess with timeout
55
+ - Restricted globals (no file I/O, network, etc.)
56
+ - Resource limits (memory, time)
57
+
58
+ Attributes:
59
+ name: Metric identifier ("execution_accuracy")
60
+ timeout: Maximum execution time per test (seconds)
61
+ max_memory_mb: Maximum memory usage (MB)
62
+
63
+ Example:
64
+ >>> from themis.evaluation.metrics.code import ExecutionAccuracy
65
+ >>> metric = ExecutionAccuracy(timeout=3.0)
66
+ >>>
67
+ >>> # Reference contains test cases
68
+ >>> test_cases = {
69
+ ... "test_fn": test_function,
70
+ ... "inputs": [(1, 2), (3, 4)],
71
+ ... "expected": [3, 7]
72
+ ... }
73
+ >>>
74
+ >>> score = metric.compute(
75
+ ... prediction="def add(a, b): return a + b",
76
+ ... references=[test_cases]
77
+ ... )
78
+ """
79
+
80
+ requires_reference = True
81
+
82
+ def __init__(
83
+ self,
84
+ timeout: float = 3.0,
85
+ max_memory_mb: int = 512,
86
+ ):
87
+ """Initialize execution metric.
88
+
89
+ Args:
90
+ timeout: Maximum execution time per test (seconds)
91
+ max_memory_mb: Maximum memory usage (MB)
92
+ """
93
+ self.name = "execution_accuracy"
94
+ self.timeout = timeout
95
+ self.max_memory_mb = max_memory_mb
96
+
97
+ def compute(
98
+ self,
99
+ *,
100
+ prediction: Any,
101
+ references: Sequence[Any],
102
+ metadata: dict[str, Any] | None = None,
103
+ ) -> MetricScore:
104
+ """Execute code and compute accuracy.
105
+
106
+ Args:
107
+ prediction: Generated code to execute
108
+ references: List of test specifications
109
+ metadata: Optional metadata dict
110
+
111
+ Returns:
112
+ MetricScore with execution accuracy
113
+ """
114
+ code_str = str(prediction)
115
+
116
+ if not references:
117
+ return MetricScore(
118
+ metric_name=self.name,
119
+ value=0.0,
120
+ details={"error": "No test cases provided"},
121
+ metadata=metadata or {},
122
+ )
123
+
124
+ # Extract test cases from reference
125
+ test_spec = references[0]
126
+ if not isinstance(test_spec, dict):
127
+ return MetricScore(
128
+ metric_name=self.name,
129
+ value=0.0,
130
+ details={"error": "Test specification must be a dictionary"},
131
+ metadata=metadata or {},
132
+ )
133
+
134
+ test_inputs = test_spec.get("inputs", [])
135
+ expected_outputs = test_spec.get("expected", [])
136
+ test_fn_name = test_spec.get("function_name", "solution")
137
+
138
+ if len(test_inputs) != len(expected_outputs):
139
+ return MetricScore(
140
+ metric_name=self.name,
141
+ value=0.0,
142
+ details={"error": "Mismatch between inputs and expected outputs"},
143
+ metadata=metadata or {},
144
+ )
145
+
146
+ # Execute code and run tests
147
+ results = []
148
+ for test_input, expected in zip(test_inputs, expected_outputs):
149
+ result = self._execute_test(
150
+ code_str,
151
+ test_fn_name,
152
+ test_input,
153
+ expected,
154
+ )
155
+ results.append(result)
156
+
157
+ # Compute accuracy
158
+ passed = sum(1 for r in results if r.passed)
159
+ total = len(results)
160
+ accuracy = passed / total if total > 0 else 0.0
161
+
162
+ return MetricScore(
163
+ metric_name=self.name,
164
+ value=accuracy,
165
+ details={
166
+ "accuracy": accuracy,
167
+ "passed": passed,
168
+ "total": total,
169
+ "results": [
170
+ {
171
+ "status": r.status.value,
172
+ "passed": r.passed,
173
+ "error": r.error,
174
+ "duration": r.duration,
175
+ }
176
+ for r in results
177
+ ],
178
+ },
179
+ metadata=metadata or {},
180
+ )
181
+
182
+ def _execute_test(
183
+ self,
184
+ code: str,
185
+ function_name: str,
186
+ test_input: Any,
187
+ expected_output: Any,
188
+ ) -> ExecutionResult:
189
+ """Execute a single test case.
190
+
191
+ Args:
192
+ code: Code to execute
193
+ function_name: Name of function to test
194
+ test_input: Input to pass to function
195
+ expected_output: Expected output
196
+
197
+ Returns:
198
+ ExecutionResult with status and outcome
199
+ """
200
+ import time
201
+
202
+ start_time = time.time()
203
+
204
+ try:
205
+ # Create restricted globals (no file I/O, network, etc.)
206
+ restricted_globals = {
207
+ "__builtins__": {
208
+ "abs": abs,
209
+ "all": all,
210
+ "any": any,
211
+ "bool": bool,
212
+ "dict": dict,
213
+ "enumerate": enumerate,
214
+ "filter": filter,
215
+ "float": float,
216
+ "int": int,
217
+ "len": len,
218
+ "list": list,
219
+ "map": map,
220
+ "max": max,
221
+ "min": min,
222
+ "range": range,
223
+ "reversed": reversed,
224
+ "set": set,
225
+ "sorted": sorted,
226
+ "str": str,
227
+ "sum": sum,
228
+ "tuple": tuple,
229
+ "zip": zip,
230
+ }
231
+ }
232
+
233
+ # Execute code with timeout
234
+ local_vars = {}
235
+ exec(code, restricted_globals, local_vars)
236
+
237
+ # Get the function
238
+ if function_name not in local_vars:
239
+ return ExecutionResult(
240
+ status=ExecutionStatus.ERROR,
241
+ passed=False,
242
+ error=f"Function '{function_name}' not found",
243
+ duration=time.time() - start_time,
244
+ )
245
+
246
+ func = local_vars[function_name]
247
+
248
+ # Run function with input
249
+ if isinstance(test_input, (list, tuple)):
250
+ actual_output = func(*test_input)
251
+ else:
252
+ actual_output = func(test_input)
253
+
254
+ # Check if output matches expected
255
+ passed = actual_output == expected_output
256
+
257
+ return ExecutionResult(
258
+ status=ExecutionStatus.PASSED if passed else ExecutionStatus.FAILED,
259
+ passed=passed,
260
+ output=str(actual_output),
261
+ duration=time.time() - start_time,
262
+ )
263
+
264
+ except TimeoutError:
265
+ return ExecutionResult(
266
+ status=ExecutionStatus.TIMEOUT,
267
+ passed=False,
268
+ error=f"Execution timeout ({self.timeout}s)",
269
+ duration=self.timeout,
270
+ )
271
+ except Exception as e:
272
+ return ExecutionResult(
273
+ status=ExecutionStatus.ERROR,
274
+ passed=False,
275
+ error=str(e),
276
+ duration=time.time() - start_time,
277
+ )
278
+
279
+
280
+ __all__ = ["ExecutionAccuracy", "ExecutionResult", "ExecutionStatus"]