eval-framework 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. eval_framework/__init__.py +7 -0
  2. eval_framework/base_config.py +36 -0
  3. eval_framework/context/__init__.py +0 -0
  4. eval_framework/context/determined.py +177 -0
  5. eval_framework/context/eval.py +121 -0
  6. eval_framework/context/local.py +78 -0
  7. eval_framework/evaluation_generator.py +234 -0
  8. eval_framework/exceptions.py +2 -0
  9. eval_framework/external/ifeval_impl/README.md +5 -0
  10. eval_framework/external/ifeval_impl/instructions.py +1523 -0
  11. eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
  12. eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
  13. eval_framework/external/ifeval_impl/utils.py +135 -0
  14. eval_framework/llm/__init__.py +0 -0
  15. eval_framework/llm/aleph_alpha.py +432 -0
  16. eval_framework/llm/base.py +180 -0
  17. eval_framework/llm/huggingface.py +418 -0
  18. eval_framework/llm/mistral.py +88 -0
  19. eval_framework/llm/models.py +28 -0
  20. eval_framework/llm/openai.py +400 -0
  21. eval_framework/llm/vllm.py +554 -0
  22. eval_framework/logger.py +3 -0
  23. eval_framework/main.py +166 -0
  24. eval_framework/metrics/__init__.py +0 -0
  25. eval_framework/metrics/base.py +40 -0
  26. eval_framework/metrics/completion/__init__.py +1 -0
  27. eval_framework/metrics/completion/accuracy_completion.py +16 -0
  28. eval_framework/metrics/completion/aidanbench.py +28 -0
  29. eval_framework/metrics/completion/bleu.py +76 -0
  30. eval_framework/metrics/completion/chrf.py +62 -0
  31. eval_framework/metrics/completion/code_assertion.py +44 -0
  32. eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
  33. eval_framework/metrics/completion/comet.py +56 -0
  34. eval_framework/metrics/completion/concordance_index.py +38 -0
  35. eval_framework/metrics/completion/csv_format.py +102 -0
  36. eval_framework/metrics/completion/cwe_accuracy.py +49 -0
  37. eval_framework/metrics/completion/exponential_similarity.py +65 -0
  38. eval_framework/metrics/completion/f1.py +42 -0
  39. eval_framework/metrics/completion/format_checker.py +56 -0
  40. eval_framework/metrics/completion/grid_difference.py +77 -0
  41. eval_framework/metrics/completion/ifeval.py +73 -0
  42. eval_framework/metrics/completion/json_format.py +179 -0
  43. eval_framework/metrics/completion/language_checker.py +74 -0
  44. eval_framework/metrics/completion/length_control.py +83 -0
  45. eval_framework/metrics/completion/math_reasoning_completion.py +307 -0
  46. eval_framework/metrics/completion/niah_accuracy.py +163 -0
  47. eval_framework/metrics/completion/placeholder_checker.py +27 -0
  48. eval_framework/metrics/completion/repetition.py +88 -0
  49. eval_framework/metrics/completion/rouge_1.py +35 -0
  50. eval_framework/metrics/completion/rouge_2.py +45 -0
  51. eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
  52. eval_framework/metrics/completion/rouge_l.py +52 -0
  53. eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
  54. eval_framework/metrics/completion/ter.py +67 -0
  55. eval_framework/metrics/completion/text_counter.py +182 -0
  56. eval_framework/metrics/efficiency/__init__.py +0 -0
  57. eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
  58. eval_framework/metrics/llm/__init__.py +0 -0
  59. eval_framework/metrics/llm/base.py +34 -0
  60. eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
  61. eval_framework/metrics/llm/graders/coherence_grader.py +115 -0
  62. eval_framework/metrics/llm/graders/comparison_grader.py +198 -0
  63. eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
  64. eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
  65. eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
  66. eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
  67. eval_framework/metrics/llm/graders/language.py +56 -0
  68. eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
  69. eval_framework/metrics/llm/graders/models.py +74 -0
  70. eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
  71. eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
  72. eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
  73. eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
  74. eval_framework/metrics/llm/llm_judge_coherence.py +44 -0
  75. eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
  76. eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
  77. eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
  78. eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
  79. eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
  80. eval_framework/metrics/llm/llm_judge_mtbench_pair.py +306 -0
  81. eval_framework/metrics/llm/llm_judge_mtbench_single.py +210 -0
  82. eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
  83. eval_framework/metrics/llm/llm_judge_sql.py +394 -0
  84. eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
  85. eval_framework/metrics/llm/utils.py +20 -0
  86. eval_framework/metrics/loglikelihood/__init__.py +0 -0
  87. eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
  88. eval_framework/metrics/loglikelihood/base.py +50 -0
  89. eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +25 -0
  90. eval_framework/metrics/loglikelihood/dcs.py +43 -0
  91. eval_framework/metrics/loglikelihood/probability_mass.py +53 -0
  92. eval_framework/metrics/loglikelihood/ternary.py +42 -0
  93. eval_framework/py.typed +0 -0
  94. eval_framework/response_generator.py +351 -0
  95. eval_framework/result_processors/__init__.py +0 -0
  96. eval_framework/result_processors/base.py +88 -0
  97. eval_framework/result_processors/hf_uploader.py +75 -0
  98. eval_framework/result_processors/result_processor.py +129 -0
  99. eval_framework/result_processors/wandb_uploader.py +137 -0
  100. eval_framework/run.py +369 -0
  101. eval_framework/run_direct.py +42 -0
  102. eval_framework/shared/types.py +227 -0
  103. eval_framework/tasks/__init__.py +6 -0
  104. eval_framework/tasks/base.py +392 -0
  105. eval_framework/tasks/benchmarks/__init__.py +0 -0
  106. eval_framework/tasks/benchmarks/aidanbench.py +211 -0
  107. eval_framework/tasks/benchmarks/arc.py +70 -0
  108. eval_framework/tasks/benchmarks/arc_de.py +46 -0
  109. eval_framework/tasks/benchmarks/arc_fi.py +46 -0
  110. eval_framework/tasks/benchmarks/belebele.py +60 -0
  111. eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
  112. eval_framework/tasks/benchmarks/casehold.py +47 -0
  113. eval_framework/tasks/benchmarks/chembench.py +85 -0
  114. eval_framework/tasks/benchmarks/copa.py +64 -0
  115. eval_framework/tasks/benchmarks/duc.py +91 -0
  116. eval_framework/tasks/benchmarks/flores200.py +133 -0
  117. eval_framework/tasks/benchmarks/flores_plus.py +84 -0
  118. eval_framework/tasks/benchmarks/gpqa.py +201 -0
  119. eval_framework/tasks/benchmarks/gsm8k.py +150 -0
  120. eval_framework/tasks/benchmarks/hellaswag.py +69 -0
  121. eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
  122. eval_framework/tasks/benchmarks/humaneval.py +97 -0
  123. eval_framework/tasks/benchmarks/ifeval.py +78 -0
  124. eval_framework/tasks/benchmarks/include.py +119 -0
  125. eval_framework/tasks/benchmarks/infinitebench.py +302 -0
  126. eval_framework/tasks/benchmarks/math_reasoning.py +580 -0
  127. eval_framework/tasks/benchmarks/mbpp.py +192 -0
  128. eval_framework/tasks/benchmarks/mmlu.py +215 -0
  129. eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
  130. eval_framework/tasks/benchmarks/mmlu_pro.py +164 -0
  131. eval_framework/tasks/benchmarks/mmmlu.py +529 -0
  132. eval_framework/tasks/benchmarks/openbookqa.py +85 -0
  133. eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
  134. eval_framework/tasks/benchmarks/pawsx.py +65 -0
  135. eval_framework/tasks/benchmarks/piqa.py +64 -0
  136. eval_framework/tasks/benchmarks/quality.py +56 -0
  137. eval_framework/tasks/benchmarks/sciq.py +110 -0
  138. eval_framework/tasks/benchmarks/sphyr.py +79 -0
  139. eval_framework/tasks/benchmarks/squad.py +211 -0
  140. eval_framework/tasks/benchmarks/struct_eval.py +116 -0
  141. eval_framework/tasks/benchmarks/tablebench.py +117 -0
  142. eval_framework/tasks/benchmarks/triviaqa.py +42 -0
  143. eval_framework/tasks/benchmarks/truthfulqa.py +119 -0
  144. eval_framework/tasks/benchmarks/winogender.py +64 -0
  145. eval_framework/tasks/benchmarks/winogrande.py +69 -0
  146. eval_framework/tasks/benchmarks/winox.py +57 -0
  147. eval_framework/tasks/benchmarks/wmt.py +160 -0
  148. eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
  149. eval_framework/tasks/eval_config.py +136 -0
  150. eval_framework/tasks/perturbation.py +83 -0
  151. eval_framework/tasks/registry.py +186 -0
  152. eval_framework/tasks/task_loader.py +81 -0
  153. eval_framework/tasks/task_names.py +324 -0
  154. eval_framework/tasks/utils.py +584 -0
  155. eval_framework/utils/constants.py +9 -0
  156. eval_framework/utils/file_ops.py +245 -0
  157. eval_framework/utils/generate_task_docs.py +244 -0
  158. eval_framework/utils/helpers.py +32 -0
  159. eval_framework/utils/logging.py +62 -0
  160. eval_framework/utils/packaging.py +52 -0
  161. eval_framework/utils/tqdm_handler.py +14 -0
  162. eval_framework-0.2.7.dist-info/METADATA +548 -0
  163. eval_framework-0.2.7.dist-info/RECORD +170 -0
  164. eval_framework-0.2.7.dist-info/WHEEL +4 -0
  165. eval_framework-0.2.7.dist-info/entry_points.txt +3 -0
  166. template_formatting/README.md +83 -0
  167. template_formatting/__init__.py +0 -0
  168. template_formatting/formatter.py +537 -0
  169. template_formatting/mistral_formatter.py +159 -0
  170. template_formatting/py.typed +0 -0
@@ -0,0 +1,74 @@
1
+ from eval_framework.exceptions import LogicError
2
+ from eval_framework.metrics.base import BaseMetric, MetricResult
3
+ from eval_framework.metrics.llm.graders.language import AVAILABLE_LANGUAGES
4
+ from eval_framework.shared.types import Completion
5
+
6
+
7
+ class LanguageChecker(BaseMetric[Completion]):
8
+ NAME = "Language Check"
9
+
10
+ def calculate(self, response: Completion) -> list[MetricResult]:
11
+ if response.error is not None:
12
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
13
+
14
+ if response.ground_truth is None:
15
+ raise LogicError("Language detection needs ground_truth.")
16
+ if response.ground_truth not in AVAILABLE_LANGUAGES:
17
+ raise LogicError("Checking for unknown or unavailable language.")
18
+
19
+ completion_language = response.get_completion_language()
20
+ target_language = response.ground_truth
21
+ value = float(completion_language == target_language)
22
+ return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]
23
+
24
+
25
+ class GermanCompletionChecker(BaseMetric[Completion]):
26
+ NAME = "German Completion Check"
27
+
28
+ def calculate(self, response: Completion) -> list[MetricResult]:
29
+ if response.error is not None:
30
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
31
+
32
+ raw_completion_language = response.get_raw_completion_language()
33
+ value = float(raw_completion_language == "de")
34
+ return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]
35
+
36
+
37
+ class LanguageConsistencyChecker(BaseMetric[Completion]):
38
+ NAME = "Language Consistency"
39
+
40
+ def calculate(self, response: Completion) -> list[MetricResult]:
41
+ if response.error is not None:
42
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
43
+
44
+ completion_language = response.get_completion_language()
45
+ target_language = response.get_instruction_language()
46
+ if completion_language == target_language == "":
47
+ return [] # No language information could be determined
48
+ else:
49
+ value = float(completion_language == target_language)
50
+ return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]
51
+
52
+
53
+ class LanguageRawConsistencyChecker(BaseMetric[Completion]):
54
+ NAME = "Language Consistency Raw"
55
+
56
+ def calculate(self, response: Completion) -> list[MetricResult]:
57
+ if response.error is not None:
58
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
59
+
60
+ raw_completion_language = response.get_raw_completion_language()
61
+ target_language = response.get_instruction_language()
62
+
63
+ if raw_completion_language == target_language == "":
64
+ return [] # No language information could be determined
65
+ else:
66
+ value = float(raw_completion_language == target_language)
67
+ return [
68
+ MetricResult(
69
+ metric_name=self.NAME,
70
+ value=value,
71
+ higher_is_better=True,
72
+ error=response.error,
73
+ )
74
+ ]
@@ -0,0 +1,83 @@
1
+ import json
2
+ from enum import Enum
3
+
4
+ from eval_framework.metrics.base import BaseMetric, MetricResult
5
+ from eval_framework.metrics.completion.text_counter import ParagraphCounter, SentenceCounter, WordCounter
6
+ from eval_framework.shared.types import Completion
7
+
8
+
9
+ class LengthRequirementUnit(Enum):
10
+ WORDS = "words"
11
+ SENTENCES = "sentences"
12
+ PARAGRAPHS = "paragraphs"
13
+
14
+
15
+ class LengthRequirementType(Enum):
16
+ MIN = "minimum"
17
+ MAX = "maximum"
18
+ TARGET = "target"
19
+
20
+
21
+ class LengthControl(BaseMetric[Completion]):
22
+ NAME = "length_control"
23
+
24
+ def __init__(self, tolerance: float = 1 / 6) -> None:
25
+ super().__init__()
26
+ self.tolerance = tolerance
27
+
28
+ def calculate(self, response: Completion) -> list[MetricResult]:
29
+ if response.error is not None:
30
+ return [
31
+ MetricResult(
32
+ metric_name=f"{self.NAME}/fulfills_length_requirement",
33
+ value=None,
34
+ higher_is_better=True,
35
+ error=response.error if response.error is not None else None,
36
+ )
37
+ ]
38
+
39
+ expectations = json.loads(str(response.ground_truth))
40
+ stripped_completion = response.completion.strip()
41
+
42
+ match LengthRequirementUnit(expectations["unit"]):
43
+ case LengthRequirementUnit.WORDS:
44
+ count = WordCounter._count_words(stripped_completion)
45
+ case LengthRequirementUnit.SENTENCES:
46
+ count = SentenceCounter._count_sentences(stripped_completion)
47
+ case LengthRequirementUnit.PARAGRAPHS:
48
+ count = ParagraphCounter._count_paragraphs(stripped_completion)
49
+ case _:
50
+ raise NotImplementedError(f"LengthRequirementUnit {expectations['unit']} is not supported.")
51
+
52
+ expected_count = int(expectations["count"])
53
+ normalized_distance_to_target = (count - expected_count) / float(expected_count)
54
+ absolute_normalized_distance_to_target = abs(normalized_distance_to_target)
55
+
56
+ match LengthRequirementType(expectations["type"]):
57
+ case LengthRequirementType.TARGET:
58
+ fulfills_length_requirement = absolute_normalized_distance_to_target <= self.tolerance
59
+ case LengthRequirementType.MIN:
60
+ fulfills_length_requirement = count >= expected_count
61
+ case LengthRequirementType.MAX:
62
+ fulfills_length_requirement = count <= expected_count
63
+ case _:
64
+ raise NotImplementedError(f"LengthRequirementType {expectations['type']} is not supported.")
65
+
66
+ return [
67
+ MetricResult(
68
+ metric_name=f"{self.NAME}/normalized_distance_to_target",
69
+ value=float(normalized_distance_to_target),
70
+ higher_is_better=False,
71
+ ),
72
+ MetricResult(
73
+ metric_name=f"{self.NAME}/absolute_normalized_distance_to_target",
74
+ value=float(absolute_normalized_distance_to_target),
75
+ higher_is_better=False,
76
+ ),
77
+ MetricResult(
78
+ metric_name=f"{self.NAME}/fulfills_length_requirement",
79
+ value=float(fulfills_length_requirement),
80
+ higher_is_better=True,
81
+ error=response.error,
82
+ ),
83
+ ]
@@ -0,0 +1,307 @@
1
+ import re
2
+ import signal
3
+ from collections.abc import Callable, Iterable
4
+ from typing import Any
5
+
6
+ from sympy import Basic, S, SympifyError, factor, simplify
7
+ from sympy.parsing.latex import parse_latex
8
+ from sympy.parsing.latex.errors import LaTeXParsingError
9
+
10
+ from eval_framework.metrics.base import BaseMetric, MetricResult
11
+ from eval_framework.shared.types import Completion
12
+
13
+
14
+ def timeout_handler(signum: Any, frame: Any) -> None:
15
+ raise TimeoutError()
16
+
17
+
18
+ class MathReasoningCompletion(BaseMetric[Completion]):
19
+ #
20
+ # Math Reasoning Completion (symbolic)
21
+ #
22
+ # This metric evaluates the correctness of the completion of a math reasoning task without
23
+ # correcting LaTeX expressions. Normalization occurs on the strings, only to remove formatting
24
+ # and units.
25
+ #
26
+ # The metric is designed to evaluate the correctness of the completion of a math reasoning task
27
+ # without correcting LaTeX expressions.
28
+ #
29
+
30
+ NAME = "Math Reasoning Completion (symbolic)"
31
+
32
+ # Substitutions to apply to the final answer
33
+ SUBSTITUTIONS = [
34
+ (r"\ban\b(?!\w)", ""), # Remove "an" if not part of a word
35
+ (r"\ba\b(?!\w)", ""), # Remove "a" if not part of a word
36
+ (r"\.\$", "$"), # Replace ".$" with "$"
37
+ (r"\\\$", ""), # Remove "\$"
38
+ (r"\\ ", ""), # Remove "\ " (escaped space)
39
+ (r"\s+", ""), # Remove all spaces
40
+ (r"\\mbox", "text"), # Replace "\mbox" with "text"
41
+ (r",\\text\{and\}", ","), # Replace ",\text{and}" with ","
42
+ (r"\\text\{and\}", ","), # Replace "\text{and}" with ","
43
+ (r"\\text\{m\}", "\\text{}"), # Replace "\text{m}" with "\text{}"
44
+ ]
45
+
46
+ # Expressions to remove from the final answer
47
+ # Most of these expressions omit units and formatting
48
+ # which the ground truth does not have
49
+ REMOVED_EXPRESSIONS_UNITS = [
50
+ "square",
51
+ "ways",
52
+ "integers",
53
+ "dollars",
54
+ "mph",
55
+ "inches",
56
+ "ft",
57
+ "hours",
58
+ "km",
59
+ "units",
60
+ "\\ldots",
61
+ "sue",
62
+ "points",
63
+ "feet",
64
+ "minutes",
65
+ "digits",
66
+ "cents",
67
+ "degrees",
68
+ "cm",
69
+ "gm",
70
+ "pounds",
71
+ "meters",
72
+ "meals",
73
+ "edges",
74
+ "students",
75
+ "childrentickets",
76
+ "multiples",
77
+ ]
78
+
79
+ REMOVED_EXPRESSIONS_FORMAT = [
80
+ "\\text{s}",
81
+ "\\text{.}",
82
+ "\\text{\ns}",
83
+ "\\text{}^2",
84
+ "\\text{}^3",
85
+ "\\text{\n}",
86
+ "\\text{}",
87
+ r"\mathrm{th}",
88
+ r"^\circ",
89
+ r"^{\circ}",
90
+ r"\;",
91
+ r",\!",
92
+ "{,}",
93
+ '"',
94
+ "\\dots",
95
+ ]
96
+
97
+ def normalize_expression(self, final_answer: str) -> str:
98
+ """
99
+ Function to normalize LaTeX expressions
100
+ :param final_answer: raw LaTeX expression
101
+ :return: normalized LaTeX expression
102
+ NOTE: Changed logic, because before the substitution randomly replaced characters in the string,
103
+ i.e., turned "infty" into "iny" by removing "ft"
104
+ """
105
+ for before, after in self.SUBSTITUTIONS:
106
+ final_answer = re.sub(before, after, final_answer)
107
+ for expr in self.REMOVED_EXPRESSIONS_UNITS:
108
+ # Safely remove units at the end, allowing optional space before the unit
109
+ final_answer = re.sub(rf"(.*?)\s*({re.escape(expr)})$", r"\1", final_answer)
110
+ for expr in self.REMOVED_EXPRESSIONS_FORMAT:
111
+ # Safely remove formatting expressions
112
+ final_answer = final_answer.replace(expr, "")
113
+ final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", r"$\3$", final_answer)
114
+ final_answer = re.sub(r"(\\text\{)(.*?)(\})", r"\2", final_answer)
115
+ final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", r"\2", final_answer)
116
+ final_answer = re.sub(r"(\\overline\{)(.*?)(\})", r"\2", final_answer)
117
+ final_answer = re.sub(r"(\\boxed\{)(.*)(\})", r"\2", final_answer)
118
+ final_answer = re.sub(r"(frac)([^{])(.)", r"frac{\2}{\3}", final_answer)
119
+ final_answer = re.sub(r"(sqrt)([^{])", r"sqrt{\2}", final_answer)
120
+ final_answer = final_answer.replace("$", "")
121
+ # Only strip commas if it's a single numeric value with optional commas (like "1,000")
122
+ if re.fullmatch(r"\d{1,3}(,\d{3})*", final_answer):
123
+ final_answer = final_answer.replace(",", "")
124
+ return final_answer
125
+
126
+ def check_for_equation(self, final_answer: str) -> list:
127
+ """
128
+ Check if the final answer is an equation and split it into left hand side and right hand side
129
+ :param final_answer: the expression to evaluate
130
+ :return: list of left hand side and right hand side of the equation
131
+ """
132
+ if isinstance(final_answer, str) and "=" in final_answer:
133
+ return final_answer.split("=")
134
+ else:
135
+ return [final_answer]
136
+
137
+ def _safe_simplify_expression(self, expression: Basic, timeout: int = 10) -> Basic:
138
+ """
139
+ Simplify an expression with a timeout and catch recursion depth exception
140
+ :param expression: SymPy expression
141
+ :param timeout: Time limit in seconds (default: 10 seconds).
142
+ :return: simplified expressions
143
+ """
144
+ signal.signal(signal.SIGALRM, timeout_handler) # Set timeout signal
145
+ signal.alarm(timeout) # Set timeout duration
146
+
147
+ try:
148
+ factored = factor(expression)
149
+ simplified = simplify(factored)
150
+ return simplified
151
+ except (SympifyError, TimeoutError):
152
+ return S.NaN
153
+ finally:
154
+ # Ensure we never leak a pending alarm into later code paths.
155
+ signal.alarm(0)
156
+
157
+ def _any_symb_correct(self, response_list: Iterable[Basic], ground_truth_list: Iterable[Basic]) -> bool:
158
+ """
159
+ Check if any of the responses are correct and return true at first match
160
+ :param response_list: list of responses
161
+ :param ground_truth_list: list of ground truths
162
+ :return: True if any response is correct
163
+ """
164
+ for answer in response_list:
165
+ for ground_truth in ground_truth_list:
166
+ try:
167
+ unsimplified_difference = answer - ground_truth
168
+ # check if the difference is close to zero with numpy
169
+ difference = self._safe_simplify_expression(unsimplified_difference)
170
+ tolerance = 1e-12
171
+ if abs(difference) < tolerance:
172
+ return True
173
+ except ValueError:
174
+ # equations cannot be evaluated against each other
175
+ return False
176
+ return False
177
+
178
+ def _apply_safely(self, func: Callable[[Basic], Basic], list_of_expressions: list[Basic]) -> None:
179
+ """
180
+ apply safely to a list of expressions and replace the original expressions
181
+ :param list_of_expressions: list of sympy expressions
182
+ """
183
+ for i, expression in enumerate(list_of_expressions):
184
+ try:
185
+ list_of_expressions[i] = func(expression)
186
+ except RecursionError:
187
+ list_of_expressions[i] = S.NaN
188
+
189
+ def calculate(self, response: Completion) -> list[MetricResult]:
190
+ """
191
+ Calculate the accuracy of the completion
192
+
193
+ performs several verification and simplification steps
194
+ to ensure that the completion is correct
195
+
196
+ the completion may either be a latex or string response
197
+ which sympy will parse, factor, and simplify
198
+
199
+ :param response: Completion object
200
+ :return: list of MetricResult
201
+ """
202
+ ground_truths = []
203
+ INVALID_ANSWER = S.NaN
204
+ timeout = 10
205
+ # latex parse all ingested ground truth values for math reasoning
206
+ for gt in response.ground_truth_list:
207
+ signal.signal(signal.SIGALRM, timeout_handler) # Set timeout signal
208
+ signal.alarm(timeout) # Set timeout duration
209
+ try:
210
+ gt_parsed = parse_latex(gt) # NOTE: parses f(x)=0,\quadf(x)=x-1,\quadf(x)=-x+1 to Eq(f(x), 0) ONLY
211
+ ground_truths.append(gt_parsed)
212
+ except Exception:
213
+ ground_truths.append(gt)
214
+ finally:
215
+ # Ensure we never leak a pending alarm into later code paths.
216
+ signal.alarm(0)
217
+ normalized_response = self.normalize_expression(response.completion)
218
+ response_list = self.check_for_equation(normalized_response)
219
+ try:
220
+ symb_is_correct = self._is_symbolically_equiv(response_list, ground_truths, INVALID_ANSWER)
221
+ except Exception:
222
+ symb_is_correct = False
223
+
224
+ # check if already correct symbolically
225
+ if symb_is_correct:
226
+ return [
227
+ MetricResult(
228
+ metric_name=self.NAME, value=float(symb_is_correct), higher_is_better=True, error=response.error
229
+ )
230
+ ]
231
+ else:
232
+ # fall back to string comparison
233
+ # ground truth can be list or str, we have str comparisons
234
+ assert isinstance(response.ground_truth, str)
235
+ str_is_correct = self._is_str_correct(normalized_response, response.ground_truth)
236
+ return [
237
+ MetricResult(
238
+ metric_name=self.NAME, value=float(str_is_correct), higher_is_better=True, error=response.error
239
+ )
240
+ ]
241
+
242
+ def _any_str_correct(self, response_list: list, ground_truths: list) -> bool:
243
+ """
244
+ Check if any of the responses are correct and return true at first match
245
+ :param response_list: list of responses
246
+ :param ground_truths: list of ground truths
247
+ :return: True if any response is correct
248
+ """
249
+ for response in response_list:
250
+ for ground_truth in ground_truths:
251
+ if self._is_str_correct(response, ground_truth):
252
+ return True
253
+ return False
254
+
255
+ def _is_str_correct(self, str1: str, str2: str) -> bool:
256
+ """
257
+ Check if two strings are equal after stripping
258
+ :param str1: first string
259
+ :param str2: second string
260
+ :param verbose: print the stripped strings
261
+ :return: True if the strings are equal
262
+ """
263
+ # if multiple equal signs in ground truth (str2)
264
+ # slide the response (str1) over the ground truth (str2)
265
+ # at the interval of every equal sign in the ground truth
266
+ # and check if any of the responses match
267
+ # this accounts for generations such as b = 1 with ground truth as x = b = 1
268
+ if str1.count("=") < str2.count("="):
269
+ return self._is_str_correct(str1, str2[str2.index("=") + 1 :])
270
+ if str1.count("=") > str2.count("="):
271
+ return self._is_str_correct(str1[str1.index("=") + 1 :], str2)
272
+ if str1 is None and str2 is None:
273
+ return True
274
+ if str1 is None or str2 is None:
275
+ return False
276
+ try:
277
+ return str1 == str2
278
+ except Exception:
279
+ return str1 == str2
280
+
281
+ def _is_symbolically_equiv(
282
+ self, response_list: list[str], ground_truths: list, default_invalid: Basic = S.NaN
283
+ ) -> bool:
284
+ """
285
+ Check if any of the responses are correct and return true at first match
286
+ :param response_list: list of responses
287
+ :param ground_truths: list of ground truths
288
+ :param default_invalid: default value for invalid expressions
289
+ :return: True if any response
290
+ """
291
+
292
+ try:
293
+ self._apply_safely(parse_latex, response_list)
294
+ except (LaTeXParsingError, SympifyError, TypeError):
295
+ response_list = [default_invalid] # this can not occur as an answer.
296
+ return False
297
+
298
+ # map objects dont catch errors, so we use safe apply here
299
+ self._apply_safely(self._safe_simplify_expression, ground_truths)
300
+ self._apply_safely(self._safe_simplify_expression, response_list)
301
+
302
+ # check if any of the simplified responses match any of the simplified ground truths
303
+ try:
304
+ is_correct = self._any_symb_correct(response_list, ground_truths)
305
+ return is_correct
306
+ except ValueError:
307
+ return False
@@ -0,0 +1,163 @@
1
+ import re
2
+ import unicodedata
3
+
4
+ from eval_framework.metrics.base import (
5
+ BaseMetric,
6
+ MetricResult,
7
+ )
8
+ from eval_framework.shared.types import Completion, Error, LanguageMetricContext, extract_context_metric
9
+
10
+ # Dictionary of "none" words in different languages
11
+ NONE_DICT = {
12
+ "en": ["none"],
13
+ "ko": ["없음"],
14
+ "pl": ["brak"],
15
+ "zh": ["无"],
16
+ "vi": ["Không có"],
17
+ "ja": ["なし", "数字はありません"],
18
+ "ta": ["ஏதுமில்லை"],
19
+ "hu": ["nincs"],
20
+ "fr": ["aucun"],
21
+ "no": ["ingen"],
22
+ "uk": ["немає", "Нема"],
23
+ "ru": ["нет"],
24
+ "de": ["Keine vorhanden"],
25
+ "es": ["ninguno"],
26
+ "sv": ["inga"],
27
+ "fi": ["ei mikään"],
28
+ "cs": ["žádné", "žádná"],
29
+ "sr": ["nema"],
30
+ "pt": ["nenhum"],
31
+ "it": ["nessuno"],
32
+ "fa": ["هیچ کدام"],
33
+ "sw": ["hakuna"],
34
+ "nl": ["geen"],
35
+ "st": ["ha ho letho"],
36
+ "hi": ["कोई नहीं"],
37
+ "da": ["ingen"],
38
+ }
39
+
40
+
41
+ def clean_text(text: str) -> str:
42
+ """Clean text by removing spaces and normalizing"""
43
+ return text.strip().lower().replace("\u200c", "").replace(" ", "")
44
+
45
+
46
+ class NIAHAccuracy(BaseMetric[Completion]):
47
+ """Metric for Needle in a Haystack tasks"""
48
+
49
+ NAME = "NIAHAccuracy"
50
+
51
+ def calculate(self, response: Completion) -> list[MetricResult]:
52
+ if response.error is not None:
53
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
54
+
55
+ context = extract_context_metric(response, LanguageMetricContext)
56
+
57
+ ground_truths = [gt for gt in response.ground_truth_list if gt is not None]
58
+
59
+ try:
60
+ # Extract task and language from metadata
61
+ assert response.context is not None
62
+ language = context.language
63
+
64
+ # Get model's answer
65
+ model_answer = response.completion
66
+
67
+ # Determine which comparison function to use based on the task
68
+ none_values = set(v for values in NONE_DICT.values() for v in values)
69
+ if ground_truths[0] in none_values:
70
+ is_correct = self._compare_none(language, model_answer)
71
+ else:
72
+ is_correct = self._compare_numbers(language, ground_truths, model_answer)
73
+
74
+ return [
75
+ MetricResult(
76
+ metric_name=self.NAME, value=float(is_correct), higher_is_better=True, error=response.error
77
+ )
78
+ ]
79
+ except Exception as e:
80
+ error = Error(error_class=e.__class__.__name__, message=str(e), traceback="")
81
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=error)]
82
+
83
+ def _compare_numbers(self, lang: str, correct_answer: list[str], model_answer: str) -> bool:
84
+ """Compare numbers for regular NIAH tasks"""
85
+ if "-" in lang:
86
+ inst_lang = lang.split("-")[1]
87
+ else:
88
+ inst_lang = lang
89
+
90
+ if not model_answer:
91
+ return False
92
+
93
+ processed_model_answer = unicodedata.normalize("NFKC", model_answer)
94
+
95
+ none_words = NONE_DICT.get(inst_lang, ["none"])
96
+ # Check if any word in none_words is present in the processed answer; if yes, auto-fail
97
+ for word in none_words:
98
+ if word in processed_model_answer or clean_text(word) in processed_model_answer:
99
+ return False
100
+
101
+ # Extract all numeric substrings from the processed answer
102
+ numeric_strings = re.findall(r"\d+", processed_model_answer)
103
+
104
+ # Remove numbers that consist of a single digit
105
+ numeric_strings = [num for num in numeric_strings if len(num) > 1]
106
+
107
+ # Remove duplicates while preserving the original order
108
+ numeric_strings = list(dict.fromkeys(numeric_strings))
109
+
110
+ # If no numerics are found after processing, return False
111
+ if not numeric_strings:
112
+ return False
113
+
114
+ # Convert the extracted number strings to integers
115
+ try:
116
+ extracted_numbers = [int(num) for num in numeric_strings]
117
+ except Exception:
118
+ return False
119
+
120
+ # Convert correct_answers elements to integers to ensure numeric comparison
121
+ try:
122
+ correct_converted = [int(item) for item in correct_answer]
123
+ except Exception:
124
+ return False
125
+
126
+ # Check that the number of extracted numbers matches the length of correct_answers
127
+ if len(extracted_numbers) != len(correct_converted):
128
+ return False
129
+
130
+ # Compare the extracted numbers with the correct answers
131
+ if set(extracted_numbers) == set(correct_converted):
132
+ return True
133
+ else:
134
+ return False
135
+
136
+ def _compare_none(self, lang: str, model_answer: str) -> bool:
137
+ """Compare for NIAH none tasks"""
138
+ # Lower-case all inputs for consistent, case-insensitive processing
139
+ if "-" in lang:
140
+ inst_lang = lang.split("-")[1]
141
+ else:
142
+ inst_lang = lang
143
+
144
+ processed_model_answer = clean_text(unicodedata.normalize("NFKC", model_answer))
145
+ none_words = [clean_text(word) for word in NONE_DICT[inst_lang]]
146
+
147
+ # Remove single digit numbers from the processed answer
148
+ processed_model_answer = re.sub(r"\b\d\b", "", processed_model_answer)
149
+
150
+ # Extract all multi-digit numeric substrings from the processed answer
151
+ numeric_strings = re.findall(r"\d\d+", processed_model_answer)
152
+
153
+ # If any multi-digit numbers are found, return False
154
+ if numeric_strings:
155
+ return False
156
+
157
+ # Check if any of the words in none_words are present
158
+ for word in none_words:
159
+ if word in processed_model_answer:
160
+ return True
161
+
162
+ # If none of the none_words are found, return False
163
+ return False
@@ -0,0 +1,27 @@
1
+ import re
2
+
3
+ from eval_framework.metrics.base import BaseMetric, MetricResult
4
+ from eval_framework.shared.types import BaseMetricContext, Completion, extract_context_metric
5
+
6
+
7
+ class PlaceholderCheckerMetricContext(BaseMetricContext):
8
+ num_placeholders: int
9
+
10
+
11
+ class PlaceholderChecker(BaseMetric[Completion]):
12
+ NAME = "Placeholder Check"
13
+
14
+ def calculate(self, response: Completion) -> list[MetricResult]:
15
+ if response.error is not None:
16
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
17
+
18
+ context = extract_context_metric(response, PlaceholderCheckerMetricContext)
19
+
20
+ assert context.num_placeholders is not None, "Expected 'num_placeholders' in context"
21
+ assert isinstance(context.num_placeholders, int), (
22
+ f"'num_placeholders' has incorrect type: {type(context.num_placeholders)}"
23
+ )
24
+
25
+ placeholders = re.findall(r"\[.*?\]", response.completion)
26
+ value = float(len(placeholders) >= context.num_placeholders)
27
+ return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]