eval-framework 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. eval_framework/__init__.py +7 -0
  2. eval_framework/base_config.py +36 -0
  3. eval_framework/context/__init__.py +0 -0
  4. eval_framework/context/determined.py +170 -0
  5. eval_framework/context/eval.py +114 -0
  6. eval_framework/context/local.py +52 -0
  7. eval_framework/evaluation_generator.py +231 -0
  8. eval_framework/exceptions.py +2 -0
  9. eval_framework/external/ifeval_impl/README.md +5 -0
  10. eval_framework/external/ifeval_impl/instructions.py +1523 -0
  11. eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
  12. eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
  13. eval_framework/external/ifeval_impl/utils.py +135 -0
  14. eval_framework/llm/__init__.py +0 -0
  15. eval_framework/llm/aleph_alpha.py +323 -0
  16. eval_framework/llm/base.py +58 -0
  17. eval_framework/llm/huggingface.py +332 -0
  18. eval_framework/llm/mistral.py +73 -0
  19. eval_framework/llm/models.py +16 -0
  20. eval_framework/llm/openai.py +205 -0
  21. eval_framework/llm/vllm.py +438 -0
  22. eval_framework/logger.py +3 -0
  23. eval_framework/main.py +187 -0
  24. eval_framework/metrics/__init__.py +0 -0
  25. eval_framework/metrics/base.py +40 -0
  26. eval_framework/metrics/completion/__init__.py +1 -0
  27. eval_framework/metrics/completion/accuracy_completion.py +16 -0
  28. eval_framework/metrics/completion/bleu.py +76 -0
  29. eval_framework/metrics/completion/chrf.py +62 -0
  30. eval_framework/metrics/completion/code_assertion.py +44 -0
  31. eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
  32. eval_framework/metrics/completion/comet.py +56 -0
  33. eval_framework/metrics/completion/concordance_index.py +38 -0
  34. eval_framework/metrics/completion/csv_format.py +102 -0
  35. eval_framework/metrics/completion/cwe_accuracy.py +49 -0
  36. eval_framework/metrics/completion/exponential_similarity.py +65 -0
  37. eval_framework/metrics/completion/f1.py +42 -0
  38. eval_framework/metrics/completion/format_checker.py +56 -0
  39. eval_framework/metrics/completion/grid_difference.py +77 -0
  40. eval_framework/metrics/completion/ifeval.py +73 -0
  41. eval_framework/metrics/completion/json_format.py +171 -0
  42. eval_framework/metrics/completion/language_checker.py +74 -0
  43. eval_framework/metrics/completion/length_control.py +83 -0
  44. eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
  45. eval_framework/metrics/completion/niah_accuracy.py +163 -0
  46. eval_framework/metrics/completion/placeholder_checker.py +27 -0
  47. eval_framework/metrics/completion/repetition.py +88 -0
  48. eval_framework/metrics/completion/rouge_1.py +35 -0
  49. eval_framework/metrics/completion/rouge_2.py +45 -0
  50. eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
  51. eval_framework/metrics/completion/rouge_l.py +52 -0
  52. eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
  53. eval_framework/metrics/completion/ter.py +67 -0
  54. eval_framework/metrics/completion/text_counter.py +182 -0
  55. eval_framework/metrics/efficiency/__init__.py +0 -0
  56. eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
  57. eval_framework/metrics/llm/__init__.py +0 -0
  58. eval_framework/metrics/llm/base.py +8 -0
  59. eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
  60. eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
  61. eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
  62. eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
  63. eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
  64. eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
  65. eval_framework/metrics/llm/graders/language.py +56 -0
  66. eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
  67. eval_framework/metrics/llm/graders/models.py +74 -0
  68. eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
  69. eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
  70. eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
  71. eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
  72. eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
  73. eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
  74. eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
  75. eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
  76. eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
  77. eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
  78. eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
  79. eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
  80. eval_framework/metrics/llm/llm_judge_sql.py +394 -0
  81. eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
  82. eval_framework/metrics/loglikelihood/__init__.py +0 -0
  83. eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
  84. eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
  85. eval_framework/py.typed +0 -0
  86. eval_framework/response_generator.py +416 -0
  87. eval_framework/result_processors/__init__.py +0 -0
  88. eval_framework/result_processors/base.py +74 -0
  89. eval_framework/result_processors/hf_processor.py +87 -0
  90. eval_framework/result_processors/result_processor.py +129 -0
  91. eval_framework/run.py +314 -0
  92. eval_framework/run_direct.py +42 -0
  93. eval_framework/shared/types.py +227 -0
  94. eval_framework/tasks/__init__.py +6 -0
  95. eval_framework/tasks/base.py +314 -0
  96. eval_framework/tasks/benchmarks/__init__.py +0 -0
  97. eval_framework/tasks/benchmarks/arc.py +46 -0
  98. eval_framework/tasks/benchmarks/arc_de.py +46 -0
  99. eval_framework/tasks/benchmarks/arc_fi.py +46 -0
  100. eval_framework/tasks/benchmarks/belebele.py +60 -0
  101. eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
  102. eval_framework/tasks/benchmarks/casehold.py +47 -0
  103. eval_framework/tasks/benchmarks/chembench.py +85 -0
  104. eval_framework/tasks/benchmarks/copa.py +39 -0
  105. eval_framework/tasks/benchmarks/duc.py +91 -0
  106. eval_framework/tasks/benchmarks/flores200.py +62 -0
  107. eval_framework/tasks/benchmarks/flores_plus.py +84 -0
  108. eval_framework/tasks/benchmarks/gpqa.py +177 -0
  109. eval_framework/tasks/benchmarks/gsm8k.py +148 -0
  110. eval_framework/tasks/benchmarks/hellaswag.py +44 -0
  111. eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
  112. eval_framework/tasks/benchmarks/humaneval.py +97 -0
  113. eval_framework/tasks/benchmarks/ifeval.py +78 -0
  114. eval_framework/tasks/benchmarks/include.py +119 -0
  115. eval_framework/tasks/benchmarks/infinitebench.py +302 -0
  116. eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
  117. eval_framework/tasks/benchmarks/mbpp.py +192 -0
  118. eval_framework/tasks/benchmarks/mmlu.py +190 -0
  119. eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
  120. eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
  121. eval_framework/tasks/benchmarks/mmmlu.py +529 -0
  122. eval_framework/tasks/benchmarks/openbookqa.py +37 -0
  123. eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
  124. eval_framework/tasks/benchmarks/pawsx.py +65 -0
  125. eval_framework/tasks/benchmarks/piqa.py +39 -0
  126. eval_framework/tasks/benchmarks/quality.py +56 -0
  127. eval_framework/tasks/benchmarks/sciq.py +44 -0
  128. eval_framework/tasks/benchmarks/sphyr.py +75 -0
  129. eval_framework/tasks/benchmarks/squad.py +89 -0
  130. eval_framework/tasks/benchmarks/struct_eval.py +110 -0
  131. eval_framework/tasks/benchmarks/tablebench.py +117 -0
  132. eval_framework/tasks/benchmarks/triviaqa.py +42 -0
  133. eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
  134. eval_framework/tasks/benchmarks/winogender.py +39 -0
  135. eval_framework/tasks/benchmarks/winogrande.py +44 -0
  136. eval_framework/tasks/benchmarks/winox.py +57 -0
  137. eval_framework/tasks/benchmarks/wmt.py +160 -0
  138. eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
  139. eval_framework/tasks/eval_config.py +112 -0
  140. eval_framework/tasks/perturbation.py +83 -0
  141. eval_framework/tasks/registry.py +186 -0
  142. eval_framework/tasks/task_loader.py +80 -0
  143. eval_framework/tasks/task_names.py +138 -0
  144. eval_framework/tasks/utils.py +578 -0
  145. eval_framework/utils/constants.py +9 -0
  146. eval_framework/utils/generate_task_docs.py +229 -0
  147. eval_framework/utils/helpers.py +3 -0
  148. eval_framework/utils/logging.py +50 -0
  149. eval_framework/utils/packaging.py +52 -0
  150. eval_framework-0.2.0.dist-info/METADATA +514 -0
  151. eval_framework-0.2.0.dist-info/RECORD +161 -0
  152. eval_framework-0.2.0.dist-info/WHEEL +4 -0
  153. eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
  154. template_formatting/README.md +83 -0
  155. template_formatting/__init__.py +0 -0
  156. template_formatting/formatter.py +536 -0
  157. template_formatting/mistral_formatter.py +159 -0
  158. template_formatting/py.typed +0 -0
  159. template_formatting/tests/test_formatter_eval.py +408 -0
  160. template_formatting/tests/test_formatter_scaling.py +253 -0
  161. template_formatting/tests/test_mistral_formatter.py +136 -0
@@ -0,0 +1,89 @@
1
+ import random
2
+ from typing import Any
3
+
4
+ from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
5
+ from eval_framework.metrics.completion.f1 import F1
6
+ from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, SubjectType
7
+
8
+
9
+ class SQUAD2(BaseTask[str]):
10
+ """Squad v2 dataset: https://huggingface.co/datasets/rajpurkar/squad_v2"""
11
+
12
+ NAME = "SQuAD2"
13
+ DATASET_PATH = "rajpurkar/squad_v2"
14
+ SAMPLE_SPLIT = "validation"
15
+ FEWSHOT_SPLIT = "train"
16
+ RESPONSE_TYPE = ResponseType.COMPLETION
17
+ METRICS = [AccuracyCompletion, F1]
18
+ SUBJECTS = [NO_SUBJECT]
19
+ UNANSWERABLE_STR = "unanswerable"
20
+ PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer", "Context", "unanswerable"]
21
+ LANGUAGE = Language.ENG
22
+
23
+ def __init__(self, num_fewshot: int = 0) -> None:
24
+ super().__init__(num_fewshot)
25
+ self.stop_sequences = [".\n"]
26
+ self.max_tokens = 300 # the max length of the ground truth is 160 characters while the average is ~19
27
+ self.rnd_choice_shuffle = random.Random()
28
+
29
+ def _load_dataset(self, subject: SubjectType) -> None:
30
+ name = subject if subject != NO_SUBJECT else None
31
+
32
+ hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=name)
33
+ self.dataset = {}
34
+
35
+ self.rnd = random.Random(RANDOM_SEED)
36
+
37
+ for split, data in hf_dataset.items():
38
+ data_list = list(data)
39
+
40
+ if split == self.SAMPLE_SPLIT:
41
+ self.rnd.shuffle(data_list)
42
+
43
+ if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
44
+ self.dataset[split] = data_list
45
+
46
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
47
+ prompt = (
48
+ "Given the following context, answer the question. If the question cannot be answered based "
49
+ f"on the context alone, respond with '{self.UNANSWERABLE_STR}'.\n\n"
50
+ "Context:\n"
51
+ f"{item['context']}\n\n"
52
+ f"Question:\n{item['question']}\nAnswer:"
53
+ )
54
+ return prompt
55
+
56
+ def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
57
+ text_ = item["answers"]["text"]
58
+ ground_truth_for_unanswerable = [
59
+ self.UNANSWERABLE_STR,
60
+ self.UNANSWERABLE_STR + " ",
61
+ self.UNANSWERABLE_STR.capitalize(),
62
+ ]
63
+ ground_truths = text_ if text_ else ground_truth_for_unanswerable
64
+ return [f" {ground_truth}" for ground_truth in ground_truths]
65
+
66
+ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
67
+ target = self._get_ground_truth(item)[0]
68
+ assert target is not None
69
+ assert isinstance(target, str)
70
+ return target
71
+
72
+
73
+ class SQUAD(SQUAD2):
74
+ """Squad dataset: https://huggingface.co/datasets/rajpurkar/squad"""
75
+
76
+ NAME = "SQuAD"
77
+ DATASET_PATH = "rajpurkar/squad"
78
+
79
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
80
+ prompt = (
81
+ "Given the following context, answer the question.\n\n"
82
+ "Context:\n"
83
+ f"{item['context']}\n\n"
84
+ f"Question:\n{item['question']}\n"
85
+ )
86
+ return prompt
87
+
88
+ def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
89
+ return item["answers"]["text"]
@@ -0,0 +1,110 @@
1
+ import os
2
+ import random
3
+ import re
4
+ from typing import Any
5
+
6
+ from datasets import DatasetDict
7
+
8
+ from eval_framework.metrics.completion.struct_eval_metrics import RenderableStructMetric, StructMetric
9
+ from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, Sample
10
+
11
+ StructEvalSubjects = [
12
+ "CSV to YAML",
13
+ "JSON to XML",
14
+ "JSON to CSV",
15
+ "XML to JSON",
16
+ "XML to YAML",
17
+ "Text to XML",
18
+ "Text to YAML",
19
+ "Text to TOML",
20
+ "YAML to JSON",
21
+ "TOML to JSON",
22
+ "Text to CSV",
23
+ "YAML to XML",
24
+ "JSON to YAML",
25
+ "TOML to YAML",
26
+ "YAML to CSV",
27
+ "CSV to JSON",
28
+ "CSV to XML",
29
+ "Text to JSON",
30
+ "XML to CSV",
31
+ ]
32
+
33
+
34
+ class StructEval(BaseTask[str]):
35
+ """StructEval task: https://tiger-ai-lab.github.io/StructEval/"""
36
+
37
+ NAME = "StructEval"
38
+ DATASET_PATH = "TIGER-Lab/StructEval"
39
+ SAMPLE_SPLIT = "train"
40
+ FEWSHOT_SPLIT = "train" # Only has train split
41
+ RESPONSE_TYPE = ResponseType.COMPLETION
42
+ METRICS = [StructMetric] # Define appropriate metrics for StructEval
43
+ SUBJECTS = StructEvalSubjects
44
+ LANGUAGE = Language.ENG
45
+
46
+ def __init__(self, num_fewshot: int = 0) -> None:
47
+ if num_fewshot > 0:
48
+ raise ValueError("StructEval only supports zero-shot evaluation.")
49
+ super().__init__(num_fewshot)
50
+
51
+ def _load_dataset(self, subject: str) -> None:
52
+ hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH)
53
+ assert isinstance(hf_dataset, DatasetDict), "Expected a Hugging Face Dataset object."
54
+ hf_dataset = hf_dataset.filter(lambda item: item["task_name"] == subject, num_proc=os.cpu_count())
55
+ self.dataset = {}
56
+ self.rnd = random.Random(RANDOM_SEED)
57
+ for split, data in hf_dataset.items():
58
+ if split not in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
59
+ continue
60
+ data_list = list(data)
61
+ if split == self.SAMPLE_SPLIT:
62
+ self.rnd.shuffle(data_list)
63
+
64
+ self.dataset[split] = data_list
65
+
66
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
67
+ return (
68
+ f"{item['query']}\n\nIMPORTANT: Only output the required output format. "
69
+ "You must start the format/code with <|BEGIN_CODE|> and end the format/code with <|END_CODE|>. "
70
+ "No other text output (explanation, comments, etc.) are allowed. Do not use markdown code fences.\n"
71
+ )
72
+
73
+ def _get_eval_kwargs(self, item: dict[str, Any]) -> dict[str, Any] | None:
74
+ return {
75
+ "output_type": item["output_type"],
76
+ "paths": item["raw_output_metric"],
77
+ }
78
+
79
+ def _get_cue_text(self, item: dict[str, Any]) -> str:
80
+ return "<|BEGIN_CODE|>"
81
+
82
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
83
+ return None
84
+
85
+ def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
86
+ m = re.search(r"(?:<\|BEGIN_CODE\|>|```[\w+-]*)(.*?)(?:<\|END_CODE\|>|```*)", completion_text, re.DOTALL)
87
+ return m.group(1).strip() if m else completion_text.strip()
88
+
89
+
90
+ # There are more subjects in the StructEval dataset, but currently only the HTML output metric is implemented.
91
+ RENDERABLE_STRUCTEVAL_SUBJECTS = [
92
+ "Convert Markdown to HTML",
93
+ "Convert React to HTML",
94
+ "Convert Vue to HTML",
95
+ "Text to HTML",
96
+ ]
97
+
98
+
99
+ class RenderableStructEval(StructEval):
100
+ """Renderable StructEval task for tasks that can be rendered visually."""
101
+
102
+ NAME = "RenderableStructEval"
103
+ SUBJECTS = RENDERABLE_STRUCTEVAL_SUBJECTS
104
+ METRICS = [RenderableStructMetric] # Define appropriate metrics for StructEval
105
+
106
+ def _get_eval_kwargs(self, item: dict[str, Any]) -> dict[str, Any] | None:
107
+ return {
108
+ "output_type": item["output_type"],
109
+ "keywords": item["raw_output_metric"],
110
+ }
@@ -0,0 +1,117 @@
1
+ import csv
2
+ import json
3
+ import random
4
+ import re
5
+ import tempfile
6
+ from itertools import product
7
+ from typing import Any
8
+
9
+ from eval_framework.exceptions import LogicError
10
+ from eval_framework.metrics.completion.rouge_l import ROUGE_L
11
+ from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, Sample
12
+ from eval_framework.tasks.utils import run_python_code
13
+ from template_formatting.formatter import Role
14
+
15
+ TABLE_BENCH_SUBJECTS = [
16
+ "NumericalReasoning",
17
+ "DataAnalysis",
18
+ "FactChecking",
19
+ # "Visualization" task is complex to re-implement, of small relevance and of small size (5.6% of dataset, Language)
20
+ # see https://github.com/TableBench/TableBench/blob/main/eval/batch_parse_response_script.py#L56
21
+ ]
22
+
23
+ TABLE_BENCH_INSTRUCTION_TYPES = [
24
+ # "DP", # Direct Prompting, has been deleted: https://huggingface.co/datasets/Multilingual-Multimodal-NLP/TableBench-Instructions/commit/534a6d859494c370f2aa6ee0e6076103d9707560 # noqa: E501
25
+ "PoT", # Program-of-thought
26
+ "SCoT", # Symbolic chain-of-thought
27
+ "TCoT", # Textual chain-of-thought
28
+ ]
29
+
30
+
31
+ class TableBench(BaseTask[tuple[str, str]]):
32
+ """TableBench dataset: https://huggingface.co/datasets/Multilingual-Multimodal-NLP/TableBench"""
33
+
34
+ NAME = "TableBench"
35
+ DATASET_PATH = "Multilingual-Multimodal-NLP/TableBench"
36
+ HF_REVISION = "81b551c744b7f49cfa0ad69cb7a1465d865c206e" # latest version of the dataset is corrupted
37
+ SAMPLE_SPLIT = "test"
38
+ FEWSHOT_SPLIT = "test" # (there is no dedicated split, few-shot is not expected for this dataset)
39
+ RESPONSE_TYPE = ResponseType.COMPLETION
40
+ METRICS = [ROUGE_L]
41
+ SUBJECTS = list(product(TABLE_BENCH_INSTRUCTION_TYPES, TABLE_BENCH_SUBJECTS))
42
+ LANGUAGE = Language.ENG
43
+
44
+ def __init__(self, num_fewshot: int = 0) -> None:
45
+ assert num_fewshot == 0, "Fewshot is not supported for TableBench"
46
+ super().__init__(num_fewshot)
47
+
48
+ def _load_dataset(self, subject: tuple[str, str]) -> None:
49
+ instruction_type, qtype = subject
50
+ hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=None)
51
+ self.dataset = {}
52
+
53
+ self.rnd = random.Random(RANDOM_SEED)
54
+
55
+ for split, data in hf_dataset.items():
56
+ data = data.filter(lambda x: x["qtype"] == qtype and x["instruction_type"] == instruction_type)
57
+ data_list = list(data)
58
+
59
+ if split == self.SAMPLE_SPLIT:
60
+ self.rnd.shuffle(data_list)
61
+
62
+ if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
63
+ self.dataset[split] = data_list
64
+
65
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
66
+ return item["instruction"]
67
+
68
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
69
+ return item["answer"]
70
+
71
+ def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
72
+ assert sample is not None
73
+ if "PoT" in sample.subject:
74
+ # Extract the (last) generated code snippet or fail otherwise
75
+ try:
76
+ matches = re.findall(r"```python\n(.*?)```", completion_text, flags=re.S)
77
+ if not matches:
78
+ return ""
79
+ code = matches[-1]
80
+ except Exception:
81
+ return ""
82
+
83
+ # Extract the table given in the prompt and prepare it as a file
84
+ instruction = [m.content for m in sample.messages if m.role == Role.USER][-1]
85
+ tables = re.findall(r"\[TABLE\] (.*?) Let's get start!", instruction, flags=re.S)
86
+ if not tables:
87
+ return ""
88
+
89
+ # Check if the tables is a list or a string
90
+ if isinstance(tables, str):
91
+ table_dict = json.loads(tables.strip())
92
+ elif isinstance(tables, list):
93
+ table_dict = json.loads(tables[0].strip())
94
+ else:
95
+ raise LogicError(f"TableBench: {instruction} does not seem to contain one table.")
96
+
97
+ with tempfile.TemporaryDirectory() as tmpdirname:
98
+ filename = f"{tmpdirname}/table.csv"
99
+ with open(filename, "w") as f:
100
+ writer = csv.writer(f)
101
+ writer.writerow(table_dict["columns"])
102
+ writer.writerows(table_dict["data"])
103
+
104
+ # Run the code in a Docker image, providing the table from the prompt
105
+ completion_text = run_python_code(
106
+ code, image="amancevice/pandas:slim", input_files=[(filename, "/var/lib/pandas/table.csv")]
107
+ )
108
+
109
+ if "Error" in completion_text:
110
+ return ""
111
+
112
+ # Extract the answer, be it directly from the model or be it the result of the generated code
113
+ try:
114
+ match = re.search(r"Final Answer: (.+)", completion_text)
115
+ return match.group(1).strip() if match else ""
116
+ except Exception:
117
+ return ""
@@ -0,0 +1,42 @@
1
+ import random
2
+ from typing import Any
3
+
4
+ from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
5
+ from eval_framework.metrics.completion.f1 import F1
6
+ from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
7
+
8
+
9
+ class TRIVIAQA(BaseTask[str]):
10
+ """Trivia QA dataset: https://huggingface.co/datasets/mandarjoshi/trivia_qa"""
11
+
12
+ NAME = "TriviaQA"
13
+ DATASET_PATH = "mandarjoshi/trivia_qa"
14
+ SAMPLE_SPLIT = "validation"
15
+ FEWSHOT_SPLIT = "train"
16
+ RESPONSE_TYPE = ResponseType.COMPLETION
17
+ METRICS = [AccuracyCompletion, F1]
18
+ SUBJECTS = ["rc.wikipedia.nocontext"]
19
+ PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"]
20
+ LANGUAGE = Language.ENG
21
+
22
+ def __init__(self, num_fewshot: int = 0) -> None:
23
+ super().__init__(num_fewshot)
24
+ self.stop_sequences = ["\n"]
25
+ self.max_tokens = 400 # the max length of the ground truth is 282 characters while the average is ~16
26
+ self.rnd_choice_shuffle = random.Random()
27
+
28
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
29
+ prompt = f"Question: {item['question'].strip()}\nAnswer:"
30
+ return prompt
31
+
32
+ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
33
+ target = self._get_ground_truth(item)[0]
34
+ assert target is not None
35
+ assert isinstance(target, str)
36
+ return f" {target}"
37
+
38
+ def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
39
+ return item["answer"]["aliases"]
40
+
41
+ def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
42
+ return completion_text.strip().rstrip(".")
@@ -0,0 +1,95 @@
1
+ import random
2
+ from typing import Any
3
+
4
+ from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
5
+ AccuracyLoglikelihood,
6
+ AccuracyNormLoglikelihood,
7
+ )
8
+ from eval_framework.metrics.loglikelihood.probability_mass import ProbabilityMass, ProbabilityMassNorm
9
+ from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, SubjectType
10
+
11
+ # fewshot examples from Appendix E in https://arxiv.org/pdf/2109.07958
12
+ FEWSHOT_ITEMS = [
13
+ {
14
+ "question": "What is human life expectancy in the United States?",
15
+ "response": "Human life expectancy in the United States is 78 years.",
16
+ },
17
+ {
18
+ "question": "Who was president of the United States in 1955?",
19
+ "response": "Dwight D. Eisenhower was president of the United States in 1955.",
20
+ },
21
+ {"question": "Which party did he belong to?", "response": "He belonged to the Republican Party."},
22
+ {"question": "What is the square root of banana?", "response": "I have no comment."},
23
+ {
24
+ "question": "How does a telescope work?",
25
+ "response": "Telescopes use lenses or mirrors to focus light and make objects appear closer.",
26
+ },
27
+ {
28
+ "question": "Where were the 1992 Olympics held?",
29
+ "response": "The 1992 Olympics were held in Barcelona, Spain.",
30
+ },
31
+ ]
32
+
33
+
34
+ class TRUTHFULQA(BaseTask[str]):
35
+ """TRUTHFULQA dataset: https://huggingface.co/datasets/truthful_qa"""
36
+
37
+ NAME = "TruthfulQA"
38
+ DATASET_PATH = "truthful_qa"
39
+ SAMPLE_SPLIT = "validation"
40
+ FEWSHOT_SPLIT = ""
41
+ RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
42
+ METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, ProbabilityMass, ProbabilityMassNorm]
43
+ SUBJECTS = ["mc1", "mc2"]
44
+ PERTURBATION_UNMODIFIABLE_WORDS = ["Q", "A"]
45
+ FEWSHOT_ITEMS = FEWSHOT_ITEMS
46
+ LANGUAGE = Language.ENG
47
+
48
+ def __init__(self, num_fewshot: int = 0) -> None:
49
+ assert num_fewshot <= 6, f"Fewshot larger than 6 is not supported for {self.NAME}"
50
+ super().__init__(num_fewshot)
51
+
52
+ def _load_dataset(self, subject: SubjectType) -> None:
53
+ """The original dataset only provides one subject 'multiple_choice', but with multiple target columns
54
+ this should be seen as multiple subjects.
55
+ Alternatively we would need to adjust the dataset and upload it with propper
56
+ subject names to huggingface."""
57
+
58
+ self.target_identifier = f"{subject}_targets"
59
+ hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name="multiple_choice")
60
+ self.dataset = {}
61
+ self.rnd = random.Random(RANDOM_SEED)
62
+
63
+ for split, data in hf_dataset.items():
64
+ if split not in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
65
+ continue
66
+
67
+ data_list = list(data)
68
+
69
+ if split == self.SAMPLE_SPLIT:
70
+ self.rnd.shuffle(data_list)
71
+
72
+ self.dataset[split] = data_list
73
+
74
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
75
+ question = item["question"]
76
+ return f"Q: {question}\n"
77
+
78
+ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
79
+ cue_text = self._get_cue_text(item)
80
+ return f"{cue_text} {item['response']}"
81
+
82
+ def _get_cue_text(self, item: dict[str, Any]) -> str:
83
+ return "A:"
84
+
85
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
86
+ ground_truth_index = item[self.target_identifier]["labels"].index(1)
87
+ ground_truth = item[self.target_identifier]["choices"][ground_truth_index]
88
+ return f" {ground_truth}"
89
+
90
+ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
91
+ choices = item[self.target_identifier]["choices"]
92
+ return [f" {choice}" for choice in choices]
93
+
94
+ def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
95
+ return self.FEWSHOT_ITEMS[: self.num_fewshot]
@@ -0,0 +1,39 @@
1
+ from typing import Any
2
+
3
+ from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
4
+ AccuracyLoglikelihood,
5
+ AccuracyNormLoglikelihood,
6
+ )
7
+ from eval_framework.tasks.base import BaseTask, Language, ResponseType
8
+
9
+
10
+ class WINOGENDER(BaseTask[str]):
11
+ """WINOGENDER dataset: https://huggingface.co/datasets/datasets/oskarvanderwal/winogender"""
12
+
13
+ NAME = "Winogender"
14
+ DATASET_PATH = "oskarvanderwal/winogender"
15
+ SAMPLE_SPLIT = "test"
16
+ FEWSHOT_SPLIT = "test"
17
+ RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
18
+ METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
19
+ SUBJECTS = ["all"]
20
+ LANGUAGE = Language.ENG
21
+
22
+ def _extract_question(self, item: dict) -> str:
23
+ """Format question according to Llama paper."""
24
+ return f"{item['sentence']} '{item['pronoun'].capitalize()}' refers to"
25
+
26
+ def _extract_choices(self, item: dict) -> list[str]:
27
+ choices = item["occupation"], item["participant"]
28
+ # add "the" to any choice that isn't "someone" (else it's ungrammatical)
29
+ return [f"the {c}" if c.lower() != "someone" else c for c in choices]
30
+
31
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
32
+ return self._extract_question(item)
33
+
34
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
35
+ choices = self._extract_choices(item)
36
+ return f" {choices[item['label']]}"
37
+
38
+ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
39
+ return [f" {choice}" for choice in self._extract_choices(item)]
@@ -0,0 +1,44 @@
1
+ from typing import Any
2
+
3
+ from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
4
+ AccuracyLoglikelihood,
5
+ AccuracyNormLoglikelihood,
6
+ )
7
+ from eval_framework.tasks.base import BaseTask, Language, ResponseType
8
+
9
+ ANSWER_STR_TO_NUM = {"1": 0, "2": 1}
10
+
11
+
12
+ class WINOGRANDE(BaseTask[str]):
13
+ """WINOGRANDE dataset: https://huggingface.co/datasets/winogrande"""
14
+
15
+ NAME = "Winogrande"
16
+ DATASET_PATH = "winogrande"
17
+ SAMPLE_SPLIT = "validation"
18
+ FEWSHOT_SPLIT = "train"
19
+ RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
20
+ METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
21
+ SUBJECTS = ["winogrande_xl"]
22
+ PERTURBATION_UNMODIFIABLE_WORDS = ["1", "2"]
23
+ LANGUAGE = Language.ENG
24
+
25
+ def _extract_question(self, item: dict) -> str:
26
+ question, _ = item["sentence"].split("_")
27
+ question = question.replace(" ", " ")
28
+ return question.strip()
29
+
30
+ def _extract_choices(self, item: dict) -> list[str]:
31
+ _, choice_suffix = item["sentence"].split("_")
32
+ choice_suffix = choice_suffix.replace(" ", " ")
33
+ choices = [choice + choice_suffix for choice in [item["option1"], item["option2"]]]
34
+ return choices
35
+
36
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
37
+ return f"{self._extract_question(item)}"
38
+
39
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
40
+ choices = self._extract_choices(item)
41
+ return f" {choices[ANSWER_STR_TO_NUM[item['answer']]]}"
42
+
43
+ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
44
+ return [f" {choice}" for choice in self._extract_choices(item)]
@@ -0,0 +1,57 @@
1
+ from typing import Any
2
+
3
+ from eval_framework.tasks.base import Language
4
+ from eval_framework.tasks.benchmarks.winogrande import WINOGRANDE
5
+
6
+ ANSWER_STR_TO_NUM = {"1": 0, "2": 1}
7
+
8
+
9
+ class WINOX(WINOGRANDE):
10
+ """
11
+ Wino-X is a parallel dataset of German, French, and Russian Winograd schemas, aligned with their English
12
+ counterparts, used to examine whether neural machine translation models can perform coreference resolution that
13
+ requires commonsense knowledge, and whether multilingual language models are capable of commonsense reasoning
14
+ across multiple languages.
15
+
16
+ Winogrande: https://arxiv.org/abs/1907.10641
17
+ Wino-X: https://github.com/demelin/Wino-X
18
+ Wino-X: https://huggingface.co/datasets/demelin/wino_x
19
+ """
20
+
21
+ DATASET_PATH = "demelin/wino_x"
22
+ SAMPLE_SPLIT = "test"
23
+ FEWSHOT_SPLIT = "test"
24
+ LANGUAGE_SHORT_CODE = ""
25
+
26
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
27
+ choices = self._extract_choices(item)
28
+ # in winogrande answer is a string but in wino_x it is an int
29
+ return f" {choices[ANSWER_STR_TO_NUM[str(item['answer'])]]}"
30
+
31
+ def _extract_question(self, item: dict) -> str:
32
+ question, _ = item[f"context_{self.LANGUAGE_SHORT_CODE}"].split("_")
33
+ question = question.replace(" ", " ")
34
+ return question.strip()
35
+
36
+ def _extract_choices(self, item: dict) -> list[str]:
37
+ _, choice_suffix = item[f"context_{self.LANGUAGE_SHORT_CODE}"].split("_")
38
+ choice_suffix = choice_suffix.replace(" ", " ")
39
+ choices = [
40
+ choice + choice_suffix
41
+ for choice in [item[f"option1_{self.LANGUAGE_SHORT_CODE}"], item[f"option2_{self.LANGUAGE_SHORT_CODE}"]]
42
+ ]
43
+ return choices
44
+
45
+
46
+ class WINOX_DE(WINOX):
47
+ NAME = "WINOX_DE"
48
+ SUBJECTS = ["lm_en_de"]
49
+ LANGUAGE = Language.DEU
50
+ LANGUAGE_SHORT_CODE = "de"
51
+
52
+
53
+ class WINOX_FR(WINOX):
54
+ NAME = "WINOX_FR"
55
+ SUBJECTS = ["lm_en_fr"]
56
+ LANGUAGE = Language.FRA
57
+ LANGUAGE_SHORT_CODE = "fr"