eval-framework 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. eval_framework/__init__.py +7 -0
  2. eval_framework/base_config.py +36 -0
  3. eval_framework/context/__init__.py +0 -0
  4. eval_framework/context/determined.py +170 -0
  5. eval_framework/context/eval.py +114 -0
  6. eval_framework/context/local.py +52 -0
  7. eval_framework/evaluation_generator.py +231 -0
  8. eval_framework/exceptions.py +2 -0
  9. eval_framework/external/ifeval_impl/README.md +5 -0
  10. eval_framework/external/ifeval_impl/instructions.py +1523 -0
  11. eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
  12. eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
  13. eval_framework/external/ifeval_impl/utils.py +135 -0
  14. eval_framework/llm/__init__.py +0 -0
  15. eval_framework/llm/aleph_alpha.py +323 -0
  16. eval_framework/llm/base.py +58 -0
  17. eval_framework/llm/huggingface.py +332 -0
  18. eval_framework/llm/mistral.py +73 -0
  19. eval_framework/llm/models.py +16 -0
  20. eval_framework/llm/openai.py +205 -0
  21. eval_framework/llm/vllm.py +438 -0
  22. eval_framework/logger.py +3 -0
  23. eval_framework/main.py +187 -0
  24. eval_framework/metrics/__init__.py +0 -0
  25. eval_framework/metrics/base.py +40 -0
  26. eval_framework/metrics/completion/__init__.py +1 -0
  27. eval_framework/metrics/completion/accuracy_completion.py +16 -0
  28. eval_framework/metrics/completion/bleu.py +76 -0
  29. eval_framework/metrics/completion/chrf.py +62 -0
  30. eval_framework/metrics/completion/code_assertion.py +44 -0
  31. eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
  32. eval_framework/metrics/completion/comet.py +56 -0
  33. eval_framework/metrics/completion/concordance_index.py +38 -0
  34. eval_framework/metrics/completion/csv_format.py +102 -0
  35. eval_framework/metrics/completion/cwe_accuracy.py +49 -0
  36. eval_framework/metrics/completion/exponential_similarity.py +65 -0
  37. eval_framework/metrics/completion/f1.py +42 -0
  38. eval_framework/metrics/completion/format_checker.py +56 -0
  39. eval_framework/metrics/completion/grid_difference.py +77 -0
  40. eval_framework/metrics/completion/ifeval.py +73 -0
  41. eval_framework/metrics/completion/json_format.py +171 -0
  42. eval_framework/metrics/completion/language_checker.py +74 -0
  43. eval_framework/metrics/completion/length_control.py +83 -0
  44. eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
  45. eval_framework/metrics/completion/niah_accuracy.py +163 -0
  46. eval_framework/metrics/completion/placeholder_checker.py +27 -0
  47. eval_framework/metrics/completion/repetition.py +88 -0
  48. eval_framework/metrics/completion/rouge_1.py +35 -0
  49. eval_framework/metrics/completion/rouge_2.py +45 -0
  50. eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
  51. eval_framework/metrics/completion/rouge_l.py +52 -0
  52. eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
  53. eval_framework/metrics/completion/ter.py +67 -0
  54. eval_framework/metrics/completion/text_counter.py +182 -0
  55. eval_framework/metrics/efficiency/__init__.py +0 -0
  56. eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
  57. eval_framework/metrics/llm/__init__.py +0 -0
  58. eval_framework/metrics/llm/base.py +8 -0
  59. eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
  60. eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
  61. eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
  62. eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
  63. eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
  64. eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
  65. eval_framework/metrics/llm/graders/language.py +56 -0
  66. eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
  67. eval_framework/metrics/llm/graders/models.py +74 -0
  68. eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
  69. eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
  70. eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
  71. eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
  72. eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
  73. eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
  74. eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
  75. eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
  76. eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
  77. eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
  78. eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
  79. eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
  80. eval_framework/metrics/llm/llm_judge_sql.py +394 -0
  81. eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
  82. eval_framework/metrics/loglikelihood/__init__.py +0 -0
  83. eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
  84. eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
  85. eval_framework/py.typed +0 -0
  86. eval_framework/response_generator.py +416 -0
  87. eval_framework/result_processors/__init__.py +0 -0
  88. eval_framework/result_processors/base.py +74 -0
  89. eval_framework/result_processors/hf_processor.py +87 -0
  90. eval_framework/result_processors/result_processor.py +129 -0
  91. eval_framework/run.py +314 -0
  92. eval_framework/run_direct.py +42 -0
  93. eval_framework/shared/types.py +227 -0
  94. eval_framework/tasks/__init__.py +6 -0
  95. eval_framework/tasks/base.py +314 -0
  96. eval_framework/tasks/benchmarks/__init__.py +0 -0
  97. eval_framework/tasks/benchmarks/arc.py +46 -0
  98. eval_framework/tasks/benchmarks/arc_de.py +46 -0
  99. eval_framework/tasks/benchmarks/arc_fi.py +46 -0
  100. eval_framework/tasks/benchmarks/belebele.py +60 -0
  101. eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
  102. eval_framework/tasks/benchmarks/casehold.py +47 -0
  103. eval_framework/tasks/benchmarks/chembench.py +85 -0
  104. eval_framework/tasks/benchmarks/copa.py +39 -0
  105. eval_framework/tasks/benchmarks/duc.py +91 -0
  106. eval_framework/tasks/benchmarks/flores200.py +62 -0
  107. eval_framework/tasks/benchmarks/flores_plus.py +84 -0
  108. eval_framework/tasks/benchmarks/gpqa.py +177 -0
  109. eval_framework/tasks/benchmarks/gsm8k.py +148 -0
  110. eval_framework/tasks/benchmarks/hellaswag.py +44 -0
  111. eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
  112. eval_framework/tasks/benchmarks/humaneval.py +97 -0
  113. eval_framework/tasks/benchmarks/ifeval.py +78 -0
  114. eval_framework/tasks/benchmarks/include.py +119 -0
  115. eval_framework/tasks/benchmarks/infinitebench.py +302 -0
  116. eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
  117. eval_framework/tasks/benchmarks/mbpp.py +192 -0
  118. eval_framework/tasks/benchmarks/mmlu.py +190 -0
  119. eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
  120. eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
  121. eval_framework/tasks/benchmarks/mmmlu.py +529 -0
  122. eval_framework/tasks/benchmarks/openbookqa.py +37 -0
  123. eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
  124. eval_framework/tasks/benchmarks/pawsx.py +65 -0
  125. eval_framework/tasks/benchmarks/piqa.py +39 -0
  126. eval_framework/tasks/benchmarks/quality.py +56 -0
  127. eval_framework/tasks/benchmarks/sciq.py +44 -0
  128. eval_framework/tasks/benchmarks/sphyr.py +75 -0
  129. eval_framework/tasks/benchmarks/squad.py +89 -0
  130. eval_framework/tasks/benchmarks/struct_eval.py +110 -0
  131. eval_framework/tasks/benchmarks/tablebench.py +117 -0
  132. eval_framework/tasks/benchmarks/triviaqa.py +42 -0
  133. eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
  134. eval_framework/tasks/benchmarks/winogender.py +39 -0
  135. eval_framework/tasks/benchmarks/winogrande.py +44 -0
  136. eval_framework/tasks/benchmarks/winox.py +57 -0
  137. eval_framework/tasks/benchmarks/wmt.py +160 -0
  138. eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
  139. eval_framework/tasks/eval_config.py +112 -0
  140. eval_framework/tasks/perturbation.py +83 -0
  141. eval_framework/tasks/registry.py +186 -0
  142. eval_framework/tasks/task_loader.py +80 -0
  143. eval_framework/tasks/task_names.py +138 -0
  144. eval_framework/tasks/utils.py +578 -0
  145. eval_framework/utils/constants.py +9 -0
  146. eval_framework/utils/generate_task_docs.py +229 -0
  147. eval_framework/utils/helpers.py +3 -0
  148. eval_framework/utils/logging.py +50 -0
  149. eval_framework/utils/packaging.py +52 -0
  150. eval_framework-0.2.0.dist-info/METADATA +514 -0
  151. eval_framework-0.2.0.dist-info/RECORD +161 -0
  152. eval_framework-0.2.0.dist-info/WHEEL +4 -0
  153. eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
  154. template_formatting/README.md +83 -0
  155. template_formatting/__init__.py +0 -0
  156. template_formatting/formatter.py +536 -0
  157. template_formatting/mistral_formatter.py +159 -0
  158. template_formatting/py.typed +0 -0
  159. template_formatting/tests/test_formatter_eval.py +408 -0
  160. template_formatting/tests/test_formatter_scaling.py +253 -0
  161. template_formatting/tests/test_mistral_formatter.py +136 -0
@@ -0,0 +1,60 @@
1
+ from typing import Any
2
+
3
+ from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
4
+ AccuracyLoglikelihood,
5
+ AccuracyNormLoglikelihood,
6
+ )
7
+ from eval_framework.tasks.base import BaseTask, Language, ResponseType
8
+ from eval_framework.tasks.utils import get_n_letters
9
+
10
+
11
+ class BELEBELE(BaseTask[str]):
12
+ """BELEBELE dataset: https://huggingface.co/datasets/facebook/belebele"""
13
+
14
+ NAME = "BELEBELE"
15
+ DATASET_PATH = "facebook/belebele"
16
+ SAMPLE_SPLIT = "test"
17
+ FEWSHOT_SPLIT = "test"
18
+ RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
19
+ METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
20
+ SUBJECTS = [
21
+ "eng_Latn",
22
+ ]
23
+ PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"] + get_n_letters(4)
24
+ LANGUAGE = Language.ENG
25
+
26
+ def __init__(self, num_fewshot: int = 0) -> None:
27
+ super().__init__(num_fewshot)
28
+
29
+ self.keys = get_n_letters(4)
30
+ self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)}
31
+
32
+ def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
33
+ return "The following are multiple choice questions (with answers)."
34
+
35
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
36
+ context = item["flores_passage"].strip()
37
+ question = item["question"].strip()
38
+ choices = "".join(
39
+ [
40
+ f"{key}. {choice}\n"
41
+ for key, choice in zip(
42
+ self.keys, [item["mc_answer1"], item["mc_answer2"], item["mc_answer3"], item["mc_answer4"]]
43
+ )
44
+ ]
45
+ )
46
+ return f"{context}\n\nQuestion: {question}\n{choices}"
47
+
48
+ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
49
+ ground_truth = self._get_ground_truth(item)
50
+ assert ground_truth is not None
51
+ return f"{self._get_cue_text(item)}{ground_truth}"
52
+
53
+ def _get_cue_text(self, item: dict[str, Any]) -> str:
54
+ return "Answer:"
55
+
56
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
57
+ return f" {self.keys[int(item['correct_answer_num']) - 1]}"
58
+
59
+ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
60
+ return [f" {key}" for key in self.keys]
@@ -0,0 +1,155 @@
1
+ import random
2
+ import re
3
+ from typing import Any
4
+
5
+ from eval_framework.metrics.completion.code_execution_pass_at_one import (
6
+ CodeExecutionPassAtOne,
7
+ CodeExecutionPassAtOneContext,
8
+ )
9
+ from eval_framework.tasks.base import (
10
+ RANDOM_SEED,
11
+ BaseTask,
12
+ Language,
13
+ ResponseType,
14
+ Sample,
15
+ SubjectType,
16
+ )
17
+ from eval_framework.tasks.utils import (
18
+ BIG_CODE_BENCH_PACKAGE_MAPPING,
19
+ CallableSerializer,
20
+ _parse_unittest_output,
21
+ unittest_merge_snippets,
22
+ )
23
+
24
+ PROMPT_INSTRUCTION = (
25
+ "Please provide a self-contained Python script, without tests or example usage, that solves the following "
26
+ "problem in a markdown code block:\n"
27
+ ) # from https://arxiv.org/pdf/2406.15877 - Figure 14
28
+
29
+
30
+ RESPONSE_PREFIX = (
31
+ "Below is a Python script with a self-contained function that solves the problem and passes "
32
+ "corresponding tests:\n"
33
+ ) # from https://github.com/bigcode-project/bigcodebench/blob/main/bigcodebench/generate.py#L149
34
+
35
+
36
+ class BigCodeBench(BaseTask[str]):
37
+ """BigCodeBench dataset: https://huggingface.co/datasets/bigcode/bigcodebench"""
38
+
39
+ NAME = "BigCodeBench"
40
+ DATASET_PATH = "bigcode/bigcodebench"
41
+ SAMPLE_SPLIT = "v0.1.4"
42
+ FEWSHOT_SPLIT = "v0.1.4" # (there is no dedicated split, few-shot is not expected for this dataset)
43
+ RESPONSE_TYPE = ResponseType.COMPLETION
44
+ METRICS = [CodeExecutionPassAtOne]
45
+ SUBJECTS = ["original", "calibrated"]
46
+ LANGUAGE = Language.ENG
47
+
48
+ def __init__(self, num_fewshot: int = 0) -> None:
49
+ assert num_fewshot == 0, "Fewshot is not supported for BigCodeBench"
50
+ # NOTE : this serializer should be the same class as initialized in the metric
51
+ self.serializer = CallableSerializer()
52
+ super().__init__(num_fewshot)
53
+
54
+ def _load_dataset(self, subject: SubjectType) -> None:
55
+ hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=None)
56
+ self.dataset = {}
57
+
58
+ self.rnd = random.Random(RANDOM_SEED)
59
+
60
+ for split, data in hf_dataset.items():
61
+ data_list = list(data)
62
+
63
+ if split == self.SAMPLE_SPLIT:
64
+ self.rnd.shuffle(data_list)
65
+
66
+ if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
67
+ self.dataset[split] = data_list
68
+
69
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
70
+ return PROMPT_INSTRUCTION + item["complete_prompt"]
71
+
72
+ def _get_cue_text(self, item: dict[str, Any]) -> str:
73
+ return RESPONSE_PREFIX + (item["code_prompt"] if item["subject"] == "calibrated" else "")
74
+
75
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
76
+ return item["canonical_solution"] # Not needed for evaluation, as it is test based given the generated code
77
+
78
+ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
79
+ return None
80
+
81
+ def _get_context(self, item: dict[str, Any]) -> CodeExecutionPassAtOneContext:
82
+ return CodeExecutionPassAtOneContext(
83
+ run_env="python:3.12", # os.environ.get("DOCKER_CODE_EXECUTION"),
84
+ code_prompt=item["code_prompt"],
85
+ test_code=item["test"],
86
+ snippet_merge_fn=self.serializer.encode(unittest_merge_snippets),
87
+ output_parse_fn=self.serializer.encode(_parse_unittest_output),
88
+ package_downloads=BIG_CODE_BENCH_PACKAGE_MAPPING,
89
+ )
90
+
91
+ def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
92
+ if sample is not None and sample.context is not None and sample.subject == "calibrated":
93
+ assert isinstance(sample.context, CodeExecutionPassAtOneContext), "Expected CodeExecutionPassAtOneContext"
94
+ processed_text = (sample.context.code_prompt if sample.context is not None else "") + completion_text
95
+ else:
96
+ processed_text = extract_executable_code(completion_text)
97
+
98
+ return processed_text
99
+
100
+
101
+ class BigCodeBenchInstruct(BigCodeBench):
102
+ """BigCodeBench dataset: https://huggingface.co/datasets/bigcode/bigcodebench"""
103
+
104
+ NAME = "BigCodeBenchInstruct"
105
+
106
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
107
+ return PROMPT_INSTRUCTION + item["instruct_prompt"]
108
+
109
+
110
+ class BigCodeBenchHard(BigCodeBench):
111
+ """BigCodeBench dataset: https://huggingface.co/datasets/bigcode/bigcodebench-hard"""
112
+
113
+ NAME = "BigCodeBenchHard"
114
+ DATASET_PATH = "bigcode/bigcodebench-hard"
115
+
116
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
117
+ return PROMPT_INSTRUCTION + item["complete_prompt"]
118
+
119
+
120
+ class BigCodeBenchHardInstruct(BigCodeBenchHard):
121
+ """BigCodeBench dataset: https://huggingface.co/datasets/bigcode/bigcodebench-hard"""
122
+
123
+ NAME = "BigCodeBenchHardInstruct"
124
+
125
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
126
+ return PROMPT_INSTRUCTION + item["instruct_prompt"]
127
+
128
+
129
+ def extract_executable_code(llm_response: str) -> str:
130
+ # Look for nested markdown+python pattern
131
+ nested_pattern = r"```markdown.*?```python\s*(.*?)\s*```"
132
+ nested_matches = re.findall(nested_pattern, llm_response, re.DOTALL)
133
+ if nested_matches:
134
+ return nested_matches[0].strip()
135
+
136
+ # Look for python code blocks
137
+ python_pattern = r"```python\s*(.*?)\s*```"
138
+ python_matches = re.findall(python_pattern, llm_response, re.DOTALL)
139
+ if python_matches:
140
+ return python_matches[0].strip()
141
+
142
+ # Look for markdown-only code blocks
143
+ markdown_pattern = r"```markdown\s*(.*?)\s*```"
144
+ markdown_matches = re.findall(markdown_pattern, llm_response, re.DOTALL)
145
+ if markdown_matches:
146
+ return markdown_matches[0].strip()
147
+
148
+ # Look for generic code blocks as fallback
149
+ generic_pattern = r"```\s*(.*?)\s*```"
150
+ generic_matches = re.findall(generic_pattern, llm_response, re.DOTALL)
151
+ if generic_matches:
152
+ return generic_matches[0].strip()
153
+
154
+ # If no code blocks found, return original response
155
+ return llm_response
@@ -0,0 +1,47 @@
1
+ import random
2
+ from typing import Any
3
+
4
+ from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
5
+ AccuracyLoglikelihood,
6
+ AccuracyNormLoglikelihood,
7
+ )
8
+ from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType
9
+
10
+
11
+ class CASEHOLD(BaseTask[str]):
12
+ NAME = "CaseHold"
13
+ DATASET_PATH = "lex_glue"
14
+ SAMPLE_SPLIT = "test"
15
+ FEWSHOT_SPLIT = "train"
16
+ RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
17
+ METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
18
+ SUBJECTS = ["case_hold"]
19
+ LANGUAGE = Language.ENG
20
+
21
+ def _load_dataset(self, subject: str) -> None:
22
+ name = subject if subject != NO_SUBJECT else None
23
+
24
+ hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=name)
25
+ self.dataset = {}
26
+
27
+ self.rnd = random.Random(RANDOM_SEED)
28
+
29
+ for split, data in hf_dataset.items():
30
+ data_list = list(data)
31
+
32
+ if split == self.SAMPLE_SPLIT:
33
+ self.rnd.shuffle(data_list)
34
+
35
+ if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
36
+ self.dataset[split] = [i for i in data_list if i["context"].count("(<HOLDING>)") == 1]
37
+
38
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
39
+ return item["context"].split("(<HOLDING>)", maxsplit=1)[0]
40
+
41
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
42
+ right = item["context"].split("(<HOLDING>)", maxsplit=1)[1]
43
+ return f"{item['endings'][item['label']]}{right}"
44
+
45
+ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
46
+ right = item["context"].split("(<HOLDING>)", maxsplit=1)[1]
47
+ return [f"{ending}{right}" for ending in item["endings"]]
@@ -0,0 +1,85 @@
1
+ import json
2
+ from typing import Any
3
+
4
+ from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
5
+ AccuracyLoglikelihood,
6
+ AccuracyNormLoglikelihood,
7
+ )
8
+ from eval_framework.tasks.base import BaseTask, Language, ResponseType
9
+ from eval_framework.tasks.utils import get_n_letters
10
+
11
+ CHEMBENCH_SUBJECTS = [
12
+ "analytical_chemistry",
13
+ "chemical_preference",
14
+ "general_chemistry",
15
+ "inorganic_chemistry",
16
+ "materials_science",
17
+ "organic_chemistry",
18
+ "physical_chemistry",
19
+ "technical_chemistry",
20
+ "toxicity_and_safety",
21
+ ]
22
+
23
+
24
+ class ChemBench(BaseTask[str]):
25
+ """ChemBench dataset: https://huggingface.co/datasets/jablonkagroup/ChemBench"""
26
+
27
+ NAME = "ChemBench"
28
+ DATASET_PATH = "jablonkagroup/ChemBench"
29
+ SAMPLE_SPLIT = "train" # Only has train split
30
+ FEWSHOT_SPLIT = "train" # Only has train split
31
+ RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
32
+ METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
33
+ SUBJECTS = CHEMBENCH_SUBJECTS
34
+ LANGUAGE = Language.ENG
35
+
36
+ def __init__(self, num_fewshot: int = 0) -> None:
37
+ assert num_fewshot == 0, "Fewshot is not supported for ChemBench"
38
+ super().__init__(num_fewshot)
39
+
40
+ self.keys = get_n_letters(16)
41
+
42
+ def _load_dataset(self, subject: str) -> None:
43
+ super()._load_dataset(subject)
44
+ # Keep only the multiple-choice options with 1 correct answer
45
+ for split in self.dataset.keys():
46
+ filtered_items = []
47
+ for item in self.dataset[split]:
48
+ if item.get("metrics") == ["multiple_choice_grade"]:
49
+ target_scores = json.loads(item["examples"][0]["target_scores"])
50
+ correct_answers = [i for i, score in enumerate(target_scores.values()) if score == 1.0]
51
+ if len(correct_answers) == 1:
52
+ filtered_items.append(item)
53
+ self.dataset[split] = filtered_items
54
+
55
+ def _get_subject_name(self, item: dict[str, Any]) -> str:
56
+ return " ".join(item["subject"].split("_"))
57
+
58
+ def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
59
+ return (
60
+ "The following is a question about chemistry. Please answer by responding with the letter of the correct "
61
+ "answer."
62
+ )
63
+
64
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
65
+ question = item["examples"][0]["input"].strip()
66
+ target_scores = json.loads(item["examples"][0]["target_scores"])
67
+ choices = "".join([f"{key}. {choice}\n" for key, choice in zip(self.keys, target_scores.keys())])
68
+ return f"Question: {question}\n{choices}"
69
+
70
+ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
71
+ ground_truth = self._get_ground_truth(item)
72
+ return f"{self._get_cue_text(item)}{ground_truth}"
73
+
74
+ def _get_cue_text(self, item: dict[str, Any]) -> str:
75
+ return "Answer:"
76
+
77
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
78
+ target_scores = json.loads(item["examples"][0]["target_scores"])
79
+ correct_answers = [i for i, score in enumerate(target_scores.values()) if score == 1.0]
80
+ assert len(correct_answers) == 1, f"Expected exactly one correct answer, but got {len(correct_answers)}"
81
+ return f" {self.keys[correct_answers[0]]}"
82
+
83
+ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
84
+ target_scores = json.loads(item["examples"][0]["target_scores"])
85
+ return [f" {key}" for key in self.keys[: len(target_scores)]]
@@ -0,0 +1,39 @@
1
+ from typing import Any
2
+
3
+ from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
4
+ AccuracyLoglikelihood,
5
+ AccuracyNormLoglikelihood,
6
+ )
7
+ from eval_framework.tasks.base import BaseTask, Language, ResponseType
8
+
9
+
10
+ class COPA(BaseTask[str]):
11
+ """COPA dataset: https://huggingface.co/datasets/aps/super_glue"""
12
+
13
+ NAME = "COPA"
14
+ DATASET_PATH = "aps/super_glue"
15
+ SAMPLE_SPLIT = "validation" # 100 examples (same split as lm-eval)
16
+ FEWSHOT_SPLIT = "test" # 500 examples
17
+ RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
18
+ METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
19
+ SUBJECTS = ["copa"]
20
+ PERTURBATION_UNMODIFIABLE_WORDS = ["because", "therefore"]
21
+ LANGUAGE = Language.ENG
22
+
23
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
24
+ connector = {
25
+ "cause": "because",
26
+ "effect": "therefore",
27
+ }[item["question"]]
28
+ return item["premise"].strip()[:-1] + f" {connector} "
29
+
30
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
31
+ correct_choice = item["choice1"] if item["label"] == 0 else item["choice2"]
32
+ return f"{self.convert_choice(correct_choice)}"
33
+
34
+ def convert_choice(self, choice: str) -> str:
35
+ return choice[0].lower() + choice[1:]
36
+
37
+ def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
38
+ choices = [self.convert_choice(item["choice1"]), self.convert_choice(item["choice2"])]
39
+ return choices
@@ -0,0 +1,91 @@
1
+ import random
2
+ import re
3
+ from abc import ABC
4
+ from typing import Any
5
+
6
+ from eval_framework.metrics.base import BaseMetric
7
+ from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
8
+ from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, Sample
9
+
10
+
11
+ class DUC(BaseTask[str], ABC):
12
+ """https://huggingface.co/datasets/midas/duc2001"""
13
+
14
+ DATASET_PATH: str = "midas/duc2001"
15
+ SAMPLE_SPLIT: str = "test"
16
+ FEWSHOT_SPLIT: str = "test"
17
+ RESPONSE_TYPE: ResponseType = ResponseType.COMPLETION
18
+ METRICS: list[type[BaseMetric]] = [AccuracyCompletion]
19
+ SUBJECTS: list[str] = ["raw"]
20
+ PERTURBATION_UNMODIFIABLE_WORDS = ["Text", "Keyphrase"]
21
+ LANGUAGE = Language.ENG
22
+
23
+ def __init__(self, num_fewshot: int = 0) -> None:
24
+ super().__init__(num_fewshot)
25
+
26
+ self.stop_sequences: list[str] = ["Text:"]
27
+ self.max_tokens = 50 # longest keyphrase is less than 50 characters long
28
+
29
+ def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
30
+ for stop_sequence in self.stop_sequences:
31
+ if stop_sequence in completion_text:
32
+ completion_text = completion_text.split(stop_sequence)[0]
33
+ completion_text = completion_text.strip()
34
+ return completion_text
35
+
36
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
37
+ instruction_text = " ".join(item["document"])
38
+ instruction_text = re.sub(r"\s+([.,!?;:])", r"\1", instruction_text)
39
+ return f"Text: {instruction_text}\nKeyphrase:"
40
+
41
+ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
42
+ target = self._get_ground_truth(item)
43
+ assert target is not None
44
+ assert isinstance(target, list)
45
+ return f" {target[0]}"
46
+
47
+
48
+ class DUC_EXTRACTIVE(DUC):
49
+ NAME = "DUC Extractive"
50
+ SUBJECTS: list[str] = ["raw"]
51
+
52
+ def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
53
+ return item["extractive_keyphrases"]
54
+
55
+ def _get_system_prompt_text(self, item: dict[str, Any]) -> str:
56
+ return (
57
+ "You are an AI model tasked with extracting keyphrases from a text document. "
58
+ "Keyphrases should capture main ideas or significant topics exactly as worded in the text."
59
+ )
60
+
61
+
62
+ class DUC_ABSTRACTIVE(DUC):
63
+ NAME = "DUC Abstractive"
64
+ SUBJECTS: list[str] = ["raw"]
65
+
66
+ def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
67
+ return item["abstractive_keyphrases"]
68
+
69
+ def _load_dataset(self, subject: str) -> None:
70
+ # not all samples have abstractive keyphrases
71
+ hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=subject)
72
+ self.dataset = {}
73
+
74
+ for split, data in hf_dataset.items():
75
+ data_list = list(filter(lambda x: len(x["abstractive_keyphrases"]) > 0, data))
76
+
77
+ if split == self.SAMPLE_SPLIT:
78
+ self.rnd = random.Random(RANDOM_SEED)
79
+ self.rnd.shuffle(data_list)
80
+
81
+ if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
82
+ self.dataset[split] = data_list
83
+
84
+ def _get_system_prompt_text(self, item: dict[str, Any]) -> str:
85
+ return (
86
+ "You are an AI model tasked with generating abstractive keyphrases "
87
+ "that capture the main ideas of the text without using exact wording."
88
+ )
89
+
90
+ def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
91
+ return "Paraphrase the following texts to improve clarity and relevance."
@@ -0,0 +1,62 @@
1
+ from typing import Any
2
+
3
+ import pycountry
4
+
5
+ from eval_framework.metrics.completion.bleu import BLEU
6
+ from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
7
+
8
+ FLORES_LANGUAGES = [
9
+ "deu_Latn",
10
+ "eng_Latn",
11
+ "fin_Latn",
12
+ "fra_Latn",
13
+ "nld_Latn",
14
+ ] # Note: there are many more languages in the dataset, but we only consider these for now
15
+
16
+
17
+ class Flores200(BaseTask[str]):
18
+ """QMSum dataset: https://huggingface.co/datasets/facebook/flores"""
19
+
20
+ NAME = "FLoRes-200"
21
+ DATASET_PATH = "facebook/flores"
22
+ SAMPLE_SPLIT = "devtest"
23
+ FEWSHOT_SPLIT = "dev"
24
+ RESPONSE_TYPE = ResponseType.COMPLETION
25
+ METRICS = [BLEU]
26
+ SUBJECTS = [f"{s}-{t}" for s in FLORES_LANGUAGES for t in FLORES_LANGUAGES if s != t]
27
+ PERTURBATION_UNMODIFIABLE_WORDS = ["sentence"]
28
+ LANGUAGE = {
29
+ "deu_Latn": Language.DEU,
30
+ "eng_Latn": Language.ENG,
31
+ "fin_Latn": Language.FIN,
32
+ "fra_Latn": Language.FRA,
33
+ "nld_Latn": Language.NLD,
34
+ }
35
+
36
+ def __init__(self, num_fewshot: int = 0) -> None:
37
+ super().__init__(num_fewshot)
38
+
39
+ self.stop_sequences = ["\n"]
40
+
41
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
42
+ source_key = item["subject"].split("-")[0]
43
+ source_language = pycountry.languages.get(alpha_3=source_key.split("_")[0]).name
44
+ source = item[f"sentence_{source_key}"]
45
+ instruction = f"{source_language} sentence: {source}\n"
46
+ target_key = item["subject"].split("-")[1]
47
+ target_language = pycountry.languages.get(alpha_3=target_key.split("_")[0]).name
48
+
49
+ return f"{instruction}{target_language} sentence:"
50
+
51
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
52
+ target_key = item["subject"].split("-")[1]
53
+ return item[f"sentence_{target_key}"]
54
+
55
+ def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
56
+ target = f" {self._get_ground_truth(item)}"
57
+ assert target is not None
58
+ assert isinstance(target, str)
59
+ return target
60
+
61
+ def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
62
+ return completion_text.strip()
@@ -0,0 +1,84 @@
1
+ import random
2
+ from itertools import product
3
+ from typing import Any
4
+
5
+ from eval_framework.metrics.completion.bleu import BLEU
6
+ from eval_framework.metrics.completion.chrf import CHRF
7
+ from eval_framework.metrics.completion.comet import COMET
8
+ from eval_framework.shared.types import BaseMetricContext, UntemplatedPrompt
9
+ from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
10
+
11
+ LANG_MAP = {
12
+ "deu_Latn": "German",
13
+ "eng_Latn": "English",
14
+ "fra_Latn": "French",
15
+ "ita_Latn": "Italian",
16
+ "nld_Latn": "Dutch",
17
+ "pol_Latn": "Polish",
18
+ "rus_Cyrl": "Russian",
19
+ "spa_Latn": "Spanish",
20
+ "ukr_Cyrl": "Ukrainian",
21
+ }
22
+
23
+
24
+ class FloresPlus(BaseTask[str]):
25
+ """Flores-Plus dataset: https://huggingface.co/datasets/openlanguagedata/flores_plus"""
26
+
27
+ NAME = "Flores-Plus"
28
+ DATASET_PATH = "openlanguagedata/flores_plus"
29
+ SAMPLE_SPLIT = "dev"
30
+ FEWSHOT_SPLIT = "devtest"
31
+ RESPONSE_TYPE = ResponseType.COMPLETION
32
+ METRICS = [BLEU, CHRF, COMET]
33
+ SUBJECTS = [f"{s}-{t}" for s, t in product(LANG_MAP, LANG_MAP) if s != t]
34
+ PERTURBATION_UNMODIFIABLE_WORDS = ["sentence"]
35
+ LANGUAGE = {
36
+ "deu_Latn": Language.DEU,
37
+ "eng_Latn": Language.ENG,
38
+ "fra_Latn": Language.FRA,
39
+ "ita_Latn": Language.ITA,
40
+ "nld_Latn": Language.NLD,
41
+ "pol_Latn": Language.POL,
42
+ "rus_Cyrl": Language.RUS,
43
+ "spa_Latn": Language.SPA,
44
+ "ukr_Cyrl": Language.UKR,
45
+ }
46
+
47
+ def __init__(self, num_fewshot: int = 0) -> None:
48
+ super().__init__(num_fewshot)
49
+ self.stop_sequences = ["\n"]
50
+
51
+ def _load_dataset(self, subject: str) -> None:
52
+ hf_dataset_src = self._load_hf_dataset(path=self.DATASET_PATH, name=subject.split("-")[0])
53
+ hf_dataset_tgt = self._load_hf_dataset(path=self.DATASET_PATH, name=subject.split("-")[1])
54
+ self.dataset = {}
55
+ self.rnd = random.Random(42)
56
+
57
+ for split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
58
+ data_src = hf_dataset_src[split]
59
+ data_tgt = hf_dataset_tgt[split]
60
+ data_list = []
61
+ for item_src, item_tgt in zip(data_src, data_tgt):
62
+ assert item_src["id"] == item_tgt["id"]
63
+ iso_src = f"{item_src['iso_639_3']}_{item_src['iso_15924']}"
64
+ iso_tgt = f"{item_tgt['iso_639_3']}_{item_tgt['iso_15924']}"
65
+ text_src = item_src["text"]
66
+ text_tgt = item_tgt["text"]
67
+ data_list.append({"iso_source": iso_src, "iso_target": iso_tgt, "source": text_src, "target": text_tgt})
68
+ if split == self.SAMPLE_SPLIT:
69
+ self.rnd.shuffle(data_list)
70
+ self.dataset[split] = data_list
71
+
72
+ def _get_instruction_text(self, item: dict[str, Any]) -> str:
73
+ target_language = LANG_MAP[item["iso_target"]]
74
+ instruction = f"Translate the following text into {target_language}:\n{item['source']}"
75
+ return instruction
76
+
77
+ def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
78
+ return item["target"]
79
+
80
+ def _get_context(self, item: dict[str, Any]) -> BaseMetricContext | list[BaseMetricContext] | None:
81
+ return UntemplatedPrompt(untemplated_prompt=item["source"])
82
+
83
+ def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
84
+ return completion_text.strip()