eval-framework 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. eval_framework/__init__.py +7 -0
  2. eval_framework/base_config.py +36 -0
  3. eval_framework/context/__init__.py +0 -0
  4. eval_framework/context/determined.py +177 -0
  5. eval_framework/context/eval.py +121 -0
  6. eval_framework/context/local.py +78 -0
  7. eval_framework/evaluation_generator.py +234 -0
  8. eval_framework/exceptions.py +2 -0
  9. eval_framework/external/ifeval_impl/README.md +5 -0
  10. eval_framework/external/ifeval_impl/instructions.py +1523 -0
  11. eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
  12. eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
  13. eval_framework/external/ifeval_impl/utils.py +135 -0
  14. eval_framework/llm/__init__.py +0 -0
  15. eval_framework/llm/aleph_alpha.py +432 -0
  16. eval_framework/llm/base.py +180 -0
  17. eval_framework/llm/huggingface.py +418 -0
  18. eval_framework/llm/mistral.py +88 -0
  19. eval_framework/llm/models.py +28 -0
  20. eval_framework/llm/openai.py +400 -0
  21. eval_framework/llm/vllm.py +554 -0
  22. eval_framework/logger.py +3 -0
  23. eval_framework/main.py +166 -0
  24. eval_framework/metrics/__init__.py +0 -0
  25. eval_framework/metrics/base.py +40 -0
  26. eval_framework/metrics/completion/__init__.py +1 -0
  27. eval_framework/metrics/completion/accuracy_completion.py +16 -0
  28. eval_framework/metrics/completion/aidanbench.py +28 -0
  29. eval_framework/metrics/completion/bleu.py +76 -0
  30. eval_framework/metrics/completion/chrf.py +62 -0
  31. eval_framework/metrics/completion/code_assertion.py +44 -0
  32. eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
  33. eval_framework/metrics/completion/comet.py +56 -0
  34. eval_framework/metrics/completion/concordance_index.py +38 -0
  35. eval_framework/metrics/completion/csv_format.py +102 -0
  36. eval_framework/metrics/completion/cwe_accuracy.py +49 -0
  37. eval_framework/metrics/completion/exponential_similarity.py +65 -0
  38. eval_framework/metrics/completion/f1.py +42 -0
  39. eval_framework/metrics/completion/format_checker.py +56 -0
  40. eval_framework/metrics/completion/grid_difference.py +77 -0
  41. eval_framework/metrics/completion/ifeval.py +73 -0
  42. eval_framework/metrics/completion/json_format.py +179 -0
  43. eval_framework/metrics/completion/language_checker.py +74 -0
  44. eval_framework/metrics/completion/length_control.py +83 -0
  45. eval_framework/metrics/completion/math_reasoning_completion.py +307 -0
  46. eval_framework/metrics/completion/niah_accuracy.py +163 -0
  47. eval_framework/metrics/completion/placeholder_checker.py +27 -0
  48. eval_framework/metrics/completion/repetition.py +88 -0
  49. eval_framework/metrics/completion/rouge_1.py +35 -0
  50. eval_framework/metrics/completion/rouge_2.py +45 -0
  51. eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
  52. eval_framework/metrics/completion/rouge_l.py +52 -0
  53. eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
  54. eval_framework/metrics/completion/ter.py +67 -0
  55. eval_framework/metrics/completion/text_counter.py +182 -0
  56. eval_framework/metrics/efficiency/__init__.py +0 -0
  57. eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
  58. eval_framework/metrics/llm/__init__.py +0 -0
  59. eval_framework/metrics/llm/base.py +34 -0
  60. eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
  61. eval_framework/metrics/llm/graders/coherence_grader.py +115 -0
  62. eval_framework/metrics/llm/graders/comparison_grader.py +198 -0
  63. eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
  64. eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
  65. eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
  66. eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
  67. eval_framework/metrics/llm/graders/language.py +56 -0
  68. eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
  69. eval_framework/metrics/llm/graders/models.py +74 -0
  70. eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
  71. eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
  72. eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
  73. eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
  74. eval_framework/metrics/llm/llm_judge_coherence.py +44 -0
  75. eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
  76. eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
  77. eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
  78. eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
  79. eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
  80. eval_framework/metrics/llm/llm_judge_mtbench_pair.py +306 -0
  81. eval_framework/metrics/llm/llm_judge_mtbench_single.py +210 -0
  82. eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
  83. eval_framework/metrics/llm/llm_judge_sql.py +394 -0
  84. eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
  85. eval_framework/metrics/llm/utils.py +20 -0
  86. eval_framework/metrics/loglikelihood/__init__.py +0 -0
  87. eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
  88. eval_framework/metrics/loglikelihood/base.py +50 -0
  89. eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +25 -0
  90. eval_framework/metrics/loglikelihood/dcs.py +43 -0
  91. eval_framework/metrics/loglikelihood/probability_mass.py +53 -0
  92. eval_framework/metrics/loglikelihood/ternary.py +42 -0
  93. eval_framework/py.typed +0 -0
  94. eval_framework/response_generator.py +351 -0
  95. eval_framework/result_processors/__init__.py +0 -0
  96. eval_framework/result_processors/base.py +88 -0
  97. eval_framework/result_processors/hf_uploader.py +75 -0
  98. eval_framework/result_processors/result_processor.py +129 -0
  99. eval_framework/result_processors/wandb_uploader.py +137 -0
  100. eval_framework/run.py +369 -0
  101. eval_framework/run_direct.py +42 -0
  102. eval_framework/shared/types.py +227 -0
  103. eval_framework/tasks/__init__.py +6 -0
  104. eval_framework/tasks/base.py +392 -0
  105. eval_framework/tasks/benchmarks/__init__.py +0 -0
  106. eval_framework/tasks/benchmarks/aidanbench.py +211 -0
  107. eval_framework/tasks/benchmarks/arc.py +70 -0
  108. eval_framework/tasks/benchmarks/arc_de.py +46 -0
  109. eval_framework/tasks/benchmarks/arc_fi.py +46 -0
  110. eval_framework/tasks/benchmarks/belebele.py +60 -0
  111. eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
  112. eval_framework/tasks/benchmarks/casehold.py +47 -0
  113. eval_framework/tasks/benchmarks/chembench.py +85 -0
  114. eval_framework/tasks/benchmarks/copa.py +64 -0
  115. eval_framework/tasks/benchmarks/duc.py +91 -0
  116. eval_framework/tasks/benchmarks/flores200.py +133 -0
  117. eval_framework/tasks/benchmarks/flores_plus.py +84 -0
  118. eval_framework/tasks/benchmarks/gpqa.py +201 -0
  119. eval_framework/tasks/benchmarks/gsm8k.py +150 -0
  120. eval_framework/tasks/benchmarks/hellaswag.py +69 -0
  121. eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
  122. eval_framework/tasks/benchmarks/humaneval.py +97 -0
  123. eval_framework/tasks/benchmarks/ifeval.py +78 -0
  124. eval_framework/tasks/benchmarks/include.py +119 -0
  125. eval_framework/tasks/benchmarks/infinitebench.py +302 -0
  126. eval_framework/tasks/benchmarks/math_reasoning.py +580 -0
  127. eval_framework/tasks/benchmarks/mbpp.py +192 -0
  128. eval_framework/tasks/benchmarks/mmlu.py +215 -0
  129. eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
  130. eval_framework/tasks/benchmarks/mmlu_pro.py +164 -0
  131. eval_framework/tasks/benchmarks/mmmlu.py +529 -0
  132. eval_framework/tasks/benchmarks/openbookqa.py +85 -0
  133. eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
  134. eval_framework/tasks/benchmarks/pawsx.py +65 -0
  135. eval_framework/tasks/benchmarks/piqa.py +64 -0
  136. eval_framework/tasks/benchmarks/quality.py +56 -0
  137. eval_framework/tasks/benchmarks/sciq.py +110 -0
  138. eval_framework/tasks/benchmarks/sphyr.py +79 -0
  139. eval_framework/tasks/benchmarks/squad.py +211 -0
  140. eval_framework/tasks/benchmarks/struct_eval.py +116 -0
  141. eval_framework/tasks/benchmarks/tablebench.py +117 -0
  142. eval_framework/tasks/benchmarks/triviaqa.py +42 -0
  143. eval_framework/tasks/benchmarks/truthfulqa.py +119 -0
  144. eval_framework/tasks/benchmarks/winogender.py +64 -0
  145. eval_framework/tasks/benchmarks/winogrande.py +69 -0
  146. eval_framework/tasks/benchmarks/winox.py +57 -0
  147. eval_framework/tasks/benchmarks/wmt.py +160 -0
  148. eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
  149. eval_framework/tasks/eval_config.py +136 -0
  150. eval_framework/tasks/perturbation.py +83 -0
  151. eval_framework/tasks/registry.py +186 -0
  152. eval_framework/tasks/task_loader.py +81 -0
  153. eval_framework/tasks/task_names.py +324 -0
  154. eval_framework/tasks/utils.py +584 -0
  155. eval_framework/utils/constants.py +9 -0
  156. eval_framework/utils/file_ops.py +245 -0
  157. eval_framework/utils/generate_task_docs.py +244 -0
  158. eval_framework/utils/helpers.py +32 -0
  159. eval_framework/utils/logging.py +62 -0
  160. eval_framework/utils/packaging.py +52 -0
  161. eval_framework/utils/tqdm_handler.py +14 -0
  162. eval_framework-0.2.7.dist-info/METADATA +548 -0
  163. eval_framework-0.2.7.dist-info/RECORD +170 -0
  164. eval_framework-0.2.7.dist-info/WHEEL +4 -0
  165. eval_framework-0.2.7.dist-info/entry_points.txt +3 -0
  166. template_formatting/README.md +83 -0
  167. template_formatting/__init__.py +0 -0
  168. template_formatting/formatter.py +537 -0
  169. template_formatting/mistral_formatter.py +159 -0
  170. template_formatting/py.typed +0 -0
@@ -0,0 +1,182 @@
1
+ import re
2
+
3
+ import nltk
4
+
5
+ from eval_framework.metrics.base import (
6
+ BaseMetric,
7
+ MetricResult,
8
+ )
9
+ from eval_framework.shared.types import BaseMetricContext, Completion, extract_context_metric
10
+
11
+ ALPHABETS = "([A-Za-z])"
12
+ PREFIXES = "(Mr|St|Mrs|Ms|Dr|www)[.]"
13
+ SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)"
14
+ STARTERS = (
15
+ r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
16
+ )
17
+ ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
18
+ WEBSITES = "[.](com|net|org|io|gov|edu|me)"
19
+ DIGITS = "([0-9])"
20
+ MULTIPLE_DOTS = r"\.{2,}"
21
+
22
+
23
+ class WordCounterMetricContext(BaseMetricContext):
24
+ comparison: str
25
+ word_count: int
26
+
27
+
28
+ class WordCounter(BaseMetric[Completion]):
29
+ NAME = "Word Count"
30
+
31
+ @staticmethod
32
+ def _count_words(text: str) -> int:
33
+ tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
34
+ tokens = tokenizer.tokenize(text)
35
+ num_words = len(tokens)
36
+ return num_words
37
+
38
+ def calculate(self, response: Completion) -> list[MetricResult]:
39
+ if response.error is not None:
40
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
41
+
42
+ context = extract_context_metric(response, WordCounterMetricContext)
43
+
44
+ assert context.comparison in ["less than", "at least"], f"'comparison' is not valid: {context.comparison}"
45
+
46
+ num_words = self._count_words(response.completion)
47
+ if context.comparison == "less than":
48
+ valid_word_count = num_words < context.word_count
49
+ if context.comparison == "at least":
50
+ valid_word_count = num_words >= context.word_count
51
+
52
+ return [
53
+ MetricResult(
54
+ metric_name=self.NAME, value=float(valid_word_count), higher_is_better=True, error=response.error
55
+ )
56
+ ]
57
+
58
+
59
+ class SentenceCounterMetricContext(BaseMetricContext):
60
+ comparison: str
61
+ sentence_count: int
62
+
63
+
64
+ class SentenceCounter(BaseMetric[Completion]):
65
+ NAME = "Sentence Count"
66
+
67
+ @staticmethod
68
+ def _count_sentences(text: str) -> int:
69
+ # Note that nltk.tokenize.sent_tokenize would be a straightforward alternative but is also not ideal. Example:
70
+ #
71
+ # "Mr. Jones gave me $10,000.00... And then he left. Numbers 5...10. Numbers 5..10. Review: bad food,
72
+ # bad service,..., so I'd miss it."
73
+ #
74
+ # this: ['Mr. Jones gave me $10,000.00...', 'And then he left.', 'Numbers 5...', '10.', 'Numbers 5..', '10.',
75
+ # 'Review: bad food, bad service,...', ", so I'd miss it."].
76
+ # nltk: ['Mr. Jones gave me $10,000.00... And then he left.', 'Numbers 5...10.',
77
+ # "Numbers 5..10. Review: bad food, bad service,..., so I'd miss it."]
78
+
79
+ text = f" {text} "
80
+ text = text.replace("\n", " ")
81
+ text = re.sub(PREFIXES, "\\1<prd>", text)
82
+ text = re.sub(WEBSITES, "<prd>\\1", text)
83
+ text = re.sub(DIGITS + "[.]" + DIGITS, "\\1<prd>\\2", text)
84
+ text = re.sub(
85
+ MULTIPLE_DOTS,
86
+ lambda match: "<prd>" * len(match.group(0)) + "<stop>",
87
+ text,
88
+ )
89
+ text = text.replace("Ph.D.", "Ph<prd>D<prd>")
90
+ text = re.sub(r"\s" + ALPHABETS + "[.] ", " \\1<prd> ", text)
91
+ text = re.sub(ACRONYMS + " " + STARTERS, "\\1<stop> \\2", text)
92
+ text = re.sub(
93
+ ALPHABETS + "[.]" + ALPHABETS + "[.]" + ALPHABETS + "[.]",
94
+ "\\1<prd>\\2<prd>\\3<prd>",
95
+ text,
96
+ )
97
+ text = re.sub(ALPHABETS + "[.]" + ALPHABETS + "[.]", "\\1<prd>\\2<prd>", text)
98
+ text = re.sub(" " + SUFFIXES + "[.] " + STARTERS, " \\1<stop> \\2", text)
99
+ text = re.sub(" " + SUFFIXES + "[.]", " \\1<prd>", text)
100
+ text = re.sub(" " + ALPHABETS + "[.]", " \\1<prd>", text)
101
+ text = text.replace(".”", "”.")
102
+ text = text.replace('."', '".')
103
+ text = text.replace('!"', '"!')
104
+ text = text.replace('?"', '"?')
105
+ text = text.replace(".", ".<stop>")
106
+ text = text.replace("?", "?<stop>")
107
+ text = text.replace("!", "!<stop>")
108
+ text = text.replace("<prd>", ".")
109
+ sentences = text.split("<stop>")
110
+ sentences = [s.strip() for s in sentences]
111
+ if sentences and not sentences[-1]:
112
+ sentences = sentences[:-1]
113
+ return len(sentences)
114
+
115
+ def calculate(self, response: Completion) -> list[MetricResult]:
116
+ if response.error is not None:
117
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
118
+
119
+ context = extract_context_metric(response, SentenceCounterMetricContext)
120
+
121
+ assert context.comparison in ["less than", "at least"], f"'comparison' is not valid: {context.comparison}"
122
+
123
+ num_sentences = self._count_sentences(response.completion)
124
+ if context.comparison == "less than":
125
+ valid_sentence_count = num_sentences < context.sentence_count
126
+ elif context.comparison == "at least":
127
+ valid_sentence_count = num_sentences >= context.sentence_count
128
+
129
+ return [
130
+ MetricResult(
131
+ metric_name=self.NAME, value=float(valid_sentence_count), higher_is_better=True, error=response.error
132
+ )
133
+ ]
134
+
135
+
136
+ class ParagraphCounterMetricContext(BaseMetricContext):
137
+ comparison: str
138
+ paragraph_count: int
139
+
140
+
141
+ class ParagraphCounter(BaseMetric[Completion]):
142
+ NAME = "Paragraph Count"
143
+
144
+ @staticmethod
145
+ def _count_paragraphs(text: str) -> int:
146
+ paragraphs = re.split(r"\s?\n\n\s?", text)
147
+ return len(paragraphs)
148
+
149
+ def calculate(self, response: Completion) -> list[MetricResult]:
150
+ if response.error is not None:
151
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
152
+
153
+ context = extract_context_metric(response, ParagraphCounterMetricContext)
154
+
155
+ assert context.comparison in ["less than", "at least"], f"'comparison' is not valid: {context.comparison}"
156
+
157
+ num_paragraphs = self._count_paragraphs(response.completion)
158
+ if context.comparison == "less than":
159
+ valid_paragraph_count = num_paragraphs < context.paragraph_count
160
+ elif context.comparison == "at least":
161
+ valid_paragraph_count = num_paragraphs >= context.paragraph_count
162
+
163
+ return [
164
+ MetricResult(
165
+ metric_name=self.NAME, value=float(valid_paragraph_count), higher_is_better=True, error=response.error
166
+ )
167
+ ]
168
+
169
+
170
+ class ResponseToOriginalLengthRatio(BaseMetric[Completion]):
171
+ NAME = "Response to Original Length Ratio"
172
+
173
+ def calculate(self, response: Completion) -> list[MetricResult]:
174
+ if response.error is not None:
175
+ return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
176
+
177
+ len_original = len(response.last_user_instruction)
178
+ if len_original > 0:
179
+ score = len(response.completion) / len_original
180
+ return [MetricResult(metric_name=self.NAME, value=score, higher_is_better=False, error=response.error)]
181
+ else:
182
+ return []
File without changes
@@ -0,0 +1,48 @@
1
+ from eval_framework.metrics.base import BaseMetric, MetricResult
2
+ from eval_framework.shared.types import Completion, Loglikelihood
3
+
4
+
5
+ class BytesLoglikelihood(BaseMetric[Loglikelihood]):
6
+ NAME = "Bytes"
7
+
8
+ def calculate(self, response: Loglikelihood) -> list[MetricResult]:
9
+ if response.error or response.concat_compression is None:
10
+ value = None
11
+ else:
12
+ value = response.concat_compression.num_bytes
13
+
14
+ return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]
15
+
16
+
17
+ class SequencePositionsLoglikelihood(BaseMetric[Loglikelihood]):
18
+ NAME = "SequencePositions"
19
+
20
+ def calculate(self, response: Loglikelihood) -> list[MetricResult]:
21
+ if response.error or response.concat_compression is None:
22
+ value = None
23
+ else:
24
+ value = response.concat_compression.num_tokens
25
+ return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]
26
+
27
+
28
+ class BytesCompletion(BaseMetric[Completion]):
29
+ NAME = "Bytes"
30
+
31
+ def calculate(self, response: Completion) -> list[MetricResult]:
32
+ if response.error or response.concat_compression is None:
33
+ value = None
34
+ else:
35
+ value = response.concat_compression.num_bytes
36
+
37
+ return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]
38
+
39
+
40
+ class SequencePositionsCompletion(BaseMetric[Completion]):
41
+ NAME = "SequencePositions"
42
+
43
+ def calculate(self, response: Completion) -> list[MetricResult]:
44
+ if response.error or response.concat_compression is None:
45
+ value = None
46
+ else:
47
+ value = response.concat_compression.num_tokens
48
+ return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]
File without changes
@@ -0,0 +1,34 @@
1
+ import traceback
2
+
3
+ from eval_framework.llm.base import BaseLLM
4
+ from eval_framework.metrics.base import BaseMetric, MetricResult
5
+ from eval_framework.shared.types import Completion, Error
6
+
7
+
8
+ class BaseLLMJudgeMetric(BaseMetric[Completion]):
9
+ def __init__(self, llm_judge: BaseLLM, randomize_order: bool = False) -> None:
10
+ self._llm_judge = llm_judge
11
+ self._randomize_order = randomize_order
12
+
13
+ def _create_metric_result(
14
+ self,
15
+ metric_name: str,
16
+ higher_is_better: bool,
17
+ value: float | None,
18
+ llm_judge_prompt: str | None = None,
19
+ llm_judge_response: str | None = None,
20
+ code_execution_trace: str | None = None,
21
+ error: Error | None = None,
22
+ ) -> MetricResult:
23
+ """Helper method to create MetricResult with consistent structure."""
24
+ return MetricResult(
25
+ metric_name=metric_name,
26
+ value=value,
27
+ higher_is_better=higher_is_better,
28
+ llm_judge_prompt=llm_judge_prompt,
29
+ llm_judge_response=llm_judge_response,
30
+ code_execution_trace=code_execution_trace,
31
+ error=Error(error_class=error.__class__.__name__, message=str(error), traceback=traceback.format_exc())
32
+ if error
33
+ else None,
34
+ )
@@ -0,0 +1,92 @@
1
+ from collections.abc import Mapping
2
+
3
+ from eval_framework.llm.base import BaseLLM as StructuredOutputChatModel
4
+ from eval_framework.metrics.llm.graders.language import Language
5
+ from eval_framework.metrics.llm.graders.models import GradingOutput, PromptTemplate, parse_json_output
6
+
7
+
8
+ class ChatbotStyleGradingOutput(GradingOutput):
9
+ thought_process: str | None
10
+ is_chatbot_style: bool | None
11
+
12
+
13
+ class ChatbotStyleGrader:
14
+ COMPLETION_KEY = "completion"
15
+ PROMPT_TEMPLATES = {
16
+ Language("de"): PromptTemplate(
17
+ system_prompt="""Deine Aufgabe ist es zu klassifizieren, ob eine von einem Textgenerator gelieferte Antwort dem Stile eines Chatbots entspricht.
18
+
19
+ Hier sind einige Schlüsselmerkmale einer Antwort im Stile eines Chatbots:
20
+ * Sie leitet den Hauptinhalt mit Phrasen wie "Natürlich, ich helfe Dir gerne", "Na klar!" oder "Selbstverständlich kann ich" ein.
21
+ * Sie endet mit Phrasen wie "Ich hoffe, ich konnte Dir weiterhelfen!"
22
+ * Sie stellt Nachfragen an den Benutzer.
23
+ * Sie neigt dazu, überaus wortreich zu sein.
24
+ * Sie enthält Gesprächs- und Unterhaltungsfloskeln.
25
+ * Sie enthält Text, der zum Verständnis des Inhalts nicht zwingend notwendig ist.
26
+ * Sie bewahrt einen überaus freundlichen Ton.
27
+
28
+ Beachte, dass die Erfüllung von nur einem dieser Merkmale ausreicht, um die Antwort als Chatbot-Stil zu klassifizieren.
29
+
30
+ Gebe deine Bewertung in folgendem JSON-Format:
31
+ {
32
+ "thought_process": str (Achte sehr genau auf die Antwort und argumentiere in ein paar Sätzen, ob die Antwort dem Chatbot-Stil folgt oder nicht),
33
+ "is_chatbot_style": bool
34
+ }""", # noqa: E501
35
+ user_prompt=f"""**Antwort des Textgenerators**
36
+ {{{COMPLETION_KEY}}}""",
37
+ ),
38
+ Language("en"): PromptTemplate(
39
+ system_prompt="""Your task is to classify if a text generation model's response follows a chatbot-style format.
40
+
41
+ Here are some key characteristics of a chatbot-style response:
42
+ * It introduces the main content with phrases like "Certainly, here is", "Sure!" or "Of course."
43
+ * It ends with phrases such as "I hope this helps!"
44
+ * It asks follow-up questions.
45
+ * It tends to be verbose.
46
+ * It tends to contain fluff that is not necessary to understand the content.
47
+ * It maintains a friendly tone.
48
+
49
+ Note that even one of these characteristics is enough to classify the response as following a chatbot-style format.
50
+
51
+ You must provide your evaluation in the following JSON format:
52
+ {
53
+ "thought_process": str (Pay very close attention to the response and argue whether the response follows a chatbot-style or not in a few sentences),
54
+ "is_chatbot_style": bool
55
+ }""", # noqa: E501
56
+ user_prompt=f"""**Model Response**:
57
+ {{{COMPLETION_KEY}}}""",
58
+ ),
59
+ }
60
+
61
+ def __init__(
62
+ self,
63
+ grading_model: StructuredOutputChatModel,
64
+ prompt_templates: Mapping[Language, PromptTemplate] = PROMPT_TEMPLATES,
65
+ ) -> None:
66
+ self._grading_model = grading_model
67
+
68
+ if not all(self.COMPLETION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()):
69
+ raise ValueError(f"At least one PromptTemplate is invalid, must contain '{self.COMPLETION_KEY}'.")
70
+ self._prompt_templates = prompt_templates
71
+
72
+ def grade(self, completion: str, language: Language) -> ChatbotStyleGradingOutput:
73
+ try:
74
+ prompt_template = language.language_config(self._prompt_templates)
75
+ except Exception as _:
76
+ prompt_template = Language("en").language_config(self._prompt_templates)
77
+
78
+ messages = prompt_template.to_messages(
79
+ [],
80
+ [
81
+ (self.COMPLETION_KEY, completion),
82
+ ],
83
+ )
84
+ raw_completion = self._grading_model.generate_from_messages([messages])[0]
85
+ loaded_json = parse_json_output(raw_completion.completion)
86
+
87
+ return ChatbotStyleGradingOutput(
88
+ thought_process=loaded_json.get("thought_process", None),
89
+ is_chatbot_style=loaded_json.get("is_chatbot_style", None),
90
+ judge_prompt=raw_completion.prompt,
91
+ judge_response=raw_completion.completion,
92
+ )
@@ -0,0 +1,115 @@
1
+ import re
2
+ from collections.abc import Mapping
3
+
4
+ from eval_framework.llm.base import BaseLLM
5
+ from eval_framework.metrics.llm.graders.language import Language
6
+ from eval_framework.metrics.llm.graders.models import (
7
+ GradingOutput,
8
+ PromptTemplate,
9
+ )
10
+
11
+
12
+ def _extract_xml_content(text: str, tag: str) -> str:
13
+ pattern = f"<{tag}>(.*?)</{tag}>"
14
+ matches = re.findall(pattern, text, re.DOTALL)
15
+ return matches[-1].strip() if matches else text
16
+
17
+
18
+ class CoherenceGradingOutput(GradingOutput):
19
+ coherence_score: int
20
+
21
+
22
+ class CoherenceGrader:
23
+ """
24
+ Coherence grader taken from AidanBench: https://github.com/aidanmclaughlin/AidanBench/blob/main/benchmark/prompts.py
25
+ """
26
+
27
+ INSTRUCTION_KEY = "instruction"
28
+ COMPLETION_KEY = "completion"
29
+ PROMPT_TEMPLATES = {
30
+ Language("en"): PromptTemplate(
31
+ system_prompt="", # noqa: E501
32
+ user_prompt=f"""Your task is to evaluate the coherence and plausibility of an answer to a given question.
33
+
34
+ Question: <question>{{{INSTRUCTION_KEY}}}</question>
35
+ Answer: <answer>{{{COMPLETION_KEY}}}</answer>
36
+
37
+ Based on the following criteria, provide a Coherence and Plausibility Score on a scale of 0 - 100:
38
+
39
+ 0-20: INCOHERENT/NONSENSICAL
40
+ - Answer is completely unrelated to the question
41
+ - Contains logical impossibilities or contradictions
42
+ - Makes claims that defy basic reality
43
+ - Shows no understanding of the question's context
44
+
45
+ 21-40: SEVERELY FLAWED
46
+ - Major logical gaps or inconsistencies
47
+ - Significant misunderstanding of core concepts
48
+ - Contains partially relevant information but mostly incorrect
49
+ - May include some true statements but fails to form a coherent response
50
+
51
+ 41-60: PARTIALLY COHERENT
52
+ - Shows basic understanding of the question
53
+ - Contains some valid points mixed with errors
54
+ - Logic is followable but may have weak connections
55
+ - Answer is relevant but may miss key aspects
56
+
57
+ 61-80: MOSTLY COHERENT
58
+ - Demonstrates clear understanding of the question
59
+ - Logic is sound with minor gaps or inconsistencies
60
+ - Most claims are plausible and well-supported
61
+ - Forms a generally complete and relevant response
62
+
63
+ 81-100: HIGHLY COHERENT
64
+ - Perfectly addresses the question
65
+ - Demonstrates complete logical consistency
66
+ - All claims are plausible and well-grounded
67
+ - Forms a comprehensive and precise response
68
+
69
+ IMPORTANT: Provide your final Coherence and Plausibility Score as a single integer between 0 and 100, enclosed in <coherence_score></coherence_score> XML tags. For example:
70
+ <coherence_score>75</coherence_score>
71
+
72
+ Do not include any additional text in your response.""", # noqa: E501
73
+ ),
74
+ }
75
+
76
+ def __init__(
77
+ self,
78
+ grading_model: BaseLLM,
79
+ prompt_templates: Mapping[Language, PromptTemplate] = PROMPT_TEMPLATES,
80
+ ) -> None:
81
+ self._grading_model = grading_model
82
+
83
+ if not all(
84
+ self.INSTRUCTION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()
85
+ ) or not all(
86
+ self.COMPLETION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()
87
+ ):
88
+ raise ValueError(
89
+ f"At least one PromptTemplate is invalid, must contain '{self.COMPLETION_KEY}' "
90
+ "and '{self.INSTRUCTION_KEY}'."
91
+ )
92
+ self._prompt_templates = prompt_templates
93
+
94
+ def grade(self, instruction: str, completion: str, language: Language) -> CoherenceGradingOutput:
95
+ try:
96
+ prompt_template = language.language_config(self._prompt_templates)
97
+ except Exception as _:
98
+ prompt_template = Language("en").language_config(self._prompt_templates)
99
+
100
+ messages = prompt_template.to_messages(
101
+ [],
102
+ [
103
+ (self.INSTRUCTION_KEY, instruction),
104
+ (self.COMPLETION_KEY, completion),
105
+ ],
106
+ )
107
+
108
+ raw_completion = self._grading_model.generate_from_messages([messages])[0]
109
+ coherence_score = int(_extract_xml_content(raw_completion.completion, "coherence_score"))
110
+
111
+ return CoherenceGradingOutput(
112
+ coherence_score=coherence_score,
113
+ judge_prompt=raw_completion.prompt,
114
+ judge_response=raw_completion.completion,
115
+ )
@@ -0,0 +1,198 @@
1
+ import random
2
+ from collections.abc import Mapping
3
+ from enum import Enum
4
+
5
+ from eval_framework.llm.base import BaseLLM as StructuredOutputChatModel
6
+ from eval_framework.metrics.llm.graders.language import Language
7
+ from eval_framework.metrics.llm.graders.models import (
8
+ GradingOutput,
9
+ PromptTemplateWithParseMap,
10
+ parse_json_output,
11
+ )
12
+ from eval_framework.metrics.llm.utils import order_answers_for_comparison
13
+
14
+
15
+ class MatchOutcome(str, Enum):
16
+ A_WINS = "a_wins"
17
+ DRAW = "draw"
18
+ B_WINS = "b_wins"
19
+
20
+ @property
21
+ def payoff(self) -> tuple[float, float]:
22
+ if self == self.A_WINS:
23
+ return (1, 0)
24
+ if self == self.DRAW:
25
+ return (0.5, 0.5)
26
+ return (0, 1)
27
+
28
+ def flip(self) -> "MatchOutcome":
29
+ """Flip the outcome (A_WINS <-> B_WINS, DRAW stays DRAW)."""
30
+ if self == self.A_WINS:
31
+ return MatchOutcome.B_WINS
32
+ if self == self.B_WINS:
33
+ return MatchOutcome.A_WINS
34
+ return self # DRAW stays DRAW
35
+
36
+ @staticmethod
37
+ def from_rank_literal(rank: int) -> "MatchOutcome":
38
+ match rank:
39
+ case 1:
40
+ return MatchOutcome.A_WINS
41
+ case 2:
42
+ return MatchOutcome.B_WINS
43
+ case 3:
44
+ return MatchOutcome.DRAW
45
+ case _:
46
+ raise ValueError(f"Got unexpected rank {rank}")
47
+
48
+
49
+ class ComparisonGradingOutput(GradingOutput):
50
+ reasoning: str | None
51
+ outcome: MatchOutcome | None
52
+
53
+
54
+ class ComparisonGrader:
55
+ INSTRUCTION_KEY = "instruction"
56
+ ANSWER_1_KEY = "answer_1"
57
+ ANSWER_2_KEY = "answer_2"
58
+ REASONING_KEY = "explanation"
59
+ BETTER_ANSWER_KEY = "better_answer"
60
+ PROMPT_TEMPLATES = {
61
+ Language("de"): PromptTemplateWithParseMap(
62
+ system_prompt=f"""Beachte die gegebene Aufgabe und dazugehörigen Antworten. Entscheide, welche Antwort besser ist, Antwort 1 oder Antwort 2. Gebe anschließend "Antwort 1 ist besser", "Antwort 2 ist besser" oder "Beide gleich" aus.
63
+
64
+ Eine gute Antwort ist:
65
+ 1. Inhaltlich korrekt.
66
+ 2. Beachtet die Anforderungen der Aufgabe präzise.
67
+ 3. Ist im Rahmen der Aufgabenstellung kreativ und nicht repetetiv.
68
+ 4. In der Sprache der Aufgabe verfasst.
69
+
70
+ Gebe die Antwort im folgenden json-Format:
71
+ {{
72
+ "{REASONING_KEY}": str (Beschreibe in wenigen Sätzen (max. 5) die Unterschiede der beiden Antworten und begründe, warum eine der beiden Antworten besser ist oder warum die Antworten ähnlich gut sind.),
73
+ "{BETTER_ANSWER_KEY}": Literal["Antwort 1 ist besser", "Antwort 2 ist besser", "Beide gleich"]
74
+ }}""", # noqa: E501
75
+ user_prompt=f"""Aufgabe:
76
+ {{{INSTRUCTION_KEY}}}
77
+ ---
78
+ Antwort 1:
79
+ {{{ANSWER_1_KEY}}}
80
+ ---
81
+ Antwort 2:
82
+ {{{ANSWER_2_KEY}}}""",
83
+ parse_map={
84
+ "Antwort 1 ist besser": MatchOutcome.A_WINS,
85
+ "Antwort 2 ist besser": MatchOutcome.B_WINS,
86
+ "Beide gleich": MatchOutcome.DRAW,
87
+ },
88
+ ),
89
+ Language("en"): PromptTemplateWithParseMap(
90
+ system_prompt=f"""Note the given task and the corresponding answers. Decide which answer is better, answer 1 or answer 2. Then output "Answer 1 is better", "Answer 2 is better" or "Both equal".
91
+
92
+ A good answer is:
93
+ 1. correct in content.
94
+ 2. follows the requirements of the task precisely.
95
+ 3. is creative and not repetitive in the context of the task.
96
+ 4. written in the same language as the task.
97
+
98
+ Enter the answer in the following json format:
99
+ {{
100
+ "{REASONING_KEY}": str (Describe in a few sentences (max. 5) the differences between the two answers and give reasons why one of the two answers is better or why the answers are similarly good),
101
+ "{BETTER_ANSWER_KEY}": Literal["Answer 1 is better", "Answer 2 is better", "Both equal"]
102
+ }}""", # noqa: E501
103
+ user_prompt=f"""Task:
104
+ {{{INSTRUCTION_KEY}}}
105
+ ---
106
+ Answer 1:
107
+ {{{ANSWER_1_KEY}}}
108
+ ---
109
+ Answer 2:
110
+ {{{ANSWER_2_KEY}}}""",
111
+ parse_map={
112
+ "Answer 1 is better": MatchOutcome.A_WINS,
113
+ "Answer 2 is better": MatchOutcome.B_WINS,
114
+ "Both equal": MatchOutcome.DRAW,
115
+ },
116
+ ),
117
+ }
118
+
119
+ def __init__(
120
+ self,
121
+ grading_model: StructuredOutputChatModel,
122
+ prompt_templates: Mapping[Language, PromptTemplateWithParseMap] = PROMPT_TEMPLATES,
123
+ ) -> None:
124
+ self._grading_model = grading_model
125
+
126
+ if not all(
127
+ self.INSTRUCTION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()
128
+ ) or not all(self.ANSWER_1_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()):
129
+ raise ValueError(
130
+ f"At least one PromptTemplate invalid, must contain '{self.ANSWER_1_KEY}' and '{self.INSTRUCTION_KEY}'."
131
+ )
132
+ self._prompt_templates = prompt_templates
133
+
134
+ def grade(
135
+ self,
136
+ instruction: str,
137
+ completion_1: str,
138
+ completion_2: str,
139
+ language: Language,
140
+ randomize_order: bool = False,
141
+ seed: int | None = None,
142
+ ) -> ComparisonGradingOutput:
143
+ """Grade two completions by comparing them.
144
+
145
+ Args:
146
+ instruction: The instruction/task that was given.
147
+ completion_1: The first completion (typically the candidate).
148
+ completion_2: The second completion (typically the reference).
149
+ language: The language for the grading prompts.
150
+ randomize_order: If True, randomly swap the order of completions to eliminate
151
+ position bias.
152
+ seed: Optional random seed for reproducibility. If None and randomize_order
153
+ is True, uses a random swap decision.
154
+
155
+ Returns:
156
+ ComparisonGradingOutput with the outcome corrected for any position swap,
157
+ so outcome always reflects completion_1 vs completion_2 regardless of
158
+ presentation order to the judge.
159
+ """
160
+ prompt_template = language.language_config(self._prompt_templates)
161
+
162
+ # Determine whether to swap the order
163
+ if randomize_order:
164
+ rng = random.Random(seed)
165
+ swap_order = rng.choice([True, False])
166
+ else:
167
+ swap_order = False
168
+
169
+ # Apply the swap if needed
170
+ actual_answer_1, actual_answer_2 = order_answers_for_comparison(completion_1, completion_2, swap_order)
171
+
172
+ messages = prompt_template.to_messages(
173
+ [],
174
+ [
175
+ (self.INSTRUCTION_KEY, instruction),
176
+ (self.ANSWER_1_KEY, actual_answer_1),
177
+ (self.ANSWER_2_KEY, actual_answer_2),
178
+ ],
179
+ )
180
+
181
+ raw_completion = self._grading_model.generate_from_messages([messages])[0]
182
+ loaded_json = parse_json_output(raw_completion.completion)
183
+
184
+ # Get the raw outcome from the judge
185
+ raw_outcome: MatchOutcome | None = prompt_template.parse_map.get(
186
+ str(loaded_json.get(self.BETTER_ANSWER_KEY, None)), None
187
+ )
188
+
189
+ # Correct the outcome if we swapped the order
190
+ # If swapped: "Answer 1 is better" means completion_2 is better (B_WINS from completion_1's perspective)
191
+ final_outcome = raw_outcome.flip() if swap_order and raw_outcome is not None else raw_outcome
192
+
193
+ return ComparisonGradingOutput(
194
+ reasoning=loaded_json.get(self.REASONING_KEY, None),
195
+ outcome=final_outcome,
196
+ judge_prompt=raw_completion.prompt,
197
+ judge_response=raw_completion.completion,
198
+ )