PyPI - eval-framework - Versions diffs - 0.2.7__py3-none-any.whl - Mend

eval-framework 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (170) hide show

eval_framework/__init__.py +7 -0
eval_framework/base_config.py +36 -0
eval_framework/context/__init__.py +0 -0
eval_framework/context/determined.py +177 -0
eval_framework/context/eval.py +121 -0
eval_framework/context/local.py +78 -0
eval_framework/evaluation_generator.py +234 -0
eval_framework/exceptions.py +2 -0
eval_framework/external/ifeval_impl/README.md +5 -0
eval_framework/external/ifeval_impl/instructions.py +1523 -0
eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
eval_framework/external/ifeval_impl/utils.py +135 -0
eval_framework/llm/__init__.py +0 -0
eval_framework/llm/aleph_alpha.py +432 -0
eval_framework/llm/base.py +180 -0
eval_framework/llm/huggingface.py +418 -0
eval_framework/llm/mistral.py +88 -0
eval_framework/llm/models.py +28 -0
eval_framework/llm/openai.py +400 -0
eval_framework/llm/vllm.py +554 -0
eval_framework/logger.py +3 -0
eval_framework/main.py +166 -0
eval_framework/metrics/__init__.py +0 -0
eval_framework/metrics/base.py +40 -0
eval_framework/metrics/completion/__init__.py +1 -0
eval_framework/metrics/completion/accuracy_completion.py +16 -0
eval_framework/metrics/completion/aidanbench.py +28 -0
eval_framework/metrics/completion/bleu.py +76 -0
eval_framework/metrics/completion/chrf.py +62 -0
eval_framework/metrics/completion/code_assertion.py +44 -0
eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
eval_framework/metrics/completion/comet.py +56 -0
eval_framework/metrics/completion/concordance_index.py +38 -0
eval_framework/metrics/completion/csv_format.py +102 -0
eval_framework/metrics/completion/cwe_accuracy.py +49 -0
eval_framework/metrics/completion/exponential_similarity.py +65 -0
eval_framework/metrics/completion/f1.py +42 -0
eval_framework/metrics/completion/format_checker.py +56 -0
eval_framework/metrics/completion/grid_difference.py +77 -0
eval_framework/metrics/completion/ifeval.py +73 -0
eval_framework/metrics/completion/json_format.py +179 -0
eval_framework/metrics/completion/language_checker.py +74 -0
eval_framework/metrics/completion/length_control.py +83 -0
eval_framework/metrics/completion/math_reasoning_completion.py +307 -0
eval_framework/metrics/completion/niah_accuracy.py +163 -0
eval_framework/metrics/completion/placeholder_checker.py +27 -0
eval_framework/metrics/completion/repetition.py +88 -0
eval_framework/metrics/completion/rouge_1.py +35 -0
eval_framework/metrics/completion/rouge_2.py +45 -0
eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
eval_framework/metrics/completion/rouge_l.py +52 -0
eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
eval_framework/metrics/completion/ter.py +67 -0
eval_framework/metrics/completion/text_counter.py +182 -0
eval_framework/metrics/efficiency/__init__.py +0 -0
eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
eval_framework/metrics/llm/__init__.py +0 -0
eval_framework/metrics/llm/base.py +34 -0
eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
eval_framework/metrics/llm/graders/coherence_grader.py +115 -0
eval_framework/metrics/llm/graders/comparison_grader.py +198 -0
eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
eval_framework/metrics/llm/graders/language.py +56 -0
eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
eval_framework/metrics/llm/graders/models.py +74 -0
eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
eval_framework/metrics/llm/llm_judge_coherence.py +44 -0
eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
eval_framework/metrics/llm/llm_judge_mtbench_pair.py +306 -0
eval_framework/metrics/llm/llm_judge_mtbench_single.py +210 -0
eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
eval_framework/metrics/llm/llm_judge_sql.py +394 -0
eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
eval_framework/metrics/llm/utils.py +20 -0
eval_framework/metrics/loglikelihood/__init__.py +0 -0
eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
eval_framework/metrics/loglikelihood/base.py +50 -0
eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +25 -0
eval_framework/metrics/loglikelihood/dcs.py +43 -0
eval_framework/metrics/loglikelihood/probability_mass.py +53 -0
eval_framework/metrics/loglikelihood/ternary.py +42 -0
eval_framework/py.typed +0 -0
eval_framework/response_generator.py +351 -0
eval_framework/result_processors/__init__.py +0 -0
eval_framework/result_processors/base.py +88 -0
eval_framework/result_processors/hf_uploader.py +75 -0
eval_framework/result_processors/result_processor.py +129 -0
eval_framework/result_processors/wandb_uploader.py +137 -0
eval_framework/run.py +369 -0
eval_framework/run_direct.py +42 -0
eval_framework/shared/types.py +227 -0
eval_framework/tasks/__init__.py +6 -0
eval_framework/tasks/base.py +392 -0
eval_framework/tasks/benchmarks/__init__.py +0 -0
eval_framework/tasks/benchmarks/aidanbench.py +211 -0
eval_framework/tasks/benchmarks/arc.py +70 -0
eval_framework/tasks/benchmarks/arc_de.py +46 -0
eval_framework/tasks/benchmarks/arc_fi.py +46 -0
eval_framework/tasks/benchmarks/belebele.py +60 -0
eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
eval_framework/tasks/benchmarks/casehold.py +47 -0
eval_framework/tasks/benchmarks/chembench.py +85 -0
eval_framework/tasks/benchmarks/copa.py +64 -0
eval_framework/tasks/benchmarks/duc.py +91 -0
eval_framework/tasks/benchmarks/flores200.py +133 -0
eval_framework/tasks/benchmarks/flores_plus.py +84 -0
eval_framework/tasks/benchmarks/gpqa.py +201 -0
eval_framework/tasks/benchmarks/gsm8k.py +150 -0
eval_framework/tasks/benchmarks/hellaswag.py +69 -0
eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
eval_framework/tasks/benchmarks/humaneval.py +97 -0
eval_framework/tasks/benchmarks/ifeval.py +78 -0
eval_framework/tasks/benchmarks/include.py +119 -0
eval_framework/tasks/benchmarks/infinitebench.py +302 -0
eval_framework/tasks/benchmarks/math_reasoning.py +580 -0
eval_framework/tasks/benchmarks/mbpp.py +192 -0
eval_framework/tasks/benchmarks/mmlu.py +215 -0
eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
eval_framework/tasks/benchmarks/mmlu_pro.py +164 -0
eval_framework/tasks/benchmarks/mmmlu.py +529 -0
eval_framework/tasks/benchmarks/openbookqa.py +85 -0
eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
eval_framework/tasks/benchmarks/pawsx.py +65 -0
eval_framework/tasks/benchmarks/piqa.py +64 -0
eval_framework/tasks/benchmarks/quality.py +56 -0
eval_framework/tasks/benchmarks/sciq.py +110 -0
eval_framework/tasks/benchmarks/sphyr.py +79 -0
eval_framework/tasks/benchmarks/squad.py +211 -0
eval_framework/tasks/benchmarks/struct_eval.py +116 -0
eval_framework/tasks/benchmarks/tablebench.py +117 -0
eval_framework/tasks/benchmarks/triviaqa.py +42 -0
eval_framework/tasks/benchmarks/truthfulqa.py +119 -0
eval_framework/tasks/benchmarks/winogender.py +64 -0
eval_framework/tasks/benchmarks/winogrande.py +69 -0
eval_framework/tasks/benchmarks/winox.py +57 -0
eval_framework/tasks/benchmarks/wmt.py +160 -0
eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
eval_framework/tasks/eval_config.py +136 -0
eval_framework/tasks/perturbation.py +83 -0
eval_framework/tasks/registry.py +186 -0
eval_framework/tasks/task_loader.py +81 -0
eval_framework/tasks/task_names.py +324 -0
eval_framework/tasks/utils.py +584 -0
eval_framework/utils/constants.py +9 -0
eval_framework/utils/file_ops.py +245 -0
eval_framework/utils/generate_task_docs.py +244 -0
eval_framework/utils/helpers.py +32 -0
eval_framework/utils/logging.py +62 -0
eval_framework/utils/packaging.py +52 -0
eval_framework/utils/tqdm_handler.py +14 -0
eval_framework-0.2.7.dist-info/METADATA +548 -0
eval_framework-0.2.7.dist-info/RECORD +170 -0
eval_framework-0.2.7.dist-info/WHEEL +4 -0
eval_framework-0.2.7.dist-info/entry_points.txt +3 -0
template_formatting/README.md +83 -0
template_formatting/__init__.py +0 -0
template_formatting/formatter.py +537 -0
template_formatting/mistral_formatter.py +159 -0
template_formatting/py.typed +0 -0

eval_framework/metrics/completion/text_counter.py ADDED Viewed

@@ -0,0 +1,182 @@
+import re
+import nltk
+from eval_framework.metrics.base import (
+    BaseMetric,
+    MetricResult,
+)
+from eval_framework.shared.types import BaseMetricContext, Completion, extract_context_metric
+ALPHABETS = "([A-Za-z])"
+PREFIXES = "(Mr|St|Mrs|Ms|Dr|www)[.]"
+SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)"
+STARTERS = (
+    r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
+)
+ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
+WEBSITES = "[.](com|net|org|io|gov|edu|me)"
+DIGITS = "([0-9])"
+MULTIPLE_DOTS = r"\.{2,}"
+class WordCounterMetricContext(BaseMetricContext):
+    comparison: str
+    word_count: int
+class WordCounter(BaseMetric[Completion]):
+    NAME = "Word Count"
+    @staticmethod
+    def _count_words(text: str) -> int:
+        tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
+        tokens = tokenizer.tokenize(text)
+        num_words = len(tokens)
+        return num_words
+    def calculate(self, response: Completion) -> list[MetricResult]:
+        if response.error is not None:
+            return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
+        context = extract_context_metric(response, WordCounterMetricContext)
+        assert context.comparison in ["less than", "at least"], f"'comparison' is not valid: {context.comparison}"
+        num_words = self._count_words(response.completion)
+        if context.comparison == "less than":
+            valid_word_count = num_words < context.word_count
+        if context.comparison == "at least":
+            valid_word_count = num_words >= context.word_count
+        return [
+            MetricResult(
+                metric_name=self.NAME, value=float(valid_word_count), higher_is_better=True, error=response.error
+            )
+        ]
+class SentenceCounterMetricContext(BaseMetricContext):
+    comparison: str
+    sentence_count: int
+class SentenceCounter(BaseMetric[Completion]):
+    NAME = "Sentence Count"
+    @staticmethod
+    def _count_sentences(text: str) -> int:
+        # Note that nltk.tokenize.sent_tokenize would be a straightforward alternative but is also not ideal. Example:
+        #
+        # "Mr. Jones gave me $10,000.00... And then he left. Numbers 5...10. Numbers 5..10. Review: bad food,
+        #    bad service,..., so I'd miss it."
+        #
+        # this: ['Mr. Jones gave me $10,000.00...', 'And then he left.', 'Numbers 5...', '10.', 'Numbers 5..', '10.',
+        #    'Review: bad food, bad service,...', ", so I'd miss it."].
+        # nltk: ['Mr. Jones gave me $10,000.00... And then he left.', 'Numbers 5...10.',
+        #    "Numbers 5..10. Review: bad food, bad service,..., so I'd miss it."]
+        text = f" {text} "
+        text = text.replace("\n", " ")
+        text = re.sub(PREFIXES, "\\1<prd>", text)
+        text = re.sub(WEBSITES, "<prd>\\1", text)
+        text = re.sub(DIGITS + "[.]" + DIGITS, "\\1<prd>\\2", text)
+        text = re.sub(
+            MULTIPLE_DOTS,
+            lambda match: "<prd>" * len(match.group(0)) + "<stop>",
+            text,
+        )
+        text = text.replace("Ph.D.", "Ph<prd>D<prd>")
+        text = re.sub(r"\s" + ALPHABETS + "[.] ", " \\1<prd> ", text)
+        text = re.sub(ACRONYMS + " " + STARTERS, "\\1<stop> \\2", text)
+        text = re.sub(
+            ALPHABETS + "[.]" + ALPHABETS + "[.]" + ALPHABETS + "[.]",
+            "\\1<prd>\\2<prd>\\3<prd>",
+            text,
+        )
+        text = re.sub(ALPHABETS + "[.]" + ALPHABETS + "[.]", "\\1<prd>\\2<prd>", text)
+        text = re.sub(" " + SUFFIXES + "[.] " + STARTERS, " \\1<stop> \\2", text)
+        text = re.sub(" " + SUFFIXES + "[.]", " \\1<prd>", text)
+        text = re.sub(" " + ALPHABETS + "[.]", " \\1<prd>", text)
+        text = text.replace(".”", "”.")
+        text = text.replace('."', '".')
+        text = text.replace('!"', '"!')
+        text = text.replace('?"', '"?')
+        text = text.replace(".", ".<stop>")
+        text = text.replace("?", "?<stop>")
+        text = text.replace("!", "!<stop>")
+        text = text.replace("<prd>", ".")
+        sentences = text.split("<stop>")
+        sentences = [s.strip() for s in sentences]
+        if sentences and not sentences[-1]:
+            sentences = sentences[:-1]
+        return len(sentences)
+    def calculate(self, response: Completion) -> list[MetricResult]:
+        if response.error is not None:
+            return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
+        context = extract_context_metric(response, SentenceCounterMetricContext)
+        assert context.comparison in ["less than", "at least"], f"'comparison' is not valid: {context.comparison}"
+        num_sentences = self._count_sentences(response.completion)
+        if context.comparison == "less than":
+            valid_sentence_count = num_sentences < context.sentence_count
+        elif context.comparison == "at least":
+            valid_sentence_count = num_sentences >= context.sentence_count
+        return [
+            MetricResult(
+                metric_name=self.NAME, value=float(valid_sentence_count), higher_is_better=True, error=response.error
+            )
+        ]
+class ParagraphCounterMetricContext(BaseMetricContext):
+    comparison: str
+    paragraph_count: int
+class ParagraphCounter(BaseMetric[Completion]):
+    NAME = "Paragraph Count"
+    @staticmethod
+    def _count_paragraphs(text: str) -> int:
+        paragraphs = re.split(r"\s?\n\n\s?", text)
+        return len(paragraphs)
+    def calculate(self, response: Completion) -> list[MetricResult]:
+        if response.error is not None:
+            return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
+        context = extract_context_metric(response, ParagraphCounterMetricContext)
+        assert context.comparison in ["less than", "at least"], f"'comparison' is not valid: {context.comparison}"
+        num_paragraphs = self._count_paragraphs(response.completion)
+        if context.comparison == "less than":
+            valid_paragraph_count = num_paragraphs < context.paragraph_count
+        elif context.comparison == "at least":
+            valid_paragraph_count = num_paragraphs >= context.paragraph_count
+        return [
+            MetricResult(
+                metric_name=self.NAME, value=float(valid_paragraph_count), higher_is_better=True, error=response.error
+            )
+        ]
+class ResponseToOriginalLengthRatio(BaseMetric[Completion]):
+    NAME = "Response to Original Length Ratio"
+    def calculate(self, response: Completion) -> list[MetricResult]:
+        if response.error is not None:
+            return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
+        len_original = len(response.last_user_instruction)
+        if len_original > 0:
+            score = len(response.completion) / len_original
+            return [MetricResult(metric_name=self.NAME, value=score, higher_is_better=False, error=response.error)]
+        else:
+            return []

eval_framework/metrics/efficiency/__init__.py ADDED Viewed

File without changes

eval_framework/metrics/efficiency/bytes_per_sequence_position.py ADDED Viewed

@@ -0,0 +1,48 @@
+from eval_framework.metrics.base import BaseMetric, MetricResult
+from eval_framework.shared.types import Completion, Loglikelihood
+class BytesLoglikelihood(BaseMetric[Loglikelihood]):
+    NAME = "Bytes"
+    def calculate(self, response: Loglikelihood) -> list[MetricResult]:
+        if response.error or response.concat_compression is None:
+            value = None
+        else:
+            value = response.concat_compression.num_bytes
+        return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]
+class SequencePositionsLoglikelihood(BaseMetric[Loglikelihood]):
+    NAME = "SequencePositions"
+    def calculate(self, response: Loglikelihood) -> list[MetricResult]:
+        if response.error or response.concat_compression is None:
+            value = None
+        else:
+            value = response.concat_compression.num_tokens
+        return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]
+class BytesCompletion(BaseMetric[Completion]):
+    NAME = "Bytes"
+    def calculate(self, response: Completion) -> list[MetricResult]:
+        if response.error or response.concat_compression is None:
+            value = None
+        else:
+            value = response.concat_compression.num_bytes
+        return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]
+class SequencePositionsCompletion(BaseMetric[Completion]):
+    NAME = "SequencePositions"
+    def calculate(self, response: Completion) -> list[MetricResult]:
+        if response.error or response.concat_compression is None:
+            value = None
+        else:
+            value = response.concat_compression.num_tokens
+        return [MetricResult(metric_name=self.NAME, value=value, higher_is_better=True, error=response.error)]

eval_framework/metrics/llm/__init__.py ADDED Viewed

File without changes

eval_framework/metrics/llm/base.py ADDED Viewed

@@ -0,0 +1,34 @@
+import traceback
+from eval_framework.llm.base import BaseLLM
+from eval_framework.metrics.base import BaseMetric, MetricResult
+from eval_framework.shared.types import Completion, Error
+class BaseLLMJudgeMetric(BaseMetric[Completion]):
+    def __init__(self, llm_judge: BaseLLM, randomize_order: bool = False) -> None:
+        self._llm_judge = llm_judge
+        self._randomize_order = randomize_order
+    def _create_metric_result(
+        self,
+        metric_name: str,
+        higher_is_better: bool,
+        value: float | None,
+        llm_judge_prompt: str | None = None,
+        llm_judge_response: str | None = None,
+        code_execution_trace: str | None = None,
+        error: Error | None = None,
+    ) -> MetricResult:
+        """Helper method to create MetricResult with consistent structure."""
+        return MetricResult(
+            metric_name=metric_name,
+            value=value,
+            higher_is_better=higher_is_better,
+            llm_judge_prompt=llm_judge_prompt,
+            llm_judge_response=llm_judge_response,
+            code_execution_trace=code_execution_trace,
+            error=Error(error_class=error.__class__.__name__, message=str(error), traceback=traceback.format_exc())
+            if error
+            else None,
+        )

eval_framework/metrics/llm/graders/chatbot_style_grader.py ADDED Viewed

@@ -0,0 +1,92 @@
+from collections.abc import Mapping
+from eval_framework.llm.base import BaseLLM as StructuredOutputChatModel
+from eval_framework.metrics.llm.graders.language import Language
+from eval_framework.metrics.llm.graders.models import GradingOutput, PromptTemplate, parse_json_output
+class ChatbotStyleGradingOutput(GradingOutput):
+    thought_process: str | None
+    is_chatbot_style: bool | None
+class ChatbotStyleGrader:
+    COMPLETION_KEY = "completion"
+    PROMPT_TEMPLATES = {
+        Language("de"): PromptTemplate(
+            system_prompt="""Deine Aufgabe ist es zu klassifizieren, ob eine von einem Textgenerator gelieferte Antwort dem Stile eines Chatbots entspricht.
+Hier sind einige Schlüsselmerkmale einer Antwort im Stile eines Chatbots:
+* Sie leitet den Hauptinhalt mit Phrasen wie "Natürlich, ich helfe Dir gerne", "Na klar!" oder "Selbstverständlich kann ich" ein.
+* Sie endet mit Phrasen wie "Ich hoffe, ich konnte Dir weiterhelfen!"
+* Sie stellt Nachfragen an den Benutzer.
+* Sie neigt dazu, überaus wortreich zu sein.
+* Sie enthält Gesprächs- und Unterhaltungsfloskeln.
+* Sie enthält Text, der zum Verständnis des Inhalts nicht zwingend notwendig ist.
+* Sie bewahrt einen überaus freundlichen Ton.
+Beachte, dass die Erfüllung von nur einem dieser Merkmale ausreicht, um die Antwort als Chatbot-Stil zu klassifizieren.
+Gebe deine Bewertung in folgendem JSON-Format:
+{
+    "thought_process": str (Achte sehr genau auf die Antwort und argumentiere in ein paar Sätzen, ob die Antwort dem Chatbot-Stil folgt oder nicht),
+    "is_chatbot_style": bool
+}""",  # noqa: E501
+            user_prompt=f"""**Antwort des Textgenerators**
+{{{COMPLETION_KEY}}}""",
+        ),
+        Language("en"): PromptTemplate(
+            system_prompt="""Your task is to classify if a text generation model's response follows a chatbot-style format.
+Here are some key characteristics of a chatbot-style response:
+* It introduces the main content with phrases like "Certainly, here is", "Sure!" or "Of course."
+* It ends with phrases such as "I hope this helps!"
+* It asks follow-up questions.
+* It tends to be verbose.
+* It tends to contain fluff that is not necessary to understand the content.
+* It maintains a friendly tone.
+Note that even one of these characteristics is enough to classify the response as following a chatbot-style format.
+You must provide your evaluation in the following JSON format:
+{
+    "thought_process": str (Pay very close attention to the response and argue whether the response follows a chatbot-style or not in a few sentences),
+    "is_chatbot_style": bool
+}""",  # noqa: E501
+            user_prompt=f"""**Model Response**:
+{{{COMPLETION_KEY}}}""",
+        ),
+    }
+    def __init__(
+        self,
+        grading_model: StructuredOutputChatModel,
+        prompt_templates: Mapping[Language, PromptTemplate] = PROMPT_TEMPLATES,
+    ) -> None:
+        self._grading_model = grading_model
+        if not all(self.COMPLETION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()):
+            raise ValueError(f"At least one PromptTemplate is invalid, must contain '{self.COMPLETION_KEY}'.")
+        self._prompt_templates = prompt_templates
+    def grade(self, completion: str, language: Language) -> ChatbotStyleGradingOutput:
+        try:
+            prompt_template = language.language_config(self._prompt_templates)
+        except Exception as _:
+            prompt_template = Language("en").language_config(self._prompt_templates)
+        messages = prompt_template.to_messages(
+            [],
+            [
+                (self.COMPLETION_KEY, completion),
+            ],
+        )
+        raw_completion = self._grading_model.generate_from_messages([messages])[0]
+        loaded_json = parse_json_output(raw_completion.completion)
+        return ChatbotStyleGradingOutput(
+            thought_process=loaded_json.get("thought_process", None),
+            is_chatbot_style=loaded_json.get("is_chatbot_style", None),
+            judge_prompt=raw_completion.prompt,
+            judge_response=raw_completion.completion,
+        )

eval_framework/metrics/llm/graders/coherence_grader.py ADDED Viewed

@@ -0,0 +1,115 @@
+import re
+from collections.abc import Mapping
+from eval_framework.llm.base import BaseLLM
+from eval_framework.metrics.llm.graders.language import Language
+from eval_framework.metrics.llm.graders.models import (
+    GradingOutput,
+    PromptTemplate,
+)
+def _extract_xml_content(text: str, tag: str) -> str:
+    pattern = f"<{tag}>(.*?)</{tag}>"
+    matches = re.findall(pattern, text, re.DOTALL)
+    return matches[-1].strip() if matches else text
+class CoherenceGradingOutput(GradingOutput):
+    coherence_score: int
+class CoherenceGrader:
+    """
+    Coherence grader taken from AidanBench: https://github.com/aidanmclaughlin/AidanBench/blob/main/benchmark/prompts.py
+    """
+    INSTRUCTION_KEY = "instruction"
+    COMPLETION_KEY = "completion"
+    PROMPT_TEMPLATES = {
+        Language("en"): PromptTemplate(
+            system_prompt="",  # noqa: E501
+            user_prompt=f"""Your task is to evaluate the coherence and plausibility of an answer to a given question.
+Question: <question>{{{INSTRUCTION_KEY}}}</question>
+Answer: <answer>{{{COMPLETION_KEY}}}</answer>
+Based on the following criteria, provide a Coherence and Plausibility Score on a scale of 0 - 100:
+0-20: INCOHERENT/NONSENSICAL
+- Answer is completely unrelated to the question
+- Contains logical impossibilities or contradictions
+- Makes claims that defy basic reality
+- Shows no understanding of the question's context
+21-40: SEVERELY FLAWED
+- Major logical gaps or inconsistencies
+- Significant misunderstanding of core concepts
+- Contains partially relevant information but mostly incorrect
+- May include some true statements but fails to form a coherent response
+41-60: PARTIALLY COHERENT
+- Shows basic understanding of the question
+- Contains some valid points mixed with errors
+- Logic is followable but may have weak connections
+- Answer is relevant but may miss key aspects
+61-80: MOSTLY COHERENT
+- Demonstrates clear understanding of the question
+- Logic is sound with minor gaps or inconsistencies
+- Most claims are plausible and well-supported
+- Forms a generally complete and relevant response
+81-100: HIGHLY COHERENT
+- Perfectly addresses the question
+- Demonstrates complete logical consistency
+- All claims are plausible and well-grounded
+- Forms a comprehensive and precise response
+IMPORTANT: Provide your final Coherence and Plausibility Score as a single integer between 0 and 100, enclosed in <coherence_score></coherence_score> XML tags. For example:
+<coherence_score>75</coherence_score>
+Do not include any additional text in your response.""",  # noqa: E501
+        ),
+    }
+    def __init__(
+        self,
+        grading_model: BaseLLM,
+        prompt_templates: Mapping[Language, PromptTemplate] = PROMPT_TEMPLATES,
+    ) -> None:
+        self._grading_model = grading_model
+        if not all(
+            self.INSTRUCTION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()
+        ) or not all(
+            self.COMPLETION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()
+        ):
+            raise ValueError(
+                f"At least one PromptTemplate is invalid, must contain '{self.COMPLETION_KEY}' "
+                "and '{self.INSTRUCTION_KEY}'."
+            )
+        self._prompt_templates = prompt_templates
+    def grade(self, instruction: str, completion: str, language: Language) -> CoherenceGradingOutput:
+        try:
+            prompt_template = language.language_config(self._prompt_templates)
+        except Exception as _:
+            prompt_template = Language("en").language_config(self._prompt_templates)
+        messages = prompt_template.to_messages(
+            [],
+            [
+                (self.INSTRUCTION_KEY, instruction),
+                (self.COMPLETION_KEY, completion),
+            ],
+        )
+        raw_completion = self._grading_model.generate_from_messages([messages])[0]
+        coherence_score = int(_extract_xml_content(raw_completion.completion, "coherence_score"))
+        return CoherenceGradingOutput(
+            coherence_score=coherence_score,
+            judge_prompt=raw_completion.prompt,
+            judge_response=raw_completion.completion,
+        )

eval_framework/metrics/llm/graders/comparison_grader.py ADDED Viewed

@@ -0,0 +1,198 @@
+import random
+from collections.abc import Mapping
+from enum import Enum
+from eval_framework.llm.base import BaseLLM as StructuredOutputChatModel
+from eval_framework.metrics.llm.graders.language import Language
+from eval_framework.metrics.llm.graders.models import (
+    GradingOutput,
+    PromptTemplateWithParseMap,
+    parse_json_output,
+)
+from eval_framework.metrics.llm.utils import order_answers_for_comparison
+class MatchOutcome(str, Enum):
+    A_WINS = "a_wins"
+    DRAW = "draw"
+    B_WINS = "b_wins"
+    @property
+    def payoff(self) -> tuple[float, float]:
+        if self == self.A_WINS:
+            return (1, 0)
+        if self == self.DRAW:
+            return (0.5, 0.5)
+        return (0, 1)
+    def flip(self) -> "MatchOutcome":
+        """Flip the outcome (A_WINS <-> B_WINS, DRAW stays DRAW)."""
+        if self == self.A_WINS:
+            return MatchOutcome.B_WINS
+        if self == self.B_WINS:
+            return MatchOutcome.A_WINS
+        return self  # DRAW stays DRAW
+    @staticmethod
+    def from_rank_literal(rank: int) -> "MatchOutcome":
+        match rank:
+            case 1:
+                return MatchOutcome.A_WINS
+            case 2:
+                return MatchOutcome.B_WINS
+            case 3:
+                return MatchOutcome.DRAW
+            case _:
+                raise ValueError(f"Got unexpected rank {rank}")
+class ComparisonGradingOutput(GradingOutput):
+    reasoning: str | None
+    outcome: MatchOutcome | None
+class ComparisonGrader:
+    INSTRUCTION_KEY = "instruction"
+    ANSWER_1_KEY = "answer_1"
+    ANSWER_2_KEY = "answer_2"
+    REASONING_KEY = "explanation"
+    BETTER_ANSWER_KEY = "better_answer"
+    PROMPT_TEMPLATES = {
+        Language("de"): PromptTemplateWithParseMap(
+            system_prompt=f"""Beachte die gegebene Aufgabe und dazugehörigen Antworten. Entscheide, welche Antwort besser ist, Antwort 1 oder Antwort 2. Gebe anschließend "Antwort 1 ist besser", "Antwort 2 ist besser" oder "Beide gleich" aus.
+Eine gute Antwort ist:
+1. Inhaltlich korrekt.
+2. Beachtet die Anforderungen der Aufgabe präzise.
+3. Ist im Rahmen der Aufgabenstellung kreativ und nicht repetetiv.
+4. In der Sprache der Aufgabe verfasst.
+Gebe die Antwort im folgenden json-Format:
+{{
+    "{REASONING_KEY}": str (Beschreibe in wenigen Sätzen (max. 5) die Unterschiede der beiden Antworten und begründe, warum eine der beiden Antworten besser ist oder warum die Antworten ähnlich gut sind.),
+    "{BETTER_ANSWER_KEY}": Literal["Antwort 1 ist besser", "Antwort 2 ist besser", "Beide gleich"]
+}}""",  # noqa: E501
+            user_prompt=f"""Aufgabe:
+{{{INSTRUCTION_KEY}}}
+---
+Antwort 1:
+{{{ANSWER_1_KEY}}}
+---
+Antwort 2:
+{{{ANSWER_2_KEY}}}""",
+            parse_map={
+                "Antwort 1 ist besser": MatchOutcome.A_WINS,
+                "Antwort 2 ist besser": MatchOutcome.B_WINS,
+                "Beide gleich": MatchOutcome.DRAW,
+            },
+        ),
+        Language("en"): PromptTemplateWithParseMap(
+            system_prompt=f"""Note the given task and the corresponding answers. Decide which answer is better, answer 1 or answer 2. Then output "Answer 1 is better", "Answer 2 is better" or "Both equal".
+A good answer is:
+1. correct in content.
+2. follows the requirements of the task precisely.
+3. is creative and not repetitive in the context of the task.
+4. written in the same language as the task.
+Enter the answer in the following json format:
+{{
+    "{REASONING_KEY}": str (Describe in a few sentences (max. 5) the differences between the two answers and give reasons why one of the two answers is better or why the answers are similarly good),
+    "{BETTER_ANSWER_KEY}": Literal["Answer 1 is better", "Answer 2 is better", "Both equal"]
+}}""",  # noqa: E501
+            user_prompt=f"""Task:
+{{{INSTRUCTION_KEY}}}
+---
+Answer 1:
+{{{ANSWER_1_KEY}}}
+---
+Answer 2:
+{{{ANSWER_2_KEY}}}""",
+            parse_map={
+                "Answer 1 is better": MatchOutcome.A_WINS,
+                "Answer 2 is better": MatchOutcome.B_WINS,
+                "Both equal": MatchOutcome.DRAW,
+            },
+        ),
+    }
+    def __init__(
+        self,
+        grading_model: StructuredOutputChatModel,
+        prompt_templates: Mapping[Language, PromptTemplateWithParseMap] = PROMPT_TEMPLATES,
+    ) -> None:
+        self._grading_model = grading_model
+        if not all(
+            self.INSTRUCTION_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()
+        ) or not all(self.ANSWER_1_KEY in prompt_template.user_prompt for prompt_template in prompt_templates.values()):
+            raise ValueError(
+                f"At least one PromptTemplate invalid, must contain '{self.ANSWER_1_KEY}' and '{self.INSTRUCTION_KEY}'."
+            )
+        self._prompt_templates = prompt_templates
+    def grade(
+        self,
+        instruction: str,
+        completion_1: str,
+        completion_2: str,
+        language: Language,
+        randomize_order: bool = False,
+        seed: int | None = None,
+    ) -> ComparisonGradingOutput:
+        """Grade two completions by comparing them.
+        Args:
+            instruction: The instruction/task that was given.
+            completion_1: The first completion (typically the candidate).
+            completion_2: The second completion (typically the reference).
+            language: The language for the grading prompts.
+            randomize_order: If True, randomly swap the order of completions to eliminate
+                position bias.
+            seed: Optional random seed for reproducibility. If None and randomize_order
+                is True, uses a random swap decision.
+        Returns:
+            ComparisonGradingOutput with the outcome corrected for any position swap,
+            so outcome always reflects completion_1 vs completion_2 regardless of
+            presentation order to the judge.
+        """
+        prompt_template = language.language_config(self._prompt_templates)
+        # Determine whether to swap the order
+        if randomize_order:
+            rng = random.Random(seed)
+            swap_order = rng.choice([True, False])
+        else:
+            swap_order = False
+        # Apply the swap if needed
+        actual_answer_1, actual_answer_2 = order_answers_for_comparison(completion_1, completion_2, swap_order)
+        messages = prompt_template.to_messages(
+            [],
+            [
+                (self.INSTRUCTION_KEY, instruction),
+                (self.ANSWER_1_KEY, actual_answer_1),
+                (self.ANSWER_2_KEY, actual_answer_2),
+            ],
+        )
+        raw_completion = self._grading_model.generate_from_messages([messages])[0]
+        loaded_json = parse_json_output(raw_completion.completion)
+        # Get the raw outcome from the judge
+        raw_outcome: MatchOutcome | None = prompt_template.parse_map.get(
+            str(loaded_json.get(self.BETTER_ANSWER_KEY, None)), None
+        )
+        # Correct the outcome if we swapped the order
+        # If swapped: "Answer 1 is better" means completion_2 is better (B_WINS from completion_1's perspective)
+        final_outcome = raw_outcome.flip() if swap_order and raw_outcome is not None else raw_outcome
+        return ComparisonGradingOutput(
+            reasoning=loaded_json.get(self.REASONING_KEY, None),
+            outcome=final_outcome,
+            judge_prompt=raw_completion.prompt,
+            judge_response=raw_completion.completion,
+        )