PyPI - eval-framework - Versions diffs - 0.5.0__tar.gz → 0.5.1__tar.gz - Mend

eval-framework 0.5.0tar.gz → 0.5.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (193) hide show

{eval_framework-0.5.0 → eval_framework-0.5.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: eval-framework
-Version: 0.5.0
+Version: 0.5.1
 Summary: Evaluation Framework
 Author: Aleph Alpha Research
 License:                                  Apache License
@@ -235,7 +235,7 @@ Requires-Dist: wandb>=0.27.2,<1
 Requires-Dist: boto3>=1.43.19,<2
 Requires-Dist: numpy>=2.2.6
 Requires-Dist: antlr4-python3-runtime==4.11.0
-Requires-Dist: scipy>=1.17.1,<2
+Requires-Dist: scipy>=1.18.0,<2
 Requires-Dist: accelerate ; extra == 'accelerate'
 Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,optional,mistral] ; extra == 'all'
 Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'

{eval_framework-0.5.0 → eval_framework-0.5.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "eval-framework"
-version = "0.5.0"
+version = "0.5.1"
 description = "Evaluation Framework"
 readme = "README.md"
 license = { file = "LICENSE" }
@@ -44,7 +44,7 @@ dependencies = [
   # is a dependency of sympy, but not explicitly listed in the requirements.txt
   # https://github.com/sympy/sympy/blob/0204fa34e8f6f6f8ccb4de01209be9a2345c9d6e/doc/src/contributing/dependencies.md?plain=1#L125
   "antlr4-python3-runtime==4.11.0",
-  "scipy>=1.17.1,<2",  # required for the aggregation of pass@k metrics
+  "scipy>=1.18.0,<2",  # required for the aggregation of pass@k metrics
 ]
 [project.optional-dependencies]
@@ -104,12 +104,12 @@ dev = [
   "pip-licenses>=5.5.5",
 ]
 flash-attn = [
-  "flash-attn>=2.8.3,<2.9",
+  "flash-attn>=2.8.3.post1,<2.9",
   "torch"
 ]
 [build-system]
-requires = ["uv_build>=0.11.22,<0.11.23"]
+requires = ["uv_build>=0.11.23,<0.11.24"]
 build-backend = "uv_build"
 [tool.uv.build-backend]

{eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/evaluation_generator.py RENAMED Viewed

@@ -18,7 +18,7 @@ from eval_framework.result_processors.base import Result, ResultProcessor
 from eval_framework.shared.types import Completion, Loglikelihood
 from eval_framework.tasks.base import ResponseType
 from eval_framework.tasks.eval_config import EvalConfig
-from eval_framework.tasks.registry import get_task
+from eval_framework.tasks.registry import registry
 from eval_framework.utils.constants import RED, RESET
 from eval_framework.utils.tqdm_handler import get_disable_bar_flag, safe_tqdm_write
@@ -36,13 +36,9 @@ class EvaluationGenerator:
         self.result_processor = result_processor
         self.save_intermediate_results = config.save_intermediate_results
-        task_class = get_task(config.task_name)
-        if hasattr(task_class, "TASK_STYLER"):
-            response_type = task_class.TASK_STYLER.response_type
-            task_metrics = list(task_class.TASK_STYLER.metrics)
-        else:
-            response_type = task_class.RESPONSE_TYPE
-            task_metrics = task_class.METRICS
+        eval_ = registry()[config.task_name]
+        response_type = eval_.response_type()
+        task_metrics = eval_.metrics()
         if response_type == ResponseType.COMPLETION:
             self.metrics = task_metrics + [BytesCompletion, SequencePositionsCompletion]
@@ -51,7 +47,7 @@ class EvaluationGenerator:
         else:
             raise NotImplementedError
-        self.task_name = task_class.NAME
+        self.task_name = eval_.task_class().NAME
     def _run_metric_calculators(self, responses: list[Completion | Loglikelihood]) -> list[Result]:
         results: list[Result] = self.result_processor.load_metrics_results()

{eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/response_generator.py RENAMED Viewed

@@ -5,7 +5,7 @@ from collections.abc import Callable, Iterable
 from datetime import UTC, datetime
 from functools import partial
-from eval_framework.tasks.registry import get_task
+from eval_framework.tasks.registry import registry
 try:
     from determined._info import get_cluster_info
@@ -28,7 +28,6 @@ from eval_framework.shared.types import (
 )
 from eval_framework.tasks.base import Language, ResponseType, Sample
 from eval_framework.tasks.eval_config import EvalConfig
-from eval_framework.tasks.perturbation import create_perturbation_class
 from eval_framework.tasks.utils import raise_errors
 from eval_framework.utils.constants import RED, RESET
 from eval_framework.utils.tqdm_handler import get_disable_bar_flag, safe_tqdm_write
@@ -54,7 +53,6 @@ def map_language_to_value(
 class ResponseGenerator:
     def __init__(self, llm: BaseLLM, config: EvalConfig, result_processor: ResultsFileProcessor) -> None:
-        self.few_shot = config.num_fewshot
         self.task_name = config.task_name
         self.llm = llm
         self.config = config
@@ -62,20 +60,16 @@ class ResponseGenerator:
         self.num_samples = config.num_samples
         self.save_intermediate_results = config.save_intermediate_results
-        task_class = get_task(config.task_name)
         if config.perturbation_config is not None:
-            perturbation_task_class = create_perturbation_class(task_class, config.perturbation_config)
-            self.task = perturbation_task_class.with_overwrite(
-                self.few_shot,
-                custom_subjects=self.config.task_subjects,
-                custom_hf_revision=self.config.hf_revision,
+            self.task = registry()[config.task_name].create_perturbation(
+                config.perturbation_config,
+                config.num_fewshot,
+                config.task_subjects,
+                config.hf_revision,
             )
         else:
-            self.task = task_class.with_overwrite(
-                self.few_shot,
-                custom_subjects=self.config.task_subjects,
-                custom_hf_revision=self.config.hf_revision,
+            self.task = registry()[config.task_name].create(
+                config.num_fewshot, config.task_subjects, config.hf_revision
             )
         self.response_type = self.task.get_response_type()

{eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/gsm8k.py RENAMED Viewed

@@ -4,6 +4,7 @@ from typing import Any
 from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion, AccuracyCompletionOLMES
 from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
+from eval_framework.tasks.task_style import BPBStyle
 logger = logging.getLogger(__name__)
@@ -215,3 +216,32 @@ class GSM8K_OLMES(GSM8K):
     def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
         return self._clean_short_answer(completion_text)
+class GSM8KBPB(GSM8K_OLMES):
+    NAME = "GSM8KBPB"
+    TASK_STYLER = BPBStyle(cue_text="Answer:", leading_space_continuations=False)
+    # BPBStyle already adds "Answer:" as that separate assistant message. But the methods we inherit
+    # still put "Answer:" at the end of the question text and leave it out of the fewshot answer.
+    # So we override them here: remove "Answer:" from the question, and add it back in front of the
+    # fewshot answer. Without this, the question ends in "Answer:Answer:" and fewshot answers have
+    # no "Answer:" label at all.
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        return f"Question: {item['question']}\n"
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        return f"Answer:{self.normalize_answer_str(item)}"
+    def _get_raw_question(self, item: dict[str, Any]) -> str:
+        return item["question"]
+    def _get_choices(self, item: dict[str, Any]) -> list[str]:
+        return [self.normalize_answer_str(item)]
+    def _get_correct_index(self, item: dict[str, Any]) -> int:
+        return 0
+    def _get_ground_truth(self, item: dict[str, Any]) -> str:
+        return self._get_choices(item)[0]

{eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/math_reasoning.py RENAMED Viewed

@@ -14,8 +14,8 @@ from eval_framework.metrics.completion.minerva_math_utils import (
     extract_answers,
     normalized_gold_from_solution,
 )
-from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood
 from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, Sample, SubjectType
+from eval_framework.tasks.task_style import BPBStyle
 # Hendrycks MATH subject splits (shared by MATH, MATHMinervaEvalHarness, MATHMinervaBPB)
 MATH_SUBJECTS = [
@@ -612,44 +612,6 @@ class MATH500Minerva(MATHMinerva):
         super().__init__(num_fewshot)
-class MATHMinervaBPB(MATHReasoning):
-    """
-    MATH (Hendrycks) with Minerva-style prompt, evaluated via loglikelihood of the
-    gold answer string (bits-per-byte).
-    Same prompt as MATHMinerva; scores P(normalized_gold_answer | prompt).
-    """
-    NAME = "MATHMinervaBPB"
-    DATASET_PATH = "EleutherAI/hendrycks_math"
-    SAMPLE_SPLIT = "test"
-    FEWSHOT_SPLIT = "train"
-    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
-    METRICS = [BitsPerByteLoglikelihood]
-    SUBJECTS = MATH_SUBJECTS
-    LANGUAGE = Language.ENG
-    def _get_instruction_text(self, item: dict[str, Any]) -> str:
-        return "Problem:\n" + item["problem"] + "\n\n" + "Solution:"
-    def _get_cue_text(self, item: dict[str, Any]) -> str:
-        return ""
-    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
-        normalized = self._normalized_gold_from_solution(item["solution"])
-        if normalized is None:
-            return None
-        return " " + normalized
-    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
-        normalized = self._normalized_gold_from_solution(item["solution"])
-        if normalized is None:
-            return None
-        return [" " + normalized]
-    def _normalized_gold_from_solution(self, solution: str) -> str | None:
-        return normalized_gold_from_solution(solution)
 class MATHLvl5(MATH):
     NAME = "Math Lvl 5"
@@ -742,7 +704,7 @@ Answer:"""
 _OLMES_FEWSHOTS = [
-    ## https://github.com/huggingface/lm-evaluation-harness/blob/add_leaderboard_tasks/lm_eval/tasks/leaderboard/math/utils.py
+    # https://github.com/huggingface/lm-evaluation-harness/blob/add_leaderboard_tasks/lm_eval/tasks/leaderboard/math/utils.py
     {
         "problem": "Find the domain of the expression  $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$.}",
         "solution": "The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so "
@@ -790,3 +752,35 @@ class MATHMinerva_OLMES(MATHMinerva):
     def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
         return _OLMES_FEWSHOTS[: self.num_fewshot]
+class MATHMinervaBPB(MATHMinerva_OLMES):
+    NAME = "MATHMinervaBPB"
+    TASK_STYLER = BPBStyle(cue_text="Solution:")
+    # BPBStyle already adds "Solution:" as that separate assistant message. But the methods we inherit
+    # still put "Solution:" at the end of the question text and leave it out of the fewshot answer.
+    # So we override them here: remove "Solution:" from the question, and add it back in front of the
+    # fewshot answer. Without this, the question ends in "Solution:Solution:" and fewshot answers have
+    # no "Solution:" label at all.
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        return "Problem:\n" + item["problem"] + "\n\n"
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        return f"Solution: {item['solution']}"
+    def _get_choices(self, item: dict[str, Any]) -> list[str]:
+        answer = normalized_gold_from_solution(item["solution"])
+        template = f"\nFinal Answer: The final answer is {answer}. I hope it is correct."
+        return [item["solution"] + template]
+    def _get_correct_index(self, item: dict[str, Any]) -> int:
+        return 0
+    def _get_raw_question(self, item: dict[str, Any]) -> str:
+        return item["problem"]
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
+        return self._get_choices(item)[0]

{eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/squad.py RENAMED Viewed

@@ -244,6 +244,11 @@ class SQuAD2_MA(SQUAD2):
     METRICS = [AccuracyCompletion, F1, F1SquadNormalized]
+    def __init__(self, num_fewshot: int = 0) -> None:
+        super().__init__(num_fewshot)
+        self.stop_sequences = []
+        self.max_tokens = None
     def _get_system_prompt_text(self, item: dict[str, Any]) -> str | None:
         return (
             "You are a helpful assistant and will answer the user's questions carefully, "

{eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/benchmarks/triviaqa.py RENAMED Viewed

@@ -52,6 +52,11 @@ class TriviaQA_MA(TRIVIAQA):
     METRICS = [AccuracyCompletion, F1, F1SquadNormalized]
     PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer", "Context", "unanswerable"]
+    def __init__(self, num_fewshot: int = 0) -> None:
+        super().__init__(num_fewshot)
+        self.stop_sequences = []
+        self.max_tokens = None
     def _get_context_text(self, item: dict[str, Any]) -> str:
         return "\n\n".join(item["entity_pages"]["wiki_context"])

{eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/eval_config.py RENAMED Viewed

@@ -10,7 +10,7 @@ from eval_framework.llm.base import BaseLLM
 from eval_framework.metrics.llm.base import BaseLLMJudgeMetric
 from eval_framework.tasks.base import BaseTask
 from eval_framework.tasks.perturbation import PerturbationConfig
-from eval_framework.tasks.registry import get_task, validate_task_name
+from eval_framework.tasks.registry import get_task, registry, validate_task_name
 from eval_framework.utils.constants import ROOT_DIR
 # Keys that don't impact actual evaluation results and should be excluded from config dumps for hashing purposes.
@@ -115,8 +115,7 @@ class EvalConfig(BaseConfig):
     @model_validator(mode="after")
     def validate_llm_judge_defined(self) -> "EvalConfig":
-        task = get_task(self.task_name)
-        task_metrics = task(num_fewshot=0).get_metrics()
+        task_metrics = registry()[self.task_name].metrics()
         for metric_class in task_metrics:
             if issubclass(metric_class, BaseLLMJudgeMetric):
                 assert self.llm_judge_class is not None, "The LLM Judge must be defined for this evaluation task."

{eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/registry.py RENAMED Viewed

@@ -3,15 +3,19 @@ import importlib
 import re
 from abc import ABC, abstractmethod
 from collections.abc import Generator, Iterator, Sequence
-from typing import Any
+from typing import TYPE_CHECKING, Any
-from eval_framework.tasks.base import BaseTask
+from eval_framework.tasks.base import BaseTask, ResponseType
+from eval_framework.tasks.perturbation import PerturbationConfig, create_perturbation_class
 from eval_framework.utils.packaging import is_extra_installed, validate_package_extras
+if TYPE_CHECKING:
+    from eval_framework.metrics.base import BaseMetric
 __all__ = [
     "register_task",
     "register_lazy_task",
-    "BenchmarkFactory",
+    "EvalFactory",
     "Registry",
     "with_registry",
     "get_task",
@@ -22,13 +26,13 @@ __all__ = [
 ]
-class BenchmarkFactory(ABC):
-    """Produces a registered benchmark's task.
+class EvalFactory(ABC):
+    """Produces a registered benchmark's eval.
-    The registry stores one factory per benchmark. This allows the factory to be
-    constructed without constructing all benchmarks. Going via this ABC allows
+    The registry stores one factory per eval. This allows the factory to be
+    constructed without constructing all evals. Going via this ABC allows
     the factory instances to contain state specifically relevant to the
-    benchmark, as well as supporting different strategies for instantiating it.
+    eval, as well as supporting different strategies for instantiating it.
     E.g. eager vs lazy loading of the required dependencies.
     """
@@ -41,11 +45,33 @@ class BenchmarkFactory(ABC):
     def source_module(self) -> str:
         """Module the task class is defined in, resolvable without importing it."""
+    @abstractmethod
+    def response_type(self) -> ResponseType:
+        """The eval's response type"""
+    @abstractmethod
+    def metrics(self) -> list[type["BaseMetric"]]:
+        """The eval's metrics"""
+    @abstractmethod
+    def create(
+        self, num_fewshot: int, custom_subjects: list[str] | None, custom_hf_revision: str | None
+    ) -> BaseTask: ...
+    @abstractmethod
+    def create_perturbation(
+        self,
+        perturbation_config: PerturbationConfig,
+        num_fewshot: int,
+        custom_subjects: list[str] | None,
+        custom_hf_revision: str | None,
+    ) -> BaseTask: ...
-class _Lazy(BenchmarkFactory):
+class _Lazy(EvalFactory):
     """
-    Create benchmark from qualified class path; Delays importing modules until
-    benchmark is constructed.
+    Create eval from qualified class path; Delays importing modules until
+    eval is constructed.
     """
     def __init__(self, class_name: str, module: str, extras: Sequence[str] = ()) -> None:
@@ -73,8 +99,35 @@ class _Lazy(BenchmarkFactory):
             self._loaded = getattr(module, self._class_name)
         return self._loaded
+    def create(self, num_fewshot: int, custom_subjects: list[str] | None, custom_hf_revision: str | None) -> BaseTask:
+        return self.task_class().with_overwrite(
+            num_fewshot=num_fewshot, custom_subjects=custom_subjects, custom_hf_revision=custom_hf_revision
+        )
+    def create_perturbation(
+        self,
+        perturbation_config: PerturbationConfig,
+        num_fewshot: int,
+        custom_subjects: list[str] | None,
+        custom_hf_revision: str | None,
+    ) -> BaseTask:
+        perturbation_task_class = create_perturbation_class(self.task_class(), perturbation_config)
+        return perturbation_task_class.with_overwrite(
+            num_fewshot=num_fewshot,
+            custom_subjects=custom_subjects,
+            custom_hf_revision=custom_hf_revision,
+        )
-class _Eager(BenchmarkFactory):
+    def response_type(self) -> ResponseType:
+        """The eval's response type"""
+        return self.task_class().get_response_type()
+    def metrics(self) -> list[type["BaseMetric"]]:
+        """The eval's metrics"""
+        return self.task_class().get_metrics()
+class _Eager(EvalFactory):
     """Wraps an already-imported task class."""
     def __init__(self, task: type[BaseTask]) -> None:
@@ -87,6 +140,33 @@ class _Eager(BenchmarkFactory):
     def task_class(self) -> type[BaseTask]:
         return self._task
+    def create(self, num_fewshot: int, custom_subjects: list[str] | None, custom_hf_revision: str | None) -> BaseTask:
+        return self.task_class().with_overwrite(
+            num_fewshot=num_fewshot, custom_subjects=custom_subjects, custom_hf_revision=custom_hf_revision
+        )
+    def create_perturbation(
+        self,
+        perturbation_config: PerturbationConfig,
+        num_fewshot: int,
+        custom_subjects: list[str] | None,
+        custom_hf_revision: str | None,
+    ) -> BaseTask:
+        perturbation_task_class = create_perturbation_class(self.task_class(), perturbation_config)
+        return perturbation_task_class.with_overwrite(
+            num_fewshot=num_fewshot,
+            custom_subjects=custom_subjects,
+            custom_hf_revision=custom_hf_revision,
+        )
+    def response_type(self) -> ResponseType:
+        """The eval's response type"""
+        return self.task_class().get_response_type()
+    def metrics(self) -> list[type["BaseMetric"]]:
+        """The eval's metrics"""
+        return self.task_class().get_metrics()
 class Registry:
     """A registry for tasks with support for lazy loading.
@@ -97,7 +177,7 @@ class Registry:
     def __init__(self) -> None:
         # TODO: Lookup only with upper names
-        self._registry: dict[str, tuple[str, BenchmarkFactory]] = dict()
+        self._registry: dict[str, tuple[str, EvalFactory]] = dict()
     def __iter__(self) -> Iterator[str]:
         for name, _ in self._registry.values():
@@ -116,20 +196,20 @@ class Registry:
         task_key = self._task_key(name)
         return task_key in self._registry
-    def __getitem__(self, name: str, /) -> type[BaseTask]:
+    def __getitem__(self, name: str, /) -> EvalFactory:
         task_key = self._task_key(name)
         try:
             _, factory = self._registry[task_key]
         except KeyError:
-            raise KeyError(f"Task not found: {name}")
+            raise KeyError(f"Task not found: {name=} with task_key {task_key=}")
-        return factory.task_class()
+        return factory
     def add(self, task: type[BaseTask]) -> None:
         task_key = self._task_key(task.NAME)
         self._registry[task_key] = (task.NAME, _Eager(task))
-    def __setitem__(self, name: str, factory: BenchmarkFactory) -> None:
+    def __setitem__(self, name: str, factory: EvalFactory) -> None:
         task_key = self._task_key(name)
         if task_key in self._registry:
             raise ValueError(f"Cannot register duplicate task with key: {task_key}")
@@ -140,6 +220,10 @@ class Registry:
 _REGISTRY = Registry()
+def registry() -> Registry:
+    return _REGISTRY
 @contextlib.contextmanager
 def with_registry(registry: Registry) -> Generator[None, Any, None]:
     """Contextmanager to change the current registry."""
@@ -183,7 +267,7 @@ def get_task(name: str, /) -> type[BaseTask]:
     Note: This method will import any lazily registered task.
     """
-    return _REGISTRY[name]
+    return _REGISTRY[name].task_class()
 def register_task(task: type[BaseTask]) -> str:

{eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/tasks/task_names.py RENAMED Viewed

@@ -30,6 +30,8 @@ def register_all_tasks() -> None:
     register_lazy_task("eval_framework.tasks.benchmarks.goldenswag.GOLDENSWAG_IDK")
     register_lazy_task("eval_framework.tasks.benchmarks.gpqa.GPQA_OLMES")
     register_lazy_task("eval_framework.tasks.benchmarks.gsm8k.GSM8K_OLMES")
+    register_lazy_task("eval_framework.tasks.benchmarks.gsm8k.GSM8KBPB")
+    register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.MATHMinervaBPB")
     register_lazy_task("eval_framework.tasks.benchmarks.math_reasoning.GSM8KReasoning")
     register_lazy_task("eval_framework.tasks.benchmarks.hellaswag.HELLASWAG")
     register_lazy_task("eval_framework.tasks.benchmarks.hellaswag.HELLASWAG_OLMES")

{eval_framework-0.5.0 → eval_framework-0.5.1}/src/eval_framework/utils/generate_task_docs.py RENAMED Viewed

@@ -6,7 +6,7 @@ from pathlib import Path
 import tqdm
-from eval_framework.tasks.registry import get_task, registered_task_names
+from eval_framework.tasks.registry import registered_task_names, registry
 from eval_framework.tasks.task_loader import load_extra_tasks
 from template_formatting.formatter import BaseFormatter, ConcatFormatter, Llama3Formatter
@@ -69,7 +69,8 @@ def generate_docs_for_task(
     output_docs_directory: Path, task_name: str, formatters: list[BaseFormatter], add_prompt_examples: bool
 ) -> None:
     """Generate documentation for a specific task."""
-    task_class = get_task(task_name)
+    eval_ = registry()[task_name]
+    task_class = eval_.task_class()
     try:
         num_fewshot = 1
@@ -98,16 +99,9 @@ def generate_docs_for_task(
             f.write(f"SAMPLE_SPLIT = {task.SAMPLE_SPLIT}".strip() + "\n")
         if hasattr(task, "FEWSHOT_SPLIT"):
             f.write(f"FEWSHOT_SPLIT = {task.FEWSHOT_SPLIT}".strip() + "\n")
-        if hasattr(task, "TASK_STYLER"):
-            f.write(f"RESPONSE_TYPE = {task.TASK_STYLER.response_type.name}".strip() + "\n")
-            metrics_list = [f"{m.__name__}" for m in task.TASK_STYLER.metrics]
-            f.write(f"METRICS = [{', '.join(metrics_list)}]".strip() + "\n")
-        else:
-            if hasattr(task, "RESPONSE_TYPE"):
-                f.write(f"RESPONSE_TYPE = {task.RESPONSE_TYPE.name}".strip() + "\n")
-            if hasattr(task, "METRICS"):
-                metrics_list = [f"{m.__name__}" for m in task.METRICS]
-                f.write(f"METRICS = [{', '.join(metrics_list)}]".strip() + "\n")
+        f.write(f"RESPONSE_TYPE = {eval_.response_type().name}".strip() + "\n")
+        metrics_list = [f"{m.__name__}" for m in eval_.metrics()]
+        f.write(f"METRICS = [{', '.join(metrics_list)}]".strip() + "\n")
         if hasattr(task, "SUBJECTS"):
             f.write(f"SUBJECTS = {repr(task.SUBJECTS)}".strip() + "\n")
         if hasattr(task, "LANGUAGE"):