eval-framework 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eval_framework/__init__.py +7 -0
- eval_framework/base_config.py +36 -0
- eval_framework/context/__init__.py +0 -0
- eval_framework/context/determined.py +170 -0
- eval_framework/context/eval.py +114 -0
- eval_framework/context/local.py +52 -0
- eval_framework/evaluation_generator.py +231 -0
- eval_framework/exceptions.py +2 -0
- eval_framework/external/ifeval_impl/README.md +5 -0
- eval_framework/external/ifeval_impl/instructions.py +1523 -0
- eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
- eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
- eval_framework/external/ifeval_impl/utils.py +135 -0
- eval_framework/llm/__init__.py +0 -0
- eval_framework/llm/aleph_alpha.py +323 -0
- eval_framework/llm/base.py +58 -0
- eval_framework/llm/huggingface.py +332 -0
- eval_framework/llm/mistral.py +73 -0
- eval_framework/llm/models.py +16 -0
- eval_framework/llm/openai.py +205 -0
- eval_framework/llm/vllm.py +438 -0
- eval_framework/logger.py +3 -0
- eval_framework/main.py +187 -0
- eval_framework/metrics/__init__.py +0 -0
- eval_framework/metrics/base.py +40 -0
- eval_framework/metrics/completion/__init__.py +1 -0
- eval_framework/metrics/completion/accuracy_completion.py +16 -0
- eval_framework/metrics/completion/bleu.py +76 -0
- eval_framework/metrics/completion/chrf.py +62 -0
- eval_framework/metrics/completion/code_assertion.py +44 -0
- eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
- eval_framework/metrics/completion/comet.py +56 -0
- eval_framework/metrics/completion/concordance_index.py +38 -0
- eval_framework/metrics/completion/csv_format.py +102 -0
- eval_framework/metrics/completion/cwe_accuracy.py +49 -0
- eval_framework/metrics/completion/exponential_similarity.py +65 -0
- eval_framework/metrics/completion/f1.py +42 -0
- eval_framework/metrics/completion/format_checker.py +56 -0
- eval_framework/metrics/completion/grid_difference.py +77 -0
- eval_framework/metrics/completion/ifeval.py +73 -0
- eval_framework/metrics/completion/json_format.py +171 -0
- eval_framework/metrics/completion/language_checker.py +74 -0
- eval_framework/metrics/completion/length_control.py +83 -0
- eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
- eval_framework/metrics/completion/niah_accuracy.py +163 -0
- eval_framework/metrics/completion/placeholder_checker.py +27 -0
- eval_framework/metrics/completion/repetition.py +88 -0
- eval_framework/metrics/completion/rouge_1.py +35 -0
- eval_framework/metrics/completion/rouge_2.py +45 -0
- eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
- eval_framework/metrics/completion/rouge_l.py +52 -0
- eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
- eval_framework/metrics/completion/ter.py +67 -0
- eval_framework/metrics/completion/text_counter.py +182 -0
- eval_framework/metrics/efficiency/__init__.py +0 -0
- eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
- eval_framework/metrics/llm/__init__.py +0 -0
- eval_framework/metrics/llm/base.py +8 -0
- eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
- eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
- eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
- eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
- eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
- eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
- eval_framework/metrics/llm/graders/language.py +56 -0
- eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
- eval_framework/metrics/llm/graders/models.py +74 -0
- eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
- eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
- eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
- eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
- eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
- eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
- eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
- eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
- eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
- eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
- eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
- eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
- eval_framework/metrics/llm/llm_judge_sql.py +394 -0
- eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
- eval_framework/metrics/loglikelihood/__init__.py +0 -0
- eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
- eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
- eval_framework/py.typed +0 -0
- eval_framework/response_generator.py +416 -0
- eval_framework/result_processors/__init__.py +0 -0
- eval_framework/result_processors/base.py +74 -0
- eval_framework/result_processors/hf_processor.py +87 -0
- eval_framework/result_processors/result_processor.py +129 -0
- eval_framework/run.py +314 -0
- eval_framework/run_direct.py +42 -0
- eval_framework/shared/types.py +227 -0
- eval_framework/tasks/__init__.py +6 -0
- eval_framework/tasks/base.py +314 -0
- eval_framework/tasks/benchmarks/__init__.py +0 -0
- eval_framework/tasks/benchmarks/arc.py +46 -0
- eval_framework/tasks/benchmarks/arc_de.py +46 -0
- eval_framework/tasks/benchmarks/arc_fi.py +46 -0
- eval_framework/tasks/benchmarks/belebele.py +60 -0
- eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
- eval_framework/tasks/benchmarks/casehold.py +47 -0
- eval_framework/tasks/benchmarks/chembench.py +85 -0
- eval_framework/tasks/benchmarks/copa.py +39 -0
- eval_framework/tasks/benchmarks/duc.py +91 -0
- eval_framework/tasks/benchmarks/flores200.py +62 -0
- eval_framework/tasks/benchmarks/flores_plus.py +84 -0
- eval_framework/tasks/benchmarks/gpqa.py +177 -0
- eval_framework/tasks/benchmarks/gsm8k.py +148 -0
- eval_framework/tasks/benchmarks/hellaswag.py +44 -0
- eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
- eval_framework/tasks/benchmarks/humaneval.py +97 -0
- eval_framework/tasks/benchmarks/ifeval.py +78 -0
- eval_framework/tasks/benchmarks/include.py +119 -0
- eval_framework/tasks/benchmarks/infinitebench.py +302 -0
- eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
- eval_framework/tasks/benchmarks/mbpp.py +192 -0
- eval_framework/tasks/benchmarks/mmlu.py +190 -0
- eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
- eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
- eval_framework/tasks/benchmarks/mmmlu.py +529 -0
- eval_framework/tasks/benchmarks/openbookqa.py +37 -0
- eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
- eval_framework/tasks/benchmarks/pawsx.py +65 -0
- eval_framework/tasks/benchmarks/piqa.py +39 -0
- eval_framework/tasks/benchmarks/quality.py +56 -0
- eval_framework/tasks/benchmarks/sciq.py +44 -0
- eval_framework/tasks/benchmarks/sphyr.py +75 -0
- eval_framework/tasks/benchmarks/squad.py +89 -0
- eval_framework/tasks/benchmarks/struct_eval.py +110 -0
- eval_framework/tasks/benchmarks/tablebench.py +117 -0
- eval_framework/tasks/benchmarks/triviaqa.py +42 -0
- eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
- eval_framework/tasks/benchmarks/winogender.py +39 -0
- eval_framework/tasks/benchmarks/winogrande.py +44 -0
- eval_framework/tasks/benchmarks/winox.py +57 -0
- eval_framework/tasks/benchmarks/wmt.py +160 -0
- eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
- eval_framework/tasks/eval_config.py +112 -0
- eval_framework/tasks/perturbation.py +83 -0
- eval_framework/tasks/registry.py +186 -0
- eval_framework/tasks/task_loader.py +80 -0
- eval_framework/tasks/task_names.py +138 -0
- eval_framework/tasks/utils.py +578 -0
- eval_framework/utils/constants.py +9 -0
- eval_framework/utils/generate_task_docs.py +229 -0
- eval_framework/utils/helpers.py +3 -0
- eval_framework/utils/logging.py +50 -0
- eval_framework/utils/packaging.py +52 -0
- eval_framework-0.2.0.dist-info/METADATA +514 -0
- eval_framework-0.2.0.dist-info/RECORD +161 -0
- eval_framework-0.2.0.dist-info/WHEEL +4 -0
- eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
- template_formatting/README.md +83 -0
- template_formatting/__init__.py +0 -0
- template_formatting/formatter.py +536 -0
- template_formatting/mistral_formatter.py +159 -0
- template_formatting/py.typed +0 -0
- template_formatting/tests/test_formatter_eval.py +408 -0
- template_formatting/tests/test_formatter_scaling.py +253 -0
- template_formatting/tests/test_mistral_formatter.py +136 -0
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import importlib.metadata
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import jsonlines
|
|
10
|
+
from pydantic import RootModel
|
|
11
|
+
|
|
12
|
+
from eval_framework.result_processors.base import Result, ResultProcessor
|
|
13
|
+
from eval_framework.shared.types import Completion, Loglikelihood
|
|
14
|
+
from eval_framework.tasks.eval_config import EvalConfig
|
|
15
|
+
|
|
16
|
+
MAIN = "eval_framework_results"
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ResultsFileProcessor(ResultProcessor):
|
|
21
|
+
def __init__(self, output_dir: Path) -> None:
|
|
22
|
+
self.output_dir = output_dir
|
|
23
|
+
os.makedirs(self.output_dir, exist_ok=True)
|
|
24
|
+
|
|
25
|
+
def save_metadata(self, metadata: dict) -> None:
|
|
26
|
+
with open(self.output_dir / "metadata.json", "w") as f:
|
|
27
|
+
json.dump(metadata, f, indent=2)
|
|
28
|
+
|
|
29
|
+
def load_metadata(self) -> dict:
|
|
30
|
+
metadata_file = self.output_dir / "metadata.json"
|
|
31
|
+
if os.path.exists(metadata_file):
|
|
32
|
+
with open(metadata_file) as f:
|
|
33
|
+
return json.load(f)
|
|
34
|
+
else:
|
|
35
|
+
logger.info("No metadata found.")
|
|
36
|
+
return {}
|
|
37
|
+
|
|
38
|
+
def save_responses(self, responses: list[Completion | Loglikelihood]) -> None:
|
|
39
|
+
responses_data = [response.model_dump(serialize_as_any=True) for response in responses]
|
|
40
|
+
with jsonlines.open(self.output_dir / "output.jsonl", "w") as f:
|
|
41
|
+
f.write_all(responses_data)
|
|
42
|
+
|
|
43
|
+
def save_response(self, response: Completion | Loglikelihood) -> None:
|
|
44
|
+
with jsonlines.open(self.output_dir / "output.jsonl", "a") as f:
|
|
45
|
+
f.write(response.model_dump(serialize_as_any=True))
|
|
46
|
+
|
|
47
|
+
def load_responses(self) -> list[Completion | Loglikelihood]:
|
|
48
|
+
output_file = self.output_dir / "output.jsonl"
|
|
49
|
+
broken_file = output_file.with_suffix(f".jsonl.broken.{datetime.now().strftime('%Y%m%d_%H%M%S')}")
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
Item = RootModel[Loglikelihood | Completion]
|
|
53
|
+
with jsonlines.open(output_file, "r") as f:
|
|
54
|
+
responses = [Item.model_validate(x).root for x in f]
|
|
55
|
+
except FileNotFoundError:
|
|
56
|
+
logger.info("No saved completions found.")
|
|
57
|
+
responses = []
|
|
58
|
+
except (json.decoder.JSONDecodeError, jsonlines.jsonlines.InvalidLineError):
|
|
59
|
+
logger.info(f"Error decoding JSON, the file is corrupted. It will be renamed to {broken_file} and ignored")
|
|
60
|
+
output_file.rename(broken_file)
|
|
61
|
+
responses = []
|
|
62
|
+
|
|
63
|
+
ids_list = [(resp.id, resp.subject) for resp in responses]
|
|
64
|
+
if len(ids_list) != len(set(ids_list)) and "mtbench" not in str(output_file):
|
|
65
|
+
logger.info(
|
|
66
|
+
f"Error: {len(ids_list) - len(set(ids_list))} duplicate response IDs found, the file is corrupted. "
|
|
67
|
+
f"It will be renamed to {broken_file} and ignored"
|
|
68
|
+
)
|
|
69
|
+
output_file.rename(broken_file)
|
|
70
|
+
responses = []
|
|
71
|
+
|
|
72
|
+
return responses
|
|
73
|
+
|
|
74
|
+
def save_metrics_results(self, results: list[Result]) -> None:
|
|
75
|
+
result_data = [x.model_dump() for x in results]
|
|
76
|
+
with jsonlines.open(self.output_dir / "results.jsonl", "w") as f:
|
|
77
|
+
f.write_all(result_data)
|
|
78
|
+
|
|
79
|
+
def save_metrics_result(self, result: Result) -> None:
|
|
80
|
+
with jsonlines.open(self.output_dir / "results.jsonl", "a") as f:
|
|
81
|
+
f.write(result.model_dump())
|
|
82
|
+
|
|
83
|
+
def save_aggregated_results(self, results: dict[str, float | None]) -> None:
|
|
84
|
+
with open(self.output_dir / "aggregated_results.json", "w") as f:
|
|
85
|
+
json.dump(results, f, indent=4)
|
|
86
|
+
|
|
87
|
+
def load_metrics_results(self) -> list[Result]:
|
|
88
|
+
results_file = self.output_dir / "results.jsonl"
|
|
89
|
+
try:
|
|
90
|
+
with jsonlines.open(results_file, "r") as f:
|
|
91
|
+
result_data = [Result.model_validate(x) for x in f]
|
|
92
|
+
return result_data
|
|
93
|
+
except FileNotFoundError:
|
|
94
|
+
logger.info("No saved metrics found.")
|
|
95
|
+
return []
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def generate_output_dir(llm_name: str, config: EvalConfig) -> Path:
|
|
99
|
+
# get the package version
|
|
100
|
+
version_str = f"v{importlib.metadata.version('eval_framework')}"
|
|
101
|
+
|
|
102
|
+
# Handle None values for num_fewshot and num_samples
|
|
103
|
+
fewshot_str = f"fewshot_{config.num_fewshot}" if config.num_fewshot is not None else "fewshot_None"
|
|
104
|
+
samples_str = f"samples_{config.num_samples}" if config.num_samples is not None else "samples_None"
|
|
105
|
+
tokens_str = f"tokens_{config.max_tokens}" if config.max_tokens is not None else ""
|
|
106
|
+
|
|
107
|
+
# Serialize key parameters for inclusion in the name
|
|
108
|
+
params_str = f"{fewshot_str}__{samples_str}"
|
|
109
|
+
if tokens_str:
|
|
110
|
+
params_str += f"__{tokens_str}"
|
|
111
|
+
|
|
112
|
+
# Serialize the full config for hashing
|
|
113
|
+
# Convert the config to a dict and sort keys to ensure consistent hashing
|
|
114
|
+
config_json = config.model_json_dump()
|
|
115
|
+
config_hash = hashlib.sha256(config_json.encode("utf-8")).hexdigest()[:5] # Short hash of 5 characters
|
|
116
|
+
|
|
117
|
+
# Include the hash in the directory name
|
|
118
|
+
dir_name = f"{params_str}_{config_hash}"
|
|
119
|
+
|
|
120
|
+
# add timestamp to dir in debug mode
|
|
121
|
+
if os.environ.get("DEBUG", "FALSE").lower() == "true":
|
|
122
|
+
# Generate the timestamp
|
|
123
|
+
timestamp = datetime.now().strftime("%Y%m%dT%H%M%S")
|
|
124
|
+
dir_name += f"_{timestamp}"
|
|
125
|
+
|
|
126
|
+
# Combine all components to form the full output directory path
|
|
127
|
+
output_dir = config.output_dir / llm_name / f"{version_str}_{config.task_name}" / dir_name
|
|
128
|
+
|
|
129
|
+
return output_dir
|
eval_framework/run.py
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import datetime
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
from eval_framework.context.determined import DeterminedContext
|
|
9
|
+
except ImportError:
|
|
10
|
+
DeterminedContext = None # type: ignore
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
from eval_framework.context.local import LocalContext
|
|
14
|
+
from eval_framework.main import main
|
|
15
|
+
from eval_framework.tasks.task_loader import load_extra_tasks
|
|
16
|
+
from eval_framework.utils.logging import setup_logging
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
CONTEXT = {
|
|
21
|
+
"local": LocalContext,
|
|
22
|
+
"determined": DeterminedContext,
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def parse_args() -> argparse.Namespace:
|
|
27
|
+
parser = argparse.ArgumentParser()
|
|
28
|
+
parser.add_argument(
|
|
29
|
+
"--context",
|
|
30
|
+
type=str,
|
|
31
|
+
required=False,
|
|
32
|
+
default="local",
|
|
33
|
+
choices=["local", "determined"],
|
|
34
|
+
help="The context in which the evaluation is run.",
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"--models",
|
|
38
|
+
type=Path,
|
|
39
|
+
required=False,
|
|
40
|
+
default=Path(__file__).parent / "llm" / "models.py",
|
|
41
|
+
help="The path to the Python module file containing model classes.",
|
|
42
|
+
)
|
|
43
|
+
parser.add_argument(
|
|
44
|
+
"--extra-task-modules",
|
|
45
|
+
nargs="*",
|
|
46
|
+
default=[],
|
|
47
|
+
required=False,
|
|
48
|
+
help="List of files and folders containing additional task definitions.",
|
|
49
|
+
)
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"--llm-name",
|
|
52
|
+
type=str,
|
|
53
|
+
required=False,
|
|
54
|
+
help=(
|
|
55
|
+
"The class derived from `eval_framework.llm.base.BaseLLM` found in the "
|
|
56
|
+
"models module to instantiate for evaluation."
|
|
57
|
+
),
|
|
58
|
+
)
|
|
59
|
+
parser.add_argument(
|
|
60
|
+
"--llm-args",
|
|
61
|
+
type=str,
|
|
62
|
+
nargs="*",
|
|
63
|
+
required=False,
|
|
64
|
+
help="The arguments to pass to the LLM as key=value pairs.",
|
|
65
|
+
)
|
|
66
|
+
parser.add_argument(
|
|
67
|
+
"--num-samples", type=int, required=False, help="The number of samples per subject to evaluate."
|
|
68
|
+
)
|
|
69
|
+
parser.add_argument(
|
|
70
|
+
"--max-tokens",
|
|
71
|
+
type=int,
|
|
72
|
+
required=False,
|
|
73
|
+
help="The maximum number of tokens to generate for each sample. Overwrites any task default value.",
|
|
74
|
+
)
|
|
75
|
+
parser.add_argument(
|
|
76
|
+
"--num-fewshot", type=int, required=False, default=0, help="The number of fewshot examples to use."
|
|
77
|
+
)
|
|
78
|
+
parser.add_argument("--task-name", type=str, required=False, help="The name of the task to evaluate.")
|
|
79
|
+
|
|
80
|
+
# Perturbation arguments
|
|
81
|
+
parser.add_argument(
|
|
82
|
+
"--perturbation-type",
|
|
83
|
+
type=str,
|
|
84
|
+
required=False,
|
|
85
|
+
choices=[
|
|
86
|
+
"editor",
|
|
87
|
+
"permute",
|
|
88
|
+
"replace",
|
|
89
|
+
"delete",
|
|
90
|
+
"uppercase",
|
|
91
|
+
],
|
|
92
|
+
help=(
|
|
93
|
+
"The type of perturbation to apply. Note that this may not make sense for some prompts, for example those "
|
|
94
|
+
"containing math and code."
|
|
95
|
+
),
|
|
96
|
+
)
|
|
97
|
+
parser.add_argument(
|
|
98
|
+
"--perturbation-probability",
|
|
99
|
+
type=float,
|
|
100
|
+
required=False,
|
|
101
|
+
default=None,
|
|
102
|
+
help="The probability of applying a perturbation to each word or character (between 0.0 and 1.0).",
|
|
103
|
+
)
|
|
104
|
+
parser.add_argument(
|
|
105
|
+
"--perturbation-seed",
|
|
106
|
+
type=int,
|
|
107
|
+
required=False,
|
|
108
|
+
default=42,
|
|
109
|
+
help="Random seed controlling perturbations.",
|
|
110
|
+
)
|
|
111
|
+
parser.add_argument(
|
|
112
|
+
"--task-subjects",
|
|
113
|
+
type=str,
|
|
114
|
+
nargs="+",
|
|
115
|
+
required=False,
|
|
116
|
+
help="The subjects of the task to evaluate. If empty, all subjects are evaluated. Subjects in the form of "
|
|
117
|
+
"tuples can be specified in a comma-delimited way, possibly using wildcard * in some dimensions of a tuple, "
|
|
118
|
+
'e.g. "DE_DE, *" or "FR_FR, astronomy".',
|
|
119
|
+
)
|
|
120
|
+
parser.add_argument(
|
|
121
|
+
"--hf-revision",
|
|
122
|
+
type=str,
|
|
123
|
+
required=False,
|
|
124
|
+
default=None,
|
|
125
|
+
help="A tag name, a branch name, or commit hash for the task HF dataset.",
|
|
126
|
+
)
|
|
127
|
+
parser.add_argument(
|
|
128
|
+
"--judge-models",
|
|
129
|
+
type=Path,
|
|
130
|
+
required=False,
|
|
131
|
+
help="The path to the Python module file containing LLM judge model classes.",
|
|
132
|
+
)
|
|
133
|
+
parser.add_argument(
|
|
134
|
+
"--output-dir",
|
|
135
|
+
type=Path,
|
|
136
|
+
default="outputs",
|
|
137
|
+
required=False,
|
|
138
|
+
help="The path for the evaluation outputs.",
|
|
139
|
+
)
|
|
140
|
+
parser.add_argument(
|
|
141
|
+
"--hf-upload-repo",
|
|
142
|
+
type=str,
|
|
143
|
+
default="Aleph-Alpha/evaluation-results",
|
|
144
|
+
required=False,
|
|
145
|
+
help="Customizable path for the HuggingFace git repository where runs will be stored.",
|
|
146
|
+
)
|
|
147
|
+
parser.add_argument(
|
|
148
|
+
"--hf-upload-dir",
|
|
149
|
+
type=str,
|
|
150
|
+
default="",
|
|
151
|
+
required=False,
|
|
152
|
+
help="Folder name for the HuggingFace git repository where runs will be stored.",
|
|
153
|
+
)
|
|
154
|
+
parser.add_argument(
|
|
155
|
+
"--wandb-project",
|
|
156
|
+
type=str,
|
|
157
|
+
default=None,
|
|
158
|
+
required=False,
|
|
159
|
+
help="The name of the Weights & Biases project to log runs to. "
|
|
160
|
+
"The environment variable WANDB_API_KEY must be set.",
|
|
161
|
+
)
|
|
162
|
+
parser.add_argument(
|
|
163
|
+
"--wandb-entity",
|
|
164
|
+
type=str,
|
|
165
|
+
default=None,
|
|
166
|
+
required=False,
|
|
167
|
+
help="The name of the Weights & Biases entity to log runs to. Defaults to the user's default entity.",
|
|
168
|
+
)
|
|
169
|
+
parser.add_argument(
|
|
170
|
+
"--wandb-run-id",
|
|
171
|
+
type=str,
|
|
172
|
+
default=None,
|
|
173
|
+
required=False,
|
|
174
|
+
help="The ID of an existing Weights & Biases run to resume. "
|
|
175
|
+
"If not given, creates a new fun. If given and exists, "
|
|
176
|
+
"will continue the run but will overwrite the pthon command logged in wandb.",
|
|
177
|
+
)
|
|
178
|
+
parser.add_argument(
|
|
179
|
+
"--description",
|
|
180
|
+
type=str,
|
|
181
|
+
required=False,
|
|
182
|
+
help="Description of the run. This will be added to the metadata of the run to help with bookkeeping.",
|
|
183
|
+
)
|
|
184
|
+
parser.add_argument(
|
|
185
|
+
"--batch-size",
|
|
186
|
+
type=int,
|
|
187
|
+
default=1,
|
|
188
|
+
required=False,
|
|
189
|
+
help=(
|
|
190
|
+
"Size of batch of samples to send to the LLM for evaluation in parallel. "
|
|
191
|
+
"Use 1 for sequential running (default)."
|
|
192
|
+
),
|
|
193
|
+
)
|
|
194
|
+
parser.add_argument(
|
|
195
|
+
"--save-logs",
|
|
196
|
+
action="store_true",
|
|
197
|
+
default=True,
|
|
198
|
+
required=False,
|
|
199
|
+
help="Whether to save logs to a file in the output directory (default: True).",
|
|
200
|
+
)
|
|
201
|
+
parser.add_argument(
|
|
202
|
+
"--judge-model-name",
|
|
203
|
+
type=str,
|
|
204
|
+
required=False,
|
|
205
|
+
help=(
|
|
206
|
+
"The class derived from `eval_framework.llm.base.BaseLLM` found in the "
|
|
207
|
+
"judge-models module to instantiate for LLM judge evaluation metrics."
|
|
208
|
+
),
|
|
209
|
+
)
|
|
210
|
+
parser.add_argument(
|
|
211
|
+
"--judge-model-args",
|
|
212
|
+
type=str,
|
|
213
|
+
required=False,
|
|
214
|
+
nargs="*",
|
|
215
|
+
help=("The args of the judge model used."),
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
llm_args: dict[str, Any] = {}
|
|
219
|
+
args = parser.parse_args()
|
|
220
|
+
|
|
221
|
+
for arg in args.llm_args or []:
|
|
222
|
+
if "=" in arg:
|
|
223
|
+
key, value = arg.split("=", 1)
|
|
224
|
+
|
|
225
|
+
# Handle nested keys like "sampling_params.temperature=0.7"
|
|
226
|
+
if "." in key:
|
|
227
|
+
nested_key, sub_key = key.split(".", 1)
|
|
228
|
+
if nested_key not in llm_args:
|
|
229
|
+
llm_args[nested_key] = {}
|
|
230
|
+
llm_args[nested_key][sub_key] = value
|
|
231
|
+
else:
|
|
232
|
+
llm_args[key] = value
|
|
233
|
+
|
|
234
|
+
args.llm_args = llm_args
|
|
235
|
+
|
|
236
|
+
judge_model_args = {}
|
|
237
|
+
for arg in args.judge_model_args or []:
|
|
238
|
+
if "=" in arg:
|
|
239
|
+
key, value = arg.split("=", 1)
|
|
240
|
+
judge_model_args[key] = value
|
|
241
|
+
|
|
242
|
+
args.judge_model_args = judge_model_args
|
|
243
|
+
|
|
244
|
+
# if args.extra_task_modules:
|
|
245
|
+
# # Convert the comma-separated string into a list
|
|
246
|
+
# args.extra_task_modules = [file_or_dir.strip() for file_or_dir in args.extra_task_modules.split(",")]
|
|
247
|
+
# else:
|
|
248
|
+
# args.extra_task_modules = None
|
|
249
|
+
|
|
250
|
+
return args
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def run_with_kwargs(kwargs: dict) -> None:
|
|
254
|
+
# Setup logging for the output directory
|
|
255
|
+
output_dir = kwargs.get("output_dir", "results")
|
|
256
|
+
setup_logging(output_dir)
|
|
257
|
+
|
|
258
|
+
now = datetime.datetime.now()
|
|
259
|
+
logger.info(f"starting time: {now}")
|
|
260
|
+
|
|
261
|
+
if kwargs["extra_task_modules"]:
|
|
262
|
+
load_extra_tasks(kwargs["extra_task_modules"])
|
|
263
|
+
|
|
264
|
+
context_name = kwargs.pop("context")
|
|
265
|
+
|
|
266
|
+
context = CONTEXT[context_name](
|
|
267
|
+
llm_name=kwargs["llm_name"],
|
|
268
|
+
models_path=kwargs["models"],
|
|
269
|
+
num_samples=kwargs["num_samples"],
|
|
270
|
+
max_tokens=kwargs["max_tokens"],
|
|
271
|
+
num_fewshot=kwargs["num_fewshot"],
|
|
272
|
+
task_name=kwargs["task_name"],
|
|
273
|
+
task_subjects=kwargs["task_subjects"],
|
|
274
|
+
hf_revision=kwargs["hf_revision"],
|
|
275
|
+
output_dir=kwargs["output_dir"],
|
|
276
|
+
wandb_project=kwargs["wandb_project"],
|
|
277
|
+
wandb_entity=kwargs["wandb_entity"],
|
|
278
|
+
wandb_run_id=kwargs["wandb_run_id"],
|
|
279
|
+
hf_upload_dir=kwargs["hf_upload_dir"],
|
|
280
|
+
hf_upload_repo=kwargs["hf_upload_repo"],
|
|
281
|
+
llm_args=kwargs["llm_args"],
|
|
282
|
+
judge_models_path=kwargs["judge_models"],
|
|
283
|
+
judge_model_name=kwargs["judge_model_name"],
|
|
284
|
+
judge_model_args=kwargs["judge_model_args"],
|
|
285
|
+
batch_size=kwargs["batch_size"],
|
|
286
|
+
description=kwargs["description"],
|
|
287
|
+
perturbation_type=kwargs["perturbation_type"],
|
|
288
|
+
perturbation_probability=kwargs["perturbation_probability"],
|
|
289
|
+
perturbation_seed=kwargs["perturbation_seed"],
|
|
290
|
+
# save_logs=kwargs["save_logs"],
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
with context as ctx:
|
|
294
|
+
if ctx.config is None:
|
|
295
|
+
raise ValueError(f"Context configuration is not set for '{type(ctx)}'.")
|
|
296
|
+
|
|
297
|
+
main(
|
|
298
|
+
llm=ctx.config.llm_class(**ctx.config.llm_args),
|
|
299
|
+
config=ctx.config,
|
|
300
|
+
should_preempt_callable=ctx.should_preempt,
|
|
301
|
+
trial_id=ctx.get_trial_id(),
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
logger.info(f"time since start: {datetime.datetime.now() - now}")
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def run() -> None:
|
|
308
|
+
run_with_kwargs(vars(parse_args()))
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
# Enable execution via `python -m eval_framework.run`. Useful for
|
|
312
|
+
# debugging via `debugpy -m eval_framework.run`
|
|
313
|
+
if __name__ == "__main__":
|
|
314
|
+
run()
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import logging
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from eval_framework.run import run_with_kwargs
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
if __name__ == "__main__":
|
|
10
|
+
logger.info(Path("models.py"))
|
|
11
|
+
now = datetime.datetime.now()
|
|
12
|
+
logger.info("starting time:", now)
|
|
13
|
+
# insert API token here
|
|
14
|
+
|
|
15
|
+
# main block for local debugging
|
|
16
|
+
kwargs = {
|
|
17
|
+
"context": "local",
|
|
18
|
+
"models": Path("src/eval_framework/llm/models.py"),
|
|
19
|
+
"judge_models": None,
|
|
20
|
+
"judge_model_name": None,
|
|
21
|
+
"judge_model_args": {},
|
|
22
|
+
# ---
|
|
23
|
+
"llm_name": "Llama31_8B_Instruct_API",
|
|
24
|
+
"llm_args": {},
|
|
25
|
+
"num_samples": 1,
|
|
26
|
+
"max_tokens": None,
|
|
27
|
+
"num_fewshot": 4,
|
|
28
|
+
"task_name": "Math", # complete task
|
|
29
|
+
"task_subjects": None,
|
|
30
|
+
"hf_revision": None,
|
|
31
|
+
"output_dir": Path("outputs"),
|
|
32
|
+
"hf_upload_dir": "",
|
|
33
|
+
"description": "",
|
|
34
|
+
"batch_size": 1,
|
|
35
|
+
"perturbation_type": None,
|
|
36
|
+
"perturbation_probability": None,
|
|
37
|
+
"perturbation_seed": None,
|
|
38
|
+
"save_logs": True,
|
|
39
|
+
}
|
|
40
|
+
run_with_kwargs(kwargs)
|
|
41
|
+
|
|
42
|
+
logger.info("time since start:", datetime.datetime.now() - now)
|