eval-framework 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eval_framework/__init__.py +7 -0
- eval_framework/base_config.py +36 -0
- eval_framework/context/__init__.py +0 -0
- eval_framework/context/determined.py +170 -0
- eval_framework/context/eval.py +114 -0
- eval_framework/context/local.py +52 -0
- eval_framework/evaluation_generator.py +231 -0
- eval_framework/exceptions.py +2 -0
- eval_framework/external/ifeval_impl/README.md +5 -0
- eval_framework/external/ifeval_impl/instructions.py +1523 -0
- eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
- eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
- eval_framework/external/ifeval_impl/utils.py +135 -0
- eval_framework/llm/__init__.py +0 -0
- eval_framework/llm/aleph_alpha.py +323 -0
- eval_framework/llm/base.py +58 -0
- eval_framework/llm/huggingface.py +332 -0
- eval_framework/llm/mistral.py +73 -0
- eval_framework/llm/models.py +16 -0
- eval_framework/llm/openai.py +205 -0
- eval_framework/llm/vllm.py +438 -0
- eval_framework/logger.py +3 -0
- eval_framework/main.py +187 -0
- eval_framework/metrics/__init__.py +0 -0
- eval_framework/metrics/base.py +40 -0
- eval_framework/metrics/completion/__init__.py +1 -0
- eval_framework/metrics/completion/accuracy_completion.py +16 -0
- eval_framework/metrics/completion/bleu.py +76 -0
- eval_framework/metrics/completion/chrf.py +62 -0
- eval_framework/metrics/completion/code_assertion.py +44 -0
- eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
- eval_framework/metrics/completion/comet.py +56 -0
- eval_framework/metrics/completion/concordance_index.py +38 -0
- eval_framework/metrics/completion/csv_format.py +102 -0
- eval_framework/metrics/completion/cwe_accuracy.py +49 -0
- eval_framework/metrics/completion/exponential_similarity.py +65 -0
- eval_framework/metrics/completion/f1.py +42 -0
- eval_framework/metrics/completion/format_checker.py +56 -0
- eval_framework/metrics/completion/grid_difference.py +77 -0
- eval_framework/metrics/completion/ifeval.py +73 -0
- eval_framework/metrics/completion/json_format.py +171 -0
- eval_framework/metrics/completion/language_checker.py +74 -0
- eval_framework/metrics/completion/length_control.py +83 -0
- eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
- eval_framework/metrics/completion/niah_accuracy.py +163 -0
- eval_framework/metrics/completion/placeholder_checker.py +27 -0
- eval_framework/metrics/completion/repetition.py +88 -0
- eval_framework/metrics/completion/rouge_1.py +35 -0
- eval_framework/metrics/completion/rouge_2.py +45 -0
- eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
- eval_framework/metrics/completion/rouge_l.py +52 -0
- eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
- eval_framework/metrics/completion/ter.py +67 -0
- eval_framework/metrics/completion/text_counter.py +182 -0
- eval_framework/metrics/efficiency/__init__.py +0 -0
- eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
- eval_framework/metrics/llm/__init__.py +0 -0
- eval_framework/metrics/llm/base.py +8 -0
- eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
- eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
- eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
- eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
- eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
- eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
- eval_framework/metrics/llm/graders/language.py +56 -0
- eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
- eval_framework/metrics/llm/graders/models.py +74 -0
- eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
- eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
- eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
- eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
- eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
- eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
- eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
- eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
- eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
- eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
- eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
- eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
- eval_framework/metrics/llm/llm_judge_sql.py +394 -0
- eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
- eval_framework/metrics/loglikelihood/__init__.py +0 -0
- eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
- eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
- eval_framework/py.typed +0 -0
- eval_framework/response_generator.py +416 -0
- eval_framework/result_processors/__init__.py +0 -0
- eval_framework/result_processors/base.py +74 -0
- eval_framework/result_processors/hf_processor.py +87 -0
- eval_framework/result_processors/result_processor.py +129 -0
- eval_framework/run.py +314 -0
- eval_framework/run_direct.py +42 -0
- eval_framework/shared/types.py +227 -0
- eval_framework/tasks/__init__.py +6 -0
- eval_framework/tasks/base.py +314 -0
- eval_framework/tasks/benchmarks/__init__.py +0 -0
- eval_framework/tasks/benchmarks/arc.py +46 -0
- eval_framework/tasks/benchmarks/arc_de.py +46 -0
- eval_framework/tasks/benchmarks/arc_fi.py +46 -0
- eval_framework/tasks/benchmarks/belebele.py +60 -0
- eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
- eval_framework/tasks/benchmarks/casehold.py +47 -0
- eval_framework/tasks/benchmarks/chembench.py +85 -0
- eval_framework/tasks/benchmarks/copa.py +39 -0
- eval_framework/tasks/benchmarks/duc.py +91 -0
- eval_framework/tasks/benchmarks/flores200.py +62 -0
- eval_framework/tasks/benchmarks/flores_plus.py +84 -0
- eval_framework/tasks/benchmarks/gpqa.py +177 -0
- eval_framework/tasks/benchmarks/gsm8k.py +148 -0
- eval_framework/tasks/benchmarks/hellaswag.py +44 -0
- eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
- eval_framework/tasks/benchmarks/humaneval.py +97 -0
- eval_framework/tasks/benchmarks/ifeval.py +78 -0
- eval_framework/tasks/benchmarks/include.py +119 -0
- eval_framework/tasks/benchmarks/infinitebench.py +302 -0
- eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
- eval_framework/tasks/benchmarks/mbpp.py +192 -0
- eval_framework/tasks/benchmarks/mmlu.py +190 -0
- eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
- eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
- eval_framework/tasks/benchmarks/mmmlu.py +529 -0
- eval_framework/tasks/benchmarks/openbookqa.py +37 -0
- eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
- eval_framework/tasks/benchmarks/pawsx.py +65 -0
- eval_framework/tasks/benchmarks/piqa.py +39 -0
- eval_framework/tasks/benchmarks/quality.py +56 -0
- eval_framework/tasks/benchmarks/sciq.py +44 -0
- eval_framework/tasks/benchmarks/sphyr.py +75 -0
- eval_framework/tasks/benchmarks/squad.py +89 -0
- eval_framework/tasks/benchmarks/struct_eval.py +110 -0
- eval_framework/tasks/benchmarks/tablebench.py +117 -0
- eval_framework/tasks/benchmarks/triviaqa.py +42 -0
- eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
- eval_framework/tasks/benchmarks/winogender.py +39 -0
- eval_framework/tasks/benchmarks/winogrande.py +44 -0
- eval_framework/tasks/benchmarks/winox.py +57 -0
- eval_framework/tasks/benchmarks/wmt.py +160 -0
- eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
- eval_framework/tasks/eval_config.py +112 -0
- eval_framework/tasks/perturbation.py +83 -0
- eval_framework/tasks/registry.py +186 -0
- eval_framework/tasks/task_loader.py +80 -0
- eval_framework/tasks/task_names.py +138 -0
- eval_framework/tasks/utils.py +578 -0
- eval_framework/utils/constants.py +9 -0
- eval_framework/utils/generate_task_docs.py +229 -0
- eval_framework/utils/helpers.py +3 -0
- eval_framework/utils/logging.py +50 -0
- eval_framework/utils/packaging.py +52 -0
- eval_framework-0.2.0.dist-info/METADATA +514 -0
- eval_framework-0.2.0.dist-info/RECORD +161 -0
- eval_framework-0.2.0.dist-info/WHEEL +4 -0
- eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
- template_formatting/README.md +83 -0
- template_formatting/__init__.py +0 -0
- template_formatting/formatter.py +536 -0
- template_formatting/mistral_formatter.py +159 -0
- template_formatting/py.typed +0 -0
- template_formatting/tests/test_formatter_eval.py +408 -0
- template_formatting/tests/test_formatter_scaling.py +253 -0
- template_formatting/tests/test_mistral_formatter.py +136 -0
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
from collections.abc import Sequence
|
|
3
|
+
from typing import Literal, Protocol
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from template_formatting.formatter import ChatTemplate, Message, Role
|
|
8
|
+
|
|
9
|
+
package_exists_mc = pytest.importorskip("mistral_common")
|
|
10
|
+
package_exists_hf = pytest.importorskip("huggingface_hub")
|
|
11
|
+
|
|
12
|
+
if package_exists_mc and package_exists_hf:
|
|
13
|
+
from huggingface_hub.utils import RepositoryNotFoundError
|
|
14
|
+
|
|
15
|
+
from template_formatting.mistral_formatter import MagistralFormatter, MistralFormatter, MistralSerializer
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TypingStub(Protocol):
|
|
19
|
+
template: ChatTemplate
|
|
20
|
+
|
|
21
|
+
def format( # type: ignore[override]
|
|
22
|
+
self, messages: Sequence[Message], output_mode: Literal["list"] = "list"
|
|
23
|
+
) -> list[Message]: ...
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TestHFAssetRetrieval:
|
|
27
|
+
def test_existing_repo(self) -> None:
|
|
28
|
+
formatter = MagistralFormatter(llm_target="mistralai/Magistral-Small-2506")
|
|
29
|
+
assert len(formatter.template.system_prompt) > 0
|
|
30
|
+
|
|
31
|
+
def test_non_existing_repo(self) -> None:
|
|
32
|
+
with pytest.raises(RepositoryNotFoundError):
|
|
33
|
+
MagistralFormatter(llm_target="Qwen/phariachat")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class TestMistralFormatter:
|
|
37
|
+
@pytest.fixture
|
|
38
|
+
def mformatter(self) -> TypingStub:
|
|
39
|
+
return MistralFormatter(llm_target="mistralai/Magistral-Small-2506")
|
|
40
|
+
|
|
41
|
+
@pytest.fixture
|
|
42
|
+
def chat(self) -> list[Message]:
|
|
43
|
+
return [
|
|
44
|
+
Message(
|
|
45
|
+
content="You are a helpful assistant that provides clear and concise"
|
|
46
|
+
" answers to general knowledge questions.",
|
|
47
|
+
role=Role.SYSTEM,
|
|
48
|
+
),
|
|
49
|
+
Message(content="What is the capital of France?", role=Role.USER),
|
|
50
|
+
Message(content="The capital of France is Paris.", role=Role.ASSISTANT),
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
def test_encoding_challenge(self, chat: list[Message]) -> None:
|
|
54
|
+
"""
|
|
55
|
+
Current template-formatting repo introduces special tokens directly into
|
|
56
|
+
the prompt (ex: <system_prompt>Be a kind agent</system_prompt>). The formatted
|
|
57
|
+
prompt is later fed to the LLM for inference.
|
|
58
|
+
|
|
59
|
+
Downside with this approach is that some tokenizers don't detect special tokens
|
|
60
|
+
and map them to unique indices. Rather these are parsed along side the text. As
|
|
61
|
+
the case with some of Mistral's tokenizers.
|
|
62
|
+
|
|
63
|
+
This test serves as a validation check and further argumentation for the need of
|
|
64
|
+
a customized MistralFormatter.
|
|
65
|
+
"""
|
|
66
|
+
mistral_serializer = MistralSerializer(llm_target="mistralai/Magistral-Small-2506")
|
|
67
|
+
|
|
68
|
+
mistral_msg_lst = mistral_serializer.convert_from_aa(msg_lst=chat)
|
|
69
|
+
mistral_request_object = mistral_serializer.build_mistral_request(mistral_msg_lst=mistral_msg_lst)
|
|
70
|
+
mistral_tokenized_object = mistral_serializer.tokenizer.instruct_tokenizer.encode_instruct(
|
|
71
|
+
mistral_request_object
|
|
72
|
+
)
|
|
73
|
+
expected_token_ids = mistral_tokenized_object.tokens
|
|
74
|
+
formatted_prompt_txt = mistral_tokenized_object.text
|
|
75
|
+
formatted_token_ids = mistral_serializer.tokenizer.instruct_tokenizer.tokenizer.encode(
|
|
76
|
+
formatted_prompt_txt, False, False
|
|
77
|
+
)
|
|
78
|
+
assert expected_token_ids != formatted_token_ids
|
|
79
|
+
|
|
80
|
+
@staticmethod
|
|
81
|
+
def __validate_request(request_msgs: list[Message], msg_lst: list[Message]) -> bool:
|
|
82
|
+
request_check = [
|
|
83
|
+
request_msgs[idx].role == msg_lst[idx].role and request_msgs[idx].content == msg_lst[idx].content
|
|
84
|
+
for idx in range(0, len(request_msgs))
|
|
85
|
+
]
|
|
86
|
+
return all(request_check)
|
|
87
|
+
|
|
88
|
+
def test_base_request(self, mformatter: TypingStub, chat: list[Message]) -> None:
|
|
89
|
+
output_openai_msgs: list[Message] = mformatter.format(messages=chat)
|
|
90
|
+
assert self.__validate_request(request_msgs=output_openai_msgs, msg_lst=chat)
|
|
91
|
+
|
|
92
|
+
def test_multiple_user_request(self, mformatter: TypingStub, chat: list[Message]) -> None:
|
|
93
|
+
test_case = copy.copy(chat)
|
|
94
|
+
test_case.insert(2, Message(content="What is the most beautiful monument in Paris ?", role=Role.USER))
|
|
95
|
+
output_openai_msgs: list[Message] = mformatter.format(messages=test_case)
|
|
96
|
+
test_case[1].content += f"\n\n{test_case[2].content}"
|
|
97
|
+
test_case.pop(2)
|
|
98
|
+
assert self.__validate_request(request_msgs=output_openai_msgs, msg_lst=test_case)
|
|
99
|
+
|
|
100
|
+
def test_no_system_request(self, mformatter: TypingStub, chat: list[Message]) -> None:
|
|
101
|
+
test_case = copy.copy(chat)
|
|
102
|
+
test_case.pop(0)
|
|
103
|
+
output_openai_msgs: list[Message] = mformatter.format(messages=test_case)
|
|
104
|
+
assert self.__validate_request(request_msgs=output_openai_msgs, msg_lst=test_case)
|
|
105
|
+
|
|
106
|
+
def test_complete_prompt(self, mformatter: TypingStub, chat: list[Message]) -> None:
|
|
107
|
+
test_case = copy.copy(chat)
|
|
108
|
+
test_case.pop()
|
|
109
|
+
output_openai_msgs: list[Message] = mformatter.format(messages=test_case)
|
|
110
|
+
assert self.__validate_request(request_msgs=output_openai_msgs, msg_lst=test_case)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class TestMagistralFormatter:
|
|
114
|
+
@pytest.fixture
|
|
115
|
+
def magistral_formatter(self) -> TypingStub:
|
|
116
|
+
return MagistralFormatter(llm_target="mistralai/Magistral-Small-2506")
|
|
117
|
+
|
|
118
|
+
def test_system_prompt_addition(self, magistral_formatter: TypingStub) -> None:
|
|
119
|
+
chat = [Message(role=Role.USER, content="What is the capital of france ?")]
|
|
120
|
+
message_lst = magistral_formatter.format(messages=chat)
|
|
121
|
+
print(message_lst)
|
|
122
|
+
assert message_lst[0].role == Role.SYSTEM
|
|
123
|
+
assert message_lst[0].content == magistral_formatter.template.system_prompt
|
|
124
|
+
|
|
125
|
+
def test_abandon_default_system_prompt(self, magistral_formatter: TypingStub) -> None:
|
|
126
|
+
chat = [
|
|
127
|
+
Message(
|
|
128
|
+
role=Role.SYSTEM,
|
|
129
|
+
content="This prompt is specific to math problems; complext prompts. You need to be smart"
|
|
130
|
+
" to solve these problems.",
|
|
131
|
+
),
|
|
132
|
+
Message(role=Role.USER, content="What is the gradient of a quadratic function"),
|
|
133
|
+
]
|
|
134
|
+
message_lst = magistral_formatter.format(messages=chat)
|
|
135
|
+
assert message_lst[0].role == Role.SYSTEM
|
|
136
|
+
assert message_lst[0].content == chat[0].content
|