eval-framework 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. eval_framework/__init__.py +7 -0
  2. eval_framework/base_config.py +36 -0
  3. eval_framework/context/__init__.py +0 -0
  4. eval_framework/context/determined.py +170 -0
  5. eval_framework/context/eval.py +114 -0
  6. eval_framework/context/local.py +52 -0
  7. eval_framework/evaluation_generator.py +231 -0
  8. eval_framework/exceptions.py +2 -0
  9. eval_framework/external/ifeval_impl/README.md +5 -0
  10. eval_framework/external/ifeval_impl/instructions.py +1523 -0
  11. eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
  12. eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
  13. eval_framework/external/ifeval_impl/utils.py +135 -0
  14. eval_framework/llm/__init__.py +0 -0
  15. eval_framework/llm/aleph_alpha.py +323 -0
  16. eval_framework/llm/base.py +58 -0
  17. eval_framework/llm/huggingface.py +332 -0
  18. eval_framework/llm/mistral.py +73 -0
  19. eval_framework/llm/models.py +16 -0
  20. eval_framework/llm/openai.py +205 -0
  21. eval_framework/llm/vllm.py +438 -0
  22. eval_framework/logger.py +3 -0
  23. eval_framework/main.py +187 -0
  24. eval_framework/metrics/__init__.py +0 -0
  25. eval_framework/metrics/base.py +40 -0
  26. eval_framework/metrics/completion/__init__.py +1 -0
  27. eval_framework/metrics/completion/accuracy_completion.py +16 -0
  28. eval_framework/metrics/completion/bleu.py +76 -0
  29. eval_framework/metrics/completion/chrf.py +62 -0
  30. eval_framework/metrics/completion/code_assertion.py +44 -0
  31. eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
  32. eval_framework/metrics/completion/comet.py +56 -0
  33. eval_framework/metrics/completion/concordance_index.py +38 -0
  34. eval_framework/metrics/completion/csv_format.py +102 -0
  35. eval_framework/metrics/completion/cwe_accuracy.py +49 -0
  36. eval_framework/metrics/completion/exponential_similarity.py +65 -0
  37. eval_framework/metrics/completion/f1.py +42 -0
  38. eval_framework/metrics/completion/format_checker.py +56 -0
  39. eval_framework/metrics/completion/grid_difference.py +77 -0
  40. eval_framework/metrics/completion/ifeval.py +73 -0
  41. eval_framework/metrics/completion/json_format.py +171 -0
  42. eval_framework/metrics/completion/language_checker.py +74 -0
  43. eval_framework/metrics/completion/length_control.py +83 -0
  44. eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
  45. eval_framework/metrics/completion/niah_accuracy.py +163 -0
  46. eval_framework/metrics/completion/placeholder_checker.py +27 -0
  47. eval_framework/metrics/completion/repetition.py +88 -0
  48. eval_framework/metrics/completion/rouge_1.py +35 -0
  49. eval_framework/metrics/completion/rouge_2.py +45 -0
  50. eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
  51. eval_framework/metrics/completion/rouge_l.py +52 -0
  52. eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
  53. eval_framework/metrics/completion/ter.py +67 -0
  54. eval_framework/metrics/completion/text_counter.py +182 -0
  55. eval_framework/metrics/efficiency/__init__.py +0 -0
  56. eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
  57. eval_framework/metrics/llm/__init__.py +0 -0
  58. eval_framework/metrics/llm/base.py +8 -0
  59. eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
  60. eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
  61. eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
  62. eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
  63. eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
  64. eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
  65. eval_framework/metrics/llm/graders/language.py +56 -0
  66. eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
  67. eval_framework/metrics/llm/graders/models.py +74 -0
  68. eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
  69. eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
  70. eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
  71. eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
  72. eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
  73. eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
  74. eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
  75. eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
  76. eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
  77. eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
  78. eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
  79. eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
  80. eval_framework/metrics/llm/llm_judge_sql.py +394 -0
  81. eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
  82. eval_framework/metrics/loglikelihood/__init__.py +0 -0
  83. eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
  84. eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
  85. eval_framework/py.typed +0 -0
  86. eval_framework/response_generator.py +416 -0
  87. eval_framework/result_processors/__init__.py +0 -0
  88. eval_framework/result_processors/base.py +74 -0
  89. eval_framework/result_processors/hf_processor.py +87 -0
  90. eval_framework/result_processors/result_processor.py +129 -0
  91. eval_framework/run.py +314 -0
  92. eval_framework/run_direct.py +42 -0
  93. eval_framework/shared/types.py +227 -0
  94. eval_framework/tasks/__init__.py +6 -0
  95. eval_framework/tasks/base.py +314 -0
  96. eval_framework/tasks/benchmarks/__init__.py +0 -0
  97. eval_framework/tasks/benchmarks/arc.py +46 -0
  98. eval_framework/tasks/benchmarks/arc_de.py +46 -0
  99. eval_framework/tasks/benchmarks/arc_fi.py +46 -0
  100. eval_framework/tasks/benchmarks/belebele.py +60 -0
  101. eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
  102. eval_framework/tasks/benchmarks/casehold.py +47 -0
  103. eval_framework/tasks/benchmarks/chembench.py +85 -0
  104. eval_framework/tasks/benchmarks/copa.py +39 -0
  105. eval_framework/tasks/benchmarks/duc.py +91 -0
  106. eval_framework/tasks/benchmarks/flores200.py +62 -0
  107. eval_framework/tasks/benchmarks/flores_plus.py +84 -0
  108. eval_framework/tasks/benchmarks/gpqa.py +177 -0
  109. eval_framework/tasks/benchmarks/gsm8k.py +148 -0
  110. eval_framework/tasks/benchmarks/hellaswag.py +44 -0
  111. eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
  112. eval_framework/tasks/benchmarks/humaneval.py +97 -0
  113. eval_framework/tasks/benchmarks/ifeval.py +78 -0
  114. eval_framework/tasks/benchmarks/include.py +119 -0
  115. eval_framework/tasks/benchmarks/infinitebench.py +302 -0
  116. eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
  117. eval_framework/tasks/benchmarks/mbpp.py +192 -0
  118. eval_framework/tasks/benchmarks/mmlu.py +190 -0
  119. eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
  120. eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
  121. eval_framework/tasks/benchmarks/mmmlu.py +529 -0
  122. eval_framework/tasks/benchmarks/openbookqa.py +37 -0
  123. eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
  124. eval_framework/tasks/benchmarks/pawsx.py +65 -0
  125. eval_framework/tasks/benchmarks/piqa.py +39 -0
  126. eval_framework/tasks/benchmarks/quality.py +56 -0
  127. eval_framework/tasks/benchmarks/sciq.py +44 -0
  128. eval_framework/tasks/benchmarks/sphyr.py +75 -0
  129. eval_framework/tasks/benchmarks/squad.py +89 -0
  130. eval_framework/tasks/benchmarks/struct_eval.py +110 -0
  131. eval_framework/tasks/benchmarks/tablebench.py +117 -0
  132. eval_framework/tasks/benchmarks/triviaqa.py +42 -0
  133. eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
  134. eval_framework/tasks/benchmarks/winogender.py +39 -0
  135. eval_framework/tasks/benchmarks/winogrande.py +44 -0
  136. eval_framework/tasks/benchmarks/winox.py +57 -0
  137. eval_framework/tasks/benchmarks/wmt.py +160 -0
  138. eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
  139. eval_framework/tasks/eval_config.py +112 -0
  140. eval_framework/tasks/perturbation.py +83 -0
  141. eval_framework/tasks/registry.py +186 -0
  142. eval_framework/tasks/task_loader.py +80 -0
  143. eval_framework/tasks/task_names.py +138 -0
  144. eval_framework/tasks/utils.py +578 -0
  145. eval_framework/utils/constants.py +9 -0
  146. eval_framework/utils/generate_task_docs.py +229 -0
  147. eval_framework/utils/helpers.py +3 -0
  148. eval_framework/utils/logging.py +50 -0
  149. eval_framework/utils/packaging.py +52 -0
  150. eval_framework-0.2.0.dist-info/METADATA +514 -0
  151. eval_framework-0.2.0.dist-info/RECORD +161 -0
  152. eval_framework-0.2.0.dist-info/WHEEL +4 -0
  153. eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
  154. template_formatting/README.md +83 -0
  155. template_formatting/__init__.py +0 -0
  156. template_formatting/formatter.py +536 -0
  157. template_formatting/mistral_formatter.py +159 -0
  158. template_formatting/py.typed +0 -0
  159. template_formatting/tests/test_formatter_eval.py +408 -0
  160. template_formatting/tests/test_formatter_scaling.py +253 -0
  161. template_formatting/tests/test_mistral_formatter.py +136 -0
@@ -0,0 +1,136 @@
1
+ import copy
2
+ from collections.abc import Sequence
3
+ from typing import Literal, Protocol
4
+
5
+ import pytest
6
+
7
+ from template_formatting.formatter import ChatTemplate, Message, Role
8
+
9
+ package_exists_mc = pytest.importorskip("mistral_common")
10
+ package_exists_hf = pytest.importorskip("huggingface_hub")
11
+
12
+ if package_exists_mc and package_exists_hf:
13
+ from huggingface_hub.utils import RepositoryNotFoundError
14
+
15
+ from template_formatting.mistral_formatter import MagistralFormatter, MistralFormatter, MistralSerializer
16
+
17
+
18
+ class TypingStub(Protocol):
19
+ template: ChatTemplate
20
+
21
+ def format( # type: ignore[override]
22
+ self, messages: Sequence[Message], output_mode: Literal["list"] = "list"
23
+ ) -> list[Message]: ...
24
+
25
+
26
+ class TestHFAssetRetrieval:
27
+ def test_existing_repo(self) -> None:
28
+ formatter = MagistralFormatter(llm_target="mistralai/Magistral-Small-2506")
29
+ assert len(formatter.template.system_prompt) > 0
30
+
31
+ def test_non_existing_repo(self) -> None:
32
+ with pytest.raises(RepositoryNotFoundError):
33
+ MagistralFormatter(llm_target="Qwen/phariachat")
34
+
35
+
36
+ class TestMistralFormatter:
37
+ @pytest.fixture
38
+ def mformatter(self) -> TypingStub:
39
+ return MistralFormatter(llm_target="mistralai/Magistral-Small-2506")
40
+
41
+ @pytest.fixture
42
+ def chat(self) -> list[Message]:
43
+ return [
44
+ Message(
45
+ content="You are a helpful assistant that provides clear and concise"
46
+ " answers to general knowledge questions.",
47
+ role=Role.SYSTEM,
48
+ ),
49
+ Message(content="What is the capital of France?", role=Role.USER),
50
+ Message(content="The capital of France is Paris.", role=Role.ASSISTANT),
51
+ ]
52
+
53
+ def test_encoding_challenge(self, chat: list[Message]) -> None:
54
+ """
55
+ Current template-formatting repo introduces special tokens directly into
56
+ the prompt (ex: <system_prompt>Be a kind agent</system_prompt>). The formatted
57
+ prompt is later fed to the LLM for inference.
58
+
59
+ Downside with this approach is that some tokenizers don't detect special tokens
60
+ and map them to unique indices. Rather these are parsed along side the text. As
61
+ the case with some of Mistral's tokenizers.
62
+
63
+ This test serves as a validation check and further argumentation for the need of
64
+ a customized MistralFormatter.
65
+ """
66
+ mistral_serializer = MistralSerializer(llm_target="mistralai/Magistral-Small-2506")
67
+
68
+ mistral_msg_lst = mistral_serializer.convert_from_aa(msg_lst=chat)
69
+ mistral_request_object = mistral_serializer.build_mistral_request(mistral_msg_lst=mistral_msg_lst)
70
+ mistral_tokenized_object = mistral_serializer.tokenizer.instruct_tokenizer.encode_instruct(
71
+ mistral_request_object
72
+ )
73
+ expected_token_ids = mistral_tokenized_object.tokens
74
+ formatted_prompt_txt = mistral_tokenized_object.text
75
+ formatted_token_ids = mistral_serializer.tokenizer.instruct_tokenizer.tokenizer.encode(
76
+ formatted_prompt_txt, False, False
77
+ )
78
+ assert expected_token_ids != formatted_token_ids
79
+
80
+ @staticmethod
81
+ def __validate_request(request_msgs: list[Message], msg_lst: list[Message]) -> bool:
82
+ request_check = [
83
+ request_msgs[idx].role == msg_lst[idx].role and request_msgs[idx].content == msg_lst[idx].content
84
+ for idx in range(0, len(request_msgs))
85
+ ]
86
+ return all(request_check)
87
+
88
+ def test_base_request(self, mformatter: TypingStub, chat: list[Message]) -> None:
89
+ output_openai_msgs: list[Message] = mformatter.format(messages=chat)
90
+ assert self.__validate_request(request_msgs=output_openai_msgs, msg_lst=chat)
91
+
92
+ def test_multiple_user_request(self, mformatter: TypingStub, chat: list[Message]) -> None:
93
+ test_case = copy.copy(chat)
94
+ test_case.insert(2, Message(content="What is the most beautiful monument in Paris ?", role=Role.USER))
95
+ output_openai_msgs: list[Message] = mformatter.format(messages=test_case)
96
+ test_case[1].content += f"\n\n{test_case[2].content}"
97
+ test_case.pop(2)
98
+ assert self.__validate_request(request_msgs=output_openai_msgs, msg_lst=test_case)
99
+
100
+ def test_no_system_request(self, mformatter: TypingStub, chat: list[Message]) -> None:
101
+ test_case = copy.copy(chat)
102
+ test_case.pop(0)
103
+ output_openai_msgs: list[Message] = mformatter.format(messages=test_case)
104
+ assert self.__validate_request(request_msgs=output_openai_msgs, msg_lst=test_case)
105
+
106
+ def test_complete_prompt(self, mformatter: TypingStub, chat: list[Message]) -> None:
107
+ test_case = copy.copy(chat)
108
+ test_case.pop()
109
+ output_openai_msgs: list[Message] = mformatter.format(messages=test_case)
110
+ assert self.__validate_request(request_msgs=output_openai_msgs, msg_lst=test_case)
111
+
112
+
113
+ class TestMagistralFormatter:
114
+ @pytest.fixture
115
+ def magistral_formatter(self) -> TypingStub:
116
+ return MagistralFormatter(llm_target="mistralai/Magistral-Small-2506")
117
+
118
+ def test_system_prompt_addition(self, magistral_formatter: TypingStub) -> None:
119
+ chat = [Message(role=Role.USER, content="What is the capital of france ?")]
120
+ message_lst = magistral_formatter.format(messages=chat)
121
+ print(message_lst)
122
+ assert message_lst[0].role == Role.SYSTEM
123
+ assert message_lst[0].content == magistral_formatter.template.system_prompt
124
+
125
+ def test_abandon_default_system_prompt(self, magistral_formatter: TypingStub) -> None:
126
+ chat = [
127
+ Message(
128
+ role=Role.SYSTEM,
129
+ content="This prompt is specific to math problems; complext prompts. You need to be smart"
130
+ " to solve these problems.",
131
+ ),
132
+ Message(role=Role.USER, content="What is the gradient of a quadratic function"),
133
+ ]
134
+ message_lst = magistral_formatter.format(messages=chat)
135
+ assert message_lst[0].role == Role.SYSTEM
136
+ assert message_lst[0].content == chat[0].content