evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,15 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ # flake8: noqa: E501
2
3
  import re
3
-
4
- from evalscope.benchmarks import Benchmark, DataAdapter
4
+ from typing import Any, Dict
5
+
6
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
7
+ from evalscope.api.dataset import Sample
8
+ from evalscope.api.evaluator import TaskState
9
+ from evalscope.api.messages.chat_message import ChatMessageUser
10
+ from evalscope.api.metric import Score
11
+ from evalscope.api.registry import register_benchmark
12
+ from evalscope.constants import Tags
5
13
  from evalscope.utils.logger import get_logger
6
14
 
7
15
  logger = get_logger()
@@ -10,28 +18,28 @@ logger = get_logger()
10
18
  # {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": " for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n", "test": "\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"} # noqa
11
19
 
12
20
 
13
- @Benchmark.register(
14
- name='humaneval',
15
- pretty_name='HumanEval',
16
- tags=['Coding'],
17
- description=
18
- 'HumanEval is a benchmark for evaluating the ability of code generation models to write Python functions based on given specifications. It consists of programming tasks with a defined input-output behavior.', # noqa: E501
19
- dataset_id='modelscope/humaneval',
20
- subset_list=['openai_humaneval'],
21
- metric_list=['Pass@1'],
22
- few_shot_num=0,
23
- train_split=None,
24
- eval_split='test',
25
- prompt_template=
26
- 'Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{query}', # noqa: E501
27
- extra_params={
28
- 'num_workers': 4,
29
- 'timeout': 4
30
- },
21
+ @register_benchmark(
22
+ BenchmarkMeta(
23
+ name='humaneval',
24
+ pretty_name='HumanEval',
25
+ tags=[Tags.CODING],
26
+ description=
27
+ 'HumanEval is a benchmark for evaluating the ability of code generation models to write Python functions based on given specifications. It consists of programming tasks with a defined input-output behavior.',
28
+ dataset_id='opencompass/humaneval',
29
+ subset_list=['openai_humaneval'],
30
+ metric_list=['Pass@1'],
31
+ eval_split='test',
32
+ prompt_template=
33
+ 'Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{question}',
34
+ extra_params={
35
+ 'num_workers': 4,
36
+ 'timeout': 4
37
+ },
38
+ )
31
39
  )
32
- class HumanevalAdapter(DataAdapter):
40
+ class HumanevalAdapter(DefaultDataAdapter):
33
41
  """
34
- A placeholder for humaneval adapter, see HumanevalEvaluator for implementation.
42
+ HumanEval adapter using the new data processing framework.
35
43
  """
36
44
 
37
45
  def __init__(self, **kwargs):
@@ -39,9 +47,11 @@ class HumanevalAdapter(DataAdapter):
39
47
  from human_eval.data import stream_jsonl, write_jsonl
40
48
  from human_eval.evaluation import check_correctness
41
49
  except ImportError:
42
- raise ImportError('Please install human_eval:'
43
- 'https://github.com/openai/human-eval/tree/master#installation , '
44
- 'Note that you need to enable the execution code in the human_eval/execution.py first.')
50
+ raise ImportError(
51
+ 'Please install human_eval:'
52
+ 'https://github.com/openai/human-eval/tree/master#installation , '
53
+ 'Note that you need to enable the execution code in the human_eval/execution.py first.'
54
+ )
45
55
  super().__init__(**kwargs)
46
56
 
47
57
  extra_params = kwargs.get('extra_params', {})
@@ -53,41 +63,62 @@ class HumanevalAdapter(DataAdapter):
53
63
  self.write_jsonl_func = write_jsonl
54
64
  self.eval_func = check_correctness
55
65
 
56
- def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
57
- data_dict = {}
58
- for subset_name in subset_list:
59
- data_dict[subset_name] = {}
60
- # [{'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...]
61
- data_dict[subset_name][self.eval_split] = [task for task in self.read_problems_func(dataset_name_or_path)]
62
-
63
- return data_dict
64
-
65
- def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
66
- """
67
- Generate prompt for the model.
68
-
69
- Args:
70
- input_d (dict): The raw input. A single data format of the Humaneval:
71
- {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
72
- """
73
- query = input_d['prompt']
74
- full_prompt = self.prompt_template.format(query=query)
75
-
76
- return self.gen_prompt_data(full_prompt)
66
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
67
+ """Convert a data record to a Sample object."""
68
+ query = record['prompt']
69
+ full_prompt = self.prompt_template.format(question=query)
70
+
71
+ return Sample(
72
+ input=[ChatMessageUser(content=full_prompt)],
73
+ target=record['canonical_solution'],
74
+ metadata={
75
+ 'task_id': record['task_id'],
76
+ 'entry_point': record['entry_point'],
77
+ 'prompt': record['prompt'],
78
+ 'test': record['test'],
79
+ }
80
+ )
81
+
82
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
83
+ """Extract code from the prediction."""
84
+ return self._postprocess(prediction)
77
85
 
78
86
  @classmethod
79
87
  def _postprocess(cls, text: str) -> str:
88
+ """Extract code from markdown code blocks."""
80
89
  blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
81
90
  if len(blocks) >= 1:
82
91
  text = blocks[0]
83
92
  return text
84
93
 
85
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
86
- return self._postprocess(result)
87
-
88
- def get_gold_answer(self, input_d: dict) -> str:
89
- return input_d
90
-
91
- def match(self, gold: str, pred: str) -> float:
92
- res = self.eval_func(gold, pred, self.timeout)
93
- return float(res['passed'])
94
+ def match_score(
95
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
96
+ ) -> Score:
97
+ score = Score(
98
+ extracted_prediction=filtered_prediction,
99
+ prediction=original_prediction,
100
+ )
101
+
102
+ # Execute the code and check correctness
103
+ res = self.eval_func(task_state.metadata, filtered_prediction, self.timeout)
104
+ passed = res['passed']
105
+
106
+ score.value = {'pass': passed}
107
+ score.explanation = res.get('result', 'Code execution completed')
108
+ score.metadata = {'task_id': task_state.metadata['task_id'], 'timeout': self.timeout, 'execution_result': res}
109
+ score.main_score_name = 'pass'
110
+
111
+ return score
112
+
113
+ def aggregate_scores(self, sample_scores):
114
+ from evalscope.metrics.metric import PassAtK
115
+
116
+ # caculate pass@k here
117
+ agg_list = []
118
+ for metric in self.metric_list:
119
+ if metric.lower().startswith('pass@'):
120
+ k = int(metric.split('@')[1])
121
+ # Get the scores for this metric
122
+ agg = PassAtK(k)
123
+ agg_list.extend(agg(sample_scores))
124
+ return agg_list
@@ -1,54 +1,83 @@
1
- from collections import defaultdict
2
1
  from typing import Any, Dict, List
3
2
 
4
- from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.constants import EvalType
6
- from evalscope.metrics import Metric, mean, metric_registry
7
-
8
-
9
- @Benchmark.register(
10
- name='ifeval',
11
- pretty_name='IFEval',
12
- tags=['Instruction-Following'],
13
- description=
14
- 'IFEval is a benchmark for evaluating instruction-following language models, focusing on their ability to understand and respond to various prompts. It includes a diverse set of tasks and metrics to assess model performance comprehensively.', # noqa: E501
15
- dataset_id='opencompass/ifeval',
16
- subset_list=['default'],
17
- metric_list=[
18
- 'prompt_level_strict_acc',
19
- 'inst_level_strict_acc',
20
- 'prompt_level_loose_acc',
21
- 'inst_level_loose_acc',
22
- ],
23
- few_shot_num=0,
24
- train_split=None,
25
- eval_split='train',
26
- prompt_template='',
3
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.evaluator import TaskState
6
+ from evalscope.api.messages import ChatMessageUser
7
+ from evalscope.api.metric import Score
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ @register_benchmark(
16
+ BenchmarkMeta(
17
+ name='ifeval',
18
+ pretty_name='IFEval',
19
+ description=
20
+ 'IFEval is a benchmark for evaluating instruction-following language models, focusing on their ability to understand and respond to various prompts. It includes a diverse set of tasks and metrics to assess model performance comprehensively.', # noqa: E501
21
+ tags=[Tags.INSTRUCTION_FOLLOWING],
22
+ dataset_id='opencompass/ifeval',
23
+ subset_list=['default'],
24
+ metric_list=[
25
+ 'prompt_level_strict',
26
+ 'inst_level_strict',
27
+ 'prompt_level_loose',
28
+ 'inst_level_loose',
29
+ ],
30
+ few_shot_num=0,
31
+ train_split=None,
32
+ eval_split='train',
33
+ prompt_template='',
34
+ )
27
35
  )
28
- class IFEvalAdapter(DataAdapter):
36
+ class IFEvalAdapter(DefaultDataAdapter):
29
37
 
30
38
  def __init__(self, **kwargs):
31
39
  super().__init__(**kwargs)
32
40
 
33
- # register metrics
34
- metric_registry.register(Metric(name='prompt_level_strict_acc', object=mean))
35
- metric_registry.register(Metric(name='inst_level_strict_acc', object=mean))
36
- metric_registry.register(Metric(name='prompt_level_loose_acc', object=mean))
37
- metric_registry.register(Metric(name='inst_level_loose_acc', object=mean))
41
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
42
+ """
43
+ Convert a data record to a Sample object.
38
44
 
39
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
40
- return self.gen_prompt_data(input_d['prompt'])
45
+ Args:
46
+ record (Dict[str, Any]): Input data record.
41
47
 
42
- def get_gold_answer(self, input_d: dict) -> str:
43
- return input_d
48
+ Returns:
49
+ Sample: Sample object with input, target, and metadata.
50
+ """
51
+ prompt = record.get('prompt', '')
52
+ message_list = [ChatMessageUser(content=prompt)]
44
53
 
45
- def match(self, gold: Any, pred: Any) -> Dict:
54
+ return Sample(input=message_list, target='', metadata=record)
55
+
56
+ def match_score(
57
+ self, original_prediction: str, filtered_prediction: str, reference: Dict, task_state: TaskState
58
+ ) -> Score:
59
+ """
60
+ Calculate evaluation scores by comparing prediction with reference.
61
+ """
46
62
  from evalscope.benchmarks.ifeval.utils import process_results
47
63
 
48
- return process_results(gold, [pred])
64
+ # Initialize the score object with prediction details
65
+ score = Score(
66
+ extracted_prediction=filtered_prediction,
67
+ prediction=original_prediction,
68
+ )
69
+
70
+ doc = task_state.metadata
71
+ try:
72
+ # Process results using the existing ifeval utility
73
+ results = process_results(doc, [filtered_prediction])
74
+ score.value.update(results)
75
+
76
+ # Set main score name
77
+ score.main_score_name = 'prompt_level_strict'
49
78
 
50
- def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
51
- # aggregate review results
52
- res_dict = super().compute_dict_metric(review_res_list, **kwargs)
79
+ except Exception as e:
80
+ logger.error(f'Error calculating ifeval metrics: {e}')
81
+ score.value = {}
53
82
 
54
- return super().compute_metric(res_dict, **kwargs)
83
+ return score
@@ -21,7 +21,7 @@ import re
21
21
  import string
22
22
  from typing import Dict, Optional, Sequence, Union
23
23
 
24
- from evalscope.benchmarks.ifeval import instructions_util
24
+ from . import instructions_util
25
25
 
26
26
  _InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]]
27
27
 
@@ -140,8 +140,9 @@ class ResponseLanguageChecker(Instruction):
140
140
  if self._language is None:
141
141
  self._language = random.choice(list(_LANGUAGES.keys()))
142
142
  # TODO(tianjianlu): opens the description generation to more choices.
143
- self._description_pattern = ('Your ENTIRE response should be in {language} language, no other '
144
- + 'language is allowed.')
143
+ self._description_pattern = (
144
+ 'Your ENTIRE response should be in {language} language, no other ' + 'language is allowed.'
145
+ )
145
146
  return self._description_pattern.format(language=_LANGUAGES[self._language])
146
147
 
147
148
  def get_instruction_args(self):
@@ -197,8 +198,10 @@ class NumberOfSentences(Instruction):
197
198
  if relation is None:
198
199
  self._comparison_relation = random.choice(_COMPARISON_RELATION)
199
200
  elif relation not in _COMPARISON_RELATION:
200
- raise ValueError('The supported relation for comparison must be in '
201
- f'{_COMPARISON_RELATION}, but {relation} is given.')
201
+ raise ValueError(
202
+ 'The supported relation for comparison must be in '
203
+ f'{_COMPARISON_RELATION}, but {relation} is given.'
204
+ )
202
205
  else:
203
206
  self._comparison_relation = relation
204
207
 
@@ -255,8 +258,10 @@ class PlaceholderChecker(Instruction):
255
258
  self._num_placeholders = num_placeholders
256
259
  if self._num_placeholders is None or self._num_placeholders < 0:
257
260
  self._num_placeholders = random.randint(1, _NUM_PLACEHOLDERS)
258
- self._description_pattern = ('The response must contain at least {num_placeholders} placeholders '
259
- + 'represented by square brackets, such as [address].')
261
+ self._description_pattern = (
262
+ 'The response must contain at least {num_placeholders} placeholders '
263
+ + 'represented by square brackets, such as [address].'
264
+ )
260
265
  return self._description_pattern.format(num_placeholders=self._num_placeholders)
261
266
 
262
267
  def get_instruction_args(self):
@@ -298,9 +303,10 @@ class BulletListChecker(Instruction):
298
303
  self._num_bullets = num_bullets
299
304
  if self._num_bullets is None or self._num_bullets < 0:
300
305
  self._num_bullets = random.randint(1, _NUM_BULLETS)
301
- self._description_pattern = ('Your answer must contain exactly {num_bullets} bullet points. '
302
- + 'Use the markdown bullet points such as:\n' + '* This is point 1. \n'
303
- + '* This is point 2')
306
+ self._description_pattern = (
307
+ 'Your answer must contain exactly {num_bullets} bullet points. '
308
+ + 'Use the markdown bullet points such as:\n' + '* This is point 1. \n' + '* This is point 2'
309
+ )
304
310
  return self._description_pattern.format(num_bullets=self._num_bullets)
305
311
 
306
312
  def get_instruction_args(self):
@@ -379,8 +385,9 @@ class ConstrainedStartChecker(Instruction):
379
385
  self._starter = starter.strip() if isinstance(starter, str) else starter
380
386
  if self._starter is None:
381
387
  self._starter = random.choice(_STARTER_OPTIONS)
382
- self._description_pattern = ('During the conversation, when it is your turn, '
383
- + 'please always start with {starter}')
388
+ self._description_pattern = (
389
+ 'During the conversation, when it is your turn, ' + 'please always start with {starter}'
390
+ )
384
391
  return self._description_pattern.format(starter=self._starter)
385
392
 
386
393
  def get_instruction_args(self):
@@ -423,8 +430,10 @@ class HighlightSectionChecker(Instruction):
423
430
  if self._num_highlights is None or self._num_highlights < 0:
424
431
  self._num_highlights = random.randint(1, _NUM_HIGHLIGHTED_SECTIONS)
425
432
 
426
- self._description_pattern = ('Highlight at least {num_highlights} sections in your answer with '
427
- + 'markdown, i.e. *highlighted section*.')
433
+ self._description_pattern = (
434
+ 'Highlight at least {num_highlights} sections in your answer with '
435
+ + 'markdown, i.e. *highlighted section*.'
436
+ )
428
437
 
429
438
  return self._description_pattern.format(num_highlights=self._num_highlights)
430
439
 
@@ -482,9 +491,11 @@ class SectionChecker(Instruction):
482
491
  if self._num_sections is None or self._num_sections < 0:
483
492
  self._num_sections = random.randint(1, _NUM_SECTIONS)
484
493
 
485
- self._description_pattern = ('Your response must have {num_sections} sections. Mark the beginning '
486
- + 'of each section with {section_spliter} X, such as:\n' + '{section_spliter} 1\n'
487
- + '[content of section 1]\n' + '{section_spliter} 2\n' + '[content of section 2]')
494
+ self._description_pattern = (
495
+ 'Your response must have {num_sections} sections. Mark the beginning '
496
+ + 'of each section with {section_spliter} X, such as:\n' + '{section_spliter} 1\n'
497
+ + '[content of section 1]\n' + '{section_spliter} 2\n' + '[content of section 2]'
498
+ )
488
499
 
489
500
  return self._description_pattern.format(num_sections=self._num_sections, section_spliter=self._section_spliter)
490
501
 
@@ -534,8 +545,9 @@ class ParagraphChecker(Instruction):
534
545
  if self._num_paragraphs is None or self._num_paragraphs < 0:
535
546
  self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
536
547
 
537
- self._description_pattern = ('There should be {num_paragraphs} paragraphs. '
538
- + 'Paragraphs are separated with the markdown divider: ***')
548
+ self._description_pattern = (
549
+ 'There should be {num_paragraphs} paragraphs. ' + 'Paragraphs are separated with the markdown divider: ***'
550
+ )
539
551
 
540
552
  return self._description_pattern.format(num_paragraphs=self._num_paragraphs)
541
553
 
@@ -585,12 +597,14 @@ class PostscriptChecker(Instruction):
585
597
  A string representing the instruction description.
586
598
  """
587
599
  self._postscript_marker = (
588
- postscript_marker.strip() if isinstance(postscript_marker, str) else postscript_marker)
600
+ postscript_marker.strip() if isinstance(postscript_marker, str) else postscript_marker
601
+ )
589
602
  if self._postscript_marker is None:
590
603
  self._postscript_marker = random.choice(_POSTSCRIPT_MARKER)
591
604
 
592
- self._description_pattern = ('At the end of your response, please explicitly add a postscript '
593
- + 'starting with {postscript}')
605
+ self._description_pattern = (
606
+ 'At the end of your response, please explicitly add a postscript ' + 'starting with {postscript}'
607
+ )
594
608
 
595
609
  return self._description_pattern.format(postscript=self._postscript_marker)
596
610
 
@@ -644,8 +658,10 @@ class RephraseChecker(Instruction):
644
658
  'in the form of *change me*.')
645
659
 
646
660
  self._reference_without_change = original_message
647
- self._description = ('Rephrasing: Your rephrased response should only'
648
- + 'change the words/sentences in between two asterisks' + 'such as *change me*.')
661
+ self._description = (
662
+ 'Rephrasing: Your rephrased response should only' + 'change the words/sentences in between two asterisks'
663
+ + 'such as *change me*.'
664
+ )
649
665
  return self._description
650
666
 
651
667
  def get_instruction_args(self):
@@ -757,13 +773,16 @@ class KeywordFrequencyChecker(Instruction):
757
773
  if relation is None:
758
774
  self._comparison_relation = random.choice(_COMPARISON_RELATION)
759
775
  elif relation not in _COMPARISON_RELATION:
760
- raise ValueError('The supported relation for comparison must be in '
761
- f'{_COMPARISON_RELATION}, but {relation} is given.')
776
+ raise ValueError(
777
+ 'The supported relation for comparison must be in '
778
+ f'{_COMPARISON_RELATION}, but {relation} is given.'
779
+ )
762
780
  else:
763
781
  self._comparison_relation = relation
764
782
 
765
- self._description_pattern = ('In your response, the word {keyword} should appear {relation} '
766
- + '{frequency} times.')
783
+ self._description_pattern = (
784
+ 'In your response, the word {keyword} should appear {relation} ' + '{frequency} times.'
785
+ )
767
786
 
768
787
  return self._description_pattern.format(
769
788
  keyword=self._keyword,
@@ -819,8 +838,10 @@ class NumberOfWords(Instruction):
819
838
  if relation is None:
820
839
  self._comparison_relation = random.choice(_COMPARISON_RELATION)
821
840
  elif relation not in _COMPARISON_RELATION:
822
- raise ValueError('The supported relation for comparison must be in '
823
- f'{_COMPARISON_RELATION}, but {relation} is given.')
841
+ raise ValueError(
842
+ 'The supported relation for comparison must be in '
843
+ f'{_COMPARISON_RELATION}, but {relation} is given.'
844
+ )
824
845
  else:
825
846
  self._comparison_relation = relation
826
847
 
@@ -850,8 +871,10 @@ class JsonFormat(Instruction):
850
871
  """Check the Json format."""
851
872
 
852
873
  def build_description(self):
853
- self._description_pattern = ('Entire output should be wrapped in JSON format. You can use markdown'
854
- ' ticks such as ```.')
874
+ self._description_pattern = (
875
+ 'Entire output should be wrapped in JSON format. You can use markdown'
876
+ ' ticks such as ```.'
877
+ )
855
878
  return self._description_pattern
856
879
 
857
880
  def get_instruction_args(self):
@@ -864,8 +887,9 @@ class JsonFormat(Instruction):
864
887
 
865
888
  def check_following(self, value):
866
889
  value = (
867
- value.strip().removeprefix('```json').removeprefix('```Json').removeprefix('```JSON').removeprefix(
868
- '```').removesuffix('```').strip())
890
+ value.strip().removeprefix('```json').removeprefix('```Json').removeprefix('```JSON').removeprefix('```').
891
+ removesuffix('```').strip()
892
+ )
869
893
  try:
870
894
  json.loads(value)
871
895
  except ValueError:
@@ -903,10 +927,12 @@ class ParagraphFirstWordCheck(Instruction):
903
927
  self._first_word = instructions_util.generate_keywords(num_keywords=1)[0]
904
928
  self._first_word = self._first_word.lower()
905
929
 
906
- self._description_pattern = ('There should be {num_paragraphs} paragraphs. '
907
- + 'Paragraphs and only paragraphs are separated with each other by two '
908
- + "new lines as if it was '\\n\\n' in python. "
909
- + 'Paragraph {nth_paragraph} must start with word {first_word}.')
930
+ self._description_pattern = (
931
+ 'There should be {num_paragraphs} paragraphs. '
932
+ + 'Paragraphs and only paragraphs are separated with each other by two '
933
+ + "new lines as if it was '\\n\\n' in python. "
934
+ + 'Paragraph {nth_paragraph} must start with word {first_word}.'
935
+ )
910
936
 
911
937
  return self._description_pattern.format(
912
938
  num_paragraphs=self._num_paragraphs,
@@ -1084,11 +1110,12 @@ class RephraseParagraph(Instruction):
1084
1110
  self._low = low
1085
1111
  self._high = high
1086
1112
 
1087
- self._description = ('Rephrase the following paragraph: ' + '{original_paragraph}\nYour response should have '
1088
- + 'between {low} and {high} of the same words. '
1089
- + 'Words are the same if and only if all of the '
1090
- + 'letters, ignoring cases, are the same. For '
1091
- + "example, 'run' is the same as 'Run' but different " + "to 'ran'.")
1113
+ self._description = (
1114
+ 'Rephrase the following paragraph: ' + '{original_paragraph}\nYour response should have '
1115
+ + 'between {low} and {high} of the same words. ' + 'Words are the same if and only if all of the '
1116
+ + 'letters, ignoring cases, are the same. For ' + "example, 'run' is the same as 'Run' but different "
1117
+ + "to 'ran'."
1118
+ )
1092
1119
 
1093
1120
  return self._description.format(original_paragraph=original_paragraph, low=self._low, high=self._high)
1094
1121
 
@@ -1123,8 +1150,10 @@ class TwoResponsesChecker(Instruction):
1123
1150
 
1124
1151
  def build_description(self):
1125
1152
  """Build the instruction description."""
1126
- self._description_pattern = ('Give two different responses. Responses and only responses should'
1127
- ' be separated by 6 asterisk symbols: ******.')
1153
+ self._description_pattern = (
1154
+ 'Give two different responses. Responses and only responses should'
1155
+ ' be separated by 6 asterisk symbols: ******.'
1156
+ )
1128
1157
  return self._description_pattern
1129
1158
 
1130
1159
  def get_instruction_args(self):
@@ -1171,10 +1200,12 @@ class RepeatPromptThenAnswer(Instruction):
1171
1200
  raise ValueError('prompt_to_repeat must be set.')
1172
1201
  else:
1173
1202
  self._prompt_to_repeat = prompt_to_repeat
1174
- self._description_pattern = ('First repeat the request word for word without change,'
1175
- ' then give your answer (1. do not say any words or characters'
1176
- ' before repeating the request; 2. the request you need to repeat'
1177
- ' does not include this sentence)')
1203
+ self._description_pattern = (
1204
+ 'First repeat the request word for word without change,'
1205
+ ' then give your answer (1. do not say any words or characters'
1206
+ ' before repeating the request; 2. the request you need to repeat'
1207
+ ' does not include this sentence)'
1208
+ )
1178
1209
  return self._description_pattern
1179
1210
 
1180
1211
  def get_instruction_args(self):
@@ -1205,8 +1236,10 @@ class EndChecker(Instruction):
1205
1236
  self._end_phrase = (end_phrase.strip() if isinstance(end_phrase, str) else end_phrase)
1206
1237
  if self._end_phrase is None:
1207
1238
  self._end_phrase = random.choice(_ENDING_OPTIONS)
1208
- self._description_pattern = ('Finish your response with this exact phrase {ender}. '
1209
- 'No other words should follow this phrase.')
1239
+ self._description_pattern = (
1240
+ 'Finish your response with this exact phrase {ender}. '
1241
+ 'No other words should follow this phrase.'
1242
+ )
1210
1243
  return self._description_pattern.format(ender=self._end_phrase)
1211
1244
 
1212
1245
  def get_instruction_args(self):
@@ -1228,8 +1261,10 @@ class TitleChecker(Instruction):
1228
1261
 
1229
1262
  def build_description(self):
1230
1263
  """Build the instruction description."""
1231
- self._description_pattern = ('Your answer must contain a title, wrapped in double angular brackets,'
1232
- ' such as <<poem of joy>>.')
1264
+ self._description_pattern = (
1265
+ 'Your answer must contain a title, wrapped in double angular brackets,'
1266
+ ' such as <<poem of joy>>.'
1267
+ )
1233
1268
  return self._description_pattern
1234
1269
 
1235
1270
  def get_instruction_args(self):
@@ -1283,13 +1318,17 @@ class LetterFrequencyChecker(Instruction):
1283
1318
  if let_relation is None:
1284
1319
  self._comparison_relation = random.choice(_COMPARISON_RELATION)
1285
1320
  elif let_relation not in _COMPARISON_RELATION:
1286
- raise ValueError('The supported relation for comparison must be in '
1287
- f'{_COMPARISON_RELATION}, but {let_relation} is given.')
1321
+ raise ValueError(
1322
+ 'The supported relation for comparison must be in '
1323
+ f'{_COMPARISON_RELATION}, but {let_relation} is given.'
1324
+ )
1288
1325
  else:
1289
1326
  self._comparison_relation = let_relation
1290
1327
 
1291
- self._description_pattern = ('In your response, the letter {letter} should appear {let_relation}'
1292
- ' {let_frequency} times.')
1328
+ self._description_pattern = (
1329
+ 'In your response, the letter {letter} should appear {let_relation}'
1330
+ ' {let_frequency} times.'
1331
+ )
1293
1332
 
1294
1333
  return self._description_pattern.format(
1295
1334
  letter=self._letter,
@@ -1352,8 +1391,10 @@ class LowercaseLettersEnglishChecker(Instruction):
1352
1391
 
1353
1392
  def build_description(self):
1354
1393
  """Build the instruction description."""
1355
- self._description_pattern = ('Your entire response should be in English, and in all lowercase'
1356
- ' letters. No capital letters are allowed.')
1394
+ self._description_pattern = (
1395
+ 'Your entire response should be in English, and in all lowercase'
1396
+ ' letters. No capital letters are allowed.'
1397
+ )
1357
1398
  return self._description_pattern
1358
1399
 
1359
1400
  def get_instruction_args(self):
@@ -1422,11 +1463,15 @@ class CapitalWordFrequencyChecker(Instruction):
1422
1463
  if capital_relation is None:
1423
1464
  self._comparison_relation = random.choice(_COMPARISON_RELATION)
1424
1465
  elif capital_relation not in _COMPARISON_RELATION:
1425
- raise ValueError('The supported relation for comparison must be in '
1426
- f'{_COMPARISON_RELATION}, but {capital_relation} is given.')
1427
-
1428
- self._description_pattern = ('In your response, words with all capital letters should appear'
1429
- ' {relation} {frequency} times.')
1466
+ raise ValueError(
1467
+ 'The supported relation for comparison must be in '
1468
+ f'{_COMPARISON_RELATION}, but {capital_relation} is given.'
1469
+ )
1470
+
1471
+ self._description_pattern = (
1472
+ 'In your response, words with all capital letters should appear'
1473
+ ' {relation} {frequency} times.'
1474
+ )
1430
1475
 
1431
1476
  return self._description_pattern.format(frequency=self._frequency, relation=self._comparison_relation)
1432
1477