evalscope 0.13.2__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (214) hide show
  1. evalscope/arguments.py +2 -1
  2. evalscope/backend/rag_eval/__init__.py +1 -1
  3. evalscope/backend/rag_eval/backend_manager.py +21 -5
  4. evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
  5. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  6. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
  7. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
  8. evalscope/backend/rag_eval/utils/embedding.py +49 -3
  9. evalscope/backend/rag_eval/utils/llm.py +4 -4
  10. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
  11. evalscope/benchmarks/__init__.py +2 -2
  12. evalscope/benchmarks/aigc/__init__.py +0 -0
  13. evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
  14. evalscope/benchmarks/aigc/t2i/base.py +56 -0
  15. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +77 -0
  16. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +58 -0
  17. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +58 -0
  18. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +57 -0
  19. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +37 -0
  20. evalscope/benchmarks/aime/aime24_adapter.py +1 -1
  21. evalscope/benchmarks/aime/aime25_adapter.py +4 -4
  22. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -2
  23. evalscope/benchmarks/arc/arc_adapter.py +2 -2
  24. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -3
  25. evalscope/benchmarks/ceval/ceval_adapter.py +2 -2
  26. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -3
  27. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +1 -1
  28. evalscope/benchmarks/competition_math/competition_math_adapter.py +1 -2
  29. evalscope/benchmarks/data_adapter.py +21 -10
  30. evalscope/benchmarks/data_collection/data_collection_adapter.py +6 -4
  31. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -2
  32. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  33. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  34. evalscope/benchmarks/live_code_bench/evaluate_utils.py +16 -21
  35. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +5 -4
  36. evalscope/benchmarks/live_code_bench/testing_util.py +369 -550
  37. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  38. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
  39. evalscope/benchmarks/math_500/math_500_adapter.py +1 -1
  40. evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -8
  41. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  42. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  43. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  44. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -2
  45. evalscope/benchmarks/utils.py +7 -16
  46. evalscope/cli/start_app.py +1 -1
  47. evalscope/collections/evaluator.py +20 -6
  48. evalscope/config.py +8 -4
  49. evalscope/constants.py +11 -0
  50. evalscope/evaluator/evaluator.py +2 -2
  51. evalscope/evaluator/reviewer/auto_reviewer.py +1 -1
  52. evalscope/metrics/__init__.py +49 -4
  53. evalscope/metrics/llm_judge.py +1 -1
  54. evalscope/metrics/named_metrics.py +13 -0
  55. evalscope/metrics/t2v_metrics/__init__.py +66 -0
  56. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  57. evalscope/metrics/t2v_metrics/constants.py +12 -0
  58. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  59. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  60. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  61. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  62. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  63. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +132 -0
  64. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +286 -0
  65. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +114 -0
  66. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +86 -0
  67. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +85 -0
  68. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  69. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  70. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +84 -0
  71. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +97 -0
  72. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +171 -0
  73. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  74. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +80 -0
  75. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +73 -0
  76. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  77. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  78. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  79. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  80. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  81. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +300 -0
  82. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  83. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +82 -0
  84. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  85. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +218 -0
  86. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +150 -0
  87. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  88. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  89. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  90. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +22 -0
  91. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +188 -0
  92. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +106 -0
  93. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +307 -0
  94. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  95. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  96. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +191 -0
  97. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +318 -0
  98. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  99. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  100. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  101. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  102. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  103. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  104. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  105. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  106. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  107. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  108. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  109. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  110. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  111. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  112. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  113. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  114. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  115. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  116. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  117. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  118. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  119. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  120. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  121. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +208 -0
  122. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  123. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1093 -0
  124. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  125. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  126. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  127. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +452 -0
  128. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +364 -0
  129. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +755 -0
  130. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +273 -0
  131. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +880 -0
  132. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1844 -0
  133. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +81 -0
  134. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +56 -0
  135. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  136. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  137. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  138. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +185 -0
  139. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +178 -0
  140. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +112 -0
  141. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  142. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +344 -0
  143. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +858 -0
  144. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +271 -0
  145. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +503 -0
  146. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1270 -0
  147. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +473 -0
  148. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +31 -0
  149. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +392 -0
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +127 -0
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +17 -0
  154. evalscope/metrics/t2v_metrics/score.py +78 -0
  155. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  156. evalscope/models/__init__.py +50 -14
  157. evalscope/models/adapters/__init__.py +17 -0
  158. evalscope/models/{base_adapter.py → adapters/base_adapter.py} +17 -17
  159. evalscope/models/{chat_adapter.py → adapters/chat_adapter.py} +10 -7
  160. evalscope/models/{choice_adapter.py → adapters/choice_adapter.py} +2 -6
  161. evalscope/models/{custom_adapter.py → adapters/custom_adapter.py} +2 -4
  162. evalscope/models/{server_adapter.py → adapters/server_adapter.py} +1 -3
  163. evalscope/models/adapters/t2i_adapter.py +76 -0
  164. evalscope/models/custom/__init__.py +2 -1
  165. evalscope/models/custom/dummy_model.py +11 -13
  166. evalscope/models/local_model.py +82 -33
  167. evalscope/models/model.py +2 -42
  168. evalscope/models/register.py +26 -0
  169. evalscope/perf/arguments.py +24 -5
  170. evalscope/perf/benchmark.py +28 -42
  171. evalscope/perf/http_client.py +2 -3
  172. evalscope/perf/plugin/api/custom_api.py +1 -1
  173. evalscope/perf/plugin/api/openai_api.py +2 -2
  174. evalscope/perf/plugin/datasets/custom.py +4 -1
  175. evalscope/perf/plugin/datasets/flickr8k.py +2 -1
  176. evalscope/perf/plugin/datasets/line_by_line.py +4 -1
  177. evalscope/perf/plugin/datasets/longalpaca.py +4 -1
  178. evalscope/perf/plugin/datasets/openqa.py +4 -1
  179. evalscope/perf/plugin/datasets/random_dataset.py +13 -6
  180. evalscope/perf/utils/benchmark_util.py +14 -8
  181. evalscope/perf/utils/db_util.py +9 -3
  182. evalscope/perf/utils/log_utils.py +41 -0
  183. evalscope/report/__init__.py +1 -0
  184. evalscope/report/app.py +128 -78
  185. evalscope/report/app_arguments.py +11 -0
  186. evalscope/report/generator.py +1 -1
  187. evalscope/run.py +10 -3
  188. evalscope/summarizer.py +2 -1
  189. evalscope/third_party/thinkbench/eval.py +19 -7
  190. evalscope/utils/chat_service.py +2 -2
  191. evalscope/utils/import_utils.py +66 -0
  192. evalscope/utils/utils.py +48 -29
  193. evalscope/version.py +2 -2
  194. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/METADATA +37 -15
  195. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/RECORD +209 -96
  196. tests/aigc/__init__.py +1 -0
  197. tests/aigc/test_t2i.py +87 -0
  198. tests/cli/test_all.py +4 -4
  199. tests/cli/test_collection.py +2 -1
  200. tests/cli/test_run.py +19 -12
  201. tests/perf/test_perf.py +3 -3
  202. tests/rag/test_clip_benchmark.py +0 -1
  203. tests/rag/test_mteb.py +37 -8
  204. tests/rag/test_ragas.py +29 -26
  205. tests/vlm/test_vlmeval.py +37 -1
  206. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  207. evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
  208. evalscope/metrics/code_metric.py +0 -98
  209. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  210. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  211. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/LICENSE +0 -0
  212. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/WHEEL +0 -0
  213. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/entry_points.txt +0 -0
  214. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
2
  from evalscope.constants import OutputType
3
- from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
3
+ from evalscope.metrics import extract_answer, math_equal, strip_answer_string
4
4
  from evalscope.utils.logger import get_logger
5
5
 
6
6
  # flake8: noqa
@@ -11,12 +11,12 @@ logger = get_logger()
11
11
  @Benchmark.register(
12
12
  name='aime25',
13
13
  pretty_name='AIME-2025',
14
- dataset_id='TIGER-Lab/AIME25',
15
- subset_list=['default'],
14
+ dataset_id='opencompass/AIME2025',
15
+ subset_list=['AIME2025-I', 'AIME2025-II'],
16
16
  metric_list=['AveragePass@1'],
17
17
  few_shot_num=0,
18
18
  train_split=None,
19
- eval_split='train', # Only train set is available
19
+ eval_split='test', # Only train set is available
20
20
  prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
21
21
  )
22
22
  class AIME25Adapter(DataAdapter):
@@ -3,8 +3,7 @@ from collections import defaultdict
3
3
  from typing import Any, List
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.metrics import Metric, mean, metric_registry
7
- from evalscope.metrics.llm_judge import LLMJudge
6
+ from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
8
7
  from evalscope.utils.logger import get_logger
9
8
 
10
9
  # flake8: noqa
@@ -18,7 +18,7 @@ logger = get_logger()
18
18
  name='arc',
19
19
  pretty_name='ARC',
20
20
  dataset_id='modelscope/ai2_arc',
21
- model_adapter=OutputType.MULTIPLE_CHOICE,
21
+ model_adapter=OutputType.GENERATION,
22
22
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
23
23
  subset_list=['ARC-Easy', 'ARC-Challenge'],
24
24
  metric_list=['AverageAccuracy'],
@@ -134,7 +134,7 @@ class ARCAdapter(DataAdapter):
134
134
  if self.model_adapter == OutputType.MULTIPLE_CHOICE:
135
135
  return result
136
136
  else:
137
- return ResponseParser.parse_first_option(text=result)
137
+ return ResponseParser.parse_first_option(text=result, options=self.choices)
138
138
 
139
139
  def match(self, gold: str, pred: str) -> float:
140
140
  return exact_match(gold=gold, pred=pred)
@@ -3,9 +3,7 @@ from collections import defaultdict
3
3
  from typing import Any, List
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import AnswerKeys
7
- from evalscope.metrics import Metric, mean, metric_registry
8
- from evalscope.metrics.llm_judge import LLMJudge
6
+ from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
9
7
  from evalscope.utils.logger import get_logger
10
8
 
11
9
  # flake8: noqa
@@ -4,7 +4,7 @@ import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType, OutputType
7
- from evalscope.metrics.metrics import exact_match
7
+ from evalscope.metrics import exact_match
8
8
  from evalscope.utils import ResponseParser
9
9
  from evalscope.utils.logger import get_logger
10
10
 
@@ -127,7 +127,7 @@ SUBJECT_MAPPING = {
127
127
  name='ceval',
128
128
  pretty_name='C-Eval',
129
129
  dataset_id='modelscope/ceval-exam',
130
- model_adapter=OutputType.MULTIPLE_CHOICE,
130
+ model_adapter=OutputType.GENERATION,
131
131
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
132
132
  subset_list=SUBSET_LIST,
133
133
  metric_list=['AverageAccuracy'],
@@ -1,10 +1,8 @@
1
1
  import re
2
- from collections import defaultdict
3
2
  from typing import Any, List
4
3
 
5
4
  from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.metrics import Metric, mean, metric_registry
7
- from evalscope.metrics.llm_judge import LLMJudge
5
+ from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
8
6
  from evalscope.utils.logger import get_logger
9
7
 
10
8
  # flake8: noqa
@@ -104,7 +104,7 @@ SUBJECT_MAPPING = {
104
104
  name='cmmlu',
105
105
  pretty_name='C-MMLU',
106
106
  dataset_id='modelscope/cmmlu',
107
- model_adapter=OutputType.MULTIPLE_CHOICE,
107
+ model_adapter=OutputType.GENERATION,
108
108
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
109
109
  subset_list=SUBSET_LIST,
110
110
  metric_list=['AverageAccuracy'],
@@ -6,8 +6,7 @@ import os
6
6
  from collections import defaultdict
7
7
 
8
8
  from evalscope.benchmarks import Benchmark, DataAdapter
9
- from evalscope.constants import AnswerKeys
10
- from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
9
+ from evalscope.metrics import extract_answer, math_equal, strip_answer_string
11
10
  from evalscope.utils.logger import get_logger
12
11
 
13
12
  # flake8: noqa
@@ -3,12 +3,11 @@ import os.path
3
3
  import random
4
4
  from abc import ABC, abstractmethod
5
5
  from collections import defaultdict
6
- from typing import Any, List, Optional, Union
6
+ from typing import Any, Dict, List, Optional, Union
7
7
 
8
8
  from evalscope.benchmarks.utils import PromptData, preprocess_decorator
9
9
  from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
10
- from evalscope.metrics.llm_judge import LLMJudge
11
- from evalscope.metrics.named_metrics import metric_registry
10
+ from evalscope.metrics import LLMJudge, metric_registry
12
11
  from evalscope.report import Report, ReportGenerator
13
12
  from evalscope.utils.logger import get_logger
14
13
 
@@ -24,6 +23,7 @@ class DataAdapter(ABC):
24
23
  subset_list: list,
25
24
  metric_list: List[str],
26
25
  llm_as_a_judge: bool = False,
26
+ output_types: Optional[List[str]] = None,
27
27
  few_shot_num: Optional[int] = 0,
28
28
  train_split: Optional[str] = None,
29
29
  eval_split: Optional[str] = None,
@@ -63,6 +63,7 @@ class DataAdapter(ABC):
63
63
  self.query_template = query_template
64
64
  self.pretty_name = pretty_name
65
65
  self.config_kwargs = kwargs
66
+ self.output_types = output_types or [model_adapter]
66
67
  self.llm_as_a_judge = llm_as_a_judge
67
68
  self.category_map = kwargs.get('category_map', {})
68
69
  self.choices = kwargs.get('choices', None)
@@ -190,7 +191,7 @@ class DataAdapter(ABC):
190
191
  if self.few_shot_num and self.few_shot_num < 0:
191
192
  raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.')
192
193
 
193
- logger.info(f'Use default settings: '
194
+ logger.info(f'Use settings: '
194
195
  f'> few_shot_num: {self.few_shot_num}, '
195
196
  f'> few_shot_split: {self.train_split}, '
196
197
  f'> target_eval_split: {self.eval_split}')
@@ -245,7 +246,8 @@ class DataAdapter(ABC):
245
246
  res_list.append({'metric_name': metric_name, 'score': metric_func(review_res), 'num': len(review_res)})
246
247
  return res_list
247
248
 
248
- def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
249
+ def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]],
250
+ **kwargs) -> Dict[str, List[float]]:
249
251
  """
250
252
  compute weighted mean of the bleu score of all samples
251
253
 
@@ -253,7 +255,7 @@ class DataAdapter(ABC):
253
255
  review_res_list: [score1, score2, ...]
254
256
 
255
257
  Returns:
256
- avg_res: List[dict]
258
+ avg_res: Dict[str, List[float]]
257
259
 
258
260
  """
259
261
  if isinstance(review_res_list[0], list):
@@ -314,11 +316,20 @@ class DataAdapter(ABC):
314
316
  kwargs['metric_list'] = self.metric_list
315
317
  return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
316
318
 
317
- def gen_prompt_data(self, prompt: str, system_prompt: Optional[str] = None, **kwargs) -> dict:
318
- if not isinstance(prompt, list):
319
- prompt = [prompt]
319
+ def gen_prompt_data(self,
320
+ prompt: str,
321
+ system_prompt: Optional[str] = None,
322
+ choices: Optional[List[str]] = None,
323
+ index: Optional[Union[int, str]] = None,
324
+ id: Optional[Union[int, str]] = None,
325
+ **kwargs) -> dict:
326
+ data = [prompt] if not isinstance(prompt, list) else prompt
320
327
  prompt_data = PromptData(
321
- data=prompt, multi_choices=self.choices, system_prompt=system_prompt or self.system_prompt)
328
+ data=data,
329
+ multi_choices=choices or self.choices,
330
+ system_prompt=system_prompt or self.system_prompt,
331
+ index=index or 0,
332
+ id=id)
322
333
  return prompt_data.to_dict()
323
334
 
324
335
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
@@ -48,14 +48,16 @@ class DataCollectionAdapter(DataAdapter):
48
48
  if len(dataset) == 0:
49
49
  raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
50
50
  else:
51
- from modelscope.msdatasets import MsDataset
51
+ from modelscope import dataset_snapshot_download
52
52
 
53
53
  # Load dataset from remote
54
54
  logger.info(f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path}')
55
55
 
56
- dataset = MsDataset.load(dataset_name=dataset_name_or_path, cache_dir=work_dir, hub=datasets_hub, **kwargs)
57
-
58
- dataset = dataset[self.eval_split].to_list()
56
+ dataset_path = dataset_snapshot_download(
57
+ dataset_name_or_path, cache_dir=work_dir, allow_file_pattern='*.jsonl')
58
+ # find the jsonl file
59
+ dataset_files = [os.path.join(dataset_path, f) for f in os.listdir(dataset_path) if f.endswith('.jsonl')]
60
+ dataset = jsonl_to_list(dataset_files[0])
59
61
 
60
62
  return dataset
61
63
 
@@ -4,7 +4,7 @@ import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType, OutputType
7
- from evalscope.metrics.metrics import exact_match
7
+ from evalscope.metrics import exact_match
8
8
  from evalscope.utils import ResponseParser
9
9
  from evalscope.utils.logger import get_logger
10
10
 
@@ -17,7 +17,7 @@ logger = get_logger()
17
17
  name='general_mcq',
18
18
  pretty_name='General MCQ',
19
19
  dataset_id='general_mcq',
20
- model_adapter=OutputType.MULTIPLE_CHOICE,
20
+ model_adapter=OutputType.GENERATION,
21
21
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
22
22
  subset_list=['default'],
23
23
  metric_list=['AverageAccuracy'],
@@ -40,7 +40,7 @@ class GeneralQAAdapter(DataAdapter):
40
40
  for subset_name in subset_list:
41
41
  data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
42
42
  elif os.path.isfile(dataset_name_or_path):
43
- cur_subset_name = os.path.basename(dataset_name_or_path).split('.')[0]
43
+ cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
44
44
  data_file_dict[cur_subset_name] = dataset_name_or_path
45
45
  else:
46
46
  raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
@@ -108,7 +108,7 @@ class HellaSwagAdapter(DataAdapter):
108
108
  if self.model_adapter == OutputType.MULTIPLE_CHOICE:
109
109
  return result
110
110
  else:
111
- return ResponseParser.parse_first_option(result)
111
+ return ResponseParser.parse_first_option(result, options=self.choices)
112
112
 
113
113
  def match(self, gold: str, pred: str) -> float:
114
114
  return exact_match(gold=str(gold), pred=str(pred))
@@ -2,7 +2,6 @@ import json
2
2
  import multiprocessing
3
3
  import numpy as np
4
4
  from collections import defaultdict
5
- from concurrent.futures import ProcessPoolExecutor, as_completed
6
5
 
7
6
  from evalscope.utils.logger import get_logger
8
7
  from .pass_k_utils import compute_metrics_from_results
@@ -31,7 +30,10 @@ def codegen_check_correctness(sample, generation, timeout, debug=True):
31
30
  args=(sample, generation, debug, result, metadata_list, timeout),
32
31
  )
33
32
  p.start()
34
- p.join(timeout=(timeout + 1) * len(json.loads(sample['input_output'])['inputs']) + 5)
33
+ global_timeout = (timeout + 1) * len(json.loads(sample['input_output'])['inputs'])
34
+ if debug:
35
+ logger.info(f'global timeout = {global_timeout}')
36
+ p.join(timeout=global_timeout)
35
37
  if p.is_alive():
36
38
  p.kill()
37
39
  if not result:
@@ -39,7 +41,7 @@ def codegen_check_correctness(sample, generation, timeout, debug=True):
39
41
  # consider that all tests failed
40
42
  result = [[-1 for i in range(len(in_outs['inputs']))]]
41
43
  if debug:
42
- logger.info('global timeout')
44
+ logger.info('global timeout occured: alarm went off')
43
45
  return result[0], metadata_list[0]
44
46
 
45
47
 
@@ -99,7 +101,7 @@ def evaluate_generations(
99
101
  samples_list: list,
100
102
  generations_list: list[list[str]],
101
103
  debug: bool = False,
102
- num_process_evaluate: int = 16,
104
+ num_process_evaluate: int = 16, # This parameter will be unused
103
105
  timeout=6,
104
106
  ):
105
107
  """We take the list of code generations and try to compile them and the run
@@ -117,26 +119,19 @@ def evaluate_generations(
117
119
  [-2] = compile error, [-1] = runtime error [False] = failed test
118
120
  case [True] = passed test case
119
121
  """
122
+ results = {}
123
+ metadata = {}
120
124
 
121
- # generations are code generations in the same order of the dataset
122
-
123
- inputs = [[(generations_list[index], samples_list[index], debug, timeout), index]
124
- for index in range(len(generations_list))]
125
-
126
- with ProcessPoolExecutor(max_workers=1 if debug else num_process_evaluate) as executor:
127
- futures = {
128
- executor.submit(evaluate_generations_by_problem, problem_generations, sample, debug, timeout): index
129
- for (problem_generations, sample, debug, timeout), index in inputs
130
- }
125
+ for index in range(len(generations_list)):
126
+ problem_generations = generations_list[index]
127
+ sample = samples_list[index]
131
128
 
132
- results = {}
133
- metadata = {}
134
- for future in as_completed(futures):
135
- index = futures[future]
136
- results[index], metadata[index] = future.result()
129
+ result, meta = evaluate_generations_by_problem(problem_generations, sample, debug, timeout)
130
+ results[index] = result
131
+ metadata[index] = meta
137
132
 
138
- assert len(results) == len(inputs), f'results = {len(results)} inputs = {len(inputs)} {results=}'
139
- # results = {i: r for r, (_, i) in zip(results, inputs)}
133
+ assert len(results) == len(
134
+ generations_list), f'results = {len(results)} inputs = {len(generations_list)} {results=}'
140
135
 
141
136
  return results, metadata
142
137
 
@@ -18,8 +18,8 @@ logger = get_logger()
18
18
  extra_params={
19
19
  'start_date': None,
20
20
  'end_date': None,
21
- 'num_process_evaluate': 1,
22
- 'timeout': 6
21
+ 'timeout': 6,
22
+ 'debug': False
23
23
  },
24
24
  system_prompt=
25
25
  'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.', # noqa: E501
@@ -33,8 +33,8 @@ class LiveCodeBenchAdapter(DataAdapter):
33
33
 
34
34
  extra_params = kwargs.get('extra_params', {})
35
35
 
36
- self.num_process_evaluate = extra_params.get('num_process_evaluate', 1)
37
36
  self.timeout = extra_params.get('timeout', 6)
37
+ self.debug = extra_params.get('debug', False)
38
38
  self.start_date = extra_params.get('start_date')
39
39
  self.end_date = extra_params.get('end_date')
40
40
 
@@ -84,7 +84,8 @@ class LiveCodeBenchAdapter(DataAdapter):
84
84
  references,
85
85
  predictions,
86
86
  k_list=[1],
87
- num_process_evaluate=self.num_process_evaluate,
87
+ num_process_evaluate=1,
88
88
  timeout=self.timeout,
89
+ debug=self.debug,
89
90
  )
90
91
  return metrics['pass@1'] / 100 # convert to point scale