evalscope 0.13.2__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (214) hide show
  1. evalscope/arguments.py +2 -1
  2. evalscope/backend/rag_eval/__init__.py +1 -1
  3. evalscope/backend/rag_eval/backend_manager.py +21 -5
  4. evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
  5. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  6. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
  7. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
  8. evalscope/backend/rag_eval/utils/embedding.py +49 -3
  9. evalscope/backend/rag_eval/utils/llm.py +4 -4
  10. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
  11. evalscope/benchmarks/__init__.py +2 -2
  12. evalscope/benchmarks/aigc/__init__.py +0 -0
  13. evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
  14. evalscope/benchmarks/aigc/t2i/base.py +56 -0
  15. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +77 -0
  16. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +58 -0
  17. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +58 -0
  18. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +57 -0
  19. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +37 -0
  20. evalscope/benchmarks/aime/aime24_adapter.py +1 -1
  21. evalscope/benchmarks/aime/aime25_adapter.py +4 -4
  22. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -2
  23. evalscope/benchmarks/arc/arc_adapter.py +2 -2
  24. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -3
  25. evalscope/benchmarks/ceval/ceval_adapter.py +2 -2
  26. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -3
  27. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +1 -1
  28. evalscope/benchmarks/competition_math/competition_math_adapter.py +1 -2
  29. evalscope/benchmarks/data_adapter.py +21 -10
  30. evalscope/benchmarks/data_collection/data_collection_adapter.py +6 -4
  31. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -2
  32. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  33. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  34. evalscope/benchmarks/live_code_bench/evaluate_utils.py +16 -21
  35. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +5 -4
  36. evalscope/benchmarks/live_code_bench/testing_util.py +369 -550
  37. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  38. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
  39. evalscope/benchmarks/math_500/math_500_adapter.py +1 -1
  40. evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -8
  41. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  42. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  43. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  44. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -2
  45. evalscope/benchmarks/utils.py +7 -16
  46. evalscope/cli/start_app.py +1 -1
  47. evalscope/collections/evaluator.py +20 -6
  48. evalscope/config.py +8 -4
  49. evalscope/constants.py +11 -0
  50. evalscope/evaluator/evaluator.py +2 -2
  51. evalscope/evaluator/reviewer/auto_reviewer.py +1 -1
  52. evalscope/metrics/__init__.py +49 -4
  53. evalscope/metrics/llm_judge.py +1 -1
  54. evalscope/metrics/named_metrics.py +13 -0
  55. evalscope/metrics/t2v_metrics/__init__.py +66 -0
  56. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  57. evalscope/metrics/t2v_metrics/constants.py +12 -0
  58. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  59. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  60. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  61. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  62. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  63. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +132 -0
  64. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +286 -0
  65. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +114 -0
  66. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +86 -0
  67. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +85 -0
  68. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  69. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  70. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +84 -0
  71. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +97 -0
  72. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +171 -0
  73. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  74. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +80 -0
  75. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +73 -0
  76. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  77. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  78. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  79. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  80. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  81. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +300 -0
  82. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  83. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +82 -0
  84. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  85. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +218 -0
  86. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +150 -0
  87. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  88. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  89. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  90. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +22 -0
  91. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +188 -0
  92. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +106 -0
  93. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +307 -0
  94. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  95. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  96. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +191 -0
  97. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +318 -0
  98. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  99. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  100. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  101. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  102. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  103. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  104. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  105. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  106. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  107. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  108. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  109. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  110. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  111. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  112. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  113. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  114. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  115. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  116. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  117. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  118. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  119. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  120. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  121. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +208 -0
  122. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  123. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1093 -0
  124. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  125. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  126. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  127. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +452 -0
  128. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +364 -0
  129. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +755 -0
  130. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +273 -0
  131. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +880 -0
  132. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1844 -0
  133. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +81 -0
  134. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +56 -0
  135. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  136. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  137. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  138. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +185 -0
  139. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +178 -0
  140. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +112 -0
  141. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  142. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +344 -0
  143. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +858 -0
  144. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +271 -0
  145. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +503 -0
  146. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1270 -0
  147. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +473 -0
  148. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +31 -0
  149. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +392 -0
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +127 -0
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +17 -0
  154. evalscope/metrics/t2v_metrics/score.py +78 -0
  155. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  156. evalscope/models/__init__.py +50 -14
  157. evalscope/models/adapters/__init__.py +17 -0
  158. evalscope/models/{base_adapter.py → adapters/base_adapter.py} +17 -17
  159. evalscope/models/{chat_adapter.py → adapters/chat_adapter.py} +10 -7
  160. evalscope/models/{choice_adapter.py → adapters/choice_adapter.py} +2 -6
  161. evalscope/models/{custom_adapter.py → adapters/custom_adapter.py} +2 -4
  162. evalscope/models/{server_adapter.py → adapters/server_adapter.py} +1 -3
  163. evalscope/models/adapters/t2i_adapter.py +76 -0
  164. evalscope/models/custom/__init__.py +2 -1
  165. evalscope/models/custom/dummy_model.py +11 -13
  166. evalscope/models/local_model.py +82 -33
  167. evalscope/models/model.py +2 -42
  168. evalscope/models/register.py +26 -0
  169. evalscope/perf/arguments.py +24 -5
  170. evalscope/perf/benchmark.py +28 -42
  171. evalscope/perf/http_client.py +2 -3
  172. evalscope/perf/plugin/api/custom_api.py +1 -1
  173. evalscope/perf/plugin/api/openai_api.py +2 -2
  174. evalscope/perf/plugin/datasets/custom.py +4 -1
  175. evalscope/perf/plugin/datasets/flickr8k.py +2 -1
  176. evalscope/perf/plugin/datasets/line_by_line.py +4 -1
  177. evalscope/perf/plugin/datasets/longalpaca.py +4 -1
  178. evalscope/perf/plugin/datasets/openqa.py +4 -1
  179. evalscope/perf/plugin/datasets/random_dataset.py +13 -6
  180. evalscope/perf/utils/benchmark_util.py +14 -8
  181. evalscope/perf/utils/db_util.py +9 -3
  182. evalscope/perf/utils/log_utils.py +41 -0
  183. evalscope/report/__init__.py +1 -0
  184. evalscope/report/app.py +128 -78
  185. evalscope/report/app_arguments.py +11 -0
  186. evalscope/report/generator.py +1 -1
  187. evalscope/run.py +10 -3
  188. evalscope/summarizer.py +2 -1
  189. evalscope/third_party/thinkbench/eval.py +19 -7
  190. evalscope/utils/chat_service.py +2 -2
  191. evalscope/utils/import_utils.py +66 -0
  192. evalscope/utils/utils.py +48 -29
  193. evalscope/version.py +2 -2
  194. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/METADATA +37 -15
  195. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/RECORD +209 -96
  196. tests/aigc/__init__.py +1 -0
  197. tests/aigc/test_t2i.py +87 -0
  198. tests/cli/test_all.py +4 -4
  199. tests/cli/test_collection.py +2 -1
  200. tests/cli/test_run.py +19 -12
  201. tests/perf/test_perf.py +3 -3
  202. tests/rag/test_clip_benchmark.py +0 -1
  203. tests/rag/test_mteb.py +37 -8
  204. tests/rag/test_ragas.py +29 -26
  205. tests/vlm/test_vlmeval.py +37 -1
  206. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  207. evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
  208. evalscope/metrics/code_metric.py +0 -98
  209. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  210. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  211. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/LICENSE +0 -0
  212. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/WHEEL +0 -0
  213. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/entry_points.txt +0 -0
  214. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/top_level.txt +0 -0
tests/cli/test_all.py CHANGED
@@ -4,13 +4,12 @@ from dotenv import dotenv_values
4
4
  env = dotenv_values('.env')
5
5
 
6
6
  import os
7
- import subprocess
8
7
  import unittest
9
8
 
10
9
  from evalscope.config import TaskConfig
11
10
  from evalscope.constants import EvalType, JudgeStrategy, OutputType
12
11
  from evalscope.run import run_task
13
- from evalscope.utils import is_module_installed, test_level_list
12
+ from evalscope.utils import test_level_list
14
13
  from evalscope.utils.logger import get_logger
15
14
 
16
15
  os.environ['LOG_LEVEL'] = 'DEBUG'
@@ -46,6 +45,7 @@ datasets=[
46
45
  'chinese_simpleqa',
47
46
  'alpaca_eval',
48
47
  'arena_hard',
48
+ 'maritime_bench',
49
49
  ]
50
50
 
51
51
  dataset_args={
@@ -134,8 +134,8 @@ class TestRun(unittest.TestCase):
134
134
  eval_type=EvalType.SERVICE,
135
135
  datasets=datasets,
136
136
  dataset_args=dataset_args,
137
- eval_batch_size=2,
138
- limit=2,
137
+ eval_batch_size=1,
138
+ limit=1,
139
139
  stream=True,
140
140
  generation_config={
141
141
  'temperature': 0,
@@ -80,4 +80,5 @@ class TestCollection(unittest.TestCase):
80
80
  'api_key': os.getenv('DASHSCOPE_API_KEY'),
81
81
  }
82
82
  )
83
- run_task(task_cfg=task_cfg)
83
+ res = run_task(task_cfg=task_cfg)
84
+ print(res)
tests/cli/test_run.py CHANGED
@@ -137,7 +137,7 @@ class TestRun(unittest.TestCase):
137
137
  'subset_list': ['gsm8k'],
138
138
  },
139
139
  'musr': {
140
- 'subset_list': ['murder_mysteries']
140
+ 'subset_list': ['murder_mysteries'],
141
141
  },
142
142
  'general_mcq': {
143
143
  'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
@@ -209,17 +209,23 @@ class TestRun(unittest.TestCase):
209
209
  task_cfg = TaskConfig(
210
210
  model='Qwen/Qwen2.5-0.5B-Instruct',
211
211
  datasets=[
212
- 'iquiz',
212
+ # 'iquiz',
213
213
  # 'math_500',
214
214
  # 'aime24',
215
- # 'competition_math'
215
+ # 'competition_math',
216
+ 'mmlu',
216
217
  ],
217
218
  dataset_args={
218
219
  'competition_math': {
219
220
  'subset_list': ['Level 4', 'Level 5']
220
- }
221
+ },
222
+ 'mmlu': {
223
+ 'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
224
+ 'few_shot_num': 0
225
+ },
221
226
  },
222
- limit=5
227
+ limit=10,
228
+ eval_batch_size=10,
223
229
  )
224
230
 
225
231
  run_task(task_cfg=task_cfg)
@@ -263,7 +269,7 @@ class TestRun(unittest.TestCase):
263
269
  datasets=[
264
270
  # 'iquiz',
265
271
  # 'ifeval',
266
- # 'mmlu',
272
+ 'mmlu',
267
273
  # 'mmlu_pro',
268
274
  # 'musr',
269
275
  # 'process_bench',
@@ -281,9 +287,10 @@ class TestRun(unittest.TestCase):
281
287
  # 'ceval',
282
288
  # 'hellaswag',
283
289
  # 'general_mcq',
284
- 'general_qa'
290
+ # 'general_qa'
285
291
  # 'super_gpqa',
286
- # 'mmlu_redux'
292
+ # 'mmlu_redux',
293
+ # 'maritime_bench'
287
294
  ],
288
295
  dataset_args={
289
296
  'mmlu': {
@@ -322,7 +329,8 @@ class TestRun(unittest.TestCase):
322
329
  'subset_list': ['gsm8k'],
323
330
  },
324
331
  'musr': {
325
- 'subset_list': ['murder_mysteries']
332
+ 'subset_list': ['murder_mysteries'],
333
+ 'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/MuSR'
326
334
  },
327
335
  'general_mcq': {
328
336
  'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
@@ -353,10 +361,9 @@ class TestRun(unittest.TestCase):
353
361
  stream=False,
354
362
  generation_config={
355
363
  'temperature': 0,
356
- 'n': 2,
364
+ 'n': 1,
357
365
  'max_tokens': 4096,
358
- },
359
- use_cache='outputs/20250326_202848',
366
+ }
360
367
  )
361
368
 
362
369
  run_task(task_cfg=task_cfg)
tests/perf/test_perf.py CHANGED
@@ -104,7 +104,7 @@ class TestPerf(unittest.TestCase):
104
104
  task_cfg = Arguments(
105
105
  parallel=20,
106
106
  model='Qwen2.5-0.5B-Instruct',
107
- url='http://127.0.0.1:8801/v1/chat/completions',
107
+ url='http://127.0.0.1:8801/v1/completions',
108
108
  api='openai',
109
109
  dataset='random',
110
110
  min_tokens=1024,
@@ -112,10 +112,10 @@ class TestPerf(unittest.TestCase):
112
112
  prefix_length=0,
113
113
  min_prompt_length=1024,
114
114
  max_prompt_length=1024,
115
- number=40,
115
+ number=20,
116
116
  tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
117
117
  seed=None,
118
- debug= True,
118
+ extra_args={'ignore_eos': True}
119
119
  )
120
120
  run_perf_benchmark(task_cfg)
121
121
 
@@ -45,7 +45,6 @@ class TestCLIPBenchmark(unittest.TestCase):
45
45
  'num_workers': 1,
46
46
  'verbose': True,
47
47
  'skip_existing': False,
48
- 'output_dir': 'outputs',
49
48
  'cache_dir': 'cache',
50
49
  'limit': 1000,
51
50
  },
tests/rag/test_mteb.py CHANGED
@@ -1,8 +1,9 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- import subprocess
4
3
  import unittest
4
+ from dotenv import dotenv_values
5
5
 
6
+ env = dotenv_values('.env')
6
7
  from evalscope.run import run_task
7
8
  from evalscope.utils import is_module_installed, test_level_list
8
9
  from evalscope.utils.logger import get_logger
@@ -45,14 +46,13 @@ class TestMTEB(unittest.TestCase):
45
46
  ],
46
47
  'eval': {
47
48
  'tasks': [
48
- 'TNews',
49
- 'CLSClusteringS2S',
49
+ # 'TNews',
50
+ # 'CLSClusteringS2S',
50
51
  'T2Reranking',
51
- 'T2Retrieval',
52
- 'ATEC',
52
+ # 'T2Retrieval',
53
+ # 'ATEC',
53
54
  ],
54
55
  'verbosity': 2,
55
- 'output_folder': 'outputs',
56
56
  'overwrite_results': True,
57
57
  'limits': 500,
58
58
  },
@@ -61,6 +61,37 @@ class TestMTEB(unittest.TestCase):
61
61
 
62
62
  run_task(task_cfg)
63
63
 
64
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
65
+ def test_run_one_stage_api(self):
66
+ from evalscope import TaskConfig
67
+ task_cfg = TaskConfig(
68
+ eval_backend='RAGEval',
69
+ eval_config={
70
+ 'tool': 'MTEB',
71
+ 'model': [
72
+ {
73
+ 'model_name': 'text-embedding-v3',
74
+ 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
75
+ 'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
76
+ 'dimensions': 1024,
77
+ 'encode_kwargs': {
78
+ 'batch_size': 10,
79
+ },
80
+ }
81
+ ],
82
+ 'eval': {
83
+ 'tasks': [
84
+ 'T2Retrieval',
85
+ ],
86
+ 'verbosity': 2,
87
+ 'overwrite_results': True,
88
+ 'limits': 30,
89
+ },
90
+ },
91
+ )
92
+
93
+ run_task(task_cfg)
94
+
64
95
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
65
96
  def test_run_two_stage_mteb(self):
66
97
  task_cfg = {
@@ -92,7 +123,6 @@ class TestMTEB(unittest.TestCase):
92
123
  'eval': {
93
124
  'tasks': ['MedicalRetrieval', 'T2Retrieval'],
94
125
  'verbosity': 2,
95
- 'output_folder': 'outputs',
96
126
  'overwrite_results': True,
97
127
  # 'limits': 10,
98
128
  'top_k': 10,
@@ -124,7 +154,6 @@ class TestMTEB(unittest.TestCase):
124
154
  'tasks': ['CustomRetrieval'],
125
155
  'dataset_path': 'custom_eval/text/retrieval',
126
156
  'verbosity': 2,
127
- 'output_folder': 'outputs',
128
157
  'overwrite_results': True,
129
158
  'limits': 500,
130
159
  },
tests/rag/test_ragas.py CHANGED
@@ -5,7 +5,7 @@ from dotenv import dotenv_values
5
5
  env = dotenv_values('.env')
6
6
  import unittest
7
7
 
8
- from evalscope.run import run_task
8
+ from evalscope import TaskConfig, run_task
9
9
  from evalscope.utils import is_module_installed, test_level_list
10
10
  from evalscope.utils.logger import get_logger
11
11
 
@@ -37,13 +37,10 @@ class TestRAGAS(unittest.TestCase):
37
37
  'docs': ['README_zh.md'],
38
38
  'test_size': 5,
39
39
  'output_file': 'outputs/testset.json',
40
- 'distribution': {
41
- 'simple': 0.5,
42
- 'multi_context': 0.4,
43
- 'reasoning': 0.1,
44
- },
45
40
  'generator_llm': {
46
- 'model_name_or_path': 'qwen/Qwen2-7B-Instruct',
41
+ 'model_name': 'qwen-plus', # 自定义聊天模型名称
42
+ 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1', # 自定义基础URL
43
+ 'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'), # 自定义API密钥
47
44
  },
48
45
  'embeddings': {
49
46
  'model_name_or_path': 'AI-ModelScope/m3e-base',
@@ -87,32 +84,38 @@ class TestRAGAS(unittest.TestCase):
87
84
 
88
85
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
89
86
  def test_run_rag_eval_api(self):
90
- task_cfg = {
91
- 'eval_backend': 'RAGEval',
92
- 'eval_config': {
93
- 'tool': 'RAGAS',
94
- 'eval': {
95
- 'testset_file':
96
- 'outputs/testset.json',
97
- 'critic_llm': {
98
- 'model_name': 'gpt-4o-mini', # 自定义聊天模型名称
99
- 'api_base': 'http://127.0.0.1:8088/v1', # 自定义基础URL
100
- 'api_key': 'xxxx', # 你的API密钥
87
+ from evalscope.backend.rag_eval.ragas.arguments import EvaluationArguments
88
+ task_cfg = TaskConfig(
89
+ eval_backend='RAGEval',
90
+ eval_config=dict(
91
+ tool='RAGAS',
92
+ eval=EvaluationArguments(
93
+ testset_file='outputs/testset_chinese_with_answer_small.json',
94
+ critic_llm={
95
+ 'model_name': 'qwen-plus', # 自定义聊天模型名称
96
+ 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1', # 自定义基础URL
97
+ 'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'), # 自定义API密钥
101
98
  },
102
- 'embeddings': {
103
- 'model_name_or_path': 'AI-ModelScope/m3e-base',
99
+ embeddings={
100
+ 'model_name': 'text-embedding-v1',
101
+ 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
102
+ 'api_key': env.get('DASHSCOPE_API_KEY', 'EMPTY'),
103
+ 'dimensions': 1024,
104
+ 'encode_kwargs': {
105
+ 'batch_size': 10,
106
+ },
104
107
  },
105
- 'metrics': [
108
+ metrics=[
106
109
  'Faithfulness',
107
110
  'AnswerRelevancy',
108
111
  'ContextPrecision',
109
112
  'AnswerCorrectness',
110
- 'MultiModalFaithfulness',
111
- 'MultiModalRelevance',
113
+ # 'MultiModalFaithfulness',
114
+ # 'MultiModalRelevance',
112
115
  ],
113
- },
114
- },
115
- }
116
+ ),
117
+ ),
118
+ )
116
119
 
117
120
  logger.info(f'>> Start to run task: {task_cfg}')
118
121
 
tests/vlm/test_vlmeval.py CHANGED
@@ -1,6 +1,7 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
2
3
 
3
- import subprocess
4
+ env = dotenv_values('.env')
4
5
  import unittest
5
6
 
6
7
  from evalscope.run import run_task
@@ -56,5 +57,40 @@ class TestVLMEval(unittest.TestCase):
56
57
  assert len(report_list) > 0, f'Failed to get report list: {report_list}'
57
58
 
58
59
 
60
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
61
+ def test_run_vlm_api(self):
62
+ task_cfg = {
63
+ 'eval_backend': 'VLMEvalKit',
64
+ 'eval_config': {
65
+ 'data': ['SEEDBench_IMG', 'ChartQA_TEST'],
66
+ 'limit': 5,
67
+ 'mode': 'all',
68
+ 'model': [
69
+ {'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
70
+ 'key': env.get('DASHSCOPE_API_KEY'),
71
+ 'name': 'CustomAPIModel',
72
+ 'temperature': 0.0,
73
+ 'type': 'qwen2.5-vl-7b-instruct',
74
+ 'img_size': -1,
75
+ 'video_llm': False,
76
+ 'max_tokens': 512,}
77
+ ],
78
+ 'nproc': 5,
79
+ 'reuse': False,
80
+ },
81
+ 'work_dir': 'outputs',
82
+ # 'use_cache': 'outputs/20241216_142838'
83
+ }
84
+
85
+ logger.info(f'>> Start to run task: {task_cfg}')
86
+
87
+ run_task(task_cfg)
88
+
89
+ logger.info('>> Start to get the report with summarizer ...')
90
+ report_list = Summarizer.get_report_from_cfg(task_cfg)
91
+ logger.info(f'\n>>The report list: {report_list}')
92
+
93
+ assert len(report_list) > 0, f'Failed to get report list: {report_list}'
94
+
59
95
  if __name__ == '__main__':
60
96
  unittest.main(buffer=False)
@@ -1,46 +0,0 @@
1
- import numpy as np
2
- import os
3
- from vlmeval.dataset.image_base import ImageBaseDataset
4
- from vlmeval.dataset.image_vqa import CustomVQADataset
5
- from vlmeval.smp import d2df, dump, load
6
-
7
-
8
- class CustomDataset:
9
-
10
- def load_data(self, dataset):
11
- # customize the loading of the dataset
12
- data_path = os.path.join(os.path.expanduser('~/LMUData'), f'{dataset}.tsv')
13
- return load(data_path)
14
-
15
- def build_prompt(self, line):
16
- msgs = ImageBaseDataset.build_prompt(self, line)
17
- # add a hint or custom instruction here
18
- msgs[-1]['value'] += '\nAnswer the question using a single word or phrase.'
19
- return msgs
20
-
21
- def evaluate(self, eval_file, **judge_kwargs):
22
- data = load(eval_file)
23
- assert 'answer' in data and 'prediction' in data
24
- data['prediction'] = [str(x) for x in data['prediction']]
25
- data['answer'] = [str(x).lower() for x in data['answer']]
26
-
27
- print(data)
28
-
29
- # ========compute the evaluation metrics as you need =========
30
- # exact match
31
- result = np.mean(data['answer'] == data['prediction'])
32
- ret = {'Overall': result}
33
- ret = d2df(ret).round(2)
34
-
35
- # save the result
36
- suffix = eval_file.split('.')[-1]
37
- result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
38
- dump(ret, result_file)
39
- return ret
40
- # ============================================================
41
-
42
-
43
- # override the default dataset class
44
- CustomVQADataset.load_data = CustomDataset.load_data
45
- CustomVQADataset.build_prompt = CustomDataset.build_prompt
46
- CustomVQADataset.evaluate = CustomDataset.evaluate
@@ -1,267 +0,0 @@
1
- # Copyright 2020 The HuggingFace Datasets Authors and the
2
- # current dataset script contributor.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- # This code is adapted from OpenAI's release
17
- # https://github.com/openai/human-eval/blob/master/human_eval/execution.py
18
-
19
- import contextlib
20
- import faulthandler
21
- import io
22
- import multiprocessing
23
- import os
24
- import platform
25
- import signal
26
- import tempfile
27
-
28
- BASE_IMPORTS = """from itertools import accumulate, chain, combinations, count, permutations, product, groupby, islice, repeat
29
- from copy import deepcopy
30
- from string import ascii_lowercase
31
- from math import floor, log2, log10, sqrt, comb, gcd, ceil, inf, isqrt
32
- from collections import defaultdict, deque, Counter
33
- from bisect import bisect, bisect_left, bisect_right, insort
34
- from heapq import heappush, heappop, heapify, merge
35
- from functools import reduce, cache, lru_cache
36
- from random import randrange, shuffle
37
- from operator import itemgetter, sub
38
- from re import search as re_search # Assuming 're' refers to a regex search
39
- from os.path import commonprefix
40
- from typing import List, Tuple, Dict, Set, Optional, Union, Any, Callable, Iterable, Iterator, Generator
41
- import copy
42
- import string
43
- import math
44
- import collections
45
- import bisect
46
- import heapq
47
- import functools
48
- import random
49
- import itertools
50
- import operator
51
- import re
52
- import numpy as np
53
- import pandas as pd
54
- from math import log, prod # 'log' and 'prod' are functions in the math module
55
- from collections import deque, defaultdict, Counter, OrderedDict
56
- from itertools import accumulate, permutations, combinations, product, groupby, islice, chain, repeat, zip_longest, cycle
57
- from functools import lru_cache, reduce, partial
58
- # from sortedcontainers import SortedList, SortedDict, SortedSet
59
- # import sortedcontainers
60
- from operator import iand
61
- import sys
62
- """ # noqa: E501
63
-
64
-
65
- def codeexecute_check_correctness(check_program, timeout=3):
66
- """Evaluates the functional correctness of a completion by running the test
67
- suite provided in the problem.
68
-
69
- :param completion_id: an optional completion ID so we can match
70
- the results later even if execution finishes asynchronously.
71
- """
72
- manager = multiprocessing.Manager()
73
- result = manager.list()
74
-
75
- p = multiprocessing.Process(target=unsafe_execute, args=(check_program, result, timeout))
76
- p.start()
77
- p.join(timeout=timeout + 1)
78
- if p.is_alive():
79
- p.kill()
80
-
81
- if not result:
82
- result.append('timed out')
83
-
84
- return result[0] == 'passed'
85
-
86
-
87
- def unsafe_execute(check_program, result, timeout):
88
-
89
- with create_tempdir():
90
-
91
- # These system calls are needed when cleaning up tempdir.
92
- import os
93
- import shutil
94
-
95
- rmtree = shutil.rmtree
96
- rmdir = os.rmdir
97
- chdir = os.chdir
98
-
99
- # Disable functionalities that can make destructive changes
100
- # to the test.
101
- reliability_guard()
102
-
103
- # Run program.
104
- try:
105
- exec_globals = {}
106
- with swallow_io():
107
- with time_limit(timeout):
108
- exec(check_program, exec_globals)
109
- result.append('passed')
110
- except TimeoutException:
111
- result.append('timed out')
112
- except BaseException as e:
113
- result.append(f'failed: {e}')
114
-
115
- # Needed for cleaning up.
116
- shutil.rmtree = rmtree
117
- os.rmdir = rmdir
118
- os.chdir = chdir
119
-
120
-
121
- @contextlib.contextmanager
122
- def time_limit(seconds):
123
-
124
- def signal_handler(signum, frame):
125
- raise TimeoutException('Timed out!')
126
-
127
- signal.setitimer(signal.ITIMER_REAL, seconds)
128
- signal.signal(signal.SIGALRM, signal_handler)
129
- try:
130
- yield
131
- finally:
132
- signal.setitimer(signal.ITIMER_REAL, 0)
133
-
134
-
135
- @contextlib.contextmanager
136
- def swallow_io():
137
- stream = WriteOnlyStringIO()
138
- with contextlib.redirect_stdout(stream):
139
- with contextlib.redirect_stderr(stream):
140
- with redirect_stdin(stream):
141
- yield
142
-
143
-
144
- @contextlib.contextmanager
145
- def create_tempdir():
146
- with tempfile.TemporaryDirectory() as dirname:
147
- with chdir(dirname):
148
- yield dirname
149
-
150
-
151
- class TimeoutException(Exception):
152
- pass
153
-
154
-
155
- class WriteOnlyStringIO(io.StringIO):
156
- """StringIO that throws an exception when it's read from."""
157
-
158
- def read(self, *args, **kwargs):
159
- raise OSError
160
-
161
- def readline(self, *args, **kwargs):
162
- raise OSError
163
-
164
- def readlines(self, *args, **kwargs):
165
- raise OSError
166
-
167
- def readable(self, *args, **kwargs):
168
- """Returns True if the IO object can be read."""
169
- return False
170
-
171
-
172
- class redirect_stdin(contextlib._RedirectStream): # type: ignore
173
- _stream = 'stdin'
174
-
175
-
176
- @contextlib.contextmanager
177
- def chdir(root):
178
- if root == '.':
179
- yield
180
- return
181
- cwd = os.getcwd()
182
- os.chdir(root)
183
- try:
184
- yield
185
- except BaseException as exc:
186
- raise exc
187
- finally:
188
- os.chdir(cwd)
189
-
190
-
191
- def reliability_guard(maximum_memory_bytes=None):
192
- """This disables various destructive functions and prevents the generated
193
- code from interfering with the test (e.g. fork bomb, killing other
194
- processes, removing filesystem files, etc.)
195
-
196
- WARNING This function is NOT a security sandbox. Untrusted code, including,
197
- model- generated code, should not be blindly executed outside of one. See
198
- the Codex paper for more information about OpenAI's code sandbox, and
199
- proceed with caution.
200
- """
201
-
202
- if maximum_memory_bytes is not None:
203
- import resource
204
-
205
- resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
206
- resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
207
- if not platform.uname().system == 'Darwin':
208
- resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
209
-
210
- faulthandler.disable()
211
-
212
- import builtins
213
-
214
- builtins.exit = None
215
- builtins.quit = None
216
-
217
- import os
218
-
219
- os.environ['OMP_NUM_THREADS'] = '1'
220
-
221
- os.kill = None
222
- os.system = None
223
- os.putenv = None
224
- os.remove = None
225
- os.removedirs = None
226
- os.rmdir = None
227
- os.fchdir = None
228
- os.setuid = None
229
- os.fork = None
230
- os.forkpty = None
231
- os.killpg = None
232
- os.rename = None
233
- os.renames = None
234
- os.truncate = None
235
- os.replace = None
236
- os.unlink = None
237
- os.fchmod = None
238
- os.fchown = None
239
- os.chmod = None
240
- os.chown = None
241
- os.chroot = None
242
- os.fchdir = None
243
- os.lchflags = None
244
- os.lchmod = None
245
- os.lchown = None
246
- os.getcwd = None
247
- os.chdir = None
248
-
249
- import shutil
250
-
251
- shutil.rmtree = None
252
- shutil.move = None
253
- shutil.chown = None
254
-
255
- import subprocess
256
-
257
- subprocess.Popen = None # type: ignore
258
-
259
- __builtins__['help'] = None
260
-
261
- import sys
262
-
263
- sys.modules['ipdb'] = None
264
- sys.modules['joblib'] = None
265
- sys.modules['resource'] = None
266
- sys.modules['psutil'] = None
267
- sys.modules['tkinter'] = None