evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (234) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +6 -2
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +47 -51
  60. evalscope/backend/rag_eval/utils/embedding.py +13 -12
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +33 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +154 -96
  139. evalscope/constants.py +50 -32
  140. evalscope/evaluator/evaluator.py +97 -377
  141. evalscope/evaluator/rating_eval.py +12 -33
  142. evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
  143. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  144. evalscope/metrics/code_metric.py +3 -9
  145. evalscope/metrics/math_accuracy.py +3 -6
  146. evalscope/metrics/metrics.py +21 -21
  147. evalscope/metrics/rouge_metric.py +11 -25
  148. evalscope/models/__init__.py +1 -2
  149. evalscope/models/api/openai_api.py +40 -29
  150. evalscope/models/custom/__init__.py +0 -1
  151. evalscope/models/custom/custom_model.py +3 -3
  152. evalscope/models/dummy_chat_model.py +7 -8
  153. evalscope/models/model_adapter.py +89 -156
  154. evalscope/models/openai_model.py +20 -20
  155. evalscope/perf/arguments.py +16 -3
  156. evalscope/perf/benchmark.py +9 -11
  157. evalscope/perf/http_client.py +3 -8
  158. evalscope/perf/main.py +8 -1
  159. evalscope/perf/plugin/api/custom_api.py +1 -2
  160. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  161. evalscope/perf/plugin/api/openai_api.py +3 -4
  162. evalscope/perf/plugin/datasets/base.py +1 -2
  163. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  164. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  165. evalscope/perf/plugin/datasets/openqa.py +1 -2
  166. evalscope/perf/plugin/registry.py +3 -3
  167. evalscope/perf/utils/analysis_result.py +1 -2
  168. evalscope/perf/utils/benchmark_util.py +5 -6
  169. evalscope/perf/utils/db_util.py +77 -30
  170. evalscope/perf/utils/local_server.py +21 -13
  171. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  172. evalscope/registry/tasks/arc.yaml +2 -3
  173. evalscope/registry/tasks/bbh.yaml +3 -4
  174. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  175. evalscope/registry/tasks/ceval.yaml +3 -3
  176. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  177. evalscope/registry/tasks/cmmlu.yaml +3 -3
  178. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  179. evalscope/registry/tasks/general_qa.yaml +1 -1
  180. evalscope/registry/tasks/gsm8k.yaml +2 -2
  181. evalscope/registry/tasks/mmlu.yaml +3 -3
  182. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  183. evalscope/run.py +153 -381
  184. evalscope/run_arena.py +21 -25
  185. evalscope/summarizer.py +27 -40
  186. evalscope/third_party/longbench_write/README.md +99 -42
  187. evalscope/third_party/longbench_write/default_task.json +1 -1
  188. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  189. evalscope/third_party/longbench_write/eval.py +29 -27
  190. evalscope/third_party/longbench_write/infer.py +16 -104
  191. evalscope/third_party/longbench_write/longbench_write.py +5 -4
  192. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  193. evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
  194. evalscope/third_party/longbench_write/utils.py +0 -1
  195. evalscope/third_party/toolbench_static/eval.py +14 -15
  196. evalscope/third_party/toolbench_static/infer.py +48 -69
  197. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  198. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  199. evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
  200. evalscope/tools/combine_reports.py +27 -34
  201. evalscope/tools/rewrite_eval_results.py +15 -47
  202. evalscope/utils/__init__.py +1 -1
  203. evalscope/utils/arena_utils.py +18 -48
  204. evalscope/{perf/utils → utils}/chat_service.py +4 -5
  205. evalscope/utils/completion_parsers.py +3 -8
  206. evalscope/utils/io_utils.py +162 -0
  207. evalscope/utils/logger.py +17 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +5 -306
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
  212. evalscope-0.8.1.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +53 -15
  214. tests/perf/test_perf.py +6 -1
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. tests/vlm/test_vlmeval.py +3 -2
  222. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  224. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  225. evalscope/cache.py +0 -98
  226. evalscope/models/template.py +0 -1446
  227. evalscope/run_ms.py +0 -140
  228. evalscope/utils/task_cfg_parser.py +0 -10
  229. evalscope/utils/task_utils.py +0 -22
  230. evalscope-0.7.2.dist-info/RECORD +0 -286
  231. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
  232. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
  233. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
  234. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  model_args: # model args should be followed by benchmark requirements
2
- revision: default
2
+ revision: master
3
3
  precision: torch.float16
4
4
  device_map: auto
5
5
  # model_name_or_path: qwen/qwen-7b-chat
@@ -20,8 +20,7 @@ model: null # Note: to be implemented as CustomModel
20
20
  eval_type: custom
21
21
  datasets:
22
22
  - ceval
23
- outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
24
23
  use_cache: false
25
24
  stage: all
26
- dataset_hub: ModelScope # `Local` or `ModelScope`
27
- limit: null
25
+ dataset_hub: modelscope # `Local` or `ModelScope`
26
+ limit: null
@@ -1,5 +1,5 @@
1
1
  model_args: # model args should be followed by benchmark requirements
2
- revision: default
2
+ revision: master
3
3
  precision: torch.float16
4
4
  device_map: auto
5
5
  # model_name_or_path: qwen/qwen-7b-chat
@@ -23,5 +23,5 @@ datasets:
23
23
  outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
24
24
  use_cache: false
25
25
  stage: all
26
- dataset_hub: ModelScope # `Local` or `ModelScope`
27
- limit: null
26
+ dataset_hub: modelscope # `Local` or `ModelScope`
27
+ limit: null
@@ -24,5 +24,5 @@ datasets:
24
24
  outputs: ./outputs/eval_qwen-7b-chat_v100 # Directory to save the outputs, structure: logs, predictions, reviews, reports
25
25
  use_cache: false
26
26
  stage: all
27
- dataset_hub: ModelScope # `Local` or `ModelScope`
27
+ dataset_hub: modelscope # `Local` or `ModelScope`
28
28
  limit: 10
@@ -1,5 +1,5 @@
1
1
  model_args: # model args should be followed by benchmark requirements
2
- revision: default
2
+ revision: master
3
3
  precision: torch.float16
4
4
  device_map: auto
5
5
  # model_name_or_path: qwen/qwen-7b-chat
@@ -1,5 +1,5 @@
1
1
  model_args: # model args should be followed by benchmark requirements
2
- revision: default
2
+ revision: master
3
3
  precision: torch.float16
4
4
  device_map: auto
5
5
  # model_name_or_path: qwen/qwen-7b-chat
@@ -25,5 +25,5 @@ datasets:
25
25
  outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
26
26
  use_cache: false
27
27
  stage: all
28
- dataset_hub: ModelScope # `Local` or `ModelScope`
28
+ dataset_hub: modelscope # `Local` or `ModelScope`
29
29
  limit: null
@@ -1,5 +1,5 @@
1
1
  model_args: # model args should be followed by benchmark requirements
2
- revision: default
2
+ revision: master
3
3
  precision: torch.float16
4
4
  device_map: auto
5
5
  # model_name_or_path: qwen/qwen-7b-chat
@@ -25,5 +25,5 @@ datasets:
25
25
  outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
26
26
  use_cache: true
27
27
  stage: all
28
- dataset_hub: ModelScope # `Local` or `ModelScope`
29
- limit: null
28
+ dataset_hub: modelscope # `Local` or `ModelScope`
29
+ limit: null
@@ -1,5 +1,5 @@
1
1
  model_args: # model args should be followed by benchmark requirements
2
- revision: default
2
+ revision: master
3
3
  precision: torch.float16
4
4
  device_map: auto
5
5
  # model_name_or_path: qwen/qwen-7b-chat
@@ -23,5 +23,5 @@ datasets:
23
23
  outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
24
24
  use_cache: false
25
25
  stage: all
26
- dataset_hub: ModelScope # `Local` or `ModelScope`
27
- limit: null
26
+ dataset_hub: modelscope # `Local` or `ModelScope`
27
+ limit: null
evalscope/run.py CHANGED
@@ -1,408 +1,180 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- # flake8: noqa
3
- import copy
4
- import json
5
- import argparse
2
+ """
3
+ Run evaluation for LLMs.
4
+ """
5
+ import logging
6
6
  import os.path
7
- from typing import Union, List
8
- import torch # noqa
9
-
10
- from evalscope.config import TaskConfig
11
- from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
7
+ import torch
8
+ from argparse import Namespace
9
+ from datetime import datetime
10
+ from typing import List, Optional, Union
11
+
12
+ from evalscope.arguments import parse_args
13
+ from evalscope.config import TaskConfig, parse_task_config
14
+ from evalscope.constants import DEFAULT_MODEL_REVISION, DEFAULT_WORK_DIR, EvalBackend, EvalType
12
15
  from evalscope.evaluator import Evaluator
13
- from evalscope.evaluator.evaluator import HumanevalEvaluator
14
16
  from evalscope.models.custom import CustomModel
15
- from evalscope.utils import import_module_util, yaml_to_dict, make_outputs_dir, gen_hash, json_to_dict, EvalBackend
16
- from evalscope.utils.logger import get_logger
17
+ from evalscope.utils import import_module_util, seed_everything
18
+ from evalscope.utils.io_utils import OutputsStructure, are_paths_same
19
+ from evalscope.utils.logger import configure_logging, get_logger
17
20
 
18
21
  logger = get_logger()
19
22
 
20
- """
21
- Run evaluation for LLMs.
22
- """
23
-
24
23
  BENCHMARK_PATH_PREFIX = 'evalscope.benchmarks.'
25
24
  MEMBERS_TO_IMPORT = ['DATASET_ID', 'SUBSET_LIST', 'DataAdapterClass', 'ModelAdapterClass']
26
25
 
27
26
 
28
- def parse_args():
29
- parser = argparse.ArgumentParser(description='Run evaluation on benchmarks for LLMs.')
30
-
31
- parser.add_argument('--model',
32
- help='The model id on modelscope, or local model dir.',
33
- type=str,
34
- # required=True,
35
- required=False,
36
- )
37
- parser.add_argument('--model-type',
38
- help='Deprecated. See `--template-type`',
39
- type=str,
40
- required=False,
41
- default=None)
42
- parser.add_argument('--template-type',
43
- type=str,
44
- help='The template type for generation, should be a string.'
45
- 'Refer to `https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md` for more details.',
46
- required=False,
47
- )
48
- parser.add_argument('--eval-type',
49
- type=str,
50
- help='The type for evaluating. '
51
- 'service - for APIs, TO-DO'
52
- 'checkpoint - for models on ModelScope or local model dir, '
53
- 'custom - for custom models.'
54
- ' Need to set `--model` to evalscope.models.custom.CustomModel format.'
55
- 'default to `checkpoint`.',
56
- required=False,
57
- default='checkpoint',
58
- )
59
- parser.add_argument('--model-args',
60
- type=str,
61
- help='The model args, should be a string.',
62
- required=False,
63
- default='revision=None,precision=torch.float16,device_map=auto'
64
- )
65
- parser.add_argument('--generation-config',
66
- type=str,
67
- help='The generation config, should be a string.',
68
- required=False,
69
- default='do_sample=False,repetition_penalty=1.0,max_new_tokens=512',
70
- )
71
- parser.add_argument('--datasets',
72
- help='Dataset id list, align to the module name in evalscope.benchmarks',
73
- type=str,
74
- nargs='+',
75
- required=False,
76
- )
77
- parser.add_argument('--dataset-args',
78
- type=json.loads,
79
- help='The dataset args, should be a json string. The key of dict should be aligned to datasets,'
80
- 'e.g. {"humaneval": {"local_path": "/to/your/path"}}',
81
- required=False,
82
- default='{}')
83
- parser.add_argument('--dataset-dir',
84
- help='The datasets dir. Use to specify the local datasets or datasets cache dir.'
85
- 'See --dataset-hub for more details.',
86
- required=False,
87
- default=DEFAULT_ROOT_CACHE_DIR)
88
- parser.add_argument('--dataset-hub',
89
- help='The datasets hub, can be `ModelScope` or `HuggingFace` or `Local`. '
90
- 'Default to `ModelScope`.'
91
- 'If `Local`, the --dataset-dir should be local input data dir.'
92
- 'Otherwise, the --dataset-dir should be the cache dir for datasets.',
93
- required=False,
94
- default='ModelScope')
95
- parser.add_argument('--outputs',
96
- help='Outputs dir. Default to `outputs`, which means dump to current path: ./outputs',
97
- required=False,
98
- default='outputs')
99
- parser.add_argument('--work-dir',
100
- help='The root cache dir.',
101
- required=False,
102
- default=DEFAULT_ROOT_CACHE_DIR)
103
- parser.add_argument('--limit',
104
- type=int,
105
- help='Max evaluation samples num for each subset. Default to None, which means no limit.',
106
- default=None)
107
- parser.add_argument('--debug',
108
- help='Debug mode, will print information for debugging.',
109
- action='store_true',
110
- default=False)
111
- parser.add_argument('--dry-run',
112
- help='Dry run in single processing mode.',
113
- action='store_true',
114
- default=False)
115
- parser.add_argument('--mem-cache',
116
- help='To use memory cache or not.',
117
- action='store_true',
118
- default=False)
119
- parser.add_argument('--use-cache',
120
- help='To reuse the cache or not. Default to `true`.',
121
- type=str,
122
- default='false')
123
- parser.add_argument('--stage',
124
- help='The stage of evaluation pipeline, '
125
- 'can be `all`, `infer`, `review`. Default to `all`.',
126
- type=str,
127
- default='all')
128
-
129
- parser.add_argument('--eval-backend',
130
- help='The evaluation backend to use. Default to None.'
131
- 'can be `Native`, `OpenCompass` and `ThirdParty`. '
132
- 'Default to `Native`.',
133
- type=str,
134
- default=EvalBackend.NATIVE.value,
135
- required=False)
136
-
137
- parser.add_argument('--eval-config',
138
- help='The eval task config file path for evaluation backend, should be a yaml or json file.',
139
- type=str,
140
- default=None,
141
- required=False)
142
-
143
- args = parser.parse_args()
144
-
145
- return args
146
-
147
-
148
- def parse_str_args(str_args: str) -> dict:
149
- assert isinstance(str_args, str), 'args should be a string.'
150
- arg_list: list = str_args.strip().split(',')
151
- arg_list = [arg.strip() for arg in arg_list]
152
- arg_dict: dict = dict([arg.split('=') for arg in arg_list])
153
-
154
- final_args = dict()
155
- for k, v in arg_dict.items():
156
- try:
157
- final_args[k] = eval(v)
158
- except:
159
- if v.lower() == 'true':
160
- v = True
161
- if v.lower() == 'false':
162
- v = False
163
- final_args[k] = v
164
-
165
- return final_args
166
-
167
-
168
- def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig]]) -> Union[dict, List[dict]]:
27
+ def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]) -> Union[dict, List[dict]]:
28
+ """Run evaluation task(s) based on the provided configuration."""
29
+ run_time = datetime.now().strftime('%Y%m%d_%H%M%S')
169
30
 
31
+ # If task_cfg is a list, run each task individually
170
32
  if isinstance(task_cfg, list):
171
- eval_results = []
172
- for one_task_cfg in task_cfg:
173
- eval_results.append(run_task(one_task_cfg))
174
- return eval_results
175
-
176
- if isinstance(task_cfg, TaskConfig):
177
- task_cfg = task_cfg.to_dict()
178
- elif isinstance(task_cfg, str):
179
- if task_cfg.endswith('.yaml'):
180
- task_cfg = yaml_to_dict(task_cfg)
181
- elif task_cfg.endswith('.json'):
182
- task_cfg = json_to_dict(task_cfg)
183
- else:
184
- raise ValueError(f'Unsupported file format: {task_cfg}, should be a yaml or json file.')
185
- elif isinstance(task_cfg, dict):
186
- logger.info('** Args: Task config is provided with dictionary type. **')
187
- else:
188
- raise ValueError('** Args: Please provide a valid task config. **')
189
-
190
- # Check and run evaluation backend
191
- if task_cfg.get('eval_backend') is None:
192
- task_cfg['eval_backend'] = EvalBackend.NATIVE.value
193
-
194
- eval_backend = task_cfg.get('eval_backend')
195
- eval_config: Union[str, dict] = task_cfg.get('eval_config')
196
-
197
- if eval_backend != EvalBackend.NATIVE.value:
198
-
199
- if eval_config is None:
200
- logger.warning(f'Got eval_backend {eval_backend}, but eval_config is not provided.')
201
-
202
- if eval_backend == EvalBackend.OPEN_COMPASS.value:
203
- from evalscope.backend.opencompass import OpenCompassBackendManager
204
- oc_backend_manager = OpenCompassBackendManager(config=eval_config)
205
- oc_backend_manager.run()
206
- elif eval_backend == EvalBackend.VLM_EVAL_KIT.value:
207
- from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
208
- vlm_eval_kit_backend_manager = VLMEvalKitBackendManager(config=eval_config)
209
- vlm_eval_kit_backend_manager.run()
210
- elif eval_backend == EvalBackend.RAG_EVAL.value:
211
- from evalscope.backend.rag_eval import RAGEvalBackendManager
212
- rag_eval_backend_manager = RAGEvalBackendManager(config=eval_config)
213
- rag_eval_backend_manager.run()
214
- # TODO: Add other evaluation backends
215
- elif eval_backend == EvalBackend.THIRD_PARTY.value:
216
- raise NotImplementedError(f'Not implemented for evaluation backend {eval_backend}')
217
-
218
- return dict()
219
-
220
- # Get the output task config
221
- output_task_cfg = copy.copy(task_cfg)
222
- logger.info(output_task_cfg)
223
-
224
- model_args: dict = task_cfg.get('model_args',
225
- {'revision': 'default', 'precision': torch.float16, 'device_map': 'auto'})
226
- # Get the GLOBAL default config (infer_cfg) for prediction
227
- generation_config: dict = task_cfg.get('generation_config',
228
- {'do_sample': False,
229
- 'repetition_penalty': 1.0,
230
- 'max_length': 2048,
231
- 'max_new_tokens': 512,
232
- 'temperature': 0.3,
233
- 'top_k': 50,
234
- 'top_p': 0.8, }
235
- )
236
- dataset_args: dict = task_cfg.get('dataset_args', {})
237
- dry_run: bool = task_cfg.get('dry_run', False)
238
- model: Union[str, CustomModel] = task_cfg.get('model', None)
239
- model_type: str = task_cfg.get('model_type', None)
240
- template_type: str = task_cfg.get('template_type', None)
241
- eval_type: str = task_cfg.get('eval_type', 'checkpoint')
242
- datasets: list = task_cfg.get('datasets', None)
243
- work_dir: str = task_cfg.get('work_dir', DEFAULT_ROOT_CACHE_DIR)
244
- outputs: str = task_cfg.get('outputs', 'outputs')
245
- mem_cache: bool = task_cfg.get('mem_cache', False)
246
- use_cache: bool = task_cfg.get('use_cache', True)
247
- dataset_hub: str = task_cfg.get('dataset_hub', 'ModelScope')
248
- dataset_dir: str = task_cfg.get('dataset_dir', DEFAULT_ROOT_CACHE_DIR)
249
- stage: str = task_cfg.get('stage', 'all')
250
- limit: int = task_cfg.get('limit', None)
251
- debug: str = task_cfg.get('debug', False)
252
-
253
- if model is None or datasets is None:
254
- if not task_cfg.get('eval_backend'):
255
- raise ValueError('** Args: Please provide model and datasets. **')
256
-
257
- if model_type:
258
- logger.warning('** DeprecatedWarning: `--model-type` is deprecated, please use `--template-type` instead.')
259
-
260
- model_precision = model_args.get('precision', torch.float16)
261
- if isinstance(model_precision, str):
262
- model_precision = eval(model_precision)
263
-
264
- if mem_cache:
265
- logger.warning('** DeprecatedWarning: `--mem-cache` is deprecated, please use `--use-cache` instead.')
266
-
267
- logger.info(f'** Set use_cache to {use_cache}.')
268
-
269
- # Get model args
270
- if dry_run:
271
- from evalscope.models.dummy_chat_model import DummyChatModel
272
- model_id: str = 'dummy'
273
- model_revision: str = 'v1.0.0'
274
- elif eval_type == 'custom':
275
- model_id: str = None
276
- model_revision: str = None
277
- else:
278
- model_id: str = model
279
- model_revision: str = model_args.get('revision', 'default')
33
+ return [run_single_task(cfg, run_time) for cfg in task_cfg]
34
+
35
+ task_cfg = parse_task_config(task_cfg)
36
+ return run_single_task(task_cfg, run_time)
37
+
280
38
 
281
- # Get outputs directory
282
- if isinstance(model_id, str) and os.path.isdir(os.path.expanduser(model_id)):
283
- # get the output_model_id when model_id is a local model dir
284
- output_model_id: str = gen_hash(model_id)
39
+ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
40
+ """Run a single evaluation task."""
41
+ seed_everything(task_cfg.seed)
42
+ outputs = setup_work_directory(task_cfg, run_time)
43
+ configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
44
+
45
+ task_cfg.dump_yaml(outputs.configs_dir)
46
+ logger.info(task_cfg)
47
+
48
+ if task_cfg.eval_backend != EvalBackend.NATIVE:
49
+ return run_non_native_backend(task_cfg)
285
50
  else:
286
- output_model_id: str = model_id
287
- if outputs == 'outputs':
288
- outputs = make_outputs_dir(root_dir=os.path.join(work_dir, 'outputs'),
289
- datasets=datasets,
290
- model_id=output_model_id,
291
- model_revision=model_revision,)
292
-
293
- eval_results = dict()
294
- for dataset_name in datasets:
295
- # Get imported_modules
296
- imported_modules = import_module_util(BENCHMARK_PATH_PREFIX, dataset_name, MEMBERS_TO_IMPORT)
297
-
298
- if dataset_name == 'humaneval' and dataset_args.get('humaneval', {}).get('local_path') is None:
299
- raise ValueError('Please specify the local problem path of humaneval dataset in --dataset-args,'
300
- 'e.g. {"humaneval": {"local_path": "/to/your/path"}}, '
301
- 'And refer to https://github.com/openai/human-eval/tree/master#installation to install it,'
302
- 'Note that you need to enable the execution code in the human_eval/execution.py first.')
303
-
304
- if dry_run:
305
- from evalscope.models.dummy_chat_model import DummyChatModel
306
- model_adapter = DummyChatModel(model_cfg=dict())
307
- elif eval_type == 'custom':
308
- if not isinstance(model, CustomModel):
309
- raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(model)}.')
310
- from evalscope.models.model_adapter import CustomModelAdapter
311
- model_adapter = CustomModelAdapter(custom_model=model)
312
- else:
313
- # Init model adapter
314
- device_map = model_args.get('device_map', 'auto') if torch.cuda.is_available() else None
315
- model_adapter = imported_modules['ModelAdapterClass'](model_id=model_id,
316
- model_revision=model_revision,
317
- device_map=device_map,
318
- torch_dtype=model_precision,
319
- cache_dir=work_dir,
320
- template_type=template_type)
321
-
322
- if dataset_name == 'humaneval':
323
- problem_file: str = dataset_args.get('humaneval', {}).get('local_path')
324
-
325
- evaluator = HumanevalEvaluator(problem_file=problem_file,
326
- model_id=model_id,
327
- model_revision=model_revision,
328
- model_adapter=model_adapter,
329
- outputs_dir=outputs,
330
- is_custom_outputs_dir=False, )
331
- else:
332
- # TODO: CHECK dataset_args
333
- dataset_name_or_path: str = dataset_args.get(dataset_name, {}).get('local_path') or imported_modules[
334
- 'DATASET_ID']
335
-
336
- in_prompt_template: str = dataset_args.get(dataset_name, {}).get('prompt_template', '')
337
-
338
- # Init data adapter
339
- few_shot_num: int = dataset_args.get(dataset_name, {}).get('few_shot_num', None)
340
- few_shot_random: bool = dataset_args.get(dataset_name, {}).get('few_shot_random', True)
341
- data_adapter = imported_modules['DataAdapterClass'](few_shot_num=few_shot_num,
342
- few_shot_random=few_shot_random,
343
- prompt_template=in_prompt_template,)
344
-
345
- in_subset_list: list = dataset_args.get(dataset_name, {})\
346
- .get('subset_list', imported_modules['SUBSET_LIST'])
347
- logger.info(f'\n** Evaluating on subsets for {dataset_name}: {in_subset_list}\n')
348
-
349
- evaluator = Evaluator(
350
- dataset_name_or_path=dataset_name_or_path,
351
- subset_list=in_subset_list,
352
- data_adapter=data_adapter,
353
- model_adapter=model_adapter,
354
- use_cache=use_cache,
355
- root_cache_dir=work_dir,
356
- outputs_dir=outputs,
357
- is_custom_outputs_dir=outputs != 'outputs',
358
- datasets_dir=dataset_dir,
359
- datasets_hub=dataset_hub,
360
- stage=stage,
361
- eval_type=eval_type,
362
- overall_task_cfg=output_task_cfg,
363
- )
364
-
365
- infer_cfg = generation_config or {}
366
- infer_cfg.update(dict(limit=limit))
367
- res_dict: dict = evaluator.eval(infer_cfg=infer_cfg, debug=debug)
51
+ return evaluate_model(task_cfg, outputs)
52
+
53
+
54
+ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
55
+ """Set the working directory for the task."""
56
+ if task_cfg.use_cache:
57
+ task_cfg.work_dir = task_cfg.use_cache
58
+ logger.info(f'Set resume from {task_cfg.work_dir}')
59
+ elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
60
+ task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
368
61
 
62
+ outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
63
+
64
+ if task_cfg.eval_backend == EvalBackend.OPEN_COMPASS:
65
+ task_cfg.eval_config['time_str'] = run_time
66
+ elif task_cfg.eval_backend == EvalBackend.VLM_EVAL_KIT:
67
+ task_cfg.eval_config['work_dir'] = task_cfg.work_dir
68
+ return outputs
69
+
70
+
71
+ def run_non_native_backend(task_cfg: TaskConfig) -> dict:
72
+ """Run evaluation using a non-native backend."""
73
+ eval_backend = task_cfg.eval_backend
74
+ eval_config = task_cfg.eval_config
75
+
76
+ if eval_config is None:
77
+ logger.warning(f'Got eval_backend {eval_backend}, but eval_config is not provided.')
78
+
79
+ backend_manager_class = get_backend_manager_class(eval_backend)
80
+ backend_manager = backend_manager_class(config=eval_config)
81
+ backend_manager.run()
82
+
83
+ return dict()
84
+
85
+
86
+ def get_backend_manager_class(eval_backend: EvalBackend):
87
+ """Get the backend manager class based on the evaluation backend."""
88
+ if eval_backend == EvalBackend.OPEN_COMPASS:
89
+ from evalscope.backend.opencompass import OpenCompassBackendManager
90
+ return OpenCompassBackendManager
91
+ elif eval_backend == EvalBackend.VLM_EVAL_KIT:
92
+ from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
93
+ return VLMEvalKitBackendManager
94
+ elif eval_backend == EvalBackend.RAG_EVAL:
95
+ from evalscope.backend.rag_eval import RAGEvalBackendManager
96
+ return RAGEvalBackendManager
97
+ elif eval_backend == EvalBackend.THIRD_PARTY:
98
+ raise NotImplementedError(f'Not implemented for evaluation backend {eval_backend}')
99
+
100
+
101
+ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
102
+ """Evaluate the model based on the provided task configuration."""
103
+ # Initialize evaluator
104
+ eval_results = {}
105
+
106
+ for dataset_name in task_cfg.datasets:
107
+ evaluator = create_evaluator(task_cfg, dataset_name, outputs)
108
+ res_dict = evaluator.eval(infer_cfg=task_cfg.generation_config, debug=task_cfg.debug, limit=task_cfg.limit)
369
109
  eval_results[dataset_name] = res_dict
370
110
 
371
111
  return eval_results
372
112
 
373
113
 
114
+ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure):
115
+ """Create an evaluator object for the specified dataset."""
116
+ imported_modules = import_module_util(BENCHMARK_PATH_PREFIX, dataset_name, MEMBERS_TO_IMPORT)
117
+ model_adapter = initialize_model_adapter(task_cfg, dataset_name, imported_modules)
118
+
119
+ dataset_config = task_cfg.dataset_args.get(dataset_name, {})
120
+ dataset_name_or_path = dataset_config.get('local_path') or imported_modules['DATASET_ID']
121
+ in_prompt_template = dataset_config.get('prompt_template', '')
122
+ few_shot_num = dataset_config.get('few_shot_num', None)
123
+ few_shot_random = dataset_config.get('few_shot_random', True)
124
+
125
+ data_adapter = imported_modules['DataAdapterClass'](
126
+ few_shot_num=few_shot_num,
127
+ few_shot_random=few_shot_random,
128
+ prompt_template=in_prompt_template,
129
+ outputs=outputs,
130
+ )
131
+ in_subset_list = dataset_config.get('subset_list', imported_modules['SUBSET_LIST'])
132
+
133
+ logger.info(f'Evaluating on subsets for {dataset_name}: {in_subset_list}\n')
134
+
135
+ return Evaluator(
136
+ dataset_name_or_path=dataset_name_or_path,
137
+ subset_list=in_subset_list,
138
+ data_adapter=data_adapter,
139
+ model_adapter=model_adapter,
140
+ use_cache=task_cfg.use_cache,
141
+ outputs=outputs,
142
+ datasets_dir=task_cfg.dataset_dir,
143
+ datasets_hub=task_cfg.dataset_hub,
144
+ stage=task_cfg.stage,
145
+ eval_type=task_cfg.eval_type,
146
+ overall_task_cfg=task_cfg,
147
+ )
148
+
149
+
150
+ def initialize_model_adapter(task_cfg: TaskConfig, dataset_name: str, imported_modules):
151
+ """Initialize the model adapter based on the task configuration."""
152
+ if task_cfg.dry_run:
153
+ from evalscope.models.dummy_chat_model import DummyChatModel
154
+ return DummyChatModel(model_cfg=dict())
155
+ elif task_cfg.eval_type == EvalType.CUSTOM:
156
+ if not isinstance(task_cfg.model, CustomModel):
157
+ raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
158
+ from evalscope.models.model_adapter import CustomModelAdapter
159
+ return CustomModelAdapter(custom_model=task_cfg.model)
160
+ else:
161
+ device_map = task_cfg.model_args.get('device_map', 'auto') if torch.cuda.is_available() else None
162
+ model_precision = task_cfg.model_args.get('precision', torch.float16)
163
+ if isinstance(model_precision, str) and model_precision != 'auto':
164
+ model_precision = eval(model_precision)
165
+ return imported_modules['ModelAdapterClass'](
166
+ model_id=task_cfg.model,
167
+ model_revision=task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION),
168
+ device_map=device_map,
169
+ torch_dtype=model_precision,
170
+ generation_config=task_cfg.generation_config,
171
+ chat_template=task_cfg.chat_template)
172
+
173
+
374
174
  def main():
375
175
  args = parse_args()
376
-
377
- # Get task_cfg
378
- use_cache: bool = False if args.use_cache.lower() == 'false' else True
379
- task_cfg = {
380
- 'model_args': parse_str_args(args.model_args),
381
- 'generation_config': parse_str_args(args.generation_config),
382
- 'dataset_args': args.dataset_args,
383
- 'dry_run': args.dry_run,
384
- 'model': args.model,
385
- 'template_type': args.template_type,
386
- 'eval_type': args.eval_type,
387
- 'datasets': args.datasets,
388
- 'work_dir': args.work_dir,
389
- 'outputs': args.outputs,
390
- 'mem_cache': args.mem_cache,
391
- 'use_cache': use_cache,
392
- 'dataset_hub': args.dataset_hub,
393
- 'dataset_dir': args.dataset_dir,
394
- 'stage': args.stage,
395
- 'limit': args.limit,
396
- 'debug': args.debug,
397
-
398
- 'eval_backend': args.eval_backend,
399
- 'eval_config': args.eval_config,
400
- }
401
-
402
- run_task(task_cfg)
176
+ run_task(args)
403
177
 
404
178
 
405
179
  if __name__ == '__main__':
406
- # Usage: python3 evalscope/run.py --model ZhipuAI/chatglm2-6b --datasets mmlu hellaswag --limit 10
407
- # Usage: python3 evalscope/run.py --model qwen/Qwen-1_8B --generation-config do_sample=false,temperature=0.0 --datasets ceval --dataset-args '{"ceval": {"few_shot_num": 0}}' --limit 10
408
180
  main()