evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +5 -1
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +46 -50
  60. evalscope/backend/rag_eval/utils/embedding.py +12 -11
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +32 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +119 -95
  139. evalscope/constants.py +61 -29
  140. evalscope/evaluator/__init__.py +1 -0
  141. evalscope/evaluator/evaluator.py +96 -377
  142. evalscope/evaluator/humaneval_evaluator.py +158 -0
  143. evalscope/evaluator/rating_eval.py +12 -33
  144. evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
  145. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  146. evalscope/metrics/code_metric.py +3 -9
  147. evalscope/metrics/math_accuracy.py +3 -6
  148. evalscope/metrics/metrics.py +21 -21
  149. evalscope/metrics/rouge_metric.py +11 -25
  150. evalscope/models/__init__.py +1 -2
  151. evalscope/models/api/openai_api.py +40 -29
  152. evalscope/models/custom/__init__.py +0 -1
  153. evalscope/models/custom/custom_model.py +3 -3
  154. evalscope/models/dummy_chat_model.py +7 -8
  155. evalscope/models/model_adapter.py +89 -156
  156. evalscope/models/openai_model.py +20 -20
  157. evalscope/perf/arguments.py +15 -3
  158. evalscope/perf/benchmark.py +7 -9
  159. evalscope/perf/http_client.py +3 -8
  160. evalscope/perf/main.py +10 -0
  161. evalscope/perf/plugin/api/custom_api.py +1 -2
  162. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  163. evalscope/perf/plugin/api/openai_api.py +3 -4
  164. evalscope/perf/plugin/datasets/base.py +1 -2
  165. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  166. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  167. evalscope/perf/plugin/datasets/openqa.py +1 -2
  168. evalscope/perf/utils/analysis_result.py +1 -2
  169. evalscope/perf/utils/benchmark_util.py +1 -2
  170. evalscope/perf/utils/db_util.py +11 -8
  171. evalscope/perf/utils/local_server.py +19 -13
  172. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  173. evalscope/registry/tasks/arc.yaml +2 -3
  174. evalscope/registry/tasks/bbh.yaml +3 -4
  175. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  176. evalscope/registry/tasks/ceval.yaml +3 -3
  177. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  178. evalscope/registry/tasks/cmmlu.yaml +3 -3
  179. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  180. evalscope/registry/tasks/general_qa.yaml +1 -1
  181. evalscope/registry/tasks/gsm8k.yaml +2 -2
  182. evalscope/registry/tasks/mmlu.yaml +3 -3
  183. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  184. evalscope/run.py +184 -375
  185. evalscope/run_arena.py +20 -25
  186. evalscope/summarizer.py +16 -17
  187. evalscope/third_party/longbench_write/README.md +99 -42
  188. evalscope/third_party/longbench_write/default_task.json +1 -1
  189. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  190. evalscope/third_party/longbench_write/eval.py +29 -28
  191. evalscope/third_party/longbench_write/infer.py +16 -104
  192. evalscope/third_party/longbench_write/longbench_write.py +5 -5
  193. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  194. evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
  195. evalscope/third_party/longbench_write/utils.py +0 -1
  196. evalscope/third_party/toolbench_static/eval.py +14 -15
  197. evalscope/third_party/toolbench_static/infer.py +48 -69
  198. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  199. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  200. evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
  201. evalscope/tools/combine_reports.py +25 -30
  202. evalscope/tools/rewrite_eval_results.py +14 -46
  203. evalscope/utils/__init__.py +0 -1
  204. evalscope/utils/arena_utils.py +18 -48
  205. evalscope/{perf/utils → utils}/chat_service.py +3 -4
  206. evalscope/utils/completion_parsers.py +3 -8
  207. evalscope/utils/logger.py +9 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +12 -138
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
  212. evalscope-0.8.0.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +54 -15
  214. tests/perf/test_perf.py +4 -0
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  222. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  224. evalscope/cache.py +0 -98
  225. evalscope/models/template.py +0 -1446
  226. evalscope/run_ms.py +0 -140
  227. evalscope/utils/task_cfg_parser.py +0 -10
  228. evalscope/utils/task_utils.py +0 -22
  229. evalscope-0.7.1.dist-info/RECORD +0 -286
  230. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
  231. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
  232. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
  233. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
evalscope/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from .version import __release_datetime__, __version__
3
+ from .version import __release_datetime__, __version__
evalscope/arguments.py ADDED
@@ -0,0 +1,73 @@
1
+ import argparse
2
+ import json
3
+
4
+
5
+ class ParseStrArgsAction(argparse.Action):
6
+
7
+ def __call__(self, parser, namespace, values, option_string=None):
8
+ assert isinstance(values, str), 'args should be a string.'
9
+
10
+ arg_dict = {}
11
+ for arg in values.strip().split(','):
12
+ key, value = map(str.strip, arg.split('=', 1)) # Use maxsplit=1 to handle multiple '='
13
+ try:
14
+ # Safely evaluate the value using eval
15
+ arg_dict[key] = eval(value)
16
+ except Exception:
17
+ # If eval fails, check if it's a boolean value
18
+ value_lower = value.lower()
19
+ if value_lower == 'true':
20
+ arg_dict[key] = True
21
+ elif value_lower == 'false':
22
+ arg_dict[key] = False
23
+ else:
24
+ # If not a boolean, keep the original string
25
+ arg_dict[key] = value
26
+
27
+ setattr(namespace, self.dest, arg_dict)
28
+
29
+
30
+ def add_argument(parser: argparse.ArgumentParser):
31
+ # yapf: disable
32
+ # Model-related arguments
33
+ parser.add_argument('--model', type=str, required=False, help='The model id on modelscope, or local model dir.')
34
+ parser.add_argument('--model-args', type=str, action=ParseStrArgsAction, help='The model args, should be a string.')
35
+
36
+ # Template-related arguments
37
+ parser.add_argument('--template-type', type=str, required=False, help='Deprecated, will be removed in v1.0.0.')
38
+ parser.add_argument('--chat-template', type=str, required=False, help='The custom jinja template for chat generation.') # noqa: E501
39
+
40
+ # Dataset-related arguments
41
+ parser.add_argument('--datasets', type=str, nargs='+', required=False, help='Dataset id list, align to the module name in evalscope.benchmarks') # noqa: E501
42
+ parser.add_argument('--dataset-args', type=json.loads, default='{}', help='The dataset args, should be a json string.') # noqa: E501
43
+ parser.add_argument('--dataset-dir', help='The datasets dir.')
44
+ parser.add_argument('--dataset-hub', help='The datasets hub.')
45
+
46
+ # Generation configuration arguments
47
+ parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.') # noqa: E501
48
+
49
+ # Evaluation-related arguments
50
+ parser.add_argument('--eval-type', type=str, help='The type for evaluating.')
51
+ parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.')
52
+ parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
53
+ parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.')
54
+ parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
55
+
56
+ # Cache and working directory arguments
57
+ parser.add_argument('--mem-cache', action='store_true', default=False, help='Deprecated, will be removed in v1.0.0.') # noqa: E501
58
+ parser.add_argument('--use-cache', type=str, help='Path to reuse the cached results.')
59
+ parser.add_argument('--work-dir', type=str, help='The root cache dir.')
60
+
61
+ # Debug and runtime mode arguments
62
+ parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
63
+ parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
64
+ parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
65
+ # yapf: enable
66
+
67
+
68
+ def parse_args():
69
+ parser = argparse.ArgumentParser(description='Run evaluation on benchmarks for LLMs.')
70
+ add_argument(parser)
71
+
72
+ args = parser.parse_args()
73
+ return args
evalscope/backend/base.py CHANGED
@@ -1,11 +1,13 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  from typing import Union
3
3
 
4
+ from evalscope.config import TaskConfig
4
5
  from evalscope.utils import yaml_to_dict
5
6
 
6
7
 
7
8
  class BackendManager:
8
- def __init__(self, config: Union[str, dict], **kwargs):
9
+
10
+ def __init__(self, config: Union[str, dict, TaskConfig], **kwargs):
9
11
  """
10
12
  BackendManager is the base class for the evaluation backend manager.
11
13
  It provides the basic configuration parsing, command generation, task submission, and result fetching.
@@ -15,6 +17,8 @@ class BackendManager:
15
17
  """
16
18
  if isinstance(config, str):
17
19
  self.config_d = yaml_to_dict(config)
20
+ elif isinstance(config, TaskConfig):
21
+ self.config_d = config.eval_config
18
22
  else:
19
23
  self.config_d = config
20
24
 
@@ -1,6 +1,6 @@
1
+ # isort: skip_file
1
2
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from typing import Dict, Any, List
3
-
3
+ from typing import Any, Dict, List
4
4
  """
5
5
  The API meta template for OpenCompass.
6
6
 
@@ -26,18 +26,16 @@ class MetaTemplateType:
26
26
  TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {}
27
27
 
28
28
 
29
- def register_template(name: str,
30
- template: Dict[str, Any],
31
- exists_ok: bool = False):
29
+ def register_template(name: str, template: Dict[str, Any], exists_ok: bool = False):
32
30
  if not exists_ok and name in TEMPLATE_MAPPING:
33
- raise ValueError(f"The `{name}` has already been registered in the TEMPLATE_MAPPING.")
31
+ raise ValueError(f'The `{name}` has already been registered in the TEMPLATE_MAPPING.')
34
32
 
35
33
  TEMPLATE_MAPPING[name] = template
36
34
 
37
35
 
38
36
  def get_template(name: str) -> Dict[str, Any]:
39
37
  if name not in TEMPLATE_MAPPING:
40
- raise ValueError(f"The `{name}` has not been registered in the TEMPLATE_MAPPING.")
38
+ raise ValueError(f'The `{name}` has not been registered in the TEMPLATE_MAPPING.')
41
39
 
42
40
  return TEMPLATE_MAPPING[name]
43
41
 
@@ -46,16 +44,12 @@ def get_template(name: str) -> Dict[str, Any]:
46
44
  register_template(
47
45
  name=MetaTemplateType.default_api_meta_template_oc,
48
46
  template=dict(
49
- round=[
50
- dict(role='HUMAN', api_role='HUMAN'),
51
- dict(role='BOT', api_role='BOT', generate=True)
52
- ],
47
+ round=[dict(role='HUMAN', api_role='HUMAN'),
48
+ dict(role='BOT', api_role='BOT', generate=True)],
53
49
  reserved_roles=[
54
50
  dict(role='SYSTEM', api_role='SYSTEM'),
55
51
  ],
56
- )
57
- )
58
-
52
+ ))
59
53
 
60
54
  if __name__ == '__main__':
61
55
  res = MetaTemplateType.get_template_name_list()
@@ -1,13 +1,13 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from enum import Enum
3
- from typing import Optional, Union
4
2
  import subprocess
5
- from dataclasses import asdict
6
3
  import tempfile
4
+ from dataclasses import asdict
5
+ from enum import Enum
6
+ from typing import Optional, Union
7
7
 
8
- from evalscope.utils import is_module_installed, get_module_path, get_valid_list
9
8
  from evalscope.backend.base import BackendManager
10
9
  from evalscope.backend.opencompass.api_meta_template import get_template
10
+ from evalscope.utils import get_module_path, get_valid_list, is_module_installed
11
11
  from evalscope.utils.logger import get_logger
12
12
 
13
13
  logger = get_logger()
@@ -107,8 +107,8 @@ class OpenCompassBackendManager(BackendManager):
107
107
 
108
108
  @staticmethod
109
109
  def list_datasets(return_details: bool = False):
110
- from opencompass.utils.run import get_config_from_arg
111
110
  from dataclasses import dataclass
111
+ from opencompass.utils.run import get_config_from_arg
112
112
 
113
113
  @dataclass
114
114
  class TempArgs:
@@ -160,18 +160,18 @@ class OpenCompassBackendManager(BackendManager):
160
160
  None
161
161
  """
162
162
  if run_mode == RunMode.FUNCTION:
163
- from opencompass.cli.main import run_task
164
163
  from opencompass.cli.arguments import ApiModelConfig
164
+ from opencompass.cli.main import run_task
165
165
 
166
166
  assert isinstance(self.args.models, list) and len(self.args.models) > 0, 'The models are required.'
167
167
 
168
168
  tmp_model_d: dict = self.args.models[0]
169
169
  assert 'path' in tmp_model_d and 'openai_api_base' in tmp_model_d, \
170
- f"Got invalid model config: {tmp_model_d}. \nTo get valid format: " \
170
+ f'Got invalid model config: {tmp_model_d}. \nTo get valid format: ' \
171
171
  "{'path': 'qwen-7b-chat', 'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'}"
172
172
 
173
173
  # Get valid datasets
174
- dataset_names = self.args.datasets # e.g. ['mmlu', 'ceval']
174
+ dataset_names = self.args.datasets # e.g. ['mmlu', 'ceval']
175
175
  dataset_names_all, real_dataset_all = self.list_datasets(return_details=True)
176
176
 
177
177
  if not dataset_names:
@@ -185,7 +185,9 @@ class OpenCompassBackendManager(BackendManager):
185
185
  assert len(valid_dataset_names) > 0, f'No valid datasets. ' \
186
186
  f'To get the valid datasets, please refer to {dataset_names_all}'
187
187
 
188
- valid_datasets = [_dataset for _dataset in real_dataset_all if _dataset['dataset_name'] in valid_dataset_names]
188
+ valid_datasets = [
189
+ _dataset for _dataset in real_dataset_all if _dataset['dataset_name'] in valid_dataset_names
190
+ ]
189
191
  for _dataset in valid_datasets:
190
192
  _dataset.pop('dataset_name')
191
193
  _dataset['reader_cfg']['test_range'] = self.args.limit
@@ -232,16 +234,23 @@ class OpenCompassBackendManager(BackendManager):
232
234
  if __name__ == '__main__':
233
235
 
234
236
  # OpenCompassBackendManager.list_datasets()
235
- # ['mmlu', 'WSC', 'DRCD', 'chid', 'gsm8k', 'AX_g', 'BoolQ', 'cmnli', 'ARC_e', 'ocnli_fc', 'summedits', 'MultiRC', 'GaokaoBench', 'obqa', 'math', 'agieval', 'hellaswag', 'RTE', 'race', 'flores', 'ocnli', 'strategyqa', 'triviaqa', 'WiC', 'COPA', 'commonsenseqa', 'piqa', 'nq', 'mbpp', 'csl', 'Xsum', 'CB', 'tnews', 'ARC_c', 'afqmc', 'eprstmt', 'ReCoRD', 'bbh', 'TheoremQA', 'CMRC', 'AX_b', 'siqa', 'storycloze', 'humaneval', 'cluewsc', 'winogrande', 'lambada', 'ceval', 'bustm', 'C3', 'lcsts']
237
+ # ['mmlu', 'WSC', 'DRCD', 'chid', 'gsm8k', 'AX_g', 'BoolQ', 'cmnli', 'ARC_e', 'ocnli_fc', 'summedits', 'MultiRC',
238
+ # 'GaokaoBench', 'obqa', 'math', 'agieval', 'hellaswag', 'RTE', 'race', 'flores', 'ocnli', 'strategyqa',
239
+ # 'triviaqa', 'WiC', 'COPA', 'commonsenseqa', 'piqa', 'nq', 'mbpp', 'csl', 'Xsum', 'CB', 'tnews', 'ARC_c',
240
+ # 'afqmc', 'eprstmt', 'ReCoRD', 'bbh', 'TheoremQA', 'CMRC', 'AX_b', 'siqa', 'storycloze', 'humaneval',
241
+ # 'cluewsc', 'winogrande', 'lambada', 'ceval', 'bustm', 'C3', 'lcsts']
236
242
 
237
243
  # 'meta_template': 'default-api-meta-template-oc',
238
244
  # models: llama3-8b-instruct, qwen-7b-chat
239
245
  oc_backend_manager = OpenCompassBackendManager(
240
- config={'datasets': ['mmlu', 'ceval', 'ARC_c', 'gsm8k'],
241
- 'models': [{'path': 'llama3-8b-instruct', 'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'}],
242
- 'limit': 5
243
- }
244
- )
246
+ config={
247
+ 'datasets': ['mmlu', 'ceval', 'ARC_c', 'gsm8k'],
248
+ 'models': [{
249
+ 'path': 'llama3-8b-instruct',
250
+ 'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'
251
+ }],
252
+ 'limit': 5
253
+ })
245
254
  all_datasets = OpenCompassBackendManager.list_datasets()
246
255
  print(f'all_datasets: {all_datasets}')
247
256
  oc_backend_manager.run()
@@ -4,7 +4,6 @@ from opencompass.partitioners import NaivePartitioner
4
4
  from opencompass.runners import LocalRunner
5
5
  from opencompass.tasks import OpenICLInferTask
6
6
 
7
-
8
7
  with read_base():
9
8
  # from opencompass.configs.summarizers.medium import summarizer
10
9
  # from opencompass.configs.summarizers.PMMEval import summarizer
@@ -17,7 +16,6 @@ for _dataset in datasets:
17
16
  from opencompass.datasets.humaneval import humaneval_gpt_postprocess
18
17
  _dataset['eval_cfg']['pred_postprocessor']['type'] = humaneval_gpt_postprocess
19
18
 
20
-
21
19
  # 2. Get models, only for placeholder, you should fill in the real model information from command line
22
20
  # See more templates in `opencompass.cli.arguments.ApiModelConfig`
23
21
  models = []
@@ -25,8 +23,5 @@ models = []
25
23
  # 3. Get infer config
26
24
  infer = dict(
27
25
  partitioner=dict(type=NaivePartitioner),
28
- runner=dict(
29
- type=LocalRunner,
30
- max_num_workers=4,
31
- task=dict(type=OpenICLInferTask)),
26
+ runner=dict(type=LocalRunner, max_num_workers=4, task=dict(type=OpenICLInferTask)),
32
27
  )
@@ -2,18 +2,18 @@
2
2
  from mmengine.config import read_base
3
3
 
4
4
  with read_base():
5
- from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets
6
- from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
7
5
  from opencompass.configs.datasets.agieval.agieval_gen_64afd3 import agieval_datasets
8
- from opencompass.configs.datasets.GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
9
- from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
10
- from opencompass.configs.datasets.mbpp.mbpp_gen_830460 import mbpp_datasets
6
+ from opencompass.configs.datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
7
+ from opencompass.configs.datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
8
+ from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
9
+ from opencompass.configs.datasets.CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
11
10
  from opencompass.configs.datasets.CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
11
+ from opencompass.configs.datasets.CLUE_cmnli.CLUE_cmnli_gen_1abf97 import cmnli_datasets
12
12
  from opencompass.configs.datasets.CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
13
13
  from opencompass.configs.datasets.CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
14
- from opencompass.configs.datasets.CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
15
- from opencompass.configs.datasets.CLUE_cmnli.CLUE_cmnli_gen_1abf97 import cmnli_datasets
16
14
  from opencompass.configs.datasets.CLUE_ocnli.CLUE_ocnli_gen_c4cb6c import ocnli_datasets
15
+ from opencompass.configs.datasets.cmb.cmb_gen_dfb5c4 import cmb_datasets
16
+ from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
17
17
  from opencompass.configs.datasets.FewCLUE_bustm.FewCLUE_bustm_gen_634f41 import bustm_datasets
18
18
  from opencompass.configs.datasets.FewCLUE_chid.FewCLUE_chid_gen_0a29a2 import chid_datasets
19
19
  from opencompass.configs.datasets.FewCLUE_cluewsc.FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets
@@ -21,37 +21,37 @@ with read_base():
21
21
  from opencompass.configs.datasets.FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
22
22
  from opencompass.configs.datasets.FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_gen_f97a97 import ocnli_fc_datasets
23
23
  from opencompass.configs.datasets.FewCLUE_tnews.FewCLUE_tnews_gen_b90e4a import tnews_datasets
24
- from opencompass.configs.datasets.lcsts.lcsts_gen_8ee1fe import lcsts_datasets
24
+ from opencompass.configs.datasets.GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
25
+ from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
26
+ from opencompass.configs.datasets.hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
27
+ from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
25
28
  from opencompass.configs.datasets.lambada.lambada_gen_217e11 import lambada_datasets
29
+ from opencompass.configs.datasets.lcsts.lcsts_gen_8ee1fe import lcsts_datasets
30
+ from opencompass.configs.datasets.math.math_gen_265cce import math_datasets
31
+ from opencompass.configs.datasets.mbpp.mbpp_gen_830460 import mbpp_datasets
32
+ from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets
33
+ from opencompass.configs.datasets.nq.nq_gen_c788f6 import nq_datasets
34
+ from opencompass.configs.datasets.obqa.obqa_gen_9069e4 import obqa_datasets
35
+ from opencompass.configs.datasets.piqa.piqa_gen_1194eb import piqa_datasets
36
+ from opencompass.configs.datasets.PMMEval.pmmeval_gen import PMMEval_datasets
37
+ from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
38
+ from opencompass.configs.datasets.siqa.siqa_gen_e78df3 import siqa_datasets
26
39
  from opencompass.configs.datasets.storycloze.storycloze_gen_7f656a import storycloze_datasets
40
+ from opencompass.configs.datasets.strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets
41
+ from opencompass.configs.datasets.summedits.summedits_gen_315438 import summedits_datasets
27
42
  from opencompass.configs.datasets.SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets
28
43
  from opencompass.configs.datasets.SuperGLUE_AX_g.SuperGLUE_AX_g_gen_68aac7 import AX_g_datasets
29
44
  from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
30
45
  from opencompass.configs.datasets.SuperGLUE_CB.SuperGLUE_CB_gen_854c6c import CB_datasets
31
46
  from opencompass.configs.datasets.SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
32
47
  from opencompass.configs.datasets.SuperGLUE_MultiRC.SuperGLUE_MultiRC_gen_27071f import MultiRC_datasets
33
- from opencompass.configs.datasets.SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
34
48
  from opencompass.configs.datasets.SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
49
+ from opencompass.configs.datasets.SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
35
50
  from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
36
51
  from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
37
- from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
38
- from opencompass.configs.datasets.Xsum.Xsum_gen_31397e import Xsum_datasets
39
- from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
40
- from opencompass.configs.datasets.summedits.summedits_gen_315438 import summedits_datasets
41
- from opencompass.configs.datasets.math.math_gen_265cce import math_datasets
42
- from opencompass.configs.datasets.hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
43
- from opencompass.configs.datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
44
- from opencompass.configs.datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
45
- from opencompass.configs.datasets.piqa.piqa_gen_1194eb import piqa_datasets
46
- from opencompass.configs.datasets.siqa.siqa_gen_e78df3 import siqa_datasets
47
- from opencompass.configs.datasets.strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets
48
- from opencompass.configs.datasets.winogrande.winogrande_gen_458220 import winogrande_datasets
49
- from opencompass.configs.datasets.obqa.obqa_gen_9069e4 import obqa_datasets
50
- from opencompass.configs.datasets.nq.nq_gen_c788f6 import nq_datasets
51
52
  from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
52
- from opencompass.configs.datasets.cmb.cmb_gen_dfb5c4 import cmb_datasets
53
- from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
54
- from opencompass.configs.datasets.PMMEval.pmmeval_gen import PMMEval_datasets
53
+ from opencompass.configs.datasets.winogrande.winogrande_gen_458220 import winogrande_datasets
54
+ from opencompass.configs.datasets.Xsum.Xsum_gen_31397e import Xsum_datasets
55
55
 
56
56
  # Note: to be supported
57
57
  # from opencompass.configs.datasets.flores.flores_gen_806ede import flores_datasets
@@ -59,7 +59,6 @@ with read_base():
59
59
  # from opencompass.configs.datasets.commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets
60
60
  # from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
61
61
 
62
-
63
62
  datasets = []
64
63
  _locals = {k: v for k, v in locals().items() if k.endswith('_datasets')}
65
64
 
@@ -68,7 +67,6 @@ for k, v in _locals.items():
68
67
  _dataset['dataset_name'] = k.replace('_datasets', '')
69
68
  datasets.append(_dataset)
70
69
 
71
-
72
70
  if __name__ == '__main__':
73
71
  for _dataset in datasets:
74
72
  print(_dataset)
@@ -1,4 +1,4 @@
1
- from evalscope.backend.rag_eval.utils.embedding import EmbeddingModel
2
- from evalscope.backend.rag_eval.utils.llm import LLM, LocalLLM, ChatOpenAI
3
- from evalscope.backend.rag_eval.utils.clip import VisionModel
4
1
  from evalscope.backend.rag_eval.backend_manager import RAGEvalBackendManager
2
+ from evalscope.backend.rag_eval.utils.clip import VisionModel
3
+ from evalscope.backend.rag_eval.utils.embedding import EmbeddingModel
4
+ from evalscope.backend.rag_eval.utils.llm import LLM, ChatOpenAI, LocalLLM
@@ -1,14 +1,15 @@
1
1
  import os
2
2
  from typing import Optional, Union
3
- from evalscope.utils import is_module_installed, get_valid_list
3
+
4
4
  from evalscope.backend.base import BackendManager
5
+ from evalscope.utils import get_valid_list, is_module_installed
5
6
  from evalscope.utils.logger import get_logger
6
7
 
7
-
8
8
  logger = get_logger()
9
9
 
10
10
 
11
11
  class RAGEvalBackendManager(BackendManager):
12
+
12
13
  def __init__(self, config: Union[str, dict], **kwargs):
13
14
  """BackendManager for VLM Evaluation Kit
14
15
 
@@ -20,17 +21,16 @@ class RAGEvalBackendManager(BackendManager):
20
21
  @staticmethod
21
22
  def _check_env(module_name: str):
22
23
  if is_module_installed(module_name):
23
- logger.info(f"Check `{module_name}` Installed")
24
+ logger.info(f'Check `{module_name}` Installed')
24
25
  else:
25
- logger.error(f"Please install `{module_name}` first")
26
+ logger.error(f'Please install `{module_name}` first')
26
27
 
27
28
  @staticmethod
28
29
  def run_mteb(model_args, eval_args):
29
- from evalscope.backend.rag_eval.cmteb import ModelArguments, EvalArguments
30
- from evalscope.backend.rag_eval.cmteb import one_stage_eval, two_stage_eval
30
+ from evalscope.backend.rag_eval.cmteb import EvalArguments, ModelArguments, one_stage_eval, two_stage_eval
31
31
 
32
32
  if len(model_args) > 2:
33
- raise ValueError("Not support multiple models yet")
33
+ raise ValueError('Not support multiple models yet')
34
34
 
35
35
  # Convert arguments to dictionary
36
36
  model_args_list = [ModelArguments(**args).to_dict() for args in model_args]
@@ -43,12 +43,8 @@ class RAGEvalBackendManager(BackendManager):
43
43
 
44
44
  @staticmethod
45
45
  def run_ragas(testset_args, eval_args):
46
- from evalscope.backend.rag_eval.ragas import rag_eval
46
+ from evalscope.backend.rag_eval.ragas import EvaluationArguments, TestsetGenerationArguments, rag_eval
47
47
  from evalscope.backend.rag_eval.ragas.tasks import generate_testset
48
- from evalscope.backend.rag_eval.ragas import (
49
- TestsetGenerationArguments,
50
- EvaluationArguments,
51
- )
52
48
 
53
49
  if testset_args is not None:
54
50
  generate_testset(TestsetGenerationArguments(**testset_args))
@@ -62,19 +58,19 @@ class RAGEvalBackendManager(BackendManager):
62
58
  evaluate(Arguments(**args))
63
59
 
64
60
  def run(self, *args, **kwargs):
65
- tool = self.config_d.pop("tool")
66
- if tool.lower() == "mteb":
67
- self._check_env("mteb")
68
- model_args = self.config_d["model"]
69
- eval_args = self.config_d["eval"]
61
+ tool = self.config_d.pop('tool')
62
+ if tool.lower() == 'mteb':
63
+ self._check_env('mteb')
64
+ model_args = self.config_d['model']
65
+ eval_args = self.config_d['eval']
70
66
  self.run_mteb(model_args, eval_args)
71
- elif tool.lower() == "ragas":
72
- self._check_env("ragas")
73
- testset_args = self.config_d.get("testset_generation", None)
74
- eval_args = self.config_d.get("eval", None)
67
+ elif tool.lower() == 'ragas':
68
+ self._check_env('ragas')
69
+ testset_args = self.config_d.get('testset_generation', None)
70
+ eval_args = self.config_d.get('eval', None)
75
71
  self.run_ragas(testset_args, eval_args)
76
- elif tool.lower() == "clip_benchmark":
77
- self._check_env("webdataset")
78
- self.run_clip_benchmark(self.config_d["eval"])
72
+ elif tool.lower() == 'clip_benchmark':
73
+ self._check_env('webdataset')
74
+ self.run_clip_benchmark(self.config_d['eval'])
79
75
  else:
80
- raise ValueError(f"Unknown tool: {tool}")
76
+ raise ValueError(f'Unknown tool: {tool}')
@@ -1,2 +1,2 @@
1
+ from evalscope.backend.rag_eval.clip_benchmark.arguments import Arguments
1
2
  from evalscope.backend.rag_eval.clip_benchmark.task_template import evaluate
2
- from evalscope.backend.rag_eval.clip_benchmark.arguments import Arguments
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass, field
2
- from typing import List, Dict
2
+ from typing import Dict, List
3
3
 
4
4
 
5
5
  @dataclass
@@ -13,7 +13,7 @@ class Arguments:
13
13
  model_name: str
14
14
  revision: str = "master"
15
15
  hub: str = "modelscope"
16
-
16
+
17
17
  For API VLM model support, you can use the following fields, (image caption only):
18
18
  model_name="gpt-4o-mini"
19
19
  api_base: str = "",
@@ -23,12 +23,12 @@ class Arguments:
23
23
  models: List[Dict] = field(default_factory=dict) # List of paths to the pre-trained models or model identifiers
24
24
  dataset_name: List[str] = field(default_factory=list) # List of dataset names to be used
25
25
  data_dir: str = None # Root directory where the datasets are stored
26
- split: str = "test" # Split of the dataset to be used (e.g., 'train', 'validation', 'test')
26
+ split: str = 'test' # Split of the dataset to be used (e.g., 'train', 'validation', 'test')
27
27
  task: str = None
28
28
  batch_size: int = 128 # Batch size for data loading
29
29
  num_workers: int = 1 # Number of workers for data loading
30
30
  verbose: bool = True # Flag to enable verbose logging
31
- output_dir: str = "outputs" # Directory where the outputs (e.g., predictions, logs) will be saved
32
- cache_dir: str = "cache" # Directory where the dataset cache will be stored
31
+ output_dir: str = 'outputs' # Directory where the outputs (e.g., predictions, logs) will be saved
32
+ cache_dir: str = 'cache' # Directory where the dataset cache will be stored
33
33
  skip_existing: bool = False # Flag to skip processing if outputs already exist
34
- limit: int = None # Limit the number of samples to be processed
34
+ limit: int = None # Limit the number of samples to be processed