evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +5 -1
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +46 -50
  60. evalscope/backend/rag_eval/utils/embedding.py +12 -11
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +32 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +119 -95
  139. evalscope/constants.py +61 -29
  140. evalscope/evaluator/__init__.py +1 -0
  141. evalscope/evaluator/evaluator.py +96 -377
  142. evalscope/evaluator/humaneval_evaluator.py +158 -0
  143. evalscope/evaluator/rating_eval.py +12 -33
  144. evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
  145. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  146. evalscope/metrics/code_metric.py +3 -9
  147. evalscope/metrics/math_accuracy.py +3 -6
  148. evalscope/metrics/metrics.py +21 -21
  149. evalscope/metrics/rouge_metric.py +11 -25
  150. evalscope/models/__init__.py +1 -2
  151. evalscope/models/api/openai_api.py +40 -29
  152. evalscope/models/custom/__init__.py +0 -1
  153. evalscope/models/custom/custom_model.py +3 -3
  154. evalscope/models/dummy_chat_model.py +7 -8
  155. evalscope/models/model_adapter.py +89 -156
  156. evalscope/models/openai_model.py +20 -20
  157. evalscope/perf/arguments.py +15 -3
  158. evalscope/perf/benchmark.py +7 -9
  159. evalscope/perf/http_client.py +3 -8
  160. evalscope/perf/main.py +10 -0
  161. evalscope/perf/plugin/api/custom_api.py +1 -2
  162. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  163. evalscope/perf/plugin/api/openai_api.py +3 -4
  164. evalscope/perf/plugin/datasets/base.py +1 -2
  165. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  166. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  167. evalscope/perf/plugin/datasets/openqa.py +1 -2
  168. evalscope/perf/utils/analysis_result.py +1 -2
  169. evalscope/perf/utils/benchmark_util.py +1 -2
  170. evalscope/perf/utils/db_util.py +11 -8
  171. evalscope/perf/utils/local_server.py +19 -13
  172. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  173. evalscope/registry/tasks/arc.yaml +2 -3
  174. evalscope/registry/tasks/bbh.yaml +3 -4
  175. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  176. evalscope/registry/tasks/ceval.yaml +3 -3
  177. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  178. evalscope/registry/tasks/cmmlu.yaml +3 -3
  179. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  180. evalscope/registry/tasks/general_qa.yaml +1 -1
  181. evalscope/registry/tasks/gsm8k.yaml +2 -2
  182. evalscope/registry/tasks/mmlu.yaml +3 -3
  183. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  184. evalscope/run.py +184 -375
  185. evalscope/run_arena.py +20 -25
  186. evalscope/summarizer.py +16 -17
  187. evalscope/third_party/longbench_write/README.md +99 -42
  188. evalscope/third_party/longbench_write/default_task.json +1 -1
  189. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  190. evalscope/third_party/longbench_write/eval.py +29 -28
  191. evalscope/third_party/longbench_write/infer.py +16 -104
  192. evalscope/third_party/longbench_write/longbench_write.py +5 -5
  193. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  194. evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
  195. evalscope/third_party/longbench_write/utils.py +0 -1
  196. evalscope/third_party/toolbench_static/eval.py +14 -15
  197. evalscope/third_party/toolbench_static/infer.py +48 -69
  198. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  199. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  200. evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
  201. evalscope/tools/combine_reports.py +25 -30
  202. evalscope/tools/rewrite_eval_results.py +14 -46
  203. evalscope/utils/__init__.py +0 -1
  204. evalscope/utils/arena_utils.py +18 -48
  205. evalscope/{perf/utils → utils}/chat_service.py +3 -4
  206. evalscope/utils/completion_parsers.py +3 -8
  207. evalscope/utils/logger.py +9 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +12 -138
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
  212. evalscope-0.8.0.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +54 -15
  214. tests/perf/test_perf.py +4 -0
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  222. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  224. evalscope/cache.py +0 -98
  225. evalscope/models/template.py +0 -1446
  226. evalscope/run_ms.py +0 -140
  227. evalscope/utils/task_cfg_parser.py +0 -10
  228. evalscope/utils/task_utils.py +0 -22
  229. evalscope-0.7.1.dist-info/RECORD +0 -286
  230. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
  231. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
  232. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
  233. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
evalscope/run_ms.py DELETED
@@ -1,140 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- # flake8: noqa
3
-
4
- import argparse
5
- import torch
6
-
7
- from evalscope.benchmarks.ceval import DATASET_ID as CEVAL_EXAM
8
- from evalscope.benchmarks.mmlu import DATASET_ID as MMLU
9
- from evalscope.benchmarks.hellaswag import DATASET_ID as HELLA_SWAG
10
- from evalscope.benchmarks.arc import DATASET_ID as ARC
11
- from evalscope.benchmarks.truthful_qa import DATASET_ID as TRUTHFUL_QA
12
- from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
13
- from evalscope.evaluator import Evaluator
14
- from evalscope.models.model_adapter import MultiChoiceModelAdapter, ContinuationLogitsModelAdapter
15
- from evalscope.utils.logger import get_logger
16
-
17
- logger = get_logger()
18
-
19
- # TODO: add more precision
20
- MODEL_PRECISION_MAP = {'fp16': torch.float16, 'fp32': torch.float32, 'bf16': torch.bfloat16}
21
-
22
- """
23
- Run evaluation process for ModelScope Leaderboard.
24
- """
25
-
26
-
27
- def parse_args():
28
- parser = argparse.ArgumentParser(description='Run evaluation on a model')
29
-
30
- parser.add_argument('--model', help='Model id from modelscope or huggingface.', required=True)
31
- parser.add_argument('--revision', help='Model revision.', required=False, default=None)
32
- parser.add_argument('--precision', help='Model precision.', default='bf16')
33
- parser.add_argument('--work-dir', help='root work cache dir.', default=None)
34
- parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
35
- parser.add_argument('--datasets-dir', help='Datasets dir.', default=DEFAULT_ROOT_CACHE_DIR)
36
- parser.add_argument('--device-map', help='device map.', default='auto')
37
- parser.add_argument('--max-eval-size', type=int, help='Max evaluation samples num for each subset', default=None)
38
- parser.add_argument('--dataset-id', help='Dataset id on modelscope', required=False, default=None)
39
-
40
- parser.add_argument('--debug',
41
- help='Debug mode, will print information for debugging.',
42
- action='store_true',
43
- default=False)
44
- parser.add_argument('--dry-run',
45
- help='Dry run in single processing mode.',
46
- action='store_true',
47
- default=False)
48
- parser.add_argument('--mem-cache',
49
- help='To use memory cache or not.',
50
- action='store_true',
51
- default=False)
52
-
53
- args = parser.parse_args()
54
-
55
- return args
56
-
57
-
58
- def main():
59
- args = parse_args()
60
- logger.info(args)
61
-
62
- # Customize your target datasets here
63
- all_benchmarks = [CEVAL_EXAM, MMLU, ARC, HELLA_SWAG, TRUTHFUL_QA]
64
-
65
- dataset_id = args.dataset_id
66
- if dataset_id is None:
67
- datasets = all_benchmarks
68
- elif dataset_id in all_benchmarks:
69
- datasets = [dataset_id]
70
- else:
71
- raise ValueError(f'Unknown dataset: {dataset_id}, Supported datasets: {all_benchmarks}')
72
-
73
- # Get model instance
74
- if args.dry_run:
75
- from evalscope.models.dummy_chat_model import DummyChatModel
76
- model_adapter = DummyChatModel(model_cfg=dict()) # TODO
77
- model_id: str = 'dummy'
78
- model_revision: str = 'v1.0.0'
79
- model_precision = MODEL_PRECISION_MAP.get(args.precision, torch.bfloat16)
80
- else:
81
- model_id: str = args.model
82
- model_revision: str = args.revision
83
- model_precision = MODEL_PRECISION_MAP.get(args.precision, torch.bfloat16)
84
-
85
- model_adapter = MultiChoiceModelAdapter(model_id=model_id,
86
- device_map=args.device_map,
87
- torch_dtype=model_precision,
88
- model_revision=model_revision,)
89
-
90
- # Evaluate on each dataset
91
- for dataset_name in datasets:
92
- if dataset_name == CEVAL_EXAM:
93
- from evalscope.benchmarks.ceval import CEVALAdapter
94
- data_adapter = CEVALAdapter()
95
- elif dataset_name == MMLU:
96
- from evalscope.benchmarks.mmlu import MMLUAdapter
97
- data_adapter = MMLUAdapter()
98
- elif dataset_name == ARC:
99
- from evalscope.benchmarks.arc import ARCAdapter
100
- data_adapter = ARCAdapter()
101
- elif dataset_name == HELLA_SWAG:
102
- # Note: HellaSwag should run few-shot eval
103
- from evalscope.benchmarks.hellaswag import HellaSwagAdapter
104
- data_adapter = HellaSwagAdapter()
105
- elif dataset_name == TRUTHFUL_QA:
106
- from evalscope.benchmarks.truthful_qa import TruthfulQaAdapter
107
- data_adapter = TruthfulQaAdapter()
108
-
109
- # TODO: add more datasets here
110
- else:
111
- raise ValueError(f'Unknown dataset: {dataset_name}')
112
-
113
- # TODO: add mapping
114
- if dataset_name in {TRUTHFUL_QA, HELLA_SWAG} and not args.dry_run:
115
- model_adapter = ContinuationLogitsModelAdapter(model_id=model_id,
116
- device_map=args.device_map,
117
- torch_dtype=model_precision,
118
- model_revision=model_revision, )
119
-
120
- root_work_dir = args.work_dir if args.work_dir is not None else DEFAULT_ROOT_CACHE_DIR
121
- evaluator = Evaluator(dataset_name_or_path=dataset_name,
122
- subset_list=None,
123
- data_adapter=data_adapter,
124
- model_adapter=model_adapter,
125
- use_cache=args.mem_cache,
126
- root_cache_dir=root_work_dir,
127
- outputs_dir=args.outputs_dir,
128
- is_custom_outputs_dir=True,
129
- datasets_dir=args.datasets_dir, )
130
-
131
- infer_cfg = dict(max_length=2048, limit=args.max_eval_size)
132
- evaluator.eval(infer_cfg=infer_cfg, debug=args.debug)
133
-
134
-
135
- if __name__ == '__main__':
136
- main()
137
-
138
- # Usage:
139
- # python evalscope/run_ms.py --model ZhipuAI/chatglm2-6b --precision fp16 --dry-run --dataset-id modelscope/mmlu --limit 10
140
-
@@ -1,10 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- """
4
- This file is used to parse the eval-task config file.
5
- """
6
-
7
- import os
8
- import json
9
-
10
-
@@ -1,22 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from enum import Enum
3
-
4
-
5
- class EvalBackend(Enum):
6
- # Use native evaluation pipeline of EvalScope
7
- NATIVE = 'Native'
8
-
9
- # Use OpenCompass framework as the evaluation backend
10
- OPEN_COMPASS = 'OpenCompass'
11
-
12
- # Use VLM Eval Kit as the multi-modal model evaluation backend
13
- VLM_EVAL_KIT = 'VLMEvalKit'
14
-
15
- # Use RAGEval as the RAG evaluation backend
16
- RAG_EVAL = 'RAGEval'
17
-
18
- # Use third-party evaluation backend/modules
19
- THIRD_PARTY = 'ThirdParty'
20
-
21
-
22
-
@@ -1,286 +0,0 @@
1
- evalscope/__init__.py,sha256=3eLMMrjkAIAs3vGluXNZn5-xTSbO_vfba9yNPbkVtg8,105
2
- evalscope/cache.py,sha256=zpGjL9JMosqjk_dkODVwvIGiUC0WAMmMTHDNJOvBQU8,3288
3
- evalscope/config.py,sha256=G_rpSn5Kd1aPlFJO6asnZu5FUggZmwcYdAxxpuq0yDs,6972
4
- evalscope/constants.py,sha256=g8lGYlpA4Wk88HwtqId1-jJX_z8Lr2k02gWLsyofyj0,2670
5
- evalscope/run.py,sha256=uAXtaxIBcR94jyfHGFAecuzn0y71oLgu-d9VOohCJAw,18738
6
- evalscope/run_arena.py,sha256=BCWCAiX0BQ9pLMIq08svEcd-IoFr75gFShpV88robIY,8963
7
- evalscope/run_ms.py,sha256=UtJoGnah64SXigTawJQWTi_TEGjr7Td0rjCTaO-htL8,6028
8
- evalscope/summarizer.py,sha256=rIyML8HpjQxIpXg8KvQ0CzOS6xMS-JHZh6kUZzkaRsk,6640
9
- evalscope/version.py,sha256=hA8tHZM5X2eUwWrp4-de43JlGb0XF7FqwyNdrNHAqQE,118
10
- evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- evalscope/backend/base.py,sha256=5BLrDNNwxsGp35zorD-kphmN15tlBbkuuqwkz8jWZq0,876
12
- evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
13
- evalscope/backend/opencompass/api_meta_template.py,sha256=sBW0XbVDOKeJ7mVUDLhmcG4e0yClw3eluazdp_8wtgQ,1753
14
- evalscope/backend/opencompass/backend_manager.py,sha256=_eg82FLAVxQ6t5e1OqlyuxZcngqD8rxvI5EijLUh_zI,10294
15
- evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
16
- evalscope/backend/opencompass/tasks/eval_api.py,sha256=NRIbDqhM_5JD0zBGinhptxrSmTjTelO_RaiaAht7ee0,1179
17
- evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=nWrPfItIYgPLJUXyu9vZmOmUUjku0BEFCV94Ss12pVU,5408
18
- evalscope/backend/rag_eval/__init__.py,sha256=8om6TVnTMmyTEQt1jBuUQA4UfIzyps-_-ih90H_Qjio,284
19
- evalscope/backend/rag_eval/backend_manager.py,sha256=jmO-UMu6_iOXMnl4--PrMWCsnIYEhsbiX017rtURqm0,2997
20
- evalscope/backend/rag_eval/clip_benchmark/__init__.py,sha256=gDXCiRUTSeGQHxd5SjQsnphMqHJ2si2jywRiHvujEOg,150
21
- evalscope/backend/rag_eval/clip_benchmark/arguments.py,sha256=VbB7JY4NunV83ewkZrUiM74jTzSETMPcOLlllRs7djA,1537
22
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py,sha256=vaguNflVBC5-0lk1kaU7CLTbkJuBf0hHGIdmoq4Bn8s,8474
23
- evalscope/backend/rag_eval/clip_benchmark/task_template.py,sha256=asEF_Nt2Xt3DtIS49J9nQKEjTdrcAkYhY4zumCDzSws,3990
24
- evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py,sha256=Bj2ysvM0JT-6T40v0rffeZgJIRht5KVX0GzMOiUphf0,2578
26
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py,sha256=ZrUYDbQ75eo0vmIwXh5Bb9c4nyEwd4AO2oURaIqjIII,7502
27
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py,sha256=Bcs64xece4BMNhxuaFimOwMJnlpjNxfGrdSCWOYItko,5977
28
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py,sha256=3wW-AigMx5rygsI47rr8Kym_t0GWO4eio7zSAavSr6A,8765
29
- evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt,sha256=eiiAaxhS48b5rVLy5O9VvFfV2AfxY86ITu_iqT7ZLkQ,649
30
- evalscope/backend/rag_eval/cmteb/__init__.py,sha256=ajVz6XP5hqPq-jm66hp2poA2qKj1V19ZGoqjrGUlO7U,279
31
- evalscope/backend/rag_eval/cmteb/arguments.py,sha256=wZvnVir2tSxYCV_DPR3TSDj4VxtUn3wLhBPqyMJYKno,2330
32
- evalscope/backend/rag_eval/cmteb/base.py,sha256=fYrIjKwOLwBAHb2rlNkEjYScjZ5Qpyv2LdMmWZYWREA,2830
33
- evalscope/backend/rag_eval/cmteb/task_template.py,sha256=Clyc8TZCtZrL6MjAw49rh55Xb3hf2y1C3SzLvZsorLE,2646
34
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py,sha256=7adR40W6Uu58-QR9jCUP4k7TdAnG0oT225v4xHXah2g,10635
35
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py,sha256=-oJ9rXy7pgOB7Gyf68TcSlmmAUoBx5hKofcKNuIsCd8,8977
36
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py,sha256=rF6dtrwOfvJoq2Y4myZg9_638M1g06qq0hWCmvxsIo0,2039
37
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py,sha256=2WkaTE-jF8jqsu1UcNDqN8A4567UzW5boD_0B83j-9A,4008
38
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py,sha256=C34nDuya8OT3aeMxYCYjUpUtWp7w00jSfIYQSInlNAg,5329
39
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py,sha256=wUxiQH5aOmWNS4YswACyHqBn5xqP5eyvsq6U9WSp5R0,11457
40
- evalscope/backend/rag_eval/cmteb/tasks/STS.py,sha256=6GMaoCANM-IKYLk4srHOYr_eurav3DGihHMQeJPXR6k,12054
41
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py,sha256=eBHm_TWeh7WiwpdVBtUlegeXMAxJyVQdUHRhJERobIs,1506
42
- evalscope/backend/rag_eval/ragas/__init__.py,sha256=-VnStCVy7uHih2uipG_7AD4i2FQ5sVM7_NI-sEZBpRQ,170
43
- evalscope/backend/rag_eval/ragas/arguments.py,sha256=BriXjcXVk2FqjDNuFYpfBZsUVzrkrYH7egbO9x-jcZ4,1873
44
- evalscope/backend/rag_eval/ragas/task_template.py,sha256=nv2i9-NE2SXpLrVKo5zhadYYKbDFVXVVA4sfgb4ti4g,1693
45
- evalscope/backend/rag_eval/ragas/metrics/__init__.py,sha256=HgY5nrcNtWpQ7gBi5lCEJXJVINd_R57dsmI8ldS2rd0,160
46
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py,sha256=Uqz5qWZ76Gos95_QlhwncbATXyk0YX4wkI0LiAdPElU,3838
47
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py,sha256=CdLnWHq1eTna6j3F5-pncW5YusxD_v3ScjzeCsZ7mng,3967
48
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py,sha256=1m8FBVga_uetCkahL_mwhGS8nAXG8V4jmnT4iP_6QYo,794
49
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json,sha256=YaqCbIynnRtPQHng6AzlD4l7KA-TPAi4ayjnhZj6gw0,3940
50
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json,sha256=-BjIwLy3QOiQbFGqjhYTNfhLTLeaBeOtpKBKfpjlf7E,1736
51
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json,sha256=eyUasvFvtwXAcpeUaOOBVuvxhGl-u_dndV-qsjnqsF4,981
52
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json,sha256=KXr3hmd49n1KsgYWrjTuYY9xBFIcTSksueVTUEwfEm0,3188
53
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json,sha256=1A9KlwbQr8WqNxdLEa4nU1HlPzF-q2KflQ591pJA0To,2475
54
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json,sha256=YQFk8o0esRyOF9m2aJBR_Nwn40D6LAr7YrfhQdHae_s,1739
55
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json,sha256=xH4kduv1OUJIl_xcGGh-StK_zOlZa4G-pCrIt1M8Hbc,1025
56
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json,sha256=Q4sf2Xud4NpVrbEIYZJEE_VVjMy-fgwX_AK0OnMQpDg,992
57
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=FGGqRlNgvEXnH-YcNPk5pzoRZXwtaS5cMtbIBQyEPyU,669
58
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=4JTUCczH-7UjH5nlz13w-srcTC3usqiXjJwLwxu-MIg,919
59
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=PJ2IHm3zXHe_XnT_DPxL5TNqJGJ-jjX2owVShw9V9kA,672
60
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=4JTUCczH-7UjH5nlz13w-srcTC3usqiXjJwLwxu-MIg,919
61
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json,sha256=nZ7VIz6R1XyyKtP0Vq5jPFNfHaN6M1Z9rFPOCVRChBE,1374
62
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json,sha256=5IKDA_hPmyuDXMhzK7aACrZGrYNT3wuqhzsHYC7Vkt4,1496
63
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json,sha256=uY_4P9OloNHP2IdvIuoTFCuUEHMyEqx9TzCoC6tj8G8,774
64
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=-0BwNQgPeH3dIIHsgNSL9OCMsg03oqtWtqm6HJG6gOk,663
65
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=9oSmEYvqor920jXByeNynyOSXagAukFK_e4jnMuDZQU,916
66
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json,sha256=ukF4AaOn8Su0uZ5E_uszzZFC1_MY2M9OymOSZ15w0BQ,688
67
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json,sha256=dH-etTJrQ0gQIS97QCZ5IhQR223gLS0_QZjUEW91fOA,657
68
- evalscope/backend/rag_eval/ragas/tasks/__init__.py,sha256=WO2xja0g0JSiYGdu2uAEDQgDceuFcgPWwPoqFnwDU0s,172
69
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=nX-dG0Fm1629pSASujuEmMODFZf1955WncNNykRrNtI,9305
70
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=bXOqik6qKWzbrEz21ykdkqeqqPrmoUIhTwW6eRQXy0M,2222
71
- evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
72
- evalscope/backend/rag_eval/utils/clip.py,sha256=frafvJ1soUtjFUmi-053_Fhg6ERRwyvczQBlLWAX9vE,5104
73
- evalscope/backend/rag_eval/utils/embedding.py,sha256=RZf0JlovZY_cCBsq8MMUqC_Sy78WtKLY_rBAlRA_udo,6239
74
- evalscope/backend/rag_eval/utils/llm.py,sha256=9tFwMNoTf3jNomgDu5qqVLO92HtEtelH3DXpny9_B2g,2552
75
- evalscope/backend/rag_eval/utils/tools.py,sha256=LpcYoeIBj1btzQ1_P84u1dYCdRWhMtiltxihmZCvWKk,1528
76
- evalscope/backend/vlm_eval_kit/__init__.py,sha256=xTgHM95lWzh4s0W7zxLwYkgUbPAZfAb0UoGGmyyBXrs,83
77
- evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
78
- evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=Yz2A5kB1E8DYBnjuVCA6TTPtLjhg8vYKeJTh6FU_Ecw,1645
79
- evalscope/benchmarks/__init__.py,sha256=6TKP35wfKf7R_h870fsEtcIlIAgomKOcukNL9M-5I1Y,162
80
- evalscope/benchmarks/benchmark.py,sha256=EmwYyFdrAHBGMkSbsMZQOR_62Q0CSKl8zeLlr7xvJdQ,2159
81
- evalscope/benchmarks/data_adapter.py,sha256=eVQvOQYQOQbIl8UlvOEUqRThL3FP3aUD6DSlqF1bqO0,10395
82
- evalscope/benchmarks/arc/__init__.py,sha256=7k2jFDUCHpEKDdQZ3Bmq59YmImFg9RyIfZQIsGebhE8,314
83
- evalscope/benchmarks/arc/ai2_arc.py,sha256=Wim8JsH094og7d0eLCEI0kUwDP_0x7AT117oTRPdiAI,5608
84
- evalscope/benchmarks/arc/arc_adapter.py,sha256=RpXgp69N-3UinKDAnMVxeuGrOBFX2HgXAwwjm_kH-vg,9214
85
- evalscope/benchmarks/bbh/__init__.py,sha256=x_FWzYE1gKf0mUswYXOKqKaAkmSm6IfzWvPnCtjbs8I,306
86
- evalscope/benchmarks/bbh/bbh_adapter.py,sha256=pUUjwtxX_9_z0DUo_oCddc7ktA5enhN5EaNrSRWT4V4,10804
87
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=hNie8uvyVSF-W0sZW721vEhH7_9lypZ0qtDRVraBgxg,1780
88
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=t2ozSTodp4p2ZibgkhoAomhBFtf2keRioum9QF9E5Sk,3652
89
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=7GDstZmMXjlucd6RsN5WzQiLij_VASLHHEx3mMP4wJ8,1166
90
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt,sha256=qhCLqXjtlI325tGCBXgLnM8V_bUKpUW-Dohh7U-BPSY,3567
91
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt,sha256=Ut7JSNl4zQqeuDt1eq-Mrgdxf2kMar1i85DpVqEi5vU,2404
92
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt,sha256=WNruIcuKwCaNwHPj-xs6VK1RzyVq2JDED02MadpDPl4,4476
93
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt,sha256=4hn9zfKMo7HquDPsWuy5fEnkuJtsI9GrfabostMLQLo,4830
94
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt,sha256=Jmv9AxvfpgbLsi0Nc_3_xcSLuwpiT-Po4VgTukmA0w8,3113
95
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt,sha256=LIqVgRhbD1t5ohv5FGW-Ql98kst4mIl-IpX-IY5c6mg,2504
96
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt,sha256=LIqVgRhbD1t5ohv5FGW-Ql98kst4mIl-IpX-IY5c6mg,2504
97
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt,sha256=LIqVgRhbD1t5ohv5FGW-Ql98kst4mIl-IpX-IY5c6mg,2504
98
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt,sha256=gkMO9u025Uc4RClBeQtF11FDcNb9EsxUlrbwGDdllZc,2120
99
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt,sha256=g7Y4iyJ0fQGmWyLYWwzaqlpmaypQ3x2sK49AX_YL3NU,2385
100
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt,sha256=DATvl8pqWOblx260R3muCt1sYErASv0TAviag1UZrVE,2146
101
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt,sha256=pTYwcwnt-yypHJ9hRLyDVW0hMgBPgxUwX6f6TZnFriw,1417
102
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt,sha256=juEuZ97hBp8vgQl_mkKiAsMdbb5MqxatkMRKkKDvopc,2385
103
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt,sha256=5KJQPIDdCPkyRmk9riKDHlSFfTtlhyG8aIeTDl7h6JA,2294
104
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt,sha256=OqEgTV80zfH8Mu2_IZkpPMKUREqVWOFSJo6t7D2sUx4,3480
105
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt,sha256=nBP1tctGuL8pCBYvH3BNW3nQRrRzY7lFNd5bWG6Hs64,6140
106
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt,sha256=QSAKws7Of09NdrmFPjJJCVc0zvAIMak6xROhpdgxSt4,3113
107
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt,sha256=vgTwHu4mowIeCtaaD24fRmfsaU-V9lG1z4U6izcoFBg,820
108
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt,sha256=xZeBUTWvnAT3jL8SgQJqiC_a82FfYYcgEra6frIuvlA,3022
109
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt,sha256=RmuGDIzsjWNt1ZlkqmerLHiVAWPzZOTVENcgoiM7AZg,2603
110
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt,sha256=RmuGDIzsjWNt1ZlkqmerLHiVAWPzZOTVENcgoiM7AZg,2603
111
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt,sha256=RmuGDIzsjWNt1ZlkqmerLHiVAWPzZOTVENcgoiM7AZg,2603
112
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=aPWMXg6mdgoqMao8Oc3jcjeOBh0RUPqN3aBvxaWv9pc,2944
113
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=uhRRz8y0hfHI96olJS9IU32XafGcdiqsPPCOexB1hL4,2163
114
- evalscope/benchmarks/ceval/__init__.py,sha256=SatTco8Ks6wD0jh9LUN5chf21VaJnwW1SG4cGG8OYAo,343
115
- evalscope/benchmarks/ceval/ceval_adapter.py,sha256=FBUTdmW4a5TY7atBjE_H1h_ST2_WoPWMMTvfHNvusNU,15852
116
- evalscope/benchmarks/ceval/ceval_exam.py,sha256=S32eMfGUBMrUDP39HzO6XfvSir0tthHCPItNtriE-hc,5063
117
- evalscope/benchmarks/ceval/samples.jsonl,sha256=dyWhGAdt4eq6Amgu2Ykx8RevUJVFtbhGFSTbDAeUgHc,448
118
- evalscope/benchmarks/cmmlu/__init__.py,sha256=mIMlXA_BHb_bF71Oi5XJwhV_sZKN2b_lBTOXhU5h6Bg,342
119
- evalscope/benchmarks/cmmlu/cmmlu.py,sha256=q_6ONrjdcHNqpXTUmSVbNOfl1yMd0zEQZWnh0PMQmYY,5153
120
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=jqVghYwex2Awx7THgka0wQ7dFY0EdzfnI7n0aMXGPro,15216
121
- evalscope/benchmarks/cmmlu/samples.jsonl,sha256=l842nKaAfeRE69jcX_E5N1gstWrHYpoNZjP-5D6Aq_k,1721
122
- evalscope/benchmarks/competition_math/__init__.py,sha256=hXO0DTtrA_0YDYUcyrL4XOyPGvPEa0sy2miHTF1Cxrg,393
123
- evalscope/benchmarks/competition_math/competition_math.py,sha256=0p5iKUfU6WpXgplb44YgVWZUYkeWLLmOdj66_dapdDc,2678
124
- evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=FijGL1FlEWJAy34tp3bIapiglT7KBJ8AvU8bjP4CGAw,19087
125
- evalscope/benchmarks/general_qa/__init__.py,sha256=lsGH8AlyH6MNCs7xZhWPKW8Ac3pwZg2hLibWMbyiKAc,346
126
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=N4Kopo7i3JfEG6Fann-kjPpYXcR2BHfYmtG8aZXfwR0,6097
127
- evalscope/benchmarks/gsm8k/__init__.py,sha256=4rdHRuEZdDO_WPY3RcLSZCAzgLV7UAOXgOa4cSUhmWs,315
128
- evalscope/benchmarks/gsm8k/gsm8k.py,sha256=WZ8k4EEMjNWWCxY-Dhs2BSR4EHHqYBViyl_UZIGNu7U,4282
129
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=qpRii4zyWK6nadEYxBPDTdOSwyyotWTf0JIxQSoEy2k,13976
130
- evalscope/benchmarks/hellaswag/__init__.py,sha256=nSLrBNF18Yqcp8B6IiBGsCYkDS3Xnm0nq_QDyXXSqM0,357
131
- evalscope/benchmarks/hellaswag/hellaswag.py,sha256=bNOUDpGHtAOAyWrQlMiqEGyp0ePTcpIeYbZH3XaFczI,4690
132
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=4Gf6zvnRAi4YljS7UB9PGxJeT0VSNuvhq8yhYiVmZ34,8557
133
- evalscope/benchmarks/humaneval/__init__.py,sha256=tBPFexx1c6U1nWMpglJqkQiY2GwKrmpSD_snv_NyRec,335
134
- evalscope/benchmarks/humaneval/humaneval.py,sha256=oNxRcVkYxlzS18N2JmwSaQb1aTZOVHlZMKwIETjfvNs,3482
135
- evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=1YiAvNbWRUcaTu9oGwpDMmvS4_zoHt_bNWmNpzJmo-g,1661
136
- evalscope/benchmarks/mmlu/__init__.py,sha256=fZicGcLq67XOc5cofGCi6WrV4FdubLupKb7nMdCUQSA,337
137
- evalscope/benchmarks/mmlu/mmlu.py,sha256=GhjZFOgX5qG041eVrSWggOcRcMyl0oAI_yGXmufwEzc,5256
138
- evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=9lg_3s3QjGKC794O-RogU9cdvcCP7_Vp4ve9U9dRhz8,16401
139
- evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
140
- evalscope/benchmarks/race/__init__.py,sha256=htMZhgk40CsvNF7HXaHeAejUnGbUtU6Nu2yATOiMfaU,337
141
- evalscope/benchmarks/race/race.py,sha256=giY44Vr6CePdVQxpi0x4CLsaknye47Gdlc_PVqN9VCA,3835
142
- evalscope/benchmarks/race/race_adapter.py,sha256=3zHfz3tFzCVKoYLtzpGek338ZnIGT7ejq_xSaMxiIjU,9900
143
- evalscope/benchmarks/race/samples.jsonl,sha256=GMwF5IPRWrsq6cfYNGS5yt_woXz687HObA0IkB6k3V4,1242
144
- evalscope/benchmarks/trivia_qa/__init__.py,sha256=oslov-n_oV3bhEhrPXLJoQwmHE8_vYR2JTerxoHq29A,351
145
- evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=9OsKC9uuBbw9MHghOOMMALeGkFOY_QTNWZYAr0ASPQ0,3444
146
- evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=xrebA71r_Ek9NvwkDfsmWTuRCsae2HZEGmTBtZMGwfM,3296
147
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=0g0xSWorXiHrZ3PKTqOO6g18kK2tUop1HWaAjmCKRwg,7659
148
- evalscope/benchmarks/truthful_qa/__init__.py,sha256=4bRdnHOceaEvn20jZj0yLCg5wpOHpzP3LRjkYm5u-Fs,367
149
- evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=eOcYBjR7XZ4VFSAY4r1_UUoKXakhR-yzd2I3FiOmlUw,7017
150
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=Cavimjnc6NPMC1TDOV4_uI37c3--sILz_VqGiJM_z50,14952
151
- evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
152
- evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
153
- evalscope/cli/cli.py,sha256=uZ-qC8WBsLd5-Hn94d43sSGg0UC_12RebSD4ToKjypg,844
154
- evalscope/cli/start_perf.py,sha256=yIE3sP13_yoTXQD3DBNzRVY6L_5p-Ix0J1VBvZFYdVU,914
155
- evalscope/cli/start_server.py,sha256=ATGLP2TE0aImJNicpehdzBuFlNb50F7KhyL4A_ZSoGU,3885
156
- evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
157
- evalscope/evaluator/evaluator.py,sha256=eSCgPPDGfIJfKu0cthhbDLFm1xMhj_869iT3ngcQkPc,30817
158
- evalscope/evaluator/rating_eval.py,sha256=cJbkyXIuwFUZoe7ZJZM6eUskNd9zlORgndckuon2OQ8,5768
159
- evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
160
- evalscope/evaluator/reviewer/auto_reviewer.py,sha256=JycPYti9h1j_8DRcu_rc5U0wkEASHYg-XBqrUUoiO-Q,17054
161
- evalscope/metrics/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
162
- evalscope/metrics/code_metric.py,sha256=zK1tpNDZbvmSHt3a_JJ5Y2Hdu2cqeFriy__wUOl2tSw,3462
163
- evalscope/metrics/math_accuracy.py,sha256=1PCy1VUNYg48JcGy-6SUmUDZNwPeAkMW1QQ_lXomdWw,1988
164
- evalscope/metrics/metrics.py,sha256=sDZljGiZwgHsFZ5eNi65-3z3BLCdIwWUzPcq2QpKf1k,12545
165
- evalscope/metrics/rouge_metric.py,sha256=VNdy86ZGZL6thVDFg0nKedp6dPApV7_yoIupMe0f6hk,4518
166
- evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
167
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=MXcHwmsXnh9mQZR1Bt5St6DNwXY-mfz4dNM8y6a23dc,12236
168
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48EsmFuY5_iVvY6xjc,524464
169
- evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
170
- evalscope/models/__init__.py,sha256=zG27J2HSeKPGiAIUE7QLPHEPLyXLsfaDwYI_TDXjpCg,145
171
- evalscope/models/dummy_chat_model.py,sha256=xE8wcFVSCkvizEJ-B8ojX0Ir01Q5KrN5mapjMQaQtbg,1325
172
- evalscope/models/model.py,sha256=ZzzVzZHVzuzdt5F1r-rEBT44ZfW9B7R1spsrV-T8nSw,3020
173
- evalscope/models/model_adapter.py,sha256=Cgs68ajRwTETEo1eU-OhFiFGuSx4eS1p7-JT3jOpcOk,22740
174
- evalscope/models/openai_model.py,sha256=PoQS1FIiWIxp1xBJPV7Bq81LFD9FIT3vAHUvNa22DCc,3452
175
- evalscope/models/template.py,sha256=Yk7-QnvjiLD0zchSZcaDSLmpW8onIeFpngSwtUOYVPk,56035
176
- evalscope/models/api/__init__.py,sha256=0c75K78O1KaV02BqqtEp-hhtSSClXLawb8E0c2iqN_A,105
177
- evalscope/models/api/openai_api.py,sha256=uBicJPaFLOhIrB5PKI8FE-SItb7v-fuDwBgkgns3CY0,7883
178
- evalscope/models/custom/__init__.py,sha256=K4Ewo7Qrs73-jBuPq4ffxd8hMnttKhic-Zj0amH3wiU,103
179
- evalscope/models/custom/custom_model.py,sha256=2ivxfGQs5V5HDnQEhTBi5v8KNBxJDbzPVJdNOGo3iSg,1566
180
- evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
181
- evalscope/perf/arguments.py,sha256=ixiWx16qAL1gU7JTwoYOnvvc3IrwVWGz2uVno38gywA,8671
182
- evalscope/perf/benchmark.py,sha256=Yiqcg5N03KmBa-5aWYNyklbYJ9Hqiuu1oaD8kBkFPSQ,9659
183
- evalscope/perf/http_client.py,sha256=OpTgYl4obSpmyi5bOkTRSIQxp0aVdO08EcIVFAv-znU,7192
184
- evalscope/perf/main.py,sha256=ljJDJVsD9hGWgF5bJCW-mfUGohc4LofaxiyAUfMa2WQ,997
185
- evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
186
- evalscope/perf/plugin/registry.py,sha256=PyK3E1AqQFuU4Bs9COvFFCJOaCtmHbfeQOVGtjVYh-I,1304
187
- evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
188
- evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
189
- evalscope/perf/plugin/api/custom_api.py,sha256=NQ2LDKsFQfExVRx2prcmfORCBzxxibfhpVHhB-lxAO4,3776
190
- evalscope/perf/plugin/api/dashscope_api.py,sha256=0p9f6ujppS_H6w7wsIbRVNnCkHXtRemIai5Bhdogla4,3826
191
- evalscope/perf/plugin/api/openai_api.py,sha256=I9yM4ouY1-xlBz4bYQ_62FZHKX4F3YCsg5GCqLU_9xA,6938
192
- evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
193
- evalscope/perf/plugin/datasets/base.py,sha256=1U_efZuU2ZdWV9UVAqFu1fx9_0PST_sJnaSIqbNvTF4,1787
194
- evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
195
- evalscope/perf/plugin/datasets/flickr8k.py,sha256=39jbcZde4cOY6PpJHeb20v5PIg58ezFMoXjYO7U6Z2A,1582
196
- evalscope/perf/plugin/datasets/line_by_line.py,sha256=IKVZMpKei6XW9DTm9VEssWHE96i1lTqMf0621dA_img,836
197
- evalscope/perf/plugin/datasets/longalpaca.py,sha256=ohmq3Mp0JKeG8h8ef9GYqN7pBLTHzpF8g9KrrriRbwM,1165
198
- evalscope/perf/plugin/datasets/openqa.py,sha256=l9vCnEKBYU1a8uo49kArwSXu-ZaOXDHa2Pl3gp4yXE4,1395
199
- evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
200
- evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
201
- evalscope/perf/utils/analysis_result.py,sha256=o0wMcr9U0Gwd5lh5tAFCFpp3FmfwsaMppyJOLI2_sJ8,1213
202
- evalscope/perf/utils/benchmark_util.py,sha256=-wZyZnWrXsQOzPrWdxQVbQUVUAljzsfWV4-2Hw_xzpQ,5565
203
- evalscope/perf/utils/chat_service.py,sha256=ncMmeUDpOo7Kjkhe_TPDZY8ffoHTCl-B5szHJ4gipEo,8642
204
- evalscope/perf/utils/db_util.py,sha256=TeZzcGoWDde81EjpDOyV6c2B1ZM7NzRv-0cEmeorGjE,7356
205
- evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
206
- evalscope/perf/utils/local_server.py,sha256=AezbEdPGuE1esCBxXtXJWjFYTZfFb6SYC6bAfcaX5Gk,4316
207
- evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
208
- evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
209
- evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=IQmfcwkzCCV-bMbIC9M2fd-X99bHJ_r_qfIJjClClx0,2760
210
- evalscope/registry/config/cfg_pairwise_baseline.yaml,sha256=d05pBiqOk1ejcdd9XE-opZ_ersyttAesF3Iwa2df8O8,3580
211
- evalscope/registry/config/cfg_single.yaml,sha256=zjsUC3zhU8z7JURaJiz7npkUbFpP82q1ycqUmObC-hc,3056
212
- evalscope/registry/data/question.jsonl,sha256=WQw5FXvFYerdfwPK1L4YwrWX-TApeAr2X4Zxjznq-oc,12885
213
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl,sha256=F3PcsoO_UOCztLNmGDYd90K4z4eVufBWz5prKrcqHG0,10554
214
- evalscope/registry/data/prompt_template/prompt_templates.jsonl,sha256=F3PcsoO_UOCztLNmGDYd90K4z4eVufBWz5prKrcqHG0,10554
215
- evalscope/registry/data/qa_browser/battle.jsonl,sha256=2MXcYoMItBmttQxSMh2Oa0x51xxqJaWEgSuERUx1O_0,1185590
216
- evalscope/registry/data/qa_browser/category_mapping.yaml,sha256=3r9nUIciW9205qbtOQF7aI_etM191cM3vlWU8ueG2Co,484
217
- evalscope/registry/tasks/arc.yaml,sha256=phXsBLsAgvHWmU31J89QMnJJnUioRphraQrF9SrJ53c,863
218
- evalscope/registry/tasks/bbh.yaml,sha256=Ircb_-_eVri2B1MHeSrFs9vIol7RY8ZaWwdz1j57NHA,701
219
- evalscope/registry/tasks/bbh_mini.yaml,sha256=eZYash__XJcfJau0VqujehuYE2WnFzrWr9s9jCkNT8Q,775
220
- evalscope/registry/tasks/ceval.yaml,sha256=OoSPrz6c3jPy_T7NH162N1lemwwU2OcnT1zo3S-nPRA,703
221
- evalscope/registry/tasks/ceval_mini.yaml,sha256=Aw9zzw_6STRVA21mVuAvmGiWCdXzL6ktmdFOCiQWRw4,769
222
- evalscope/registry/tasks/cmmlu.yaml,sha256=mkTqqXCdc8bqMcCDI_J3d375RaxX_8v4jw5fyAKAW0A,703
223
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml,sha256=IzPU-edTVDVAr_LGyGoYTlaFvhH1iFp4LpAWKPIy2Lg,737
224
- evalscope/registry/tasks/general_qa.yaml,sha256=7uiytV5kgs23eP5sBCpg5KXs6b9yFtPFWU1pnCCQIpg,703
225
- evalscope/registry/tasks/gsm8k.yaml,sha256=KYLK-xtv_3qtgCZiwwP4-rP_ftc_qUmtsl1Tf-jNlCg,730
226
- evalscope/registry/tasks/mmlu.yaml,sha256=504yhHVfi9pvUBk_SGPs-Yx7R2hx_2_-nAFiGIiFGx4,726
227
- evalscope/registry/tasks/mmlu_mini.yaml,sha256=wVbosZ5Tm9pwLG5nCphalezXilIjcq5j33nz3MR7_BE,778
228
- evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
229
- evalscope/third_party/longbench_write/README.md,sha256=p7C5StphFFzEeMA2lcfKyeBlJgJiIjTSXvzwhw9md2k,3248
230
- evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
231
- evalscope/third_party/longbench_write/default_task.json,sha256=HPSnI7Ar7cqe86wzQnH2XsDtqmAuCDLy3sZm3MeNyKc,711
232
- evalscope/third_party/longbench_write/default_task.yaml,sha256=aQB-Cn-gEkdoI_26yOaeJWGpoI3-FxHBclZGAmxeBcc,579
233
- evalscope/third_party/longbench_write/eval.py,sha256=_fwV3f-Yq0qrkuZ6LBXvBiXnM6lpz6sOqd7BfYxEU80,11163
234
- evalscope/third_party/longbench_write/infer.py,sha256=MB0MdSM1qDx15FyrPSU6BXPbSGnBjxuTWqrcHAgbj9o,8318
235
- evalscope/third_party/longbench_write/longbench_write.py,sha256=MQzlIzv3sGlNgxgX0FPHtDIuAmgwThfBkMeKNcsR3U8,3926
236
- evalscope/third_party/longbench_write/utils.py,sha256=l6q9cNZLFVRvG9qYbxFxobuQkcMyteU9Y6NxyMU4tmQ,816
237
- evalscope/third_party/longbench_write/resources/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
238
- evalscope/third_party/longbench_write/resources/judge.txt,sha256=LEI86IoOtqYUgvQnmXo8A8S8Ef6GEQKJXcrEWSauHVc,1884
239
- evalscope/third_party/longbench_write/resources/longbench_write.jsonl,sha256=H26ZSXzCTWWJTWXgFAYvOYupRuvdJUt_izOeSNOrV3k,54155
240
- evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl,sha256=h4AJJ3YfNA5IiZ5N9dR_tyEa1JNqY0INv6l5ZgQUJZ8,24235
241
- evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl,sha256=odTr8N8PoWAFZ2kdEcmlLeMDfEo3KXDtLo9S8oieCmI,5718
242
- evalscope/third_party/longbench_write/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
243
- evalscope/third_party/longbench_write/tools/data_etl.py,sha256=fSc4iT7_bdTvW20TbjlWme-k1pLqj_e2wXV8z831_Yw,5963
244
- evalscope/third_party/toolbench_static/README.md,sha256=Osdnt0_K-setbmYwDPCPRp2LXxamGp2mE8KsOByPPOY,3944
245
- evalscope/third_party/toolbench_static/__init__.py,sha256=BO936RxwodHr4OEpV6W3S_keC91OfOd41_msIJ2d0fs,128
246
- evalscope/third_party/toolbench_static/config_default.json,sha256=KrUzeHL2DNiM5FwY7cH3KZlxTwELCQZ6e39nilfUi0M,368
247
- evalscope/third_party/toolbench_static/config_default.yaml,sha256=-6n6Zyg9eHN2eexlehSi9LI4F3EPk-3JacrAb6ZoyxI,451
248
- evalscope/third_party/toolbench_static/eval.py,sha256=TqjMuuYePnD3bGRhQe1_9bIOlAW41kiFSztaEuppRLM,8237
249
- evalscope/third_party/toolbench_static/infer.py,sha256=WogwVXqDabdcsJ4uftZxAwR2wncp6HYpkS-fACEvjT4,9331
250
- evalscope/third_party/toolbench_static/requirements.txt,sha256=JMIbWAfKRYcQh771IT-EjroMagXchYDSgfgY7gcqx08,21
251
- evalscope/third_party/toolbench_static/toolbench_static.py,sha256=uXvyeyNWTZHFVASnOeMf1sqHUjy9NQ3r8wbkhUQJL1g,1930
252
- evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
253
- evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=UywM8SU2ByFTzD4YkbB17SXJyxmzY1QDwARDuGzbCvs,1452
254
- evalscope/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
255
- evalscope/tools/combine_reports.py,sha256=AJYB7ZAHiBu64mcs81bf40ClxukpU2NIUV53UYPiqUs,5388
256
- evalscope/tools/gen_mmlu_subject_mapping.py,sha256=CUmRdReEU7QfMyprh9I56KmHoRww_zUda_JuyxmCL1A,3277
257
- evalscope/tools/rewrite_eval_results.py,sha256=ZVi2hVjiTOmR_O5IaLv6qnQNpMz6FnDb95c83Fi3h4I,3193
258
- evalscope/utils/__init__.py,sha256=6RjACRYUSpGj6fkZ7NzYpl0lFppQCp9KVn5ktZe626s,128
259
- evalscope/utils/arena_utils.py,sha256=RMkymUv9Cxs37arUntzgDY5P0Dand2jGpsb7uy6wZmg,7670
260
- evalscope/utils/completion_parsers.py,sha256=61l8CTh1VxHgRoMDhtznpAhuJp47MssGgS-LdEe_h80,2997
261
- evalscope/utils/logger.py,sha256=Nhm8u_Wpd5BlVPdv9IBW_M3XMEcp5UbkOf1oN2HvGG0,3060
262
- evalscope/utils/task_cfg_parser.py,sha256=LiNQ2X8lbZU0cODpaY_PbKyUhNoxZIC495UsLJigX64,138
263
- evalscope/utils/task_utils.py,sha256=IMtBSBUp3H95Ko0vn8Q55Wmz2SFZXSfjVy49tyomL_g,537
264
- evalscope/utils/utils.py,sha256=bv_5zDNNzsODSwXz6M7TFkdfVJT6rw_orn_BG-qkijM,20567
265
- tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
266
- tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
267
- tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
268
- tests/cli/test_run.py,sha256=9GTF21NaUgERcF1Rkm9almO5-5pxsDF86Nw8fs8X7Hg,2926
269
- tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
270
- tests/perf/test_perf.py,sha256=Mn3nw2UJoR4qDLZ3Jhna3m52gD4mouc63uY_DLyXkG0,2889
271
- tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
272
- tests/rag/test_clip_benchmark.py,sha256=7NsOzgrpU9ou22M7fXtSFEnYt0iy2Q-ShIDL26Kp2gw,2597
273
- tests/rag/test_mteb.py,sha256=MOksxYseIQ6SD_iFFxMC9BinvDtB0vlNSFEGJt0SGl8,4608
274
- tests/rag/test_ragas.py,sha256=g3rAHymUzTyM6usIce6kItwyh1IocummK0BBPZiJPmY,4024
275
- tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
276
- tests/swift/test_run_swift_eval.py,sha256=Qop40c8jsHUbDTJe-Y8b_Aa8qn4Xstmu-FNGG14Gqik,5749
277
- tests/swift/test_run_swift_vlm_eval.py,sha256=p2i2ZRj-vG1YsQGsemvQLHcyhjy1EmUChyAjFEmVbCE,4899
278
- tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=c31jwQle_97ru5Dep91qsAqYjR1HDm1O9YZihRr0u0s,6018
279
- tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
280
- tests/vlm/test_vlmeval.py,sha256=21xi0nu4ghDB6_X-Pol7pTfK7aYkAYOp82TQ-MSQv-I,1757
281
- evalscope-0.7.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
282
- evalscope-0.7.1.dist-info/METADATA,sha256=MjxJQlVBhVgVF7ovyRYL9m7S1g8Oz3SCssvB3vbdTf8,23796
283
- evalscope-0.7.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
284
- evalscope-0.7.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
285
- evalscope-0.7.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
286
- evalscope-0.7.1.dist-info/RECORD,,