evalscope 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +230 -0
  2. evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +43 -0
  3. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +87 -0
  4. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +36 -0
  5. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +26 -0
  6. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +41 -0
  7. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +60 -0
  8. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +36 -0
  9. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +22 -0
  10. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +35 -0
  11. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  12. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  13. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  14. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  15. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +34 -0
  16. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +36 -0
  17. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +25 -0
  18. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  19. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  20. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +16 -0
  21. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +24 -0
  22. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +18 -0
  23. evalscope/backend/vlm_eval_kit/backend_manager.py +23 -21
  24. evalscope/benchmarks/ceval/samples.jsonl +1 -0
  25. evalscope/benchmarks/cmmlu/samples.jsonl +5 -0
  26. evalscope/benchmarks/mmlu/samples.jsonl +5 -0
  27. evalscope/benchmarks/race/samples.jsonl +5 -0
  28. evalscope/benchmarks/trivia_qa/samples.jsonl +5 -0
  29. evalscope/cli/start_perf.py +8 -11
  30. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +58485 -0
  31. evalscope/metrics/resources/gpt2-zhcn3-v4.json +1 -0
  32. evalscope/metrics/rouge_metric.py +30 -15
  33. evalscope/perf/arguments.py +179 -0
  34. evalscope/perf/benchmark.py +245 -0
  35. evalscope/perf/http_client.py +127 -711
  36. evalscope/perf/main.py +35 -0
  37. evalscope/perf/plugin/__init__.py +2 -0
  38. evalscope/perf/plugin/api/__init__.py +3 -0
  39. evalscope/perf/{api_plugin_base.py → plugin/api/base.py} +17 -18
  40. evalscope/perf/{custom_api.py → plugin/api/custom_api.py} +25 -19
  41. evalscope/perf/{dashscope_api.py → plugin/api/dashscope_api.py} +28 -14
  42. evalscope/perf/{openai_api.py → plugin/api/openai_api.py} +51 -27
  43. evalscope/perf/plugin/datasets/__init__.py +6 -0
  44. evalscope/perf/{dataset_plugin_base.py → plugin/datasets/base.py} +13 -10
  45. evalscope/perf/plugin/datasets/custom.py +21 -0
  46. evalscope/perf/plugin/datasets/flickr8k.py +51 -0
  47. evalscope/perf/{datasets → plugin/datasets}/line_by_line.py +9 -5
  48. evalscope/perf/plugin/datasets/longalpaca.py +28 -0
  49. evalscope/perf/plugin/datasets/openqa.py +38 -0
  50. evalscope/perf/plugin/datasets/speed_benchmark.py +50 -0
  51. evalscope/perf/plugin/registry.py +54 -0
  52. evalscope/perf/{how_to_analysis_result.py → utils/analysis_result.py} +11 -5
  53. evalscope/perf/utils/benchmark_util.py +135 -0
  54. evalscope/perf/utils/chat_service.py +252 -0
  55. evalscope/perf/utils/db_util.py +200 -0
  56. evalscope/perf/utils/handler.py +46 -0
  57. evalscope/perf/utils/local_server.py +139 -0
  58. evalscope/registry/config/cfg_arena.yaml +77 -0
  59. evalscope/registry/config/cfg_arena_zhihu.yaml +63 -0
  60. evalscope/registry/config/cfg_pairwise_baseline.yaml +83 -0
  61. evalscope/registry/config/cfg_single.yaml +78 -0
  62. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +8 -0
  63. evalscope/registry/data/prompt_template/prompt_templates.jsonl +8 -0
  64. evalscope/registry/data/qa_browser/battle.jsonl +634 -0
  65. evalscope/registry/data/qa_browser/category_mapping.yaml +10 -0
  66. evalscope/registry/data/question.jsonl +80 -0
  67. evalscope/third_party/longbench_write/README.md +118 -0
  68. evalscope/third_party/longbench_write/default_task.json +27 -0
  69. evalscope/third_party/longbench_write/default_task.yaml +24 -0
  70. evalscope/third_party/toolbench_static/README.md +118 -0
  71. evalscope/third_party/toolbench_static/config_default.json +15 -0
  72. evalscope/third_party/toolbench_static/config_default.yaml +12 -0
  73. evalscope/third_party/toolbench_static/requirements.txt +2 -0
  74. evalscope/utils/logger.py +18 -20
  75. evalscope/utils/utils.py +41 -42
  76. evalscope/version.py +2 -2
  77. evalscope-0.7.0.dist-info/LICENSE +203 -0
  78. {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/METADATA +91 -33
  79. {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/RECORD +99 -29
  80. {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/WHEEL +1 -1
  81. {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/top_level.txt +1 -0
  82. tests/cli/__init__.py +1 -0
  83. tests/cli/test_run.py +76 -0
  84. tests/perf/__init__.py +1 -0
  85. tests/perf/test_perf.py +96 -0
  86. tests/rag/test_clip_benchmark.py +85 -0
  87. tests/rag/test_mteb.py +136 -0
  88. tests/rag/test_ragas.py +120 -0
  89. tests/swift/__init__.py +1 -0
  90. tests/swift/test_run_swift_eval.py +146 -0
  91. tests/swift/test_run_swift_vlm_eval.py +128 -0
  92. tests/swift/test_run_swift_vlm_jugde_eval.py +157 -0
  93. tests/test_run_all.py +12 -0
  94. tests/vlm/__init__.py +1 -0
  95. tests/vlm/test_vlmeval.py +59 -0
  96. evalscope/perf/_logging.py +0 -32
  97. evalscope/perf/datasets/longalpaca_12k.py +0 -20
  98. evalscope/perf/datasets/openqa.py +0 -22
  99. evalscope/perf/plugin_registry.py +0 -35
  100. evalscope/perf/query_parameters.py +0 -42
  101. evalscope/perf/server_sent_event.py +0 -43
  102. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -221
  103. /evalscope/perf/{datasets → utils}/__init__.py +0 -0
  104. {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/entry_points.txt +0 -0
  105. {evalscope/preprocess → tests}/__init__.py +0 -0
  106. {evalscope/preprocess/tokenizers → tests/rag}/__init__.py +0 -0
@@ -0,0 +1,7 @@
1
+ {
2
+ "ragas_version": "0.2.5",
3
+ "original_hash": -1698100170803872933,
4
+ "language": "chinese",
5
+ "instruction": "根据指定的条件(角色、主题、风格、长度)和提供的上下文生成查询和答案。确保答案完全忠实于上下文,仅使用直接来自提供节点的信息。### 指令:\n1. **生成查询**:根据上下文、角色、主题、风格和长度,创建一个与角色视角一致并反映主题的问题。\n2. **生成答案**:仅使用提供的上下文内容,创建一个忠实且详细的答案。不要包含任何不在或无法从给定上下文中推断的信息。\n### 示例输出:\n\n",
6
+ "examples": []
7
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "ragas_version": "0.2.5",
3
+ "original_hash": 2334929353739018813,
4
+ "language": "chinese",
5
+ "instruction": "给定一个主题和角色列表,根据角色描述将每个角色与相关主题关联起来。",
6
+ "examples": [
7
+ {
8
+ "input": {
9
+ "themes": [
10
+ "同理心",
11
+ "包容性",
12
+ "远程工作"
13
+ ],
14
+ "personas": [
15
+ {
16
+ "name": "人力资源经理",
17
+ "role_description": "专注于包容性和员工支持。"
18
+ },
19
+ {
20
+ "name": "远程团队负责人",
21
+ "role_description": "管理远程团队沟通。"
22
+ }
23
+ ]
24
+ },
25
+ "output": {
26
+ "mapping": {
27
+ "HR Manager": [
28
+ "包容性",
29
+ "同理心"
30
+ ],
31
+ "Remote Team Lead": [
32
+ "远程工作",
33
+ "同理心"
34
+ ]
35
+ }
36
+ }
37
+ }
38
+ ]
39
+ }
@@ -0,0 +1,34 @@
1
+ {
2
+ "ragas_version": "0.2.5",
3
+ "original_hash": -2189588237940965149,
4
+ "language": "chinese",
5
+ "instruction": "请说明给定的信息是否得到视觉和文本上下文信息的支持。您需要回答“是”或“否”。如果任何图像和文本上下文支持该信息,请回答“是”。",
6
+ "examples": [
7
+ {
8
+ "input": {
9
+ "response": "苹果派通常是双层皮的。",
10
+ "retrieved_contexts": [
11
+ "苹果派是一种水果派,其主要馅料成分是苹果。",
12
+ "苹果派通常与奶油、冰淇淋(“苹果派 à la mode”)、蛋奶沙司或切达干酪一起食用。",
13
+ "它通常是双层皮的,上下都有糕点;上层皮可以是实心的或格状的(交叉条纹编织而成)。"
14
+ ]
15
+ },
16
+ "output": {
17
+ "faithful": true
18
+ }
19
+ },
20
+ {
21
+ "input": {
22
+ "response": "苹果派味道不好。",
23
+ "retrieved_contexts": [
24
+ "苹果派是一种水果派,其主要馅料成分是苹果。",
25
+ "苹果派通常与奶油、冰淇淋(“苹果派 à la mode”)、蛋奶沙司或切达干酪一起食用。",
26
+ "它通常是双层皮的,上下都有糕点;上层皮可以是实心的或格状的(交叉条纹编织而成)。"
27
+ ]
28
+ },
29
+ "output": {
30
+ "faithful": false
31
+ }
32
+ }
33
+ ]
34
+ }
@@ -0,0 +1,36 @@
1
+ {
2
+ "ragas_version": "0.2.5",
3
+ "original_hash": -7302860412443151372,
4
+ "language": "chinese",
5
+ "instruction": "\n您的任务是评估查询的响应是否与提供的图像和文本上下文信息一致。\n您有两个选项可以回答。要么是 True / False。\n如果查询的响应与上下文信息一致,则回答 - True,否则为 False。\n",
6
+ "examples": [
7
+ {
8
+ "input": {
9
+ "user_input": "传统玛格丽塔披萨的主要成分是什么?",
10
+ "response": "玛格丽塔披萨的主要成分是番茄、马苏里拉奶酪和新鲜罗勒。",
11
+ "retrieved_contexts": [
12
+ "传统的玛格丽塔披萨由薄薄的饼皮组成。",
13
+ "主要的配料包括番茄、马苏里拉奶酪、新鲜罗勒、盐和橄榄油。",
14
+ "它是最简单和最经典的披萨类型之一。"
15
+ ]
16
+ },
17
+ "output": {
18
+ "relevance": true
19
+ }
20
+ },
21
+ {
22
+ "input": {
23
+ "user_input": "谁在2021年奥斯卡颁奖典礼上获得了最佳男演员奖?",
24
+ "response": "2021年的最佳男演员奖由莱昂纳多·迪卡普里奥获得。",
25
+ "retrieved_contexts": [
26
+ "第93届奥斯卡颁奖典礼于2021年举行。",
27
+ "安东尼·霍普金斯凭借在《困在时间里的父亲》中的角色赢得了最佳男演员奖。",
28
+ "由于COVID-19的限制,这次活动具有独特性。"
29
+ ]
30
+ },
31
+ "output": {
32
+ "relevance": false
33
+ }
34
+ }
35
+ ]
36
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "ragas_version": "0.2.5",
3
+ "original_hash": -7036736759899743798,
4
+ "language": "chinese",
5
+ "instruction": "从给定文本中提取命名实体,限制输出为最重要的实体。确保实体数量不超过指定的最大值。",
6
+ "examples": [
7
+ {
8
+ "input": {
9
+ "text": "特斯拉和SpaceX的首席执行官埃隆·马斯克宣布计划将业务扩展到欧洲和亚洲的新地点。\n 此次扩展预计将创造数千个就业机会,特别是在柏林和上海等城市。",
10
+ "max_num": 10
11
+ },
12
+ "output": {
13
+ "entities": [
14
+ "埃隆·马斯克",
15
+ "特斯拉",
16
+ "SpaceX",
17
+ "欧洲",
18
+ "亚洲",
19
+ "柏林",
20
+ "上海"
21
+ ]
22
+ }
23
+ }
24
+ ]
25
+ }
@@ -0,0 +1,7 @@
1
+ {
2
+ "ragas_version": "0.2.5",
3
+ "original_hash": -1422723613754983378,
4
+ "language": "chinese",
5
+ "instruction": "根据指定的条件(角色、术语、风格、长度)和提供的上下文生成查询和答案。确保答案完全忠实于上下文,仅使用直接来自提供上下文的信息。### 指令:\n1. **生成查询**:根据上下文、角色、术语、风格和长度,创建一个与角色视角一致并包含术语的问题。\n2. **生成答案**:仅使用提供的上下文中的内容,构建对查询的详细答案。不要添加上下文中未包含或无法推断的信息。\n### 示例输出:\n\n",
6
+ "examples": []
7
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "ragas_version": "0.2.5",
3
+ "original_hash": 2334929353739018813,
4
+ "language": "chinese",
5
+ "instruction": "给定一个主题和角色列表,根据角色描述将每个角色与相关主题关联起来。",
6
+ "examples": [
7
+ {
8
+ "input": {
9
+ "themes": [
10
+ "同理心",
11
+ "包容性",
12
+ "远程工作"
13
+ ],
14
+ "personas": [
15
+ {
16
+ "name": "人力资源经理",
17
+ "role_description": "专注于包容性和员工支持。"
18
+ },
19
+ {
20
+ "name": "远程团队领导",
21
+ "role_description": "管理远程团队沟通。"
22
+ }
23
+ ]
24
+ },
25
+ "output": {
26
+ "mapping": {
27
+ "HR Manager": [
28
+ "包容性",
29
+ "同理心"
30
+ ],
31
+ "Remote Team Lead": [
32
+ "远程工作",
33
+ "同理心"
34
+ ]
35
+ }
36
+ }
37
+ }
38
+ ]
39
+ }
@@ -0,0 +1,16 @@
1
+ {
2
+ "ragas_version": "0.2.5",
3
+ "original_hash": -5467318232123540806,
4
+ "language": "chinese",
5
+ "instruction": "将给定文本总结为少于10个句子。",
6
+ "examples": [
7
+ {
8
+ "input": {
9
+ "text": "人工智能\n\n人工智能正在通过自动化以前需要人类智能的任务来改变各个行业。从医疗到金融,人工智能正在被用来快速准确地分析大量数据。这项技术还推动了自动驾驶汽车和个性化推荐等领域的创新。"
10
+ },
11
+ "output": {
12
+ "text": "人工智能通过自动化任务、分析数据和推动自动驾驶汽车和个性化推荐等创新,正在革新各个行业。"
13
+ }
14
+ }
15
+ ]
16
+ }
@@ -0,0 +1,24 @@
1
+ {
2
+ "ragas_version": "0.2.5",
3
+ "original_hash": 2452110859551524285,
4
+ "language": "chinese",
5
+ "instruction": "从给定的文本中提取主要主题和概念。",
6
+ "examples": [
7
+ {
8
+ "input": {
9
+ "text": "人工智能通过自动化需要人类智能的任务来改变行业。人工智能快速准确地分析大量数据,推动了自动驾驶汽车和个性化推荐等创新。",
10
+ "max_num": 10
11
+ },
12
+ "output": {
13
+ "output": [
14
+ "人工智能",
15
+ "自动化",
16
+ "数据分析",
17
+ "创新",
18
+ "自动驾驶汽车",
19
+ "个性化推荐"
20
+ ]
21
+ }
22
+ }
23
+ ]
24
+ }
@@ -0,0 +1,18 @@
1
+ import typing as t
2
+
3
+ from pydantic import BaseModel
4
+ from ragas.prompt import PydanticPrompt, StringIO
5
+ from ragas.testset.persona import Persona
6
+
7
+
8
+ class PersonaGenerationPromptZH(PydanticPrompt[StringIO, Persona]):
9
+ instruction: str = ('使用提供的摘要,生成一个可能会与内容互动或从中受益的角色。包括一个独特的名字和一个简洁的角色描述。')
10
+ input_model: t.Type[StringIO] = StringIO
11
+ output_model: t.Type[Persona] = Persona
12
+ examples: t.List[t.Tuple[StringIO, Persona]] = [(
13
+ StringIO(text='《数字营销指南》解释了在各种在线平台上吸引受众的策略。'),
14
+ Persona(
15
+ name='数字营销专家',
16
+ role_description='专注于吸引受众并在线上提升品牌。',
17
+ ),
18
+ )]
@@ -1,10 +1,11 @@
1
+ import copy
2
+ import subprocess
3
+ from functools import partial
1
4
  from typing import Optional, Union
2
- from evalscope.utils import is_module_installed, get_valid_list
5
+
3
6
  from evalscope.backend.base import BackendManager
7
+ from evalscope.utils import get_valid_list, is_module_installed
4
8
  from evalscope.utils.logger import get_logger
5
- from functools import partial
6
- import subprocess
7
- import copy
8
9
 
9
10
  logger = get_logger()
10
11
 
@@ -19,6 +20,7 @@ class ExecutionMode:
19
20
 
20
21
 
21
22
  class VLMEvalKitBackendManager(BackendManager):
23
+
22
24
  def __init__(self, config: Union[str, dict], **kwargs):
23
25
  """BackendManager for VLM Evaluation Kit
24
26
 
@@ -36,7 +38,6 @@ class VLMEvalKitBackendManager(BackendManager):
36
38
 
37
39
  self._check_valid()
38
40
 
39
-
40
41
  def _check_valid(self):
41
42
  # Ensure not both model and datasets are empty
42
43
  if not self.args.data or not self.args.model:
@@ -45,15 +46,15 @@ class VLMEvalKitBackendManager(BackendManager):
45
46
  # Check datasets
46
47
  valid_datasets, invalid_datasets = get_valid_list(self.args.data, self.valid_datasets)
47
48
  if len(invalid_datasets) != 0:
48
- logger.warning(f"Using custom dataset: {invalid_datasets}, ")
49
-
49
+ logger.warning(f'Using custom dataset: {invalid_datasets}, ')
50
+
50
51
  # Check model
51
52
  if isinstance(self.args.model[0], dict):
52
53
  model_names = [model['name'] for model in self.args.model]
53
54
  valid_model_names, invalid_model_names = get_valid_list(model_names, self.valid_model_names)
54
55
  assert len(invalid_model_names) == 0, f'Invalid models: {invalid_model_names}, ' \
55
56
  f'refer to the following list to get proper model name: {self.valid_model_names}'
56
-
57
+
57
58
  # set model_cfg
58
59
  new_model_names = []
59
60
  for model_cfg in self.args.model:
@@ -62,19 +63,15 @@ class VLMEvalKitBackendManager(BackendManager):
62
63
  if model_name == 'CustomAPIModel':
63
64
  model_type = model_cfg['type']
64
65
  remain_cfg = copy.deepcopy(model_cfg)
65
- del remain_cfg['name'] # remove not used args
66
- del remain_cfg['type'] # remove not used args
67
-
68
- self.valid_models.update({
69
- model_type: partial(model_class,
70
- model=model_type,
71
- **remain_cfg)
72
- })
66
+ del remain_cfg['name'] # remove not used args
67
+ del remain_cfg['type'] # remove not used args
68
+
69
+ self.valid_models.update({model_type: partial(model_class, model=model_type, **remain_cfg)})
73
70
  new_model_names.append(model_type)
74
71
  else:
75
72
  remain_cfg = copy.deepcopy(model_cfg)
76
- del remain_cfg['name'] # remove not used args
77
-
73
+ del remain_cfg['name'] # remove not used args
74
+
78
75
  self.valid_models[model_name] = partial(model_class, **remain_cfg)
79
76
  new_model_names.append(model_name)
80
77
 
@@ -83,7 +80,7 @@ class VLMEvalKitBackendManager(BackendManager):
83
80
  elif isinstance(self.args.model[0], str):
84
81
  valid_model_names, invalid_model_names = get_valid_list(self.args.model, self.valid_model_names)
85
82
  if len(invalid_datasets) != 0:
86
- logger.warning(f"Using custom dataset: {invalid_datasets}, ")
83
+ logger.warning(f'Using custom dataset: {invalid_datasets}, ')
87
84
 
88
85
  @property
89
86
  def cmd(self):
@@ -127,7 +124,7 @@ class VLMEvalKitBackendManager(BackendManager):
127
124
  f'--data {" ".join(self.args.data)} ' \
128
125
  f'{self.get_restore_arg("verbose", self.args.verbose)} ' \
129
126
  f'{self.get_restore_arg("ignore", self.args.ignore)} ' \
130
- f'{self.get_restore_arg("rerun", self.args.rerun)} ' \
127
+ f'{self.get_restore_arg("reuse", self.args.reuse)} ' \
131
128
  f'{self.get_arg_with_default("work-dir", self.args.work_dir)} ' \
132
129
  f'{self.get_arg_with_default("limit", self.args.limit)} ' \
133
130
  f'{self.get_arg_with_default("mode", self.args.mode)} ' \
@@ -141,7 +138,12 @@ class VLMEvalKitBackendManager(BackendManager):
141
138
  if run_mode == ExecutionMode.CMD:
142
139
  logger.info(f'** Run command: {self.cmd}')
143
140
  try:
144
- subprocess.run(self.cmd, check=True, ext=True, shell=True,)
141
+ subprocess.run(
142
+ self.cmd,
143
+ check=True,
144
+ ext=True,
145
+ shell=True,
146
+ )
145
147
  except subprocess.CalledProcessError as e:
146
148
  logger.error(f'** Run command failed: {e.stderr}')
147
149
  raise
@@ -0,0 +1 @@
1
+ {'id': 0, 'question': '下列关于税法基本原则的表述中,不正确的是____。', 'A': '税收法定原则包括税收要件法定原则和税务合法性原则', 'B': '税收公平原则源于法律上的平等性原则', 'C': '税收效率原则包含经济效率和行政效率两个方面', 'D': '税务机关按法定程序依法征税,可以自由做出减征、停征或免征税款的决定', 'answer': 'D', 'explanation': ''}
@@ -0,0 +1,5 @@
1
+ {'input': '毛毛骑在牛背上过河,他共有甲、乙、丙、丁4头牛,甲过河要20分钟,乙过河要30分钟,丙过河要40分钟,丁过河要50分钟。毛毛每次只能赶2头牛过河,要把4头牛都赶到对岸去,最少要多少分钟?', 'A': '190', 'B': '180', 'C': '170', 'D': '160', 'target': 'D'}
2
+ {'input': '下列关于重力的说法正确的是', 'A': '在地球周围的物体都要受到重力作用,与其运动状态无关', 'B': '对某一物体而言,重力的大小是一个恒量,不随物体的地理位置而改变', 'C': '重力就是地球对物体的吸引力,重力的方向总是竖直向下', 'D': '在地球表面各处的重力方向都是相同的', 'target': 'A'}
3
+ {'input': '心脏的静脉血回心的主要途径是', 'A': '心小静脉', 'B': '冠状窦', 'C': '心中静脉', 'D': '心前静脉', 'target': 'B'}
4
+ {'input': "以西蒙为代表的决策理论学派提出的决策准则是", 'A': '最优化', 'B': '公平', 'C': '民主化', 'D': '满意', 'target': 'D'}
5
+ {'input': '20世纪初,英国首相阿斯奎斯说:“我们现在有一个牢固确立了两百年的传统,即归根到底,王位的占有者接受其大臣的建议并据此行事。”这一传统的确立,使一个以小农业和手工业生产为主的国家变成了一个典型的资本主义国家,成为欧洲各国效仿的对象。各国效仿的理由是', 'A': '英国“光荣革命”宣告了欧洲新社会政治制度的诞生', 'B': '殖民主义深刻影响了英国“世界工厂”的地位', 'C': '英国经济上的成就得益于其制度设计', 'D': '英国启蒙思想奠定了资产阶级民主主义政治的理论基础', 'target': 'C'}
@@ -0,0 +1,5 @@
1
+ {'input': 'A "dished face" profile is often associated with', 'A': 'a protruding mandible due to reactivation of the condylar cartilage by acromegaly.', 'B': 'a recessive maxilla due to failure of elongation of the cranial base.', 'C': 'an enlarged frontal bone due to hydrocephaly.', 'D': 'defective development of the maxillary air sinus.', 'target': 'B'}
2
+ {'input': '___________ is based on the idea that customer expectations of the service they will receive shape their perception of the actual service encounter.', 'A': 'Service quality.', 'B': 'Service action.', 'C': 'Service recovery.', 'D': 'Service satisfaction.', 'target': 'A'}
3
+ {'input': ' Information collected for the first time specifically for a marketing research study is called:', 'A': 'Secondary research.', 'B': 'Primary research.', 'C': 'Soft research.', 'D': 'Experimental research.', 'target': 'B'}
4
+ {'input': "This includes advertisements that contain 'call-to-response' mechanisms such as telephone numbers, website addresses, email and postal addresses:", 'A': 'Direct response advertising.', 'B': 'Sales promotions.', 'C': 'Mass media advertising.', 'D': 'Public relations.', 'target': 'A'}
5
+ {'input': 'Which of the following is not part of the external marketing environment?', 'A': 'Political.', 'B': 'Legal.', 'C': 'Product.', 'D': 'Socio-cultural.', 'target': 'C'}
@@ -0,0 +1,5 @@
1
+ {'example_id': 'middle4227.txt', 'article': 'There are many kinds...ealthy.\n,.', 'answer': 'D', 'question': 'We may read this pas... in _ .', 'options': ['a letter', 'a story', 'a newspaper', 'a health magazine']}
2
+ {'example_id': 'middle3329.txt', 'article': 'Do you know why diff...ng at all.', 'answer': 'B', 'question': 'Those pests with dif...of danger.', 'options': ['change their colours', 'hide in the day time...r at night', 'move quietly', 'hide at night and ap...e day time']}
3
+ {'example_id': 'middle3614.txt', 'article': 'The seahorse is a ve...o the sea.', 'answer': 'B', 'question': 'A seahorse eats _ .', 'options': ['sea weed', 'small fish', 'water', 'nothing']}
4
+ {'example_id': 'middle6632.txt', 'article': 'Kids have unbelievab...h at her."', 'answer': 'D', 'question': 'Which is NOT mention...e passage?', 'options': ['Robots keep secrets.', 'Robots give suggestions.', 'Robots do chores.', 'Robots make movies.']}
5
+ {'example_id': 'middle3503.txt', 'article': 'Have you ever heard ...eir lives.', 'answer': 'B', 'question': 'Which of the followi...lue moon"?', 'options': ['Simon often tells jo...blue moon.', 'Tom rarely remembers...blue moon.', 'Mary likes to go sho...blue moon.', 'Cindy hates to stay ...blue moon.']}
@@ -0,0 +1,5 @@
1
+ {"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "Who was the man behind The Chipmunks?"}], "ideal": ["David Seville", "david seville"]}
2
+ {"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "Which Lloyd Webber musical premiered in the US on 10th December 1993?"}], "ideal": ["Sunset Blvd", "West Sunset Boulevard", "Sunset Boulevard", "Sunset Bulevard", "Sunset Blvd.", "sunset boulevard", "sunset bulevard", "west sunset boulevard", "sunset blvd"]}
3
+ {"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "Who was the next British Prime Minister after Arthur Balfour?"}], "ideal": ["Sir Henry Campbell-Bannerman", "Campbell-Bannerman", "Campbell Bannerman", "Sir Henry Campbell Bannerman", "Henry Campbell Bannerman", "Henry Campbell-Bannerman", "henry campbell bannerman", "sir henry campbell bannerman", "campbell bannerman"]}
4
+ {"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "Who had a 70s No 1 hit with Kiss You All Over?"}], "ideal": ["Internal exile", "Exiles", "Transported for life", "Exile (politics and government)", "Voluntary exile", "Sent into exile", "Exile and Banishment", "Self-exile", "Forced exile", "Exile", "Exile in Greek tragedy", "Banish", "Banishment", "exiles", "voluntary exile", "forced exile", "banish", "self exile", "exile politics and government", "exile in greek tragedy", "sent into exile", "banishment", "transported for life", "exile", "internal exile", "exile and banishment"]}
5
+ {"input": [{"role": "system", "content": "Follow the given examples and answer the question."}, {"role": "user", "content": "What claimed the life of singer Kathleen Ferrier?"}], "ideal": ["Cancer pathology", "Deaths by cancer", "Anti-cancer", "Cancer (disease)", "Cancerophobia", "Malignant lesion", "Cancer medication", "Malignant tumors", "Cancer signs", "Malignant neoplasm", "Invasive (cancer)", "Malignant Neoplasms", "Malignant growth", "Sporadic cancer", "Malignant cancer", "Tumour virus", "Cancer en cuirasse", "Microtumor", "Malignant neoplasms", "Malignant tumour", "Carcinophobia", "Malignacy", "Cancer patient", "Epithelial cancers", "Solid cancer", "Cancers", "Tumor medication", "Malignant neoplastic disease", "AIDS-related cancer", "Invasive cancer", "Cancer therapy", "Cancerous tumor", "Cancer", "Financial toxicity", "Cancer diagnosis", "Cancer (medicine)", "Malignant tumor", "Cancerous", "Borderline (cancer)", "Signs of cancer", "Malignancies", "Cancer aromatase", "aids related cancer", "sporadic cancer", "cancer disease", "malignant tumors", "cancers", "carcinophobia", "cancer", "cancer diagnosis", "malignant neoplastic disease", "malignant neoplasm", "tumour virus", "cancer medicine", "deaths by cancer", "malignant tumour", "epithelial cancers", "solid cancer", "cancerous", "borderline cancer", "invasive cancer", "anti cancer", "cancer pathology", "cancer signs", "cancer aromatase", "cancer therapy", "financial toxicity", "cancerophobia", "cancer en cuirasse", "cancer patient", "cancerous tumor", "malignant cancer", "malignant neoplasms", "tumor medication", "signs of cancer", "malignacy", "malignant tumor", "cancer medication", "microtumor", "malignancies", "malignant lesion", "malignant growth"]}
@@ -1,20 +1,21 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from abc import abstractmethod
3
- import os, sys, time
2
+ import os
4
3
  from argparse import ArgumentParser
5
- import subprocess
6
-
7
4
 
8
5
  from evalscope.cli.base import CLICommand
9
- from evalscope.perf.http_client import add_argument, run_perf_benchmark
6
+ from evalscope.perf.arguments import add_argument
7
+ from evalscope.perf.main import run_perf_benchmark
10
8
 
11
9
  current_path = os.path.dirname(os.path.abspath(__file__))
12
10
  root_path = os.path.dirname(current_path)
11
+
12
+
13
13
  def subparser_func(args):
14
14
  """ Function which will be called for a specific sub parser.
15
15
  """
16
16
  return PerfBenchCMD(args)
17
-
17
+
18
+
18
19
  class PerfBenchCMD(CLICommand):
19
20
  name = 'perf'
20
21
 
@@ -28,10 +29,6 @@ class PerfBenchCMD(CLICommand):
28
29
  parser = parsers.add_parser(PerfBenchCMD.name)
29
30
  add_argument(parser)
30
31
  parser.set_defaults(func=subparser_func)
31
-
32
+
32
33
  def execute(self):
33
34
  run_perf_benchmark(self.args)
34
-
35
-
36
-
37
-