evalscope 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. evalscope/__init__.py +3 -0
  2. evalscope/backend/__init__.py +3 -0
  3. evalscope/backend/base.py +27 -0
  4. evalscope/backend/opencompass/__init__.py +3 -0
  5. evalscope/backend/opencompass/api_meta_template.py +64 -0
  6. evalscope/backend/opencompass/backend_manager.py +247 -0
  7. evalscope/backend/opencompass/tasks/__init__.py +1 -0
  8. evalscope/backend/opencompass/tasks/eval_api.py +30 -0
  9. evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
  10. evalscope/backend/vlm_eval_kit/__init__.py +1 -0
  11. evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
  12. evalscope/benchmarks/__init__.py +4 -0
  13. evalscope/benchmarks/arc/__init__.py +5 -0
  14. evalscope/benchmarks/arc/ai2_arc.py +148 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +231 -0
  16. evalscope/benchmarks/bbh/__init__.py +6 -0
  17. evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
  18. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
  19. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
  20. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
  21. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
  22. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
  23. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
  24. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
  25. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
  26. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
  27. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
  28. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
  29. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
  30. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
  31. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
  32. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
  33. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
  34. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
  35. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
  36. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
  37. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
  38. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
  39. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
  40. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
  41. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
  42. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
  43. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
  44. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
  45. evalscope/benchmarks/benchmark.py +65 -0
  46. evalscope/benchmarks/ceval/__init__.py +5 -0
  47. evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
  48. evalscope/benchmarks/ceval/ceval_exam.py +159 -0
  49. evalscope/benchmarks/cmmlu/__init__.py +5 -0
  50. evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
  51. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
  52. evalscope/benchmarks/competition_math/__init__.py +5 -0
  53. evalscope/benchmarks/competition_math/competition_math.py +88 -0
  54. evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
  55. evalscope/benchmarks/data_adapter.py +263 -0
  56. evalscope/benchmarks/general_qa/__init__.py +5 -0
  57. evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
  58. evalscope/benchmarks/gsm8k/__init__.py +5 -0
  59. evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
  60. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
  61. evalscope/benchmarks/hellaswag/__init__.py +5 -0
  62. evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
  63. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
  64. evalscope/benchmarks/humaneval/__init__.py +5 -0
  65. evalscope/benchmarks/humaneval/humaneval.py +82 -0
  66. evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
  67. evalscope/benchmarks/mmlu/__init__.py +5 -0
  68. evalscope/benchmarks/mmlu/mmlu.py +174 -0
  69. evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
  70. evalscope/benchmarks/race/__init__.py +5 -0
  71. evalscope/benchmarks/race/race.py +118 -0
  72. evalscope/benchmarks/race/race_adapter.py +229 -0
  73. evalscope/benchmarks/trivia_qa/__init__.py +5 -0
  74. evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
  75. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
  76. evalscope/benchmarks/truthful_qa/__init__.py +5 -0
  77. evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
  78. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
  79. evalscope/cache.py +98 -0
  80. evalscope/cli/__init__.py +1 -0
  81. evalscope/cli/base.py +20 -0
  82. evalscope/cli/cli.py +26 -0
  83. evalscope/cli/start_perf.py +37 -0
  84. evalscope/cli/start_server.py +138 -0
  85. evalscope/config.py +165 -0
  86. evalscope/constants.py +150 -0
  87. evalscope/evaluator/__init__.py +3 -0
  88. evalscope/evaluator/evaluator.py +689 -0
  89. evalscope/evaluator/rating_eval.py +178 -0
  90. evalscope/evaluator/reviewer/__init__.py +1 -0
  91. evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
  92. evalscope/metrics/__init__.py +1 -0
  93. evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
  94. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
  95. evalscope/metrics/code_metric.py +104 -0
  96. evalscope/metrics/math_accuracy.py +60 -0
  97. evalscope/metrics/metrics.py +405 -0
  98. evalscope/metrics/rouge_metric.py +129 -0
  99. evalscope/models/__init__.py +4 -0
  100. evalscope/models/custom/__init__.py +4 -0
  101. evalscope/models/custom/custom_model.py +53 -0
  102. evalscope/models/dummy_chat_model.py +50 -0
  103. evalscope/models/model.py +88 -0
  104. evalscope/models/model_adapter.py +586 -0
  105. evalscope/models/openai_model.py +103 -0
  106. evalscope/models/template.py +1446 -0
  107. evalscope/perf/__init__.py +0 -0
  108. evalscope/perf/_logging.py +32 -0
  109. evalscope/perf/api_plugin_base.py +60 -0
  110. evalscope/perf/custom_api.py +87 -0
  111. evalscope/perf/dashscope_api.py +84 -0
  112. evalscope/perf/dataset_plugin_base.py +64 -0
  113. evalscope/perf/datasets/__init__.py +0 -0
  114. evalscope/perf/datasets/line_by_line.py +18 -0
  115. evalscope/perf/datasets/longalpaca_12k.py +20 -0
  116. evalscope/perf/datasets/openqa.py +22 -0
  117. evalscope/perf/how_to_analysis_result.py +24 -0
  118. evalscope/perf/http_client.py +756 -0
  119. evalscope/perf/openai_api.py +130 -0
  120. evalscope/perf/plugin_registry.py +35 -0
  121. evalscope/perf/query_parameters.py +42 -0
  122. evalscope/perf/server_sent_event.py +43 -0
  123. evalscope/preprocess/__init__.py +1 -0
  124. evalscope/preprocess/tokenizers/__init__.py +0 -0
  125. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
  126. evalscope/registry/__init__.py +1 -0
  127. evalscope/registry/tasks/arc.yaml +29 -0
  128. evalscope/registry/tasks/bbh.yaml +27 -0
  129. evalscope/registry/tasks/bbh_mini.yaml +27 -0
  130. evalscope/registry/tasks/ceval.yaml +27 -0
  131. evalscope/registry/tasks/ceval_mini.yaml +27 -0
  132. evalscope/registry/tasks/cmmlu.yaml +27 -0
  133. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
  134. evalscope/registry/tasks/general_qa.yaml +27 -0
  135. evalscope/registry/tasks/gsm8k.yaml +29 -0
  136. evalscope/registry/tasks/mmlu.yaml +29 -0
  137. evalscope/registry/tasks/mmlu_mini.yaml +27 -0
  138. evalscope/run.py +404 -0
  139. evalscope/run_arena.py +204 -0
  140. evalscope/run_ms.py +140 -0
  141. evalscope/summarizer.py +144 -0
  142. evalscope/third_party/__init__.py +1 -0
  143. evalscope/third_party/toolbench_static/__init__.py +3 -0
  144. evalscope/third_party/toolbench_static/eval.py +219 -0
  145. evalscope/third_party/toolbench_static/infer.py +278 -0
  146. evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
  147. evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
  148. evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
  149. evalscope/tools/__init__.py +1 -0
  150. evalscope/tools/combine_reports.py +140 -0
  151. evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
  152. evalscope/tools/rewrite_eval_results.py +95 -0
  153. evalscope/utils/__init__.py +4 -0
  154. evalscope/utils/arena_utils.py +247 -0
  155. evalscope/utils/completion_parsers.py +87 -0
  156. evalscope/utils/logger.py +64 -0
  157. evalscope/utils/task_cfg_parser.py +10 -0
  158. evalscope/utils/task_utils.py +19 -0
  159. evalscope/utils/utils.py +625 -0
  160. evalscope/version.py +4 -0
  161. evalscope-0.5.0.dist-info/METADATA +566 -0
  162. evalscope-0.5.0.dist-info/RECORD +165 -0
  163. evalscope-0.5.0.dist-info/WHEEL +5 -0
  164. evalscope-0.5.0.dist-info/entry_points.txt +3 -0
  165. evalscope-0.5.0.dist-info/top_level.txt +1 -0
File without changes
@@ -0,0 +1,32 @@
1
+ import logging
2
+ import os
3
+
4
+
5
+ logger = logging.getLogger('perf')
6
+
7
+
8
+ def enable_logging():
9
+ level = os.environ.get('LOGGING_LEVEL', 'info')
10
+ if level is not None: # set logging level.
11
+ if level not in ['info', 'debug']:
12
+ # set logging level env, but invalid value, use default.
13
+ level = 'info'
14
+ if level == 'info':
15
+ logger.setLevel(logging.INFO)
16
+ else:
17
+ logger.setLevel(logging.DEBUG)
18
+ # set default logging handler
19
+ console_handler = logging.StreamHandler()
20
+ formatter = logging.Formatter(
21
+ '%(asctime)s - %(name)s - %(filename)s - %(funcName)s - %(lineno)d - %(levelname)s - %(message)s' # noqa E501
22
+ )
23
+ #formatter = logging.Formatter(
24
+ # '%(asctime)s - %(name)s - %(levelname)s - %(message)s' # noqa E501
25
+ #)
26
+ console_handler.setFormatter(formatter)
27
+ logger.addHandler(console_handler)
28
+
29
+
30
+ # in release disable dashscope log
31
+ # you can enable dashscope log for debugger.
32
+ enable_logging()
@@ -0,0 +1,60 @@
1
+ from abc import abstractmethod
2
+ from typing import Any, Dict, List, Tuple
3
+
4
+ from evalscope.perf.query_parameters import QueryParameters
5
+
6
+ class ApiPluginBase:
7
+ def __init__(self, model_path: str) -> None:
8
+ self.model_path = model_path
9
+
10
+ @abstractmethod
11
+ def build_request(self, messages: List[Dict], param: QueryParameters)->Dict:
12
+ """Build a api request body.
13
+
14
+ Args:
15
+ messages (List[Dict]): The messages generated by dataset.
16
+ param (QueryParameters): The query parameters.
17
+
18
+ Raises:
19
+ NotImplementedError: Not implemented.
20
+
21
+ Returns:
22
+ Dict: The api request body.
23
+ """
24
+ raise NotImplementedError
25
+
26
+ @abstractmethod
27
+ def parse_responses(self,
28
+ responses: List,
29
+ request: Any=None,
30
+ **kwargs:Any) -> Tuple[int, int]:
31
+ """Parser responses and return number of request and response tokens.
32
+
33
+ Args:
34
+ responses (List[bytes]): List of http response body, for stream output,
35
+ there are multiple responses, each is bytes, for general only one.
36
+ request (Any): The request body.
37
+
38
+ Returns:
39
+ Tuple: (Number of prompt_tokens and number of completion_tokens).
40
+ """
41
+ raise NotImplementedError
42
+
43
+ @staticmethod
44
+ def replace_values(input_json: Any, model: str, prompt: str):
45
+ if isinstance(input_json, dict):
46
+ for key, value in input_json.items():
47
+ if isinstance(value, str):
48
+ input_json[key] = value.replace("%m", model).replace("%p", prompt)
49
+ else:
50
+ ApiPluginBase.replace_values(value, model, prompt)
51
+ elif isinstance(input_json, list):
52
+ for idx, item in enumerate(input_json):
53
+ if isinstance(item, str):
54
+ input_json[idx] = item.replace("%m", model).replace("%p", prompt)
55
+ else:
56
+ ApiPluginBase.replace_values(item, model, prompt)
57
+ elif isinstance(input_json, str):
58
+ input_json = input_json.replace("%m", model).replace("%p", prompt)
59
+ else:
60
+ pass
@@ -0,0 +1,87 @@
1
+ from typing import Any, Dict, Iterator, List
2
+ import json
3
+ from evalscope.perf.api_plugin_base import ApiPluginBase
4
+ from transformers import AutoTokenizer
5
+ from evalscope.perf.plugin_registry import register_api
6
+ from evalscope.perf.query_parameters import QueryParameters
7
+
8
+ @register_api("custom")
9
+ class CustomPlugin(ApiPluginBase):
10
+ """Support tensorrt-llm triton server
11
+ """
12
+ def __init__(self, mode_path: str):
13
+ """Init the plugin
14
+
15
+ Args:
16
+ mode_path (str): The model path, we use the tokenizer
17
+ weight in the model to calculate the number of the
18
+ input and output tokens.
19
+ """
20
+ super().__init__(model_path=mode_path)
21
+ if mode_path is not None:
22
+ self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
23
+ else:
24
+ self.tokenizer = None
25
+
26
+ def build_request(self, messages: List[Dict], param: QueryParameters) -> Dict:
27
+ """Build the openai format request based on prompt, dataset
28
+
29
+ Args:
30
+ message (Dict): The basic message to generator query.
31
+ param (QueryParameters): The query parameters.
32
+
33
+ Raises:
34
+ Exception: NotImplemented
35
+
36
+ Returns:
37
+ Dict: The request body. None if prompt format is error.
38
+ """
39
+ try:
40
+ query = json.loads(param.query_template)
41
+ ApiPluginBase.replace_values(query, param.model, messages[0]['content'])
42
+ return query
43
+ except Exception as e:
44
+ print(e)
45
+ print('Prompt: %s invalidate!'%messages)
46
+ return None
47
+
48
+ def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict:
49
+ """Parser responses and return number of request and response tokens.
50
+ sample of the output delta:
51
+ {"id":"4","object":"chat.completion.chunk","created":1714030870,"model":"llama3","choices":[{"index":0,"delta":{"role":"assistant","content":""},"logprobs":null,"finish_reason":null}]}
52
+
53
+
54
+ Args:
55
+ responses (List[bytes]): List of http response body, for stream output,
56
+ there are multiple responses, for general only one.
57
+ kwargs: (Any): The command line --parameter content.
58
+ Returns:
59
+ Tuple: Return number of prompt token and number of completion tokens.
60
+ """
61
+ full_response_content = ''
62
+ delta_contents = {}
63
+ input_tokens = None
64
+ output_tokens = None
65
+ for response in responses:
66
+ js = json.loads(response)
67
+ # {"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"model_name":"ensemble",
68
+ # "model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"性"}
69
+ if 'text_output' in js:
70
+ if 0 in delta_contents:
71
+ delta_contents[0].append(js['text_output'])
72
+ else:
73
+ delta_contents[0] = [js['text_output']]
74
+ if input_tokens is None and output_tokens is None and self.tokenizer is not None:
75
+ input_tokens = 0
76
+ output_tokens = 0
77
+ for _, choice_contents in delta_contents.items():
78
+ full_response_content = ''.join([m for m in choice_contents])
79
+ input_tokens += len(self.tokenizer.encode(request['text_input']))
80
+ output_tokens += len(self.tokenizer.encode(full_response_content))
81
+ elif input_tokens is None and output_tokens is None: # no usage info get.
82
+ input_tokens = 0
83
+ output_tokens = 0
84
+
85
+ return input_tokens, output_tokens
86
+
87
+
@@ -0,0 +1,84 @@
1
+
2
+ from sys import maxsize
3
+ import sys
4
+ from typing import Any, Dict, Iterator, List
5
+ import json
6
+ from evalscope.perf.api_plugin_base import ApiPluginBase
7
+
8
+ from evalscope.perf.plugin_registry import register_api
9
+ from evalscope.perf.query_parameters import QueryParameters
10
+
11
+ @register_api("dashscope")
12
+ class DashScopeApiPlugin(ApiPluginBase):
13
+ def __init__(self, mode_path: str):
14
+ """Init the plugin
15
+
16
+ Args:
17
+ mode_path (str): The model path, we use the tokenizer
18
+ weight in the model to calculate the number of the
19
+ input and output tokens.
20
+ """
21
+ super().__init__(model_path=mode_path)
22
+
23
+ def build_request(self,messages: List[Dict], param: QueryParameters) -> Dict:
24
+ """Build the openai format request based on prompt, dataset
25
+
26
+ Args:
27
+ messages (List[Dict]): The basic message to generator query.
28
+ param (QueryParameters): The query parameters.
29
+
30
+ Raises:
31
+ Exception: NotImplemented
32
+
33
+ Returns:
34
+ Dict: The request body. None if prompt format is error.
35
+ """
36
+ try:
37
+ if param.query_template is not None:
38
+ query = json.loads(param.query_template)
39
+ query['input']['messages'] = messages # replace template content with message.
40
+ return self.__compose_query_from_parameter(query, param)
41
+ else:
42
+ query = {'messages': messages}
43
+ return self.__compose_query_from_parameter(query, param)
44
+ except Exception as e:
45
+ print(e)
46
+ return None
47
+ def __compose_query_from_parameter(self, payload: Dict, param: QueryParameters):
48
+ payload['model'] = param.model
49
+ if 'parameters' not in payload:
50
+ payload['parameters'] = {}
51
+ if param.max_tokens is not None:
52
+ payload['parameters']['max_tokens'] = param.max_tokens
53
+ if param.frequency_penalty is not None:
54
+ payload['parameters']['frequency_penalty'] = param.frequency_penalty
55
+ if param.logprobs is not None:
56
+ payload['parameters']['logprobs'] = param.logprobs
57
+ if param.n_choices is not None:
58
+ payload['parameters']['n'] = param.n_choices
59
+ if param.seed is not None:
60
+ payload['parameters']['seed'] = param.seed
61
+ if param.stop is not None:
62
+ payload['parameters']['stop'] = param.stop
63
+ if param.stream is not None and not param.stream:
64
+ payload['parameters']['stream'] = param.stream
65
+ if param.temperature is not None:
66
+ payload['parameters']['temperature'] = param.temperature
67
+ if param.top_p is not None:
68
+ payload['parameters']['top_p'] = param.top_p
69
+ return payload
70
+
71
+ def parse_responses(self, responses, **kwargs) -> Dict:
72
+ """Parser responses and return number of request and response tokens.
73
+
74
+ Args:
75
+ responses (List[bytes]): List of http response body, for stream output,
76
+ there are multiple responses, for general only one.
77
+ kwargs: (Any): The command line --parameter content.
78
+
79
+ Returns:
80
+ Tuple: Return number of prompt token and number of completion tokens.
81
+ """
82
+ last_response = responses[-1]
83
+ js = json.loads(last_response)
84
+ return js['usage']['input_tokens'], js['usage']['output_tokens']
@@ -0,0 +1,64 @@
1
+ from abc import abstractmethod
2
+ import sys
3
+ from typing import Any, Dict, Iterator, List, Tuple
4
+ import json
5
+
6
+ from evalscope.perf.query_parameters import QueryParameters
7
+
8
+ class DatasetPluginBase:
9
+ def __init__(self, query_parameters: QueryParameters):
10
+ """Build data set plugin
11
+
12
+ Args:
13
+ dataset_path (str, optional): The input dataset path. Defaults to None.
14
+ """
15
+ self.query_parameters = query_parameters
16
+
17
+ def __next__(self):
18
+ for item in self.build_messages():
19
+ yield item
20
+ raise StopIteration
21
+
22
+ def __iter__(self):
23
+ return self.build_messages()
24
+
25
+ @abstractmethod
26
+ def build_messages(self)->Iterator[List[Dict]]:
27
+ """Build the request.
28
+
29
+ Raises:
30
+ NotImplementedError: The request is not impletion.
31
+
32
+ Yields:
33
+ Iterator[List[Dict]]: Yield request messages.
34
+ """
35
+ raise NotImplementedError
36
+
37
+ def dataset_line_by_line(self, dataset: str)->Iterator[str]:
38
+ """Get content line by line of dataset.
39
+
40
+ Args:
41
+ dataset (str): The dataset path.
42
+
43
+ Yields:
44
+ Iterator[str]: Each line of file.
45
+ """
46
+ with open(dataset, 'r', encoding='utf-8') as f:
47
+ for line in f:
48
+ yield line
49
+
50
+ def dataset_json_list(self, dataset: str)->Iterator[Dict]:
51
+ """Read data from file which is list of requests.
52
+ Sample: https://huggingface.co/datasets/Yukang/LongAlpaca-12k
53
+
54
+ Args:
55
+ dataset (str): The dataset path.
56
+
57
+ Yields:
58
+ Iterator[Dict]: The each request object.
59
+ """
60
+ with open(dataset, 'r', encoding='utf-8') as f:
61
+ content = f.read()
62
+ data = json.loads(content)
63
+ for item in data:
64
+ yield item
File without changes
@@ -0,0 +1,18 @@
1
+ import sys
2
+ from typing import Dict, Iterator, List
3
+ from evalscope.perf.dataset_plugin_base import DatasetPluginBase
4
+ from evalscope.perf.plugin_registry import register_dataset
5
+ from evalscope.perf.query_parameters import QueryParameters
6
+
7
+ @register_dataset('line_by_line')
8
+ class LineByLineDatasetPlugin(DatasetPluginBase):
9
+ """Read dataset and return prompt.
10
+ """
11
+ def __init__(self, query_parameters: QueryParameters):
12
+ super().__init__(query_parameters)
13
+
14
+ def build_messages(self) -> Iterator[List[Dict]]:
15
+ for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
16
+ prompt = item.strip()
17
+ if len(prompt) > self.query_parameters.min_prompt_length and len(prompt) < self.query_parameters.max_prompt_length:
18
+ yield [{'role': 'user', 'content': prompt}]
@@ -0,0 +1,20 @@
1
+ import sys
2
+ from typing import Any, Dict, Iterator, List
3
+ from evalscope.perf.dataset_plugin_base import DatasetPluginBase
4
+
5
+ from evalscope.perf.plugin_registry import register_dataset
6
+ from evalscope.perf.query_parameters import QueryParameters
7
+
8
+ @register_dataset('longalpaca')
9
+ class LongAlpacaDatasetPlugin(DatasetPluginBase):
10
+ """Read data from file which is list of requests.
11
+ Sample: https://huggingface.co/datasets/Yukang/LongAlpaca-12k
12
+ """
13
+ def __init__(self, query_parameters: QueryParameters):
14
+ super().__init__(query_parameters)
15
+
16
+ def build_messages(self) -> Iterator[List[Dict]]:
17
+ for item in self.dataset_json_list(self.query_parameters.dataset_path):
18
+ prompt = item['instruction'].strip()
19
+ if len(prompt) > self.query_parameters.min_prompt_length and len(prompt) < self.query_parameters.max_prompt_length:
20
+ yield [{'role': 'user', 'content': prompt}]
@@ -0,0 +1,22 @@
1
+ from sys import maxsize
2
+ import sys
3
+ from typing import Any, Dict, Iterator, List
4
+ import json
5
+ from evalscope.perf.dataset_plugin_base import DatasetPluginBase
6
+ from evalscope.perf.plugin_registry import register_dataset
7
+ from evalscope.perf.query_parameters import QueryParameters
8
+
9
+ @register_dataset('openqa')
10
+ class OpenqaDatasetPlugin(DatasetPluginBase):
11
+ """Read dataset and return prompt.
12
+ Datasets: https://huggingface.co/datasets/Hello-SimpleAI/HC3-Chinese/blob/main/open_qa.jsonl
13
+ """
14
+ def __init__(self, query_parameters: QueryParameters):
15
+ super().__init__(query_parameters)
16
+
17
+ def build_messages(self) -> Iterator[List[Dict]]:
18
+ for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
19
+ item = json.loads(item)
20
+ prompt = item['question'].strip()
21
+ if len(prompt) > self.query_parameters.min_prompt_length and len(prompt) < self.query_parameters.max_prompt_length:
22
+ yield [{'role': 'user', 'content': prompt}]
@@ -0,0 +1,24 @@
1
+ import sqlite3
2
+ import base64
3
+ import pickle
4
+ import json
5
+ result_db_path = 'db_name.db'
6
+ con = sqlite3.connect(result_db_path)
7
+ query_sql = "SELECT request, response_messages, prompt_tokens, completion_tokens \
8
+ FROM result WHERE success='True'"
9
+ # how to save base64.b64encode(pickle.dumps(benchmark_data["request"])).decode("ascii"),
10
+ with con:
11
+ rows = con.execute(query_sql).fetchall()
12
+ if len(rows) > 0:
13
+ for row in rows:
14
+ request = row[0]
15
+ responses = row[1]
16
+ request = base64.b64decode(request)
17
+ request = pickle.loads(request)
18
+ responses = base64.b64decode(responses)
19
+ responses = pickle.loads(responses)
20
+ response_content = ''
21
+ for response in responses:
22
+ response = json.loads(response)
23
+ response_content += response['choices'][0]['delta']['content']
24
+ print('prompt: %s, tokens: %s, completion: %s, tokens: %s' % (request['messages'][0]['content'], row[2], response_content, row[3]))