evalscope 0.8.1__py3-none-any.whl → 0.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (36) hide show
  1. evalscope/config.py +3 -1
  2. evalscope/evaluator/evaluator.py +1 -0
  3. evalscope/models/model_adapter.py +1 -1
  4. evalscope/perf/arguments.py +1 -0
  5. evalscope/perf/benchmark.py +1 -1
  6. evalscope/perf/main.py +3 -1
  7. evalscope/perf/plugin/api/openai_api.py +51 -47
  8. evalscope/perf/utils/local_server.py +1 -0
  9. evalscope/version.py +2 -2
  10. {evalscope-0.8.1.dist-info → evalscope-0.8.2.dist-info}/METADATA +15 -3
  11. {evalscope-0.8.1.dist-info → evalscope-0.8.2.dist-info}/RECORD +16 -36
  12. tests/perf/test_perf.py +3 -3
  13. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
  14. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
  15. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
  16. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
  17. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
  18. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
  19. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
  20. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
  21. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
  22. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  23. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  24. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  25. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  26. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
  27. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
  28. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
  29. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
  30. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  31. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
  33. {evalscope-0.8.1.dist-info → evalscope-0.8.2.dist-info}/LICENSE +0 -0
  34. {evalscope-0.8.1.dist-info → evalscope-0.8.2.dist-info}/WHEEL +0 -0
  35. {evalscope-0.8.1.dist-info → evalscope-0.8.2.dist-info}/entry_points.txt +0 -0
  36. {evalscope-0.8.1.dist-info → evalscope-0.8.2.dist-info}/top_level.txt +0 -0
evalscope/config.py CHANGED
@@ -114,7 +114,9 @@ class TaskConfig:
114
114
  def from_args(args: Namespace):
115
115
  # Convert Namespace to a dictionary and filter out None values
116
116
  args_dict = {k: v for k, v in vars(args).items() if v is not None}
117
- del args_dict['func'] # Note: compat CLI arguments
117
+
118
+ if 'func' in args_dict:
119
+ del args_dict['func'] # Note: compat CLI arguments
118
120
 
119
121
  return TaskConfig.from_dict(args_dict)
120
122
 
@@ -86,6 +86,7 @@ class Evaluator(object):
86
86
  **kwargs)
87
87
 
88
88
  # Get prompts from dataset
89
+ # TODO: support sampler
89
90
  self.prompts = self.data_adapter.gen_prompts(data_dict=self.dataset)
90
91
  del self.dataset
91
92
 
@@ -429,7 +429,7 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
429
429
  fix_do_sample_warning(self.generation_config)
430
430
 
431
431
  # Run inference
432
- output_ids = self.model.generate(**inputs, generation_config=self.generation_config)
432
+ output_ids = self.model.generate(input_ids, generation_config=self.generation_config)
433
433
 
434
434
  response = self.tokenizer.decode(output_ids[0, len(input_ids[0]):], skip_special_tokens=True)
435
435
  return response
@@ -68,6 +68,7 @@ class Arguments:
68
68
  model=args.model,
69
69
  attn_implementation=args.attn_implementation,
70
70
  url=args.url,
71
+ port=args.port,
71
72
  api_key=args.api_key,
72
73
  connect_timeout=args.connect_timeout,
73
74
  read_timeout=args.read_timeout,
@@ -157,7 +157,7 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
157
157
  while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
158
158
  try:
159
159
  # Attempt to get benchmark data from the queue with a timeout
160
- benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=1)
160
+ benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
161
161
  benchmark_data_queue.task_done()
162
162
  except asyncio.TimeoutError:
163
163
  # If timeout, continue to the next iteration
evalscope/perf/main.py CHANGED
@@ -19,7 +19,9 @@ def run_perf_benchmark(args):
19
19
  args = Arguments(**args)
20
20
  elif isinstance(args, Namespace):
21
21
  args = Arguments.from_args(args)
22
- seed_everything(args.seed)
22
+
23
+ if args.seed is not None:
24
+ seed_everything(args.seed)
23
25
 
24
26
  # Setup logger and output
25
27
  args.outputs_dir = get_output_path(args)
@@ -96,60 +96,64 @@ class OpenaiPlugin(ApiPluginBase):
96
96
 
97
97
  def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict:
98
98
  """Parser responses and return number of request and response tokens.
99
- sample of the output delta:
100
- {"id":"4","object":"chat.completion.chunk","created":1714030870,"model":"llama3","choices":[{"index":0,"delta":{"role":"assistant","content":""},"logprobs":null,"finish_reason":null}]}
99
+ Only one response for non-stream, multiple responses for stream.
100
+ """
101
101
 
102
+ # when stream, the last response is the full usage
103
+ # when non-stream, the last response is the first response
104
+ last_response_js = json.loads(responses[-1])
105
+ if 'usage' in last_response_js and last_response_js['usage']:
106
+ input_tokens = last_response_js['usage']['prompt_tokens']
107
+ output_tokens = last_response_js['usage']['completion_tokens']
108
+ return input_tokens, output_tokens
102
109
 
103
- Args:
104
- responses (List[bytes]): List of http response body, for stream output,
105
- there are multiple responses, for general only one.
106
- kwargs: (Any): The command line --parameter content.
107
- Returns:
108
- Tuple: Return number of prompt token and number of completion tokens.
109
- """
110
- full_response_content = ''
110
+ # no usage information in the response, parse the response to get the tokens
111
111
  delta_contents = {}
112
- input_tokens = None
113
- output_tokens = None
114
112
  for response in responses:
115
113
  js = json.loads(response)
116
- if js['object'] == 'chat.completion':
117
- for choice in js['choices']:
118
- delta_contents[choice['index']] = [choice['message']['content']]
119
- input_tokens = js['usage']['prompt_tokens']
120
- output_tokens = js['usage']['completion_tokens']
121
- elif js['object'] == 'text_completion':
122
- for choice in js['choices']:
123
- delta_contents[choice['index']] = [choice['text']]
124
- input_tokens = js['usage']['prompt_tokens']
125
- output_tokens = js['usage']['completion_tokens']
126
- elif js['object'] == 'chat.completion.chunk':
127
- if 'choices' in js:
128
- for choice in js['choices']:
129
- if 'delta' in choice and 'index' in choice:
130
- delta = choice['delta']
131
- idx = choice['index']
132
- if 'content' in delta:
133
- delta_content = delta['content']
134
- if idx in delta_contents:
135
- delta_contents[idx].append(delta_content)
136
- else:
137
- delta_contents[idx] = [delta_content]
138
- # usage in chunk: {"id":"","object":"chat.completion.chunk","created":1718269986,"model":"llama3",
139
- # "choices":[],"usage":{"prompt_tokens":32,"total_tokens":384,"completion_tokens":352}}
140
- if 'usage' in js and js['usage']:
141
- input_tokens = js['usage']['prompt_tokens']
142
- output_tokens = js['usage']['completion_tokens']
143
- if (input_tokens is None and output_tokens is None and self.tokenizer is not None):
144
- input_tokens = 0
145
- output_tokens = 0
114
+ if 'object' in js:
115
+ self.__process_response_object(js, delta_contents)
116
+ else:
117
+ self.__process_no_object(js, delta_contents)
118
+
119
+ input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
120
+ return input_tokens, output_tokens
121
+
122
+ def __process_response_object(self, js, delta_contents):
123
+ if js['object'] == 'chat.completion':
124
+ for choice in js['choices']:
125
+ delta_contents[choice['index']] = [choice['message']['content']]
126
+ elif js['object'] == 'text_completion':
127
+ for choice in js['choices']:
128
+ delta_contents[choice['index']] = [choice['text']]
129
+ elif js['object'] == 'chat.completion.chunk':
130
+ for choice in js.get('choices', []):
131
+ if 'delta' in choice and 'index' in choice:
132
+ delta = choice['delta']
133
+ idx = choice['index']
134
+ if 'content' in delta:
135
+ delta_content = delta['content']
136
+ delta_contents.setdefault(idx, []).append(delta_content)
137
+
138
+ def __process_no_object(self, js, delta_contents):
139
+ # assume the response is a single choice
140
+ for choice in js['choices']:
141
+ if 'delta' in choice:
142
+ delta = choice['delta']
143
+ idx = choice['index']
144
+ if 'content' in delta:
145
+ delta_content = delta['content']
146
+ delta_contents.setdefault(idx, []).append(delta_content)
147
+ else:
148
+ delta_contents[choice['index']] = [choice['message']['content']]
149
+
150
+ def __calculate_tokens_from_content(self, request, delta_contents):
151
+ input_tokens = output_tokens = 0
152
+ if self.tokenizer is not None:
146
153
  for idx, choice_contents in delta_contents.items():
147
- full_response_content = ''.join([m for m in choice_contents])
154
+ full_response_content = ''.join(choice_contents)
148
155
  input_tokens += len(self.tokenizer.encode(request['messages'][0]['content']))
149
156
  output_tokens += len(self.tokenizer.encode(full_response_content))
150
- elif input_tokens is None and output_tokens is None: # no usage info get.
151
- input_tokens = 0
152
- output_tokens = 0
157
+ else:
153
158
  logger.warning('No usage information found. Please specify `--tokenizer-path` to generate usage details.')
154
-
155
159
  return input_tokens, output_tokens
@@ -103,6 +103,7 @@ def start_app(args: Arguments):
103
103
  elif args.api == 'local_vllm':
104
104
  os.environ['VLLM_USE_MODELSCOPE'] = 'True'
105
105
  os.environ['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = '1'
106
+ os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
106
107
  # yapf: disable
107
108
  proc = subprocess.Popen([
108
109
  'python', '-m', 'vllm.entrypoints.openai.api_server',
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.8.1'
4
- __release_datetime__ = '2024-12-17 20:00:00'
3
+ __version__ = '0.8.2'
4
+ __release_datetime__ = '2024-12-26 20:00:00'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.8.1
3
+ Version: 0.8.2
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -84,7 +84,7 @@ Requires-Dist: transformers-stream-generator; extra == "all"
84
84
  Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
85
85
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
86
86
  Requires-Dist: mteb==1.19.4; extra == "all"
87
- Requires-Dist: ragas==0.2.7; extra == "all"
87
+ Requires-Dist: ragas==0.2.9; extra == "all"
88
88
  Requires-Dist: webdataset>0.2.0; extra == "all"
89
89
  Requires-Dist: aiohttp; extra == "all"
90
90
  Requires-Dist: fastapi; extra == "all"
@@ -129,7 +129,7 @@ Requires-Dist: transformers; extra == "perf"
129
129
  Requires-Dist: unicorn; extra == "perf"
130
130
  Provides-Extra: rag
131
131
  Requires-Dist: mteb==1.19.4; extra == "rag"
132
- Requires-Dist: ragas==0.2.7; extra == "rag"
132
+ Requires-Dist: ragas==0.2.9; extra == "rag"
133
133
  Requires-Dist: webdataset>0.2.0; extra == "rag"
134
134
  Provides-Extra: vlmeval
135
135
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
@@ -181,6 +181,8 @@ The framework accommodates multiple evaluation scenarios such as end-to-end RAG
181
181
  <br>EvalScope Framework.
182
182
  </p>
183
183
 
184
+ <details><summary>Framework Description</summary>
185
+
184
186
  The architecture includes the following modules:
185
187
  1. **Model Adapter**: The model adapter is used to convert the outputs of specific models into the format required by the framework, supporting both API call models and locally run models.
186
188
  2. **Data Adapter**: The data adapter is responsible for converting and processing input data to meet various evaluation needs and formats.
@@ -194,6 +196,16 @@ The architecture includes the following modules:
194
196
  5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
195
197
  6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
196
198
 
199
+ </details>
200
+
201
+ ## ☎ User Groups
202
+
203
+ Please scan the QR code below to join our community groups:
204
+
205
+ [Discord Group](https://discord.com/invite/D27yfEFVz5) | WeChat Group | DingTalk Group
206
+ :-------------------------:|:-------------------------:|:-------------------------:
207
+ <img src="docs/asset/discord_qr.jpg" width="160" height="160"> | <img src="docs/asset/wechat.png" width="160" height="160"> | <img src="docs/asset/dingding.png" width="160" height="160">
208
+
197
209
 
198
210
  ## 🎉 News
199
211
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
@@ -1,11 +1,11 @@
1
1
  evalscope/__init__.py,sha256=RY0EjssSquqqsysRobElYm9Ix6E41uTXeaeh7lI7kqs,106
2
2
  evalscope/arguments.py,sha256=nozBnog45l77jxTFH_lyyJkj04ER3yyIpICepc2tC1Y,3783
3
- evalscope/config.py,sha256=ZDN0XVCCXMSSD675Smzm57fNDOx-cZTsNvPboMtYVow,8407
3
+ evalscope/config.py,sha256=_4IRpoAssdHEg75UKPKVw6FVaCu2NaP2aOMA5DRsuGU,8444
4
4
  evalscope/constants.py,sha256=M5qJ8b7kp-RF52IwBjx5EMjeuiH1e1jdollCsbIT-c4,3753
5
5
  evalscope/run.py,sha256=s_qE1ukrt4HBfRVAPJjC1XiqD9k7rSH7lX8yysyf5do,7279
6
6
  evalscope/run_arena.py,sha256=6nc_S8KL7B3V4SsnpIexfvczHN9kQwHR9R1GXb2sqgI,8586
7
7
  evalscope/summarizer.py,sha256=FgdYz7LlNs5XpDMlj2ULkVQGIg5XVeeWdWJ1_OMweq0,5882
8
- evalscope/version.py,sha256=OXwZDg6ML1mbsIw-CBhWRf4zVz2ArW2PFzzLK9FVAZk,118
8
+ evalscope/version.py,sha256=uvEbCM3fC0oZ2Rt82Q0oErXsM-iYBNxJtPPLXPwscAU,118
9
9
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
11
11
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -42,26 +42,6 @@ evalscope/backend/rag_eval/ragas/__init__.py,sha256=D0yJkN9SuNGIAL3niZw4BI08Yh3H
42
42
  evalscope/backend/rag_eval/ragas/arguments.py,sha256=8SYCV15d25ocdDHRqmGMQzd9zR6gwfOrVSFBe4T-KCo,1806
43
43
  evalscope/backend/rag_eval/ragas/task_template.py,sha256=a_3bWfLx0j2zJkWgEWNStO0XXAeUFdnFpeukpoGfxLg,1669
44
44
  evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py,sha256=fX9sCci787ViGiL3BhGsykx0bnWfOWWEFueaJKyR8g4,793
45
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json,sha256=4wPfjNh-OVFQdvho3CAJ66_B2TZuRZVm6-xUIXokKcY,3935
46
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json,sha256=wWidnp8726hf6-fY31ZoqCt9zhZgVM260o8MwdBI0d8,1737
47
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json,sha256=o5RXPz-O1JM8gFRCLCY2iobh0uLc4mznT_zLCpWaPFE,968
48
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json,sha256=eEs6gdAKuYfDohCz9EzM1o0ykIEUbvwoRu1Pd2dL92E,3168
49
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json,sha256=qZhHR9Ki374Ykb6V8dmptE1whXmPKRvAJ0Gl2akoaX0,216
50
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json,sha256=k5LjoxcIDM9Yvj0h5bje6ANXEOgFbioRs1i23259Md8,2486
51
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json,sha256=Pn1rGIjfyIeY6BZQEOeR4v-QC5xcmTN6aIh0G2E2Xuo,1740
52
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json,sha256=p7RrFdNWY1Wo5s03SvtXQSZ-CEn96NkPZ3EHsJ3UIFE,1137
53
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json,sha256=s2mlf9BTWnmnCZ9H3yLZgPvPUPWnPgIIDtRtH0qStMM,991
54
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=s_6K6surhTGpr5efryHjW-PFDKlYJTTpgXDlC_TbzVw,1943
55
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=XmHkc-bj7PFdLxGKoM3UDeOv2FO0X2Pc9Wpd6JOkdns,919
56
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=p-vCyibNNezGcuID2kGvBDZJGdPXm3NvTTVvH6ij7N4,1973
57
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=XmHkc-bj7PFdLxGKoM3UDeOv2FO0X2Pc9Wpd6JOkdns,919
58
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json,sha256=yayuzrNO2EO9eIqSv5mthNTVXnw_7D_HOJZ_tse-qw0,1374
59
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json,sha256=-rOBZuhZGbVrlti3PycavxAoInEry3dMYt9VN3Qvo-E,1475
60
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json,sha256=svZ_xzfQp3KMzdVJoqTVPGnwgls2JjXXplTcUj1jVFo,767
61
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=VRO9Hy-e5Dba1AkLqxj2R-Ezwoby3BvipM9zNlZJ4GY,1328
62
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=XmHkc-bj7PFdLxGKoM3UDeOv2FO0X2Pc9Wpd6JOkdns,919
63
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json,sha256=1YVcklCc4otS0mkO0aiNNFx7Zecc1L3wB6ol3NPxTt0,697
64
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json,sha256=c70_FGepQm3_dZngdjNudX_iCmu39tvZncyBqNxMrfg,658
65
45
  evalscope/backend/rag_eval/ragas/tasks/__init__.py,sha256=hErdWKbvV9aRqOpQTzdFHw1tcYoDbnttmic7GpZzKx8,173
66
46
  evalscope/backend/rag_eval/ragas/tasks/build_distribution.py,sha256=vFfemiqtPx22u5pwwZxEQJKYf3B9efYmwbpWDI5hY30,1491
67
47
  evalscope/backend/rag_eval/ragas/tasks/build_transform.py,sha256=GtAYqdVOy7BxIGyC4rSZ_UfXagKYzE6eEtXbaOI_g-k,5425
@@ -154,7 +134,7 @@ evalscope/cli/start_eval.py,sha256=2lyD2WSQ0DnP6T31VvTimQ-6POnwxeEP9GLPFnT7Tfo,7
154
134
  evalscope/cli/start_perf.py,sha256=lEHJBSpzNsO4KGlWfQc-EfZGXq1M_FpOwtRxRdb4fso,813
155
135
  evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
156
136
  evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
157
- evalscope/evaluator/evaluator.py,sha256=nRR6aaa9J8nRfB8QPZwexSrfKDvPkPSGQpFVpbWLeW0,18380
137
+ evalscope/evaluator/evaluator.py,sha256=wrTWyvyD1eqSvsZRwDRV1UVBxXv7y-2A29UCD9F-5qI,18412
158
138
  evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
159
139
  evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
160
140
  evalscope/evaluator/reviewer/auto_reviewer.py,sha256=nL8k-i92L1iMwjPOnNxzQyZICfukZKJul4ZBvOWkHGw,16414
@@ -170,24 +150,24 @@ evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk
170
150
  evalscope/models/__init__.py,sha256=b-jXJ2Cj6dH8notAU7lvCVKbGrcEaf8Gfr5w79qNHAk,111
171
151
  evalscope/models/dummy_chat_model.py,sha256=aG3yolnnIN_-gsfF9FsyjyGMewQteEnUfOxTGScROSE,1272
172
152
  evalscope/models/model.py,sha256=ZzzVzZHVzuzdt5F1r-rEBT44ZfW9B7R1spsrV-T8nSw,3020
173
- evalscope/models/model_adapter.py,sha256=XBeSFTR9pXmnhFWRRddcobnITC5T4JKooeFUeWEtUVI,19006
153
+ evalscope/models/model_adapter.py,sha256=5jzDXpFp24ZZ25tjpIMJeDTz-lDSD_EHp040gJOZACc,19007
174
154
  evalscope/models/openai_model.py,sha256=-tPBu6v0Ogf_flmG88tFuu66QNKrOyxv3AjYwVtuR44,3313
175
155
  evalscope/models/api/__init__.py,sha256=0c75K78O1KaV02BqqtEp-hhtSSClXLawb8E0c2iqN_A,105
176
156
  evalscope/models/api/openai_api.py,sha256=PiIvvDYJkn041SJkLoroXwl1B8TtwpB7licVfqNSeuQ,8168
177
157
  evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
178
158
  evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
179
159
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
180
- evalscope/perf/arguments.py,sha256=J067vNJF-RObJNZ0oE2RBIBNjliCYcflWtt6aGAt40g,9205
181
- evalscope/perf/benchmark.py,sha256=h151QXsVbg7lMe09aH_mxUdPRALIl1A35I9VO2zryEo,9615
160
+ evalscope/perf/arguments.py,sha256=8KiD4u51B_twEaIiI0_kw4Jknk3YG4S6XN-vgvutChA,9233
161
+ evalscope/perf/benchmark.py,sha256=qNgDNseW8N0beuAB_4-JVtTdHs7ZaJEHK5XnkMU9vRU,9618
182
162
  evalscope/perf/http_client.py,sha256=TfnQT9OaBlUCpGwi4ifSJBaaGsn3P2KVBPMGuw-Rqkk,7073
183
- evalscope/perf/main.py,sha256=2GrE9wHibprzaw4gmcovdc5ods_EHwoSwwmkFDLTUjQ,1257
163
+ evalscope/perf/main.py,sha256=Qg99KhGUjnVAMkNofbDsvMGFxijewH8ri3QoW1y1U7U,1292
184
164
  evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
185
165
  evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
186
166
  evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
187
167
  evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
188
168
  evalscope/perf/plugin/api/custom_api.py,sha256=IplmkCu8v9yQrY5CeqBEQDWdOfOp3vRkiDYUcvhw2yY,3775
189
169
  evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
190
- evalscope/perf/plugin/api/openai_api.py,sha256=raa4SaatEphNfWuK6_3ecfe49Vg4yftD6C-enhufJuE,7020
170
+ evalscope/perf/plugin/api/openai_api.py,sha256=WV2EUIl1PTg-Dj7HMSxJrAE7OUxJZqQmZLJZLHffcJo,6805
191
171
  evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
192
172
  evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
193
173
  evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
@@ -201,7 +181,7 @@ evalscope/perf/utils/analysis_result.py,sha256=ig0zPwbUODGh1GUr3GmnNF4lJJp9SQvW0
201
181
  evalscope/perf/utils/benchmark_util.py,sha256=T_pXpSCwCNLJgfzgv3IO7kG61ghTLthVMsXZhBCGP_4,5541
202
182
  evalscope/perf/utils/db_util.py,sha256=PSBq16uWyzXx0zyoEE4wazWKN19UAA8_GjobS7rTPso,9001
203
183
  evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
204
- evalscope/perf/utils/local_server.py,sha256=A26gqBbxsnZA8CqQospyO50x3prVnD9XiT2l--ERxK0,4566
184
+ evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
205
185
  evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
206
186
  evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
207
187
  evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
@@ -266,7 +246,7 @@ tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
266
246
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
267
247
  tests/cli/test_run.py,sha256=pMZvI3b0Vs-UFfciDoPwCYFAaYJzocQjxEaMLFTxYSo,4289
268
248
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
269
- tests/perf/test_perf.py,sha256=AQB2QuMwJ1TnenHFPBF4YAtifbR0D0pSobP6xmDysqw,3023
249
+ tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
270
250
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
271
251
  tests/rag/test_clip_benchmark.py,sha256=Ar8Br2CoAFYT2h4zCv_JKMKCGJKbKGYZgNwJ410ZaoU,2597
272
252
  tests/rag/test_mteb.py,sha256=CaEJ0f1M06Z90c72FQb9z23IC_KZtkURWsc_oRMgQn8,4609
@@ -277,9 +257,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
277
257
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
278
258
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
279
259
  tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
280
- evalscope-0.8.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
281
- evalscope-0.8.1.dist-info/METADATA,sha256=HydrEYb1OxbvVUMl11oLekV2sjvlgQQvtEpkcNAiW5A,23190
282
- evalscope-0.8.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
283
- evalscope-0.8.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
284
- evalscope-0.8.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
285
- evalscope-0.8.1.dist-info/RECORD,,
260
+ evalscope-0.8.2.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
261
+ evalscope-0.8.2.dist-info/METADATA,sha256=Fk1p0gh2RycQ7yOBj7fMYym7G-SYj8sL-IZX8cgGxVQ,23709
262
+ evalscope-0.8.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
263
+ evalscope-0.8.2.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
264
+ evalscope-0.8.2.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
265
+ evalscope-0.8.2.dist-info/RECORD,,
tests/perf/test_perf.py CHANGED
@@ -19,13 +19,13 @@ class TestPerf(unittest.TestCase):
19
19
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
20
20
  def test_run_perf(self):
21
21
  task_cfg = {
22
- 'url': 'http://127.0.0.1:8000/v1/chat/completions',
22
+ 'url': 'http://127.0.0.1:8001/v1/chat/completions',
23
23
  'parallel': 1,
24
24
  'model': 'qwen2.5',
25
25
  'number': 15,
26
26
  'api': 'openai',
27
27
  'dataset': 'openqa',
28
- 'stream': True,
28
+ # 'stream': True,
29
29
  'debug': True,
30
30
  }
31
31
  run_perf_benchmark(task_cfg)
@@ -47,7 +47,7 @@ class TestPerf(unittest.TestCase):
47
47
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
48
48
  def test_run_perf_speed_benchmark(self):
49
49
  task_cfg = {
50
- 'url': 'http://127.0.0.1:8801/v1/completions',
50
+ 'url': 'http://127.0.0.1:8001/v1/completions',
51
51
  'parallel': 1,
52
52
  'model': 'qwen2.5',
53
53
  'api': 'openai',
@@ -1,87 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -492257975294377194,
4
- "language": "chinese",
5
- "instruction": "给定一个真实情况和一个答案陈述,分析每个陈述并将其分类为以下类别之一:TP(真正):答案中存在的陈述也直接由一个或多个真实情况中的陈述支持,FP(假正):答案中存在的陈述但没有被任何真实情况中的陈述直接支持,FN(假负):在真实情况中发现但在答案中不存在的陈述。每个陈述只能属于其中一个类别。为每个分类提供理由。",
6
- "examples": [
7
- {
8
- "input": {
9
- "question": "是什么为太阳提供能量,它的主要功能是什么?",
10
- "answer": [
11
- "太阳的能量来自核裂变,类似于地球上的核反应堆。",
12
- "太阳的主要功能是为太阳系提供光。"
13
- ],
14
- "ground_truth": [
15
- "太阳的能量来自核聚变,其中氢原子融合形成氦。",
16
- "太阳核心的这种聚变过程释放出巨大的能量。",
17
- "来自太阳的能量提供热量和光,这对地球上的生命至关重要。",
18
- "太阳的光在地球的气候系统中起着关键作用。",
19
- "阳光有助于驱动天气和海洋洋流。"
20
- ]
21
- },
22
- "output": {
23
- "TP": [
24
- {
25
- "statement": "太阳的主要功能是为太阳系提供光。",
26
- "reason": "这一说法在某种程度上得到了地面事实的支持,提到太阳提供光和它的作用,尽管它更广泛地关注太阳的能量。"
27
- }
28
- ],
29
- "FP": [
30
- {
31
- "statement": "太阳的能量来自核裂变,类似于地球上的核反应堆。",
32
- "reason": "这一说法是不正确的,与地面事实相矛盾,地面事实指出太阳的能量来自核聚变。"
33
- }
34
- ],
35
- "FN": [
36
- {
37
- "statement": "太阳的能量来自核聚变,其中氢原子融合形成氦。",
38
- "reason": "这种对太阳能量来源的准确描述没有包含在答案中。"
39
- },
40
- {
41
- "statement": "太阳核心的这种聚变过程释放出巨大的能量。",
42
- "reason": "这个过程及其重要性没有在答案中提到。"
43
- },
44
- {
45
- "statement": "来自太阳的能量提供热量和光,这对地球上的生命至关重要。",
46
- "reason": "答案中只提到了光,忽略了热量及其对生命的必要性,这些在地面事实中都有涵盖。"
47
- },
48
- {
49
- "statement": "太阳的光在地球的气候系统中起着关键作用。",
50
- "reason": "太阳光对地球气候系统的这种更广泛的影响没有在答案中提到。"
51
- },
52
- {
53
- "statement": "阳光有助于驱动天气和海洋洋流。",
54
- "reason": "答案中省略了阳光对天气模式和海洋洋流的影响。"
55
- }
56
- ]
57
- }
58
- },
59
- {
60
- "input": {
61
- "question": "水的沸点是多少?",
62
- "answer": [
63
- "水的沸点在海平面上是100摄氏度。"
64
- ],
65
- "ground_truth": [
66
- "水的沸点在海平面上是100摄氏度(212华氏度)。",
67
- "水的沸点会随着海拔的变化而变化。"
68
- ]
69
- },
70
- "output": {
71
- "TP": [
72
- {
73
- "statement": "水的沸点在海平面上是100摄氏度。",
74
- "reason": "这一说法直接得到了地面事实的支持,地面事实具体说明了水的沸点在海平面上是100摄氏度。"
75
- }
76
- ],
77
- "FP": [],
78
- "FN": [
79
- {
80
- "statement": "水的沸点会随着海拔的变化而变化。",
81
- "reason": "关于水的沸点如何随海拔变化的额外信息没有在答案中提到。"
82
- }
83
- ]
84
- }
85
- }
86
- ]
87
- }
@@ -1,36 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -8546983388246528139,
4
- "language": "chinese",
5
- "instruction": "给定一个问题、一个答案和答案中的句子,分析在“句子”下给出的每个句子的复杂性,并将每个句子分解为一个或多个完全可理解的陈述,同时确保每个陈述中不使用代词。将输出格式化为JSON。",
6
- "examples": [
7
- {
8
- "input": {
9
- "question": "阿尔伯特·爱因斯坦是谁,他以什么而闻名?",
10
- "answer": "他是一位出生于德国的理论物理学家,被广泛认为是有史以来最伟大和最有影响力的物理学家之一。他最著名的是发展了相对论,他还对量子力学理论的发展做出了重要贡献。",
11
- "sentences": {
12
- "0": "他是一位出生于德国的理论物理学家,被广泛认为是有史以来最伟大和最有影响力的物理学家之一。",
13
- "1": "他最著名的是发展了相对论,他还对量子力学理论的发展做出了重要贡献。"
14
- }
15
- },
16
- "output": {
17
- "sentences": [
18
- {
19
- "sentence_index": 0,
20
- "simpler_statements": [
21
- "阿尔伯特·爱因斯坦是一位出生于德国的理论物理学家。",
22
- "阿尔伯特·爱因斯坦被认为是有史以来最伟大和最有影响力的物理学家之一。"
23
- ]
24
- },
25
- {
26
- "sentence_index": 1,
27
- "simpler_statements": [
28
- "阿尔伯特·爱因斯坦最著名的是发展了相对论。",
29
- "阿尔伯特·爱因斯坦还对量子力学理论的发展做出了重要贡献。"
30
- ]
31
- }
32
- ]
33
- }
34
- }
35
- ]
36
- }
@@ -1,26 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 7951911230338252816,
4
- "language": "chinese",
5
- "instruction": "为给定的答案生成一个问题,并识别答案是否含糊不清。如果答案含糊不清,则给出1;如果答案明确,则给出0。含糊不清的答案是指那些回避的、模糊的或不明确的答案。例如,“我不知道”或“我不确定”是含糊不清的答案。",
6
- "examples": [
7
- {
8
- "input": {
9
- "response": "阿尔伯特·爱因斯坦出生在德国。"
10
- },
11
- "output": {
12
- "question": "阿尔伯特·爱因斯坦出生在哪里?",
13
- "noncommittal": 0
14
- }
15
- },
16
- {
17
- "input": {
18
- "response": "我不知道2023年发明的智能手机的突破性功能,因为我对2022年以后的信息不了解。"
19
- },
20
- "output": {
21
- "question": "2023年发明的智能手机的突破性功能是什么?",
22
- "noncommittal": 1
23
- }
24
- }
25
- ]
26
- }
@@ -1,41 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -5318808809674890018,
4
- "language": "chinese",
5
- "instruction": "给定问题、答案和背景,验证背景在得出给定答案时是否有用。如果有用,判定为“1”,如果没有用,判定为“0”,并以json格式输出。",
6
- "examples": [
7
- {
8
- "input": {
9
- "question": "你能告诉我关于阿尔伯特·爱因斯坦的什么?",
10
- "context": "阿尔伯特·爱因斯坦(1879年3月14日-1955年4月18日)是一位德国出生的理论物理学家,被广泛认为是有史以来最伟大和最有影响力的科学家之一。他因发展相对论而闻名,同时也对量子力学做出了重要贡献,因此在20世纪前几十年现代物理学对自然科学理解的革命性重塑中起到了核心作用。他的质能等价公式E=mc²,源于相对论,被称为“世界上最著名的方程”。他因“对理论物理学的贡献,特别是发现光电效应定律”而获得1921年诺贝尔物理学奖,这是量子理论发展的关键一步。他的工作也因其对科学哲学的影响而闻名。在1999年由英国《物理世界》杂志对全球130位顶尖物理学家的调查中,爱因斯坦被评为有史以来最伟大的物理学家。他的智力成就和原创性使爱因斯坦成为天才的代名词。",
11
- "answer": "阿尔伯特·爱因斯坦,生于1879年3月14日,是一位德国出生的理论物理学家,被广泛认为是有史以来最伟大和最有影响力的科学家之一。他因对理论物理学的贡献而获得1921年诺贝尔物理学奖。"
12
- },
13
- "output": {
14
- "reason": "提供的背景确实有助于得出给定的答案。背景包括关于阿尔伯特·爱因斯坦的生活和贡献的关键信息,这些信息在答案中得到了反映。",
15
- "verdict": 1
16
- }
17
- },
18
- {
19
- "input": {
20
- "question": "谁赢得了2020年ICC世界杯?",
21
- "context": "2022年ICC男子T20世界杯于2022年10月16日至11月13日在澳大利亚举行,是该赛事的第八届。原定于2020年举行,但因COVID-19大流行而推迟。英格兰在决赛中以五个小门击败巴基斯坦,赢得了他们的第二个ICC男子T20世界杯冠军。",
22
- "answer": "英格兰"
23
- },
24
- "output": {
25
- "reason": "背景有助于澄清关于2020年ICC世界杯的情况,并指出英格兰是原定于2020年举行但实际上在2022年举行的比赛的获胜者。",
26
- "verdict": 1
27
- }
28
- },
29
- {
30
- "input": {
31
- "question": "世界上最高的山是什么?",
32
- "context": "安第斯山脉是世界上最长的大陆山脉,位于南美洲。它横跨七个国家,拥有西半球许多最高的山峰。该山脉以其多样的生态系统而闻名,包括高海拔的安第斯高原和亚马逊雨林。",
33
- "answer": "珠穆朗玛峰。"
34
- },
35
- "output": {
36
- "reason": "提供的背景讨论了安第斯山脉,虽然令人印象深刻,但不包括珠穆朗玛峰,也与关于世界最高山的问题没有直接关系。",
37
- "verdict": 0
38
- }
39
- }
40
- ]
41
- }
@@ -1,7 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -1333942410710431097,
4
- "language": "chinese",
5
- "instruction": "给定文档摘要和节点内容,将节点内容评分在1到5的范围内。",
6
- "examples": []
7
- }
@@ -1,60 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 5296785184599215999,
4
- "language": "chinese",
5
- "instruction": "您的任务是根据给定的上下文判断一系列陈述的真实性。对于每个陈述,如果可以根据上下文直接推断出该陈述,则必须返回判决为1;如果不能根据上下文直接推断出该陈述,则返回判决为0。",
6
- "examples": [
7
- {
8
- "input": {
9
- "context": "约翰是XYZ大学的学生。他正在攻读计算机科学学位。本学期他注册了几门课程,包括数据结构、算法和数据库管理。约翰是一个勤奋的学生,花费大量时间学习和完成作业。他经常在图书馆待到很晚以完成他的项目。",
10
- "statements": [
11
- "约翰主修生物学。",
12
- "约翰正在学习一门人工智能课程。",
13
- "约翰是一个勤奋的学生。",
14
- "约翰有一份兼职工作。"
15
- ]
16
- },
17
- "output": {
18
- "statements": [
19
- {
20
- "statement": "约翰主修生物学。",
21
- "reason": "约翰的专业明确提到是计算机科学。没有信息表明他主修生物学。",
22
- "verdict": 0
23
- },
24
- {
25
- "statement": "约翰正在学习一门人工智能课程。",
26
- "reason": "上下文中提到约翰目前注册的课程,并未提到人工智能。因此,不能推断出约翰正在学习人工智能课程。",
27
- "verdict": 0
28
- },
29
- {
30
- "statement": "约翰是一个勤奋的学生。",
31
- "reason": "上下文中提到他花费大量时间学习和完成作业。此外,还提到他经常在图书馆待到很晚以完成他的项目,这意味着他很勤奋。",
32
- "verdict": 1
33
- },
34
- {
35
- "statement": "约翰有一份兼职工作。",
36
- "reason": "上下文中没有给出约翰有兼职工作的信息。",
37
- "verdict": 0
38
- }
39
- ]
40
- }
41
- },
42
- {
43
- "input": {
44
- "context": "光合作用是植物、藻类和某些细菌用来将光能转化为化学能的过程。",
45
- "statements": [
46
- "阿尔伯特·爱因斯坦是个天才。"
47
- ]
48
- },
49
- "output": {
50
- "statements": [
51
- {
52
- "statement": "阿尔伯特·爱因斯坦是个天才。",
53
- "reason": "上下文和陈述无关",
54
- "verdict": 0
55
- }
56
- ]
57
- }
58
- }
59
- ]
60
- }
@@ -1,36 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -8546983388246528139,
4
- "language": "chinese",
5
- "instruction": "给定一个问题、一个答案和答案中的句子,分析在“句子”下给出的每个句子的复杂性,并将每个句子分解为一个或多个完全可理解的陈述,同时确保每个陈述中不使用代词。将输出格式化为JSON。",
6
- "examples": [
7
- {
8
- "input": {
9
- "question": "阿尔伯特·爱因斯坦是谁,他最出名的是什么?",
10
- "answer": "他是一位出生于德国的理论物理学家,被广泛认为是有史以来最伟大和最有影响力的物理学家之一。他最出名的是发展了相对论,他还为量子力学理论的发展做出了重要贡献。",
11
- "sentences": {
12
- "0": "他是一位出生于德国的理论物理学家,被广泛认为是有史以来最伟大和最有影响力的物理学家之一。",
13
- "1": "他最出名的是发展了相对论,他还为量子力学理论的发展做出了重要贡献。"
14
- }
15
- },
16
- "output": {
17
- "sentences": [
18
- {
19
- "sentence_index": 0,
20
- "simpler_statements": [
21
- "阿尔伯特·爱因斯坦是一位出生于德国的理论物理学家。",
22
- "阿尔伯特·爱因斯坦被认为是有史以来最伟大和最有影响力的物理学家之一。"
23
- ]
24
- },
25
- {
26
- "sentence_index": 1,
27
- "simpler_statements": [
28
- "阿尔伯特·爱因斯坦最出名的是发展了相对论。",
29
- "阿尔伯特·爱因斯坦还为量子力学理论的发展做出了重要贡献。"
30
- ]
31
- }
32
- ]
33
- }
34
- }
35
- ]
36
- }
@@ -1,24 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 7972318980248949928,
4
- "language": "chinese",
5
- "instruction": "从给定文本中提取最重要的max_num标题,这些标题可用于将文本分成独立的部分。重点关注第2级和第3级标题。",
6
- "examples": [
7
- {
8
- "input": {
9
- "text": " 介绍\n 主题概述...\n\n 主要概念\n 核心思想的解释...\n\n 详细分析\n 分析的技术和方法...\n\n 小节:专业技术\n 专业技术的进一步细节...\n\n 未来方向\n 对即将到来的趋势的见解...\n\n 小节:研究的下一步\n 新研究领域的讨论...\n\n 结论\n 最后的评论和总结。\n ",
10
- "max_num": 6
11
- },
12
- "output": {
13
- "headlines": [
14
- "介绍",
15
- "主要概念",
16
- "详细分析",
17
- "小节:专业技术",
18
- "未来方向",
19
- "结论"
20
- ]
21
- }
22
- }
23
- ]
24
- }
@@ -1,35 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 5035835898922847346,
4
- "language": "chinese",
5
- "instruction": "通过将来自至少两个不同列表的概念配对来形成组合。\n**说明:**\n- 查看每个节点的概念。\n- 确定可以逻辑连接或对比的概念。\n- 形成涉及来自不同节点的概念的组合。\n- 每个组合应至少包括来自两个或多个节点的一个概念。\n- 清晰简洁地列出组合。\n- 不要重复相同的组合。",
6
- "examples": [
7
- {
8
- "input": {
9
- "lists_of_concepts": [
10
- [
11
- "人工智能",
12
- "自动化"
13
- ],
14
- [
15
- "医疗保健",
16
- "数据隐私"
17
- ]
18
- ],
19
- "max_combinations": 2
20
- },
21
- "output": {
22
- "combinations": [
23
- [
24
- "人工智能",
25
- "医疗保健"
26
- ],
27
- [
28
- "自动化",
29
- "数据隐私"
30
- ]
31
- ]
32
- }
33
- }
34
- ]
35
- }
@@ -1,30 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 5691378570114822729,
4
- "language": "chinese",
5
- "instruction": "根据指定的条件(角色、主题、风格、长度)和提供的上下文生成一个多跳查询和答案。主题代表从上下文中提取或生成的一组短语,这些短语突出显示了所选上下文在创建多跳查询时的适用性。确保查询明确包含这些主题。### 指导:\n1. **生成多跳查询**:使用提供的上下文片段和主题形成一个需要结合多个片段信息的查询(例如,`<1-hop>` 和 `<2-hop>`)。确保查询明确包含一个或多个主题,并反映其与上下文的相关性。\n2. **生成答案**:仅使用提供的上下文中的内容来创建查询的详细和忠实的答案。避免添加不直接存在或无法从给定上下文推断的信息。\n3. **多跳上下文标签**:\n - 每个上下文片段标记为 `<1-hop>`、`<2-hop>` 等。\n - 确保查询使用至少两个片段的信息并有意义地连接它们。",
6
- "examples": [
7
- {
8
- "input": {
9
- "persona": {
10
- "name": "历史学家",
11
- "role_description": "专注于主要科学里程碑及其全球影响。"
12
- },
13
- "themes": [
14
- "相对论",
15
- "实验验证"
16
- ],
17
- "query_style": "正式",
18
- "query_length": "中等",
19
- "context": [
20
- "<1-hop> 阿尔伯特·爱因斯坦发展了相对论,引入了时空的概念。",
21
- "<2-hop> 在1919年的日食期间,光线被重力弯曲的现象得到了证实,支持了爱因斯坦的理论。"
22
- ]
23
- },
24
- "output": {
25
- "query": "在1919年日食期间,相对论的实验验证是如何实现的?",
26
- "answer": "在1919年日食期间,通过确认光线被重力弯曲,实现了相对论的实验验证,这支持了爱因斯坦在理论中提出的时空概念。"
27
- }
28
- }
29
- ]
30
- }
@@ -1,39 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 4608101540215877909,
4
- "language": "chinese",
5
- "instruction": "给定一个主题和角色列表,根据角色描述将每个角色与相关主题关联起来。",
6
- "examples": [
7
- {
8
- "input": {
9
- "themes": [
10
- "同理心",
11
- "包容性",
12
- "远程工作"
13
- ],
14
- "personas": [
15
- {
16
- "name": "人力资源经理",
17
- "role_description": "专注于包容性和员工支持。"
18
- },
19
- {
20
- "name": "远程团队负责人",
21
- "role_description": "管理远程团队沟通。"
22
- }
23
- ]
24
- },
25
- "output": {
26
- "mapping": {
27
- "HR Manager": [
28
- "包容性",
29
- "同理心"
30
- ],
31
- "Remote Team Lead": [
32
- "远程工作",
33
- "同理心"
34
- ]
35
- }
36
- }
37
- }
38
- ]
39
- }
@@ -1,30 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 5691378570114822729,
4
- "language": "chinese",
5
- "instruction": "根据指定的条件(角色、主题、风格、长度)和提供的上下文生成一个多跳查询和答案。主题代表从上下文中提取或生成的一组短语,这些短语突出显示了所选上下文适合多跳查询创建的适用性。确保查询明确包含这些主题。### 指导:\n1. **生成多跳查询**:使用提供的上下文片段和主题形成一个需要结合多个片段信息的查询(例如,`<1-hop>` 和 `<2-hop>`)。确保查询明确包含一个或多个主题,并反映其与上下文的相关性。\n2. **生成答案**:仅使用提供的上下文中的内容来创建对查询的详细和忠实的答案。避免添加不直接存在或无法从给定上下文推断的信息。\n3. **多跳上下文标签**:\n - 每个上下文片段标记为 `<1-hop>`、`<2-hop>` 等。\n - 确保查询使用至少两个片段的信息并有意义地连接它们。",
6
- "examples": [
7
- {
8
- "input": {
9
- "persona": {
10
- "name": "历史学家",
11
- "role_description": "专注于重大的科学里程碑及其全球影响。"
12
- },
13
- "themes": [
14
- "相对论",
15
- "实验验证"
16
- ],
17
- "query_style": "正式",
18
- "query_length": "中等",
19
- "context": [
20
- "<1-hop> 阿尔伯特·爱因斯坦发展了相对论,引入了时空的概念。",
21
- "<2-hop> 在1919年的日全食期间,光线被重力弯曲的现象得到了证实,支持了爱因斯坦的理论。"
22
- ]
23
- },
24
- "output": {
25
- "query": "在1919年的日全食期间,相对论的实验验证是如何实现的?",
26
- "answer": "在1919年的日全食期间,通过确认光线被重力弯曲的现象,实现了相对论的实验验证,这支持了爱因斯坦在理论中提出的时空概念。"
27
- }
28
- }
29
- ]
30
- }
@@ -1,39 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 4608101540215877909,
4
- "language": "chinese",
5
- "instruction": "给定一个主题和角色列表,根据角色描述将每个角色与相关主题关联起来。",
6
- "examples": [
7
- {
8
- "input": {
9
- "themes": [
10
- "同理心",
11
- "包容性",
12
- "远程工作"
13
- ],
14
- "personas": [
15
- {
16
- "name": "人力资源经理",
17
- "role_description": "专注于包容性和员工支持。"
18
- },
19
- {
20
- "name": "远程团队负责人",
21
- "role_description": "管理远程团队沟通。"
22
- }
23
- ]
24
- },
25
- "output": {
26
- "mapping": {
27
- "HR Manager": [
28
- "包容性",
29
- "同理心"
30
- ],
31
- "Remote Team Lead": [
32
- "远程工作",
33
- "同理心"
34
- ]
35
- }
36
- }
37
- }
38
- ]
39
- }
@@ -1,34 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -1903496084584659501,
4
- "language": "chinese",
5
- "instruction": "请说明给定的信息是否得到视觉和文本上下文信息的支持。您需要回答“是”或“否”。如果任何图像和文本上下文支持该信息,请回答“是”。",
6
- "examples": [
7
- {
8
- "input": {
9
- "response": "苹果派通常是双层皮的。",
10
- "retrieved_contexts": [
11
- "苹果派是一种水果派,其主要馅料成分是苹果。",
12
- "苹果派通常与奶油、冰淇淋(“苹果派 à la mode”)、蛋奶沙司或切达干酪一起食用。",
13
- "它通常是双层皮的,上下都有糕点;上层皮可以是实心的或格状的(交叉条纹编织而成)。"
14
- ]
15
- },
16
- "output": {
17
- "faithful": true
18
- }
19
- },
20
- {
21
- "input": {
22
- "response": "苹果派味道不好。",
23
- "retrieved_contexts": [
24
- "苹果派是一种水果派,其主要馅料成分是苹果。",
25
- "苹果派通常与奶油、冰淇淋(“苹果派 à la mode”)、蛋奶沙司或切达干酪一起食用。",
26
- "它通常是双层皮的,上下都有糕点;上层皮可以是实心的或格状的(交叉条纹编织而成)。"
27
- ]
28
- },
29
- "output": {
30
- "faithful": false
31
- }
32
- }
33
- ]
34
- }
@@ -1,36 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -2067734205359291291,
4
- "language": "chinese",
5
- "instruction": "\n您的任务是评估查询的响应是否与提供的图像和文本上下文信息一致。\n您有两个选项可以回答。要么是 True / False。\n如果查询的响应与上下文信息一致,则回答 - True,否则为 False。\n",
6
- "examples": [
7
- {
8
- "input": {
9
- "user_input": "传统玛格丽塔披萨的主要成分是什么?",
10
- "response": "玛格丽塔披萨的主要成分是番茄、马苏里拉奶酪和新鲜罗勒。",
11
- "retrieved_contexts": [
12
- "传统的玛格丽塔披萨由薄饼皮组成。",
13
- "主要的配料包括番茄、马苏里拉奶酪、新鲜罗勒、盐和橄榄油。",
14
- "它是最简单和最经典的披萨类型之一。"
15
- ]
16
- },
17
- "output": {
18
- "relevance": true
19
- }
20
- },
21
- {
22
- "input": {
23
- "user_input": "谁在2021年获得了奥斯卡最佳男演员奖?",
24
- "response": "2021年的最佳男演员奖由莱昂纳多·迪卡普里奥获得。",
25
- "retrieved_contexts": [
26
- "第93届奥斯卡颁奖典礼于2021年举行。",
27
- "安东尼·霍普金斯凭借在《困在时间里的父亲》中的角色获得了最佳男演员奖。",
28
- "由于COVID-19的限制,这次活动具有独特性。"
29
- ]
30
- },
31
- "output": {
32
- "relevance": false
33
- }
34
- }
35
- ]
36
- }
@@ -1,25 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -677862064343016555,
4
- "language": "chinese",
5
- "instruction": "从给定文本中提取命名实体,限制输出为顶级实体。确保实体数量不超过指定的最大值。",
6
- "examples": [
7
- {
8
- "input": {
9
- "text": "特斯拉和SpaceX的首席执行官埃隆·马斯克宣布计划将业务扩展到欧洲和亚洲的新地点。\n 此次扩展预计将创造数千个就业机会,特别是在柏林和上海等城市。",
10
- "max_num": 10
11
- },
12
- "output": {
13
- "entities": [
14
- "埃隆·马斯克",
15
- "特斯拉",
16
- "SpaceX",
17
- "欧洲",
18
- "亚洲",
19
- "柏林",
20
- "上海"
21
- ]
22
- }
23
- }
24
- ]
25
- }
@@ -1,24 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 3079700511467088808,
4
- "language": "chinese",
5
- "instruction": "根据指定的条件(角色、术语、风格、长度)和提供的上下文生成一个单跳查询和答案。确保答案完全忠实于上下文,仅使用提供的上下文中的信息。### 指导:\n1. **生成查询**:根据上下文、角色、术语、风格和长度,创建一个与角色视角一致并包含术语的问题。\n2. **生成答案**:仅使用提供的上下文中的内容,构建对查询的详细答案。不要添加上下文中未包含或无法推断的信息。\n",
6
- "examples": [
7
- {
8
- "input": {
9
- "persona": {
10
- "name": "软件工程师",
11
- "role_description": "专注于编码最佳实践和系统设计。"
12
- },
13
- "term": "微服务",
14
- "query_style": "正式",
15
- "query_length": "中等",
16
- "context": "微服务是一种架构风格,其中应用程序被构建为一组松散耦合的服务。每个服务都是细粒度的,并专注于单一功能。"
17
- },
18
- "output": {
19
- "query": "微服务在软件架构中的目的是什么?",
20
- "answer": "微服务旨在将应用程序结构化为一组松散耦合的服务,每个服务专注于单一功能。"
21
- }
22
- }
23
- ]
24
- }
@@ -1,39 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 4608101540215877909,
4
- "language": "chinese",
5
- "instruction": "给定一个主题和角色列表,根据角色描述将每个角色与相关主题关联起来。",
6
- "examples": [
7
- {
8
- "input": {
9
- "themes": [
10
- "同理心",
11
- "包容性",
12
- "远程工作"
13
- ],
14
- "personas": [
15
- {
16
- "name": "人力资源经理",
17
- "role_description": "专注于包容性和员工支持。"
18
- },
19
- {
20
- "name": "远程团队负责人",
21
- "role_description": "管理远程团队沟通。"
22
- }
23
- ]
24
- },
25
- "output": {
26
- "mapping": {
27
- "HR Manager": [
28
- "包容性",
29
- "同理心"
30
- ],
31
- "Remote Team Lead": [
32
- "远程工作",
33
- "同理心"
34
- ]
35
- }
36
- }
37
- }
38
- ]
39
- }
@@ -1,16 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -2203889341293275650,
4
- "language": "chinese",
5
- "instruction": "将给定文本总结为不超过10个句子。",
6
- "examples": [
7
- {
8
- "input": {
9
- "text": "人工智能\n\n人工智能正在通过自动化以前需要人类智能的任务来改变各个行业。从医疗保健到金融,人工智能正在被用来快速准确地分析大量数据。这项技术还推动了自动驾驶汽车和个性化推荐等领域的创新。"
10
- },
11
- "output": {
12
- "text": "人工智能通过自动化任务、分析数据和推动自动驾驶汽车和个性化推荐等创新,正在革新各个行业。"
13
- }
14
- }
15
- ]
16
- }
@@ -1,24 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -7344189172470926110,
4
- "language": "chinese",
5
- "instruction": "从给定的文本中提取主要主题和概念。",
6
- "examples": [
7
- {
8
- "input": {
9
- "text": "人工智能通过自动化需要人类智能的任务来改变行业。人工智能快速准确地分析大量数据,推动了自动驾驶汽车和个性化推荐等创新。",
10
- "max_num": 10
11
- },
12
- "output": {
13
- "output": [
14
- "人工智能",
15
- "自动化",
16
- "数据分析",
17
- "创新",
18
- "自动驾驶汽车",
19
- "个性化推荐"
20
- ]
21
- }
22
- }
23
- ]
24
- }