evalscope 0.8.0__py3-none-any.whl → 0.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (68) hide show
  1. evalscope/backend/base.py +1 -1
  2. evalscope/backend/rag_eval/utils/clip.py +2 -2
  3. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  4. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  5. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -1
  6. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +2 -1
  7. evalscope/benchmarks/humaneval/humaneval_adapter.py +193 -7
  8. evalscope/benchmarks/race/race_adapter.py +2 -1
  9. evalscope/config.py +38 -2
  10. evalscope/constants.py +24 -38
  11. evalscope/evaluator/__init__.py +0 -1
  12. evalscope/evaluator/evaluator.py +6 -4
  13. evalscope/evaluator/rating_eval.py +1 -1
  14. evalscope/evaluator/reviewer/auto_reviewer.py +2 -1
  15. evalscope/models/model_adapter.py +1 -1
  16. evalscope/perf/arguments.py +3 -1
  17. evalscope/perf/benchmark.py +3 -3
  18. evalscope/perf/main.py +5 -6
  19. evalscope/perf/plugin/api/openai_api.py +53 -49
  20. evalscope/perf/plugin/registry.py +3 -3
  21. evalscope/perf/utils/benchmark_util.py +4 -4
  22. evalscope/perf/utils/db_util.py +66 -22
  23. evalscope/perf/utils/local_server.py +4 -1
  24. evalscope/run.py +45 -82
  25. evalscope/run_arena.py +2 -1
  26. evalscope/summarizer.py +14 -26
  27. evalscope/third_party/longbench_write/eval.py +2 -1
  28. evalscope/third_party/longbench_write/longbench_write.py +2 -1
  29. evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
  30. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  31. evalscope/tools/combine_reports.py +2 -4
  32. evalscope/tools/rewrite_eval_results.py +1 -1
  33. evalscope/utils/__init__.py +1 -0
  34. evalscope/utils/chat_service.py +1 -1
  35. evalscope/utils/io_utils.py +162 -0
  36. evalscope/utils/logger.py +8 -0
  37. evalscope/utils/utils.py +0 -175
  38. evalscope/version.py +2 -2
  39. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/METADATA +15 -3
  40. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/RECORD +47 -67
  41. tests/cli/test_run.py +11 -12
  42. tests/perf/test_perf.py +3 -2
  43. tests/vlm/test_vlmeval.py +3 -2
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
  52. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
  53. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  54. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  55. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  56. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  57. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
  58. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
  59. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
  60. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
  61. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  62. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
  63. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
  64. evalscope/evaluator/humaneval_evaluator.py +0 -158
  65. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/LICENSE +0 -0
  66. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/WHEEL +0 -0
  67. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/entry_points.txt +0 -0
  68. {evalscope-0.8.0.dist-info → evalscope-0.8.2.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,7 @@ import time
4
4
 
5
5
  from evalscope.models.custom import CustomModel
6
6
  from evalscope.run import run_task
7
- from evalscope.utils import yaml_to_dict
7
+ from evalscope.utils.io_utils import yaml_to_dict
8
8
  from evalscope.utils.logger import get_logger
9
9
 
10
10
  logger = get_logger()
@@ -1,3 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
+ from evalscope.constants import *
3
4
  from evalscope.utils.utils import *
@@ -43,7 +43,7 @@ class DeltaMessage(BaseModel):
43
43
 
44
44
  class ChatCompletionRequest(BaseModel):
45
45
  model: str
46
- messages: List[ChatMessage] | str
46
+ messages: Union[List[ChatMessage], str]
47
47
  temperature: Optional[float] = None
48
48
  top_p: Optional[float] = None
49
49
  max_tokens: Optional[int] = 2048
@@ -0,0 +1,162 @@
1
+ import json
2
+ import jsonlines as jsonl
3
+ import os
4
+ import yaml
5
+
6
+ from evalscope.constants import DumpMode
7
+ from evalscope.utils.logger import get_logger
8
+
9
+ logger = get_logger()
10
+
11
+
12
+ class OutputsStructure:
13
+ LOGS_DIR = 'logs'
14
+ PREDICTIONS_DIR = 'predictions'
15
+ REVIEWS_DIR = 'reviews'
16
+ REPORTS_DIR = 'reports'
17
+ CONFIGS_DIR = 'configs'
18
+
19
+ def __init__(self, outputs_dir: str, is_make=True):
20
+ self.outputs_dir = outputs_dir
21
+ self.is_make = is_make
22
+ self._dirs = {
23
+ 'logs_dir': None,
24
+ 'predictions_dir': None,
25
+ 'reviews_dir': None,
26
+ 'reports_dir': None,
27
+ 'configs_dir': None
28
+ }
29
+
30
+ def _get_dir(self, attr_name, dir_name):
31
+ if self._dirs[attr_name] is None:
32
+ dir_path = os.path.join(self.outputs_dir, dir_name)
33
+ if self.is_make:
34
+ os.makedirs(dir_path, exist_ok=True)
35
+ self._dirs[attr_name] = dir_path
36
+ return self._dirs[attr_name]
37
+
38
+ @property
39
+ def logs_dir(self):
40
+ return self._get_dir('logs_dir', OutputsStructure.LOGS_DIR)
41
+
42
+ @property
43
+ def predictions_dir(self):
44
+ return self._get_dir('predictions_dir', OutputsStructure.PREDICTIONS_DIR)
45
+
46
+ @property
47
+ def reviews_dir(self):
48
+ return self._get_dir('reviews_dir', OutputsStructure.REVIEWS_DIR)
49
+
50
+ @property
51
+ def reports_dir(self):
52
+ return self._get_dir('reports_dir', OutputsStructure.REPORTS_DIR)
53
+
54
+ @property
55
+ def configs_dir(self):
56
+ return self._get_dir('configs_dir', OutputsStructure.CONFIGS_DIR)
57
+
58
+
59
+ def jsonl_to_list(jsonl_file):
60
+ """
61
+ Read jsonl file to list.
62
+
63
+ Args:
64
+ jsonl_file: jsonl file path.
65
+
66
+ Returns:
67
+ list: list of lines. Each line is a dict.
68
+ """
69
+ res_list = []
70
+ with jsonl.open(jsonl_file, mode='r') as reader:
71
+ for line in reader.iter(type=dict, allow_none=True, skip_invalid=False):
72
+ res_list.append(line)
73
+ return res_list
74
+
75
+
76
+ def jsonl_to_reader(jsonl_file):
77
+ """
78
+ Read jsonl file to reader object.
79
+
80
+ Args:
81
+ jsonl_file: jsonl file path.
82
+
83
+ Returns:
84
+ reader: jsonl reader object.
85
+ """
86
+ with jsonl.open(jsonl_file, mode='r') as reader:
87
+ return reader
88
+
89
+
90
+ def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
91
+ """
92
+ Dump data to jsonl file.
93
+
94
+ Args:
95
+ data_list: data list to be dumped. [{'a': 'aaa'}, ...]
96
+ jsonl_file: jsonl file path.
97
+ dump_mode: dump mode. It can be 'overwrite' or 'append'.
98
+ """
99
+ if not jsonl_file:
100
+ raise ValueError('output file must be provided.')
101
+
102
+ jsonl_file = os.path.expanduser(jsonl_file)
103
+
104
+ if not isinstance(data_list, list):
105
+ data_list = [data_list]
106
+
107
+ if dump_mode == DumpMode.OVERWRITE:
108
+ dump_mode = 'w'
109
+ elif dump_mode == DumpMode.APPEND:
110
+ dump_mode = 'a'
111
+ with jsonl.open(jsonl_file, mode=dump_mode) as writer:
112
+ writer.write_all(data_list)
113
+
114
+
115
+ def jsonl_to_csv():
116
+ pass
117
+
118
+
119
+ def yaml_to_dict(yaml_file) -> dict:
120
+ """
121
+ Read yaml file to dict.
122
+ """
123
+ with open(yaml_file, 'r') as f:
124
+ try:
125
+ stream = yaml.safe_load(f)
126
+ except yaml.YAMLError as e:
127
+ logger.error(f'{e}')
128
+ raise e
129
+
130
+ return stream
131
+
132
+
133
+ def dict_to_yaml(d: dict, yaml_file: str):
134
+ """
135
+ Dump dict to yaml file.
136
+ """
137
+ with open(yaml_file, 'w') as f:
138
+ yaml.dump(d, f, default_flow_style=False)
139
+
140
+
141
+ def json_to_dict(json_file) -> dict:
142
+ """
143
+ Read json file to dict.
144
+ """
145
+ with open(json_file, 'r') as f:
146
+ try:
147
+ stream = json.load(f)
148
+ except json.JSONDecodeError as e:
149
+ logger.error(f'{e}')
150
+ raise e
151
+
152
+ return stream
153
+
154
+
155
+ def are_paths_same(path1, path2):
156
+ """
157
+ Check if two paths are the same.
158
+ """
159
+ real_path1 = os.path.realpath(os.path.abspath(os.path.expanduser(path1)))
160
+ real_path2 = os.path.realpath(os.path.abspath(os.path.expanduser(path2)))
161
+
162
+ return real_path1 == real_path2
evalscope/utils/logger.py CHANGED
@@ -75,6 +75,14 @@ def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, f
75
75
  return logger
76
76
 
77
77
 
78
+ def configure_logging(debug: bool, log_file: Optional[str] = None):
79
+ """Configure logging level based on the debug flag."""
80
+ if log_file:
81
+ get_logger(log_file=log_file, force=True)
82
+ if debug:
83
+ get_logger(log_level=logging.DEBUG, force=True)
84
+
85
+
78
86
  def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
79
87
  for handler in logger.handlers:
80
88
  if isinstance(handler, logging.FileHandler):
evalscope/utils/utils.py CHANGED
@@ -5,19 +5,13 @@ import functools
5
5
  import hashlib
6
6
  import importlib
7
7
  import importlib.util
8
- import json
9
- import jsonlines as jsonl
10
8
  import numpy as np
11
9
  import os
12
10
  import random
13
11
  import re
14
- import sys
15
12
  import torch
16
- import torch.nn.functional as F
17
- import yaml
18
13
  from typing import Any, Dict, List, Tuple, Union
19
14
 
20
- from evalscope.constants import DumpMode
21
15
  from evalscope.utils.logger import get_logger
22
16
 
23
17
  logger = get_logger()
@@ -36,102 +30,6 @@ def test_level_list():
36
30
  return TEST_LEVEL_LIST
37
31
 
38
32
 
39
- def jsonl_to_list(jsonl_file):
40
- """
41
- Read jsonl file to list.
42
-
43
- Args:
44
- jsonl_file: jsonl file path.
45
-
46
- Returns:
47
- list: list of lines. Each line is a dict.
48
- """
49
- res_list = []
50
- with jsonl.open(jsonl_file, mode='r') as reader:
51
- for line in reader.iter(type=dict, allow_none=True, skip_invalid=False):
52
- res_list.append(line)
53
- return res_list
54
-
55
-
56
- def jsonl_to_reader(jsonl_file):
57
- """
58
- Read jsonl file to reader object.
59
-
60
- Args:
61
- jsonl_file: jsonl file path.
62
-
63
- Returns:
64
- reader: jsonl reader object.
65
- """
66
- with jsonl.open(jsonl_file, mode='r') as reader:
67
- return reader
68
-
69
-
70
- def jsonl_to_csv():
71
- pass
72
-
73
-
74
- def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
75
- """
76
- Dump data to jsonl file.
77
-
78
- Args:
79
- data_list: data list to be dumped. [{'a': 'aaa'}, ...]
80
- jsonl_file: jsonl file path.
81
- dump_mode: dump mode. It can be 'overwrite' or 'append'.
82
- """
83
- if not jsonl_file:
84
- raise ValueError('output file must be provided.')
85
-
86
- jsonl_file = os.path.expanduser(jsonl_file)
87
-
88
- if not isinstance(data_list, list):
89
- data_list = [data_list]
90
-
91
- if dump_mode == DumpMode.OVERWRITE:
92
- dump_mode = 'w'
93
- elif dump_mode == DumpMode.APPEND:
94
- dump_mode = 'a'
95
- with jsonl.open(jsonl_file, mode=dump_mode) as writer:
96
- writer.write_all(data_list)
97
-
98
-
99
- def yaml_to_dict(yaml_file) -> dict:
100
- """
101
- Read yaml file to dict.
102
- """
103
- with open(yaml_file, 'r') as f:
104
- try:
105
- stream = yaml.safe_load(f)
106
- except yaml.YAMLError as e:
107
- logger.error(f'{e}')
108
- raise e
109
-
110
- return stream
111
-
112
-
113
- def dict_to_yaml(d: dict, yaml_file: str):
114
- """
115
- Dump dict to yaml file.
116
- """
117
- with open(yaml_file, 'w') as f:
118
- yaml.dump(d, f, default_flow_style=False)
119
-
120
-
121
- def json_to_dict(json_file) -> dict:
122
- """
123
- Read json file to dict.
124
- """
125
- with open(json_file, 'r') as f:
126
- try:
127
- stream = json.load(f)
128
- except json.JSONDecodeError as e:
129
- logger.error(f'{e}')
130
- raise e
131
-
132
- return stream
133
-
134
-
135
33
  def get_obj_from_cfg(eval_class_ref: Any, *args, **kwargs) -> Any:
136
34
  module_name, spliter, cls_name = eval_class_ref.partition(':')
137
35
 
@@ -300,18 +198,6 @@ class ResponseParser:
300
198
  return ''
301
199
 
302
200
 
303
- def make_outputs_dir(root_dir: str, datasets: list, model_id: str, model_revision: str):
304
- if not model_id:
305
- model_id = 'default'
306
- model_id = model_id.replace('/', '_')
307
-
308
- if not model_revision:
309
- model_revision = 'default'
310
-
311
- outputs_dir = os.path.join(root_dir, model_id, model_revision, f"eval_{'-'.join(datasets)}")
312
-
313
- return outputs_dir
314
-
315
201
 
316
202
  def import_module_util(import_path_prefix: str, module_name: str, members_to_import: list) -> dict:
317
203
  """
@@ -355,67 +241,6 @@ def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float
355
241
  return score
356
242
 
357
243
 
358
- def split_str_parts_by(text: str, delimiters: List[str]):
359
- """Split the text field into parts.
360
- Args:
361
- text: A text to be split.
362
- delimiters: The delimiters.
363
- Returns:
364
- The split text in list of dicts.
365
- """
366
- all_start_chars = [d[0] for d in delimiters]
367
- all_length = [len(d) for d in delimiters]
368
-
369
- text_list = []
370
- last_words = ''
371
-
372
- while len(text) > 0:
373
- for char_idx, char in enumerate(text):
374
- match_index = [idx for idx, start_char in enumerate(all_start_chars) if start_char == char]
375
- is_delimiter = False
376
- for index in match_index:
377
- if text[char_idx:char_idx + all_length[index]] == delimiters[index]:
378
- if last_words:
379
- if text_list:
380
- text_list[-1]['content'] = last_words
381
- else:
382
- text_list.append({'key': '', 'content': last_words})
383
- last_words = ''
384
- text_list.append({'key': delimiters[index]})
385
- text = text[char_idx + all_length[index]:]
386
- is_delimiter = True
387
- break
388
- if not is_delimiter:
389
- last_words += char
390
- else:
391
- break
392
- if last_words == text:
393
- text = ''
394
-
395
- text_list[-1]['content'] = last_words
396
- return text_list
397
-
398
-
399
- def get_bucket_sizes(max_length: int) -> List[int]:
400
- return [max_length // 4 * (i + 1) for i in range(4)]
401
-
402
-
403
- def _get_closet_bucket(bucket_sizes, data_length):
404
- """Select the one from bucket_sizes that is closest in distance to
405
- data_length. This is required for TorchAcc.
406
- """
407
- cloest_length = sys.maxsize
408
- for b in bucket_sizes:
409
- if b == data_length or ((b < cloest_length) and (b > data_length)):
410
- cloest_length = b
411
-
412
- if cloest_length == sys.maxsize:
413
- bucket_sizes.append(data_length)
414
- cloest_length = data_length
415
-
416
- return cloest_length
417
-
418
-
419
244
  def is_module_installed(module_name):
420
245
  try:
421
246
  importlib.import_module(module_name)
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.8.0'
4
- __release_datetime__ = '2024-12-15 00:00:00'
3
+ __version__ = '0.8.2'
4
+ __release_datetime__ = '2024-12-26 20:00:00'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.8.0
3
+ Version: 0.8.2
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -84,7 +84,7 @@ Requires-Dist: transformers-stream-generator; extra == "all"
84
84
  Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
85
85
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
86
86
  Requires-Dist: mteb==1.19.4; extra == "all"
87
- Requires-Dist: ragas==0.2.7; extra == "all"
87
+ Requires-Dist: ragas==0.2.9; extra == "all"
88
88
  Requires-Dist: webdataset>0.2.0; extra == "all"
89
89
  Requires-Dist: aiohttp; extra == "all"
90
90
  Requires-Dist: fastapi; extra == "all"
@@ -129,7 +129,7 @@ Requires-Dist: transformers; extra == "perf"
129
129
  Requires-Dist: unicorn; extra == "perf"
130
130
  Provides-Extra: rag
131
131
  Requires-Dist: mteb==1.19.4; extra == "rag"
132
- Requires-Dist: ragas==0.2.7; extra == "rag"
132
+ Requires-Dist: ragas==0.2.9; extra == "rag"
133
133
  Requires-Dist: webdataset>0.2.0; extra == "rag"
134
134
  Provides-Extra: vlmeval
135
135
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
@@ -181,6 +181,8 @@ The framework accommodates multiple evaluation scenarios such as end-to-end RAG
181
181
  <br>EvalScope Framework.
182
182
  </p>
183
183
 
184
+ <details><summary>Framework Description</summary>
185
+
184
186
  The architecture includes the following modules:
185
187
  1. **Model Adapter**: The model adapter is used to convert the outputs of specific models into the format required by the framework, supporting both API call models and locally run models.
186
188
  2. **Data Adapter**: The data adapter is responsible for converting and processing input data to meet various evaluation needs and formats.
@@ -194,6 +196,16 @@ The architecture includes the following modules:
194
196
  5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
195
197
  6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
196
198
 
199
+ </details>
200
+
201
+ ## ☎ User Groups
202
+
203
+ Please scan the QR code below to join our community groups:
204
+
205
+ [Discord Group](https://discord.com/invite/D27yfEFVz5) | WeChat Group | DingTalk Group
206
+ :-------------------------:|:-------------------------:|:-------------------------:
207
+ <img src="docs/asset/discord_qr.jpg" width="160" height="160"> | <img src="docs/asset/wechat.png" width="160" height="160"> | <img src="docs/asset/dingding.png" width="160" height="160">
208
+
197
209
 
198
210
  ## 🎉 News
199
211
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.