evalscope 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (47) hide show
  1. evalscope/backend/base.py +1 -1
  2. evalscope/backend/rag_eval/utils/clip.py +2 -2
  3. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  4. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  5. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -1
  6. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +2 -1
  7. evalscope/benchmarks/humaneval/humaneval_adapter.py +193 -7
  8. evalscope/benchmarks/race/race_adapter.py +2 -1
  9. evalscope/config.py +35 -1
  10. evalscope/constants.py +24 -38
  11. evalscope/evaluator/__init__.py +0 -1
  12. evalscope/evaluator/evaluator.py +5 -4
  13. evalscope/evaluator/rating_eval.py +1 -1
  14. evalscope/evaluator/reviewer/auto_reviewer.py +2 -1
  15. evalscope/perf/arguments.py +2 -1
  16. evalscope/perf/benchmark.py +2 -2
  17. evalscope/perf/main.py +2 -5
  18. evalscope/perf/plugin/api/openai_api.py +2 -2
  19. evalscope/perf/plugin/registry.py +3 -3
  20. evalscope/perf/utils/benchmark_util.py +4 -4
  21. evalscope/perf/utils/db_util.py +66 -22
  22. evalscope/perf/utils/local_server.py +3 -1
  23. evalscope/run.py +45 -82
  24. evalscope/run_arena.py +2 -1
  25. evalscope/summarizer.py +14 -26
  26. evalscope/third_party/longbench_write/eval.py +2 -1
  27. evalscope/third_party/longbench_write/longbench_write.py +2 -1
  28. evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
  29. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  30. evalscope/tools/combine_reports.py +2 -4
  31. evalscope/tools/rewrite_eval_results.py +1 -1
  32. evalscope/utils/__init__.py +1 -0
  33. evalscope/utils/chat_service.py +1 -1
  34. evalscope/utils/io_utils.py +162 -0
  35. evalscope/utils/logger.py +8 -0
  36. evalscope/utils/utils.py +0 -175
  37. evalscope/version.py +2 -2
  38. {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/METADATA +1 -1
  39. {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/RECORD +46 -46
  40. tests/cli/test_run.py +11 -12
  41. tests/perf/test_perf.py +2 -1
  42. tests/vlm/test_vlmeval.py +3 -2
  43. evalscope/evaluator/humaneval_evaluator.py +0 -158
  44. {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
  45. {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
  46. {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
  47. {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
evalscope/utils/utils.py CHANGED
@@ -5,19 +5,13 @@ import functools
5
5
  import hashlib
6
6
  import importlib
7
7
  import importlib.util
8
- import json
9
- import jsonlines as jsonl
10
8
  import numpy as np
11
9
  import os
12
10
  import random
13
11
  import re
14
- import sys
15
12
  import torch
16
- import torch.nn.functional as F
17
- import yaml
18
13
  from typing import Any, Dict, List, Tuple, Union
19
14
 
20
- from evalscope.constants import DumpMode
21
15
  from evalscope.utils.logger import get_logger
22
16
 
23
17
  logger = get_logger()
@@ -36,102 +30,6 @@ def test_level_list():
36
30
  return TEST_LEVEL_LIST
37
31
 
38
32
 
39
- def jsonl_to_list(jsonl_file):
40
- """
41
- Read jsonl file to list.
42
-
43
- Args:
44
- jsonl_file: jsonl file path.
45
-
46
- Returns:
47
- list: list of lines. Each line is a dict.
48
- """
49
- res_list = []
50
- with jsonl.open(jsonl_file, mode='r') as reader:
51
- for line in reader.iter(type=dict, allow_none=True, skip_invalid=False):
52
- res_list.append(line)
53
- return res_list
54
-
55
-
56
- def jsonl_to_reader(jsonl_file):
57
- """
58
- Read jsonl file to reader object.
59
-
60
- Args:
61
- jsonl_file: jsonl file path.
62
-
63
- Returns:
64
- reader: jsonl reader object.
65
- """
66
- with jsonl.open(jsonl_file, mode='r') as reader:
67
- return reader
68
-
69
-
70
- def jsonl_to_csv():
71
- pass
72
-
73
-
74
- def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
75
- """
76
- Dump data to jsonl file.
77
-
78
- Args:
79
- data_list: data list to be dumped. [{'a': 'aaa'}, ...]
80
- jsonl_file: jsonl file path.
81
- dump_mode: dump mode. It can be 'overwrite' or 'append'.
82
- """
83
- if not jsonl_file:
84
- raise ValueError('output file must be provided.')
85
-
86
- jsonl_file = os.path.expanduser(jsonl_file)
87
-
88
- if not isinstance(data_list, list):
89
- data_list = [data_list]
90
-
91
- if dump_mode == DumpMode.OVERWRITE:
92
- dump_mode = 'w'
93
- elif dump_mode == DumpMode.APPEND:
94
- dump_mode = 'a'
95
- with jsonl.open(jsonl_file, mode=dump_mode) as writer:
96
- writer.write_all(data_list)
97
-
98
-
99
- def yaml_to_dict(yaml_file) -> dict:
100
- """
101
- Read yaml file to dict.
102
- """
103
- with open(yaml_file, 'r') as f:
104
- try:
105
- stream = yaml.safe_load(f)
106
- except yaml.YAMLError as e:
107
- logger.error(f'{e}')
108
- raise e
109
-
110
- return stream
111
-
112
-
113
- def dict_to_yaml(d: dict, yaml_file: str):
114
- """
115
- Dump dict to yaml file.
116
- """
117
- with open(yaml_file, 'w') as f:
118
- yaml.dump(d, f, default_flow_style=False)
119
-
120
-
121
- def json_to_dict(json_file) -> dict:
122
- """
123
- Read json file to dict.
124
- """
125
- with open(json_file, 'r') as f:
126
- try:
127
- stream = json.load(f)
128
- except json.JSONDecodeError as e:
129
- logger.error(f'{e}')
130
- raise e
131
-
132
- return stream
133
-
134
-
135
33
  def get_obj_from_cfg(eval_class_ref: Any, *args, **kwargs) -> Any:
136
34
  module_name, spliter, cls_name = eval_class_ref.partition(':')
137
35
 
@@ -300,18 +198,6 @@ class ResponseParser:
300
198
  return ''
301
199
 
302
200
 
303
- def make_outputs_dir(root_dir: str, datasets: list, model_id: str, model_revision: str):
304
- if not model_id:
305
- model_id = 'default'
306
- model_id = model_id.replace('/', '_')
307
-
308
- if not model_revision:
309
- model_revision = 'default'
310
-
311
- outputs_dir = os.path.join(root_dir, model_id, model_revision, f"eval_{'-'.join(datasets)}")
312
-
313
- return outputs_dir
314
-
315
201
 
316
202
  def import_module_util(import_path_prefix: str, module_name: str, members_to_import: list) -> dict:
317
203
  """
@@ -355,67 +241,6 @@ def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float
355
241
  return score
356
242
 
357
243
 
358
- def split_str_parts_by(text: str, delimiters: List[str]):
359
- """Split the text field into parts.
360
- Args:
361
- text: A text to be split.
362
- delimiters: The delimiters.
363
- Returns:
364
- The split text in list of dicts.
365
- """
366
- all_start_chars = [d[0] for d in delimiters]
367
- all_length = [len(d) for d in delimiters]
368
-
369
- text_list = []
370
- last_words = ''
371
-
372
- while len(text) > 0:
373
- for char_idx, char in enumerate(text):
374
- match_index = [idx for idx, start_char in enumerate(all_start_chars) if start_char == char]
375
- is_delimiter = False
376
- for index in match_index:
377
- if text[char_idx:char_idx + all_length[index]] == delimiters[index]:
378
- if last_words:
379
- if text_list:
380
- text_list[-1]['content'] = last_words
381
- else:
382
- text_list.append({'key': '', 'content': last_words})
383
- last_words = ''
384
- text_list.append({'key': delimiters[index]})
385
- text = text[char_idx + all_length[index]:]
386
- is_delimiter = True
387
- break
388
- if not is_delimiter:
389
- last_words += char
390
- else:
391
- break
392
- if last_words == text:
393
- text = ''
394
-
395
- text_list[-1]['content'] = last_words
396
- return text_list
397
-
398
-
399
- def get_bucket_sizes(max_length: int) -> List[int]:
400
- return [max_length // 4 * (i + 1) for i in range(4)]
401
-
402
-
403
- def _get_closet_bucket(bucket_sizes, data_length):
404
- """Select the one from bucket_sizes that is closest in distance to
405
- data_length. This is required for TorchAcc.
406
- """
407
- cloest_length = sys.maxsize
408
- for b in bucket_sizes:
409
- if b == data_length or ((b < cloest_length) and (b > data_length)):
410
- cloest_length = b
411
-
412
- if cloest_length == sys.maxsize:
413
- bucket_sizes.append(data_length)
414
- cloest_length = data_length
415
-
416
- return cloest_length
417
-
418
-
419
244
  def is_module_installed(module_name):
420
245
  try:
421
246
  importlib.import_module(module_name)
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.8.0'
4
- __release_datetime__ = '2024-12-15 00:00:00'
3
+ __version__ = '0.8.1'
4
+ __release_datetime__ = '2024-12-17 20:00:00'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.8.0
3
+ Version: 0.8.1
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -1,13 +1,13 @@
1
1
  evalscope/__init__.py,sha256=RY0EjssSquqqsysRobElYm9Ix6E41uTXeaeh7lI7kqs,106
2
2
  evalscope/arguments.py,sha256=nozBnog45l77jxTFH_lyyJkj04ER3yyIpICepc2tC1Y,3783
3
- evalscope/config.py,sha256=KYS_O0RdAbruQhqP6mp3rQL0003Oaskx03IroZUGRps,6897
4
- evalscope/constants.py,sha256=D2MU7bs_qwmcHQ1ge05C5Ekk04XqMyiGxssvKwAecxI,4515
5
- evalscope/run.py,sha256=5cG81qfdpMN_GtPphvJ7BHboD6LBYHWyodX8ViR1XL4,8874
6
- evalscope/run_arena.py,sha256=Kmzak4TGdATbOhOCe_zLLRxDvgtkOfs6e4VaxOAzPKk,8550
7
- evalscope/summarizer.py,sha256=Eq7ZqGKuvrhWVeGriLxHCGupgnJmtvmIGqZYzRNaY8I,6480
8
- evalscope/version.py,sha256=Xha7v5_YH0Oppyh6iO7HrpSsmv1WCPdQPFtzYTJvG4A,118
3
+ evalscope/config.py,sha256=ZDN0XVCCXMSSD675Smzm57fNDOx-cZTsNvPboMtYVow,8407
4
+ evalscope/constants.py,sha256=M5qJ8b7kp-RF52IwBjx5EMjeuiH1e1jdollCsbIT-c4,3753
5
+ evalscope/run.py,sha256=s_qE1ukrt4HBfRVAPJjC1XiqD9k7rSH7lX8yysyf5do,7279
6
+ evalscope/run_arena.py,sha256=6nc_S8KL7B3V4SsnpIexfvczHN9kQwHR9R1GXb2sqgI,8586
7
+ evalscope/summarizer.py,sha256=FgdYz7LlNs5XpDMlj2ULkVQGIg5XVeeWdWJ1_OMweq0,5882
8
+ evalscope/version.py,sha256=OXwZDg6ML1mbsIw-CBhWRf4zVz2ArW2PFzzLK9FVAZk,118
9
9
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- evalscope/backend/base.py,sha256=l7zUHXX2XToIfU_hkVeTSHT9wWURYumyohXCIgywZBI,1021
10
+ evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
11
11
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
12
12
  evalscope/backend/opencompass/api_meta_template.py,sha256=DaBJg15ZSIjxroXiygl3-4RdmIe_FD7xHbXvjSZmkQA,1706
13
13
  evalscope/backend/opencompass/backend_manager.py,sha256=y5NnAIY1pI7E1ZSeKU3acrD-oyH3uMGL7M3nPp1WiHU,10381
@@ -68,8 +68,8 @@ evalscope/backend/rag_eval/ragas/tasks/build_transform.py,sha256=GtAYqdVOy7BxIGy
68
68
  evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=B5ZETlQw5XTEDnO-VR5yXjSbbg1eUtjGts7M5msK2ik,5618
69
69
  evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=aP8U9zjIDl26X_YF82SXLpkxoJ4nUurmdKSEoJ-qsLY,2129
70
70
  evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
- evalscope/backend/rag_eval/utils/clip.py,sha256=WZovQJGyPI33Y-9bUnanR6fIYJzrXgnjD4zVwUJSgCw,5002
72
- evalscope/backend/rag_eval/utils/embedding.py,sha256=XWI07YeWDALc2etP4DGluYqrid85nKz1tjM91JLZRmM,6252
71
+ evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
72
+ evalscope/backend/rag_eval/utils/embedding.py,sha256=x9HAEfZSSAnT2Tdbf-9a5UmBVagCr__ay5A2nMCPMpg,6258
73
73
  evalscope/backend/rag_eval/utils/llm.py,sha256=619eP8pXUcwIBaktBrGNA17j53j9jfg_1JeFDYzMCIE,2582
74
74
  evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
75
75
  evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
@@ -122,23 +122,23 @@ evalscope/benchmarks/competition_math/__init__.py,sha256=CDK03RXT-X21WcIAlkrCs0r
122
122
  evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
123
123
  evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=cHWJ6LLIWvftFXjGrOidMlZ1RGUFxPgDjs4wmBPSm1Y,18862
124
124
  evalscope/benchmarks/general_qa/__init__.py,sha256=N2t-ehNrl9eVAarlSgJvRapm9yOjhfCWhNPPfcUUy-s,409
125
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=Y7_d6hmh94W2XbzUnDMX9_uKWcarK0zv4Q4mQWUfSZ8,5869
125
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=cSW0Mq9__-gh-tVoVXD9Rk6h3h2iZW-Fu3RQ16haJhQ,5878
126
126
  evalscope/benchmarks/gsm8k/__init__.py,sha256=CtcG_QM8m5zmvMs2N53d7kcm4_hIgsO2qYPyx-71aLw,313
127
127
  evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
128
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=gg65W_pz4mPOBUOwaYIgfUxGKzrmRZRuoEg5xtS8bYg,13830
128
+ evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=KBI9t5F7XW1Cs44QUA7ultkfsXxLyucH9zNYe-jOQQk,13866
129
129
  evalscope/benchmarks/hellaswag/__init__.py,sha256=cY1kluaTqC7AvyzwlQYc3BF_kB3LD1gOpg6i7RDr0cI,415
130
130
  evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
131
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=7REJeC8vD8OVtmcqI5TP6cTn88-KOzBs5oOKEZEmESs,8459
131
+ evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=IIesSMPw1Yya4-LjqJt1QVkpOx8RGKwBYTQtmc0VfaQ,8495
132
132
  evalscope/benchmarks/humaneval/__init__.py,sha256=lqSlAf1-8Nzhc1j89sj6yAcaLt9pGhqu15M84bmzamc,333
133
133
  evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
134
- evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=HxAjkIA-Wt5-wb8kNSDMzZRoHflgsNxIfa1BoeVzwog,1660
134
+ evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=VAO7siedusq9z3b1J3ztFE4XDopYKqmwe2n-Numg7HY,9149
135
135
  evalscope/benchmarks/mmlu/__init__.py,sha256=OGiN1J80WDM72y242o7diYT9Rl-jkVEqTNntCl8Vt4M,385
136
136
  evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
137
137
  evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=8T-fN_Az0gWOyME9nHl3MvcD144TjWknFKcEOMHppAI,15494
138
138
  evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
139
139
  evalscope/benchmarks/race/__init__.py,sha256=HVda-CB-Q-N8RbwiVLADXYNY6VLUH-frJ8VCc3jm0Mk,385
140
140
  evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
141
- evalscope/benchmarks/race/race_adapter.py,sha256=Ppo7bttx15zB-m-UtguIwIXgqpEKAi_ClIOol0hPQiE,9805
141
+ evalscope/benchmarks/race/race_adapter.py,sha256=WgnWYSctc3VtWm2FAeVDTlxR2hwXsF2tala7n66f5mw,9841
142
142
  evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
143
143
  evalscope/benchmarks/trivia_qa/__init__.py,sha256=eLMVC6tfwty5HqrQuGyWeAF2IhRNajWoO1SkLVemQj4,409
144
144
  evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
@@ -153,12 +153,11 @@ evalscope/cli/cli.py,sha256=yNL3ZeolBc-cVr5D4GByGZWKrmpKIK-48R6wXOXO7Y0,641
153
153
  evalscope/cli/start_eval.py,sha256=2lyD2WSQ0DnP6T31VvTimQ-6POnwxeEP9GLPFnT7Tfo,767
154
154
  evalscope/cli/start_perf.py,sha256=lEHJBSpzNsO4KGlWfQc-EfZGXq1M_FpOwtRxRdb4fso,813
155
155
  evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
156
- evalscope/evaluator/__init__.py,sha256=h_EyZm7vDqBsGx6CkoQVLg0aMy0tE_IG5uEnheubb0s,174
157
- evalscope/evaluator/evaluator.py,sha256=MGkuJi9o5Hdbj_fN7qolDqP0B47i9i0ksGd1uc-TMn0,18365
158
- evalscope/evaluator/humaneval_evaluator.py,sha256=245XRxwulGQpjdapwU8CiYJn1xT0XKxl7hdWvzFxLG0,5964
159
- evalscope/evaluator/rating_eval.py,sha256=VuDIZcmSlsv1tc8znDGesz8ZwpQ7NvZJPv823Quvht0,5566
156
+ evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
157
+ evalscope/evaluator/evaluator.py,sha256=nRR6aaa9J8nRfB8QPZwexSrfKDvPkPSGQpFVpbWLeW0,18380
158
+ evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
160
159
  evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
161
- evalscope/evaluator/reviewer/auto_reviewer.py,sha256=YVTJAHK0uz9hNupsdeTXMM2PISECf8phXq0GYPr4law,16378
160
+ evalscope/evaluator/reviewer/auto_reviewer.py,sha256=nL8k-i92L1iMwjPOnNxzQyZICfukZKJul4ZBvOWkHGw,16414
162
161
  evalscope/metrics/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
163
162
  evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
164
163
  evalscope/metrics/math_accuracy.py,sha256=WqLfACuIeVFrX4q6_c2exnTLn2t10-rjv6sfxcqJJ14,1965
@@ -178,17 +177,17 @@ evalscope/models/api/openai_api.py,sha256=PiIvvDYJkn041SJkLoroXwl1B8TtwpB7licVfq
178
177
  evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
179
178
  evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
180
179
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
181
- evalscope/perf/arguments.py,sha256=_gW1tq7SbrAZd05N-FbY_oWrQB0Djs4KUaFdXSfFsr8,9112
182
- evalscope/perf/benchmark.py,sha256=ff9PFFMY5UucuUihcdo6lSf1X9XXoaOmrpBvjDk5Mrw,9599
180
+ evalscope/perf/arguments.py,sha256=J067vNJF-RObJNZ0oE2RBIBNjliCYcflWtt6aGAt40g,9205
181
+ evalscope/perf/benchmark.py,sha256=h151QXsVbg7lMe09aH_mxUdPRALIl1A35I9VO2zryEo,9615
183
182
  evalscope/perf/http_client.py,sha256=TfnQT9OaBlUCpGwi4ifSJBaaGsn3P2KVBPMGuw-Rqkk,7073
184
- evalscope/perf/main.py,sha256=-8NsvJZ7uyVfJT9N2lX36KfsHkVTy0r8OcsWPYoKms0,1316
183
+ evalscope/perf/main.py,sha256=2GrE9wHibprzaw4gmcovdc5ods_EHwoSwwmkFDLTUjQ,1257
185
184
  evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
186
- evalscope/perf/plugin/registry.py,sha256=PyK3E1AqQFuU4Bs9COvFFCJOaCtmHbfeQOVGtjVYh-I,1304
185
+ evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
187
186
  evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
188
187
  evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
189
188
  evalscope/perf/plugin/api/custom_api.py,sha256=IplmkCu8v9yQrY5CeqBEQDWdOfOp3vRkiDYUcvhw2yY,3775
190
189
  evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
191
- evalscope/perf/plugin/api/openai_api.py,sha256=KRN6EjObTG08mcI82kJD3dGK7DoVMUZzrUZ1AgoLEp0,7007
190
+ evalscope/perf/plugin/api/openai_api.py,sha256=raa4SaatEphNfWuK6_3ecfe49Vg4yftD6C-enhufJuE,7020
192
191
  evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
193
192
  evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
194
193
  evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
@@ -199,10 +198,10 @@ evalscope/perf/plugin/datasets/openqa.py,sha256=2pv7yyPSFYTjPhvAGBsHl0eQO8gt7Wk1
199
198
  evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
200
199
  evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
201
200
  evalscope/perf/utils/analysis_result.py,sha256=ig0zPwbUODGh1GUr3GmnNF4lJJp9SQvW0awWiXEIkCI,1212
202
- evalscope/perf/utils/benchmark_util.py,sha256=xFZSSUoBoFpHRZC69-KS9cK2vqJlL7rIuCEz_MnpnGA,5564
203
- evalscope/perf/utils/db_util.py,sha256=A2K3otCrNw3K1SMwoYo8a6jekT5nAVvWJepqi31DH28,7479
201
+ evalscope/perf/utils/benchmark_util.py,sha256=T_pXpSCwCNLJgfzgv3IO7kG61ghTLthVMsXZhBCGP_4,5541
202
+ evalscope/perf/utils/db_util.py,sha256=PSBq16uWyzXx0zyoEE4wazWKN19UAA8_GjobS7rTPso,9001
204
203
  evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
205
- evalscope/perf/utils/local_server.py,sha256=31EQZ8S_SzgSiBFpc9zRU13GXm2jREvRmPDN5qWKgbg,4468
204
+ evalscope/perf/utils/local_server.py,sha256=A26gqBbxsnZA8CqQospyO50x3prVnD9XiT2l--ERxK0,4566
206
205
  evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
207
206
  evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
208
207
  evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
@@ -229,9 +228,9 @@ evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86J
229
228
  evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
230
229
  evalscope/third_party/longbench_write/default_task.json,sha256=d_NPShtW10Mc02U3pAuxX9hXd09tZw7QJAr1SvrECcM,694
231
230
  evalscope/third_party/longbench_write/default_task.yaml,sha256=YjU8EeyH9UtM8e7_fhrwJNChQdszOAcrKmOi--Awvhk,578
232
- evalscope/third_party/longbench_write/eval.py,sha256=bZrpaKg9sPXv2VkUxLpfJiNqMIoIj7Pf3eFMqmDncyY,11229
231
+ evalscope/third_party/longbench_write/eval.py,sha256=39McZSDHL7bA5Dg-BSyZ4EiAF1nfTiYJAnx5FqbNYok,11265
233
232
  evalscope/third_party/longbench_write/infer.py,sha256=bFsOp--8Qn6qQ-NpdLY0bennQGQl5TMGEngvGda8k7g,4937
234
- evalscope/third_party/longbench_write/longbench_write.py,sha256=1caNiJvmZL2vwDU6oHUE4cdCViZGYE8yBo9EsMcA-Qw,3955
233
+ evalscope/third_party/longbench_write/longbench_write.py,sha256=nIR1toB1hvUXR7Lrs3xcY9wqaI-bjeADg_Oscf3HdaY,3991
235
234
  evalscope/third_party/longbench_write/utils.py,sha256=nd-YslsOyNGAuyBfAWb2pnTMaGLMQ58lbnJJdrCndeI,815
236
235
  evalscope/third_party/longbench_write/resources/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
237
236
  evalscope/third_party/longbench_write/resources/judge.txt,sha256=Go1ISY4bUBmEDXXY_DItjAmskuHSaRj5WTNMNH98FSk,1885
@@ -239,7 +238,7 @@ evalscope/third_party/longbench_write/resources/longbench_write.jsonl,sha256=H26
239
238
  evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl,sha256=h4AJJ3YfNA5IiZ5N9dR_tyEa1JNqY0INv6l5ZgQUJZ8,24235
240
239
  evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl,sha256=odTr8N8PoWAFZ2kdEcmlLeMDfEo3KXDtLo9S8oieCmI,5718
241
240
  evalscope/third_party/longbench_write/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
242
- evalscope/third_party/longbench_write/tools/data_etl.py,sha256=nmWKOrD-GeZi0ZGH5jLCGuW3qiLTui8ASSxI2z8l6ls,5962
241
+ evalscope/third_party/longbench_write/tools/data_etl.py,sha256=T7a-4PwZg5alZQh-oTi1zjMxjGmVVZYVwSR9-diZlF8,5971
243
242
  evalscope/third_party/toolbench_static/README.md,sha256=Osdnt0_K-setbmYwDPCPRp2LXxamGp2mE8KsOByPPOY,3944
244
243
  evalscope/third_party/toolbench_static/__init__.py,sha256=BO936RxwodHr4OEpV6W3S_keC91OfOd41_msIJ2d0fs,128
245
244
  evalscope/third_party/toolbench_static/config_default.json,sha256=KrUzeHL2DNiM5FwY7cH3KZlxTwELCQZ6e39nilfUi0M,368
@@ -247,26 +246,27 @@ evalscope/third_party/toolbench_static/config_default.yaml,sha256=-6n6Zyg9eHN2ee
247
246
  evalscope/third_party/toolbench_static/eval.py,sha256=do_-lVi_vEoljeLYvt3b_AYSMqpdKzgYnTek9WLSKe8,8236
248
247
  evalscope/third_party/toolbench_static/infer.py,sha256=rsADLhEd2IBcC6EI9aD7hSJmo6Oo5b22mnHWBCZLDPs,9010
249
248
  evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP1Di_E6DgNgGoGP1UcvnqrdCR68,22
250
- evalscope/third_party/toolbench_static/toolbench_static.py,sha256=y4nC9WCBCgBg378aWYAdhmrFte_r_XOkigJs7XJ_iXQ,1930
249
+ evalscope/third_party/toolbench_static/toolbench_static.py,sha256=ABb9Gy09zMt30tY50AZGxSZ46k3NVEsvuDj6xlLOjeA,1966
251
250
  evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
252
251
  evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=usmVelh0ogBlCtSUL0dqp89w2mAqH1Ptv9MURVoGrc8,1209
253
252
  evalscope/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
254
- evalscope/tools/combine_reports.py,sha256=1BJ29IEUKoZLM3NAzg_IpU8B9uhljO9-b_hqAYi9RpA,5078
253
+ evalscope/tools/combine_reports.py,sha256=JFf3P_GJLPdlSqpv30D8ioPb7dup3tOTktsELmsKXLI,4900
255
254
  evalscope/tools/gen_mmlu_subject_mapping.py,sha256=CUmRdReEU7QfMyprh9I56KmHoRww_zUda_JuyxmCL1A,3277
256
- evalscope/tools/rewrite_eval_results.py,sha256=2lbDHfF_9abK1tUk2UYZZRwzO68eoiE36dXyh_b-mwg,2011
257
- evalscope/utils/__init__.py,sha256=hDS1xpoAxtVH4-ZQOXstdg7WYmjcGPQ62Kh54FIgkwU,87
255
+ evalscope/tools/rewrite_eval_results.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
256
+ evalscope/utils/__init__.py,sha256=ZOri8VHx8LpJBJS90uw8h0Z7gPhtxhjWlBPWuuZgoRE,121
258
257
  evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
259
- evalscope/utils/chat_service.py,sha256=N8lJPiVtzdqsHypa42wzb15T7hduXUrRPtU3Atf8yg4,8641
258
+ evalscope/utils/chat_service.py,sha256=VdNPXdFSf-4zxe0Ht74LBcdRNbpb9vzVi86HDEqfXHc,8647
260
259
  evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
261
- evalscope/utils/logger.py,sha256=IkY0oxkWSvfA0z1m79crioTiqQcnxulNF5HtJNlV0Fc,3174
260
+ evalscope/utils/io_utils.py,sha256=MnEi4llOYtXK81bUQ_XE_WP5qIsVrJ4MlKmWMH9vzFs,3993
261
+ evalscope/utils/logger.py,sha256=4OGlkBsut_wzq-1UcM2DKQKdKs1FRNYGHw538TGvypU,3440
262
262
  evalscope/utils/model_utils.py,sha256=zMS1YRu4CzU4CVLZS6e_lgfHIDBqv3YBTJbPF1R2M90,443
263
- evalscope/utils/utils.py,sha256=PVtpv3WAIm6Bs66Vz4KBDiAiXp8y6Oejxxr1LWHTRsI,15146
263
+ evalscope/utils/utils.py,sha256=lZl5lt4WqjoY5SEfsum8Sc-s_c9GSlmIZlkTAQkMnjE,10485
264
264
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
265
265
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
266
266
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
267
- tests/cli/test_run.py,sha256=lXR35DDLQjdb-XGA6pKnQC9pJTfTOHjknAN7PEaw8G4,4334
267
+ tests/cli/test_run.py,sha256=pMZvI3b0Vs-UFfciDoPwCYFAaYJzocQjxEaMLFTxYSo,4289
268
268
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
269
- tests/perf/test_perf.py,sha256=GD5nInXpQG7H1E8wI6dvy4DFSvTEddGDzv-Cu8YV1ts,2995
269
+ tests/perf/test_perf.py,sha256=AQB2QuMwJ1TnenHFPBF4YAtifbR0D0pSobP6xmDysqw,3023
270
270
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
271
271
  tests/rag/test_clip_benchmark.py,sha256=Ar8Br2CoAFYT2h4zCv_JKMKCGJKbKGYZgNwJ410ZaoU,2597
272
272
  tests/rag/test_mteb.py,sha256=CaEJ0f1M06Z90c72FQb9z23IC_KZtkURWsc_oRMgQn8,4609
@@ -276,10 +276,10 @@ tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p
276
276
  tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZUQhtYO5e0o,4898
277
277
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
278
278
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
279
- tests/vlm/test_vlmeval.py,sha256=21xi0nu4ghDB6_X-Pol7pTfK7aYkAYOp82TQ-MSQv-I,1757
280
- evalscope-0.8.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
281
- evalscope-0.8.0.dist-info/METADATA,sha256=5RKZaNBwuJj84sdAXlNmT11Bm8kGYha6EYnqszwZ1Qk,23190
282
- evalscope-0.8.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
283
- evalscope-0.8.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
284
- evalscope-0.8.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
285
- evalscope-0.8.0.dist-info/RECORD,,
279
+ tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
280
+ evalscope-0.8.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
281
+ evalscope-0.8.1.dist-info/METADATA,sha256=HydrEYb1OxbvVUMl11oLekV2sjvlgQQvtEpkcNAiW5A,23190
282
+ evalscope-0.8.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
283
+ evalscope-0.8.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
284
+ evalscope-0.8.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
285
+ evalscope-0.8.1.dist-info/RECORD,,
tests/cli/test_run.py CHANGED
@@ -70,7 +70,7 @@ class TestRun(unittest.TestCase):
70
70
 
71
71
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
72
72
  def test_run_task(self):
73
- task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct', 'datasets': ['gsm8k'], 'limit': 2, 'debug': False}
73
+ task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct', 'datasets': ['bbh', 'gsm8k', 'arc'], 'limit': 2, 'debug': False}
74
74
  run_task(task_cfg=task_cfg)
75
75
 
76
76
 
@@ -80,33 +80,32 @@ class TestRun(unittest.TestCase):
80
80
 
81
81
  task_cfg = TaskConfig(
82
82
  model='qwen/Qwen2-0.5B-Instruct',
83
- datasets=['ceval'], # 数据格式,选择题格式固定为 'ceval'
83
+ datasets=['ceval', 'general_qa'], # 数据格式,选择题格式固定为 'ceval'
84
84
  dataset_args={
85
85
  'ceval': {
86
86
  'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
87
87
  'subset_list': [
88
88
  'example' # 评测数据集名称,上述 *_dev.csv 中的 *
89
89
  ]
90
+ },
91
+ 'general_qa': {
92
+ 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
93
+ 'subset_list': [
94
+ 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
95
+ ]
90
96
  }
91
97
  },
92
98
  )
93
99
  run_task(task_cfg=task_cfg)
94
100
 
95
101
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
96
- def test_run_custom_qa(self):
102
+ def test_run_humaneval(self):
97
103
  from evalscope.config import TaskConfig
98
104
 
99
105
  task_cfg = TaskConfig(
100
106
  model='qwen/Qwen2-0.5B-Instruct',
101
- datasets=['general_qa'], # 数据格式,选择题格式固定为 'ceval'
102
- dataset_args={
103
- 'general_qa': {
104
- 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
105
- 'subset_list': [
106
- 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
107
- ]
108
- }
109
- },
107
+ datasets=['humaneval'],
108
+ limit=2
110
109
  )
111
110
 
112
111
  run_task(task_cfg=task_cfg)
tests/perf/test_perf.py CHANGED
@@ -25,6 +25,7 @@ class TestPerf(unittest.TestCase):
25
25
  'number': 15,
26
26
  'api': 'openai',
27
27
  'dataset': 'openqa',
28
+ 'stream': True,
28
29
  'debug': True,
29
30
  }
30
31
  run_perf_benchmark(task_cfg)
@@ -46,7 +47,7 @@ class TestPerf(unittest.TestCase):
46
47
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
47
48
  def test_run_perf_speed_benchmark(self):
48
49
  task_cfg = {
49
- 'url': 'http://127.0.0.1:8000/v1/completions',
50
+ 'url': 'http://127.0.0.1:8801/v1/completions',
50
51
  'parallel': 1,
51
52
  'model': 'qwen2.5',
52
53
  'api': 'openai',
tests/vlm/test_vlmeval.py CHANGED
@@ -40,8 +40,9 @@ class TestVLMEval(unittest.TestCase):
40
40
  }], # model name for VLMEval config
41
41
  'nproc': 1,
42
42
  'reuse': True,
43
- 'work_dir': 'outputs'
44
- }
43
+ },
44
+ 'work_dir': 'outputs',
45
+ 'use_cache': 'outputs/20241216_142838'
45
46
  }
46
47
 
47
48
  logger.info(f'>> Start to run task: {task_cfg}')