evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  4. evalscope/backend/rag_eval/utils/llm.py +1 -1
  5. evalscope/benchmarks/__init__.py +20 -1
  6. evalscope/benchmarks/arc/__init__.py +0 -5
  7. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  8. evalscope/benchmarks/bbh/__init__.py +0 -4
  9. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  10. evalscope/benchmarks/benchmark.py +70 -59
  11. evalscope/benchmarks/ceval/__init__.py +0 -5
  12. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  13. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  14. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  15. evalscope/benchmarks/competition_math/__init__.py +0 -5
  16. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  17. evalscope/benchmarks/data_adapter.py +115 -87
  18. evalscope/benchmarks/general_qa/__init__.py +0 -5
  19. evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
  20. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  21. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
  22. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  23. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
  24. evalscope/benchmarks/humaneval/__init__.py +0 -4
  25. evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
  26. evalscope/benchmarks/ifeval/__init__.py +0 -0
  27. evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
  28. evalscope/benchmarks/ifeval/instructions.py +1478 -0
  29. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  30. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  31. evalscope/benchmarks/ifeval/utils.py +134 -0
  32. evalscope/benchmarks/iquiz/__init__.py +0 -0
  33. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  34. evalscope/benchmarks/mmlu/__init__.py +0 -5
  35. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  36. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  37. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  38. evalscope/benchmarks/race/__init__.py +0 -5
  39. evalscope/benchmarks/race/race_adapter.py +26 -123
  40. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  41. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  42. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  43. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  44. evalscope/cli/cli.py +2 -0
  45. evalscope/cli/start_app.py +29 -0
  46. evalscope/collections/__init__.py +3 -0
  47. evalscope/collections/evaluator.py +198 -0
  48. evalscope/collections/sampler.py +138 -0
  49. evalscope/collections/schema.py +126 -0
  50. evalscope/config.py +7 -5
  51. evalscope/constants.py +9 -26
  52. evalscope/evaluator/evaluator.py +87 -121
  53. evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  54. evalscope/metrics/__init__.py +3 -0
  55. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  56. evalscope/metrics/math_accuracy.py +193 -50
  57. evalscope/metrics/metrics.py +18 -6
  58. evalscope/metrics/named_metrics.py +17 -0
  59. evalscope/metrics/rouge_metric.py +13 -8
  60. evalscope/models/__init__.py +14 -1
  61. evalscope/models/base_adapter.py +52 -0
  62. evalscope/models/chat_adapter.py +138 -0
  63. evalscope/models/choice_adapter.py +211 -0
  64. evalscope/models/custom_adapter.py +67 -0
  65. evalscope/models/local_model.py +74 -0
  66. evalscope/models/model.py +141 -0
  67. evalscope/models/server_adapter.py +111 -0
  68. evalscope/perf/__init__.py +1 -0
  69. evalscope/perf/main.py +0 -1
  70. evalscope/perf/plugin/api/custom_api.py +1 -1
  71. evalscope/perf/plugin/api/openai_api.py +1 -1
  72. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  73. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  74. evalscope/report/__init__.py +5 -0
  75. evalscope/report/app.py +506 -0
  76. evalscope/report/combinator.py +73 -0
  77. evalscope/report/generator.py +80 -0
  78. evalscope/report/utils.py +133 -0
  79. evalscope/run.py +48 -72
  80. evalscope/run_arena.py +1 -1
  81. evalscope/summarizer.py +1 -1
  82. evalscope/utils/__init__.py +1 -1
  83. evalscope/utils/chat_service.py +5 -4
  84. evalscope/utils/io_utils.py +8 -0
  85. evalscope/utils/logger.py +5 -0
  86. evalscope/utils/model_utils.py +15 -2
  87. evalscope/utils/utils.py +3 -25
  88. evalscope/version.py +2 -2
  89. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
  90. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
  91. tests/cli/test_collection.py +57 -0
  92. tests/cli/test_run.py +52 -1
  93. tests/rag/test_mteb.py +3 -2
  94. evalscope/models/api/__init__.py +0 -3
  95. evalscope/models/dummy_chat_model.py +0 -49
  96. evalscope/models/model_adapter.py +0 -525
  97. evalscope/models/openai_model.py +0 -103
  98. evalscope/tools/__init__.py +0 -1
  99. evalscope/tools/combine_reports.py +0 -133
  100. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  101. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  102. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  103. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
  104. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
  105. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
  106. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,133 @@
1
+ import json
2
+ import pandas as pd
3
+ from collections import defaultdict
4
+ from dataclasses import asdict, dataclass, field
5
+ from typing import Any, Dict, List
6
+
7
+ from evalscope.metrics import macro_mean, micro_mean
8
+ from evalscope.utils import normalize_score
9
+
10
+
11
+ @dataclass
12
+ class Subset:
13
+ name: str = 'default_subset'
14
+ score: float = 0.0
15
+ num: int = 0
16
+
17
+ def __post_init__(self):
18
+ self.score = normalize_score(self.score)
19
+
20
+
21
+ @dataclass
22
+ class Category:
23
+ name: tuple[str] = field(default_factory=tuple)
24
+ num: int = 0
25
+ score: float = 0.0
26
+ macro_score: float = 0.0
27
+ subsets: List[Subset] = field(default_factory=list)
28
+
29
+ def __post_init__(self):
30
+ if isinstance(self.name, str):
31
+ # ensure name is tuple format
32
+ self.name = (self.name, )
33
+ self.num = sum(subset.num for subset in self.subsets)
34
+ self.score = normalize_score(micro_mean(self.subsets))
35
+ self.macro_score = normalize_score(macro_mean(self.subsets))
36
+
37
+ @classmethod
38
+ def from_dict(cls, data: dict):
39
+ subsets = [Subset(**subset) for subset in data.get('subsets', [])]
40
+ return cls(name=data['name'], subsets=subsets)
41
+
42
+
43
+ @dataclass
44
+ class Metric:
45
+ name: str = 'default_metric'
46
+ num: int = 0
47
+ score: float = 0.0
48
+ macro_score: float = 0.0
49
+ categories: List[Category] = field(default_factory=list)
50
+
51
+ def __post_init__(self):
52
+ self.num = sum(category.num for category in self.categories)
53
+ self.score = normalize_score(micro_mean(self.categories))
54
+ self.macro_score = normalize_score(macro_mean(self.categories))
55
+
56
+ @classmethod
57
+ def from_dict(cls, data: dict):
58
+ categories = [Category.from_dict(category) for category in data.get('categories', [])]
59
+ return cls(name=data['name'], categories=categories)
60
+
61
+
62
+ class ReportKey:
63
+ model_name = 'Model'
64
+ dataset_name = 'Dataset'
65
+ metric_name = 'Metric'
66
+ category_name = 'Category'
67
+ category_prefix = 'Cat.'
68
+ subset_name = 'Subset'
69
+ num = 'Num'
70
+ score = 'Score'
71
+
72
+
73
+ @dataclass
74
+ class Report:
75
+ name: str = 'default_report'
76
+ dataset_name: str = 'default_dataset'
77
+ model_name: str = 'default_model'
78
+ score: float = 0.0
79
+ metrics: List[Metric] = field(default_factory=list)
80
+
81
+ def __post_init__(self):
82
+ self.score = self.metrics[0].score # NOTE: only use the first metric by default
83
+
84
+ def to_dict(self) -> Dict[str, Any]:
85
+ return asdict(self)
86
+
87
+ @classmethod
88
+ def from_dict(cls, data: dict):
89
+ metrics = [Metric.from_dict(metric) for metric in data.get('metrics', [])]
90
+ return cls(
91
+ name=data['name'],
92
+ score=data['score'],
93
+ metrics=metrics,
94
+ dataset_name=data['dataset_name'],
95
+ model_name=data['model_name'])
96
+
97
+ @classmethod
98
+ def from_json(cls, json_file: str):
99
+ with open(json_file, 'r') as f:
100
+ data = json.load(f)
101
+ return cls.from_dict(data)
102
+
103
+ def to_dataframe(self, flatten_metrics: bool = True, flatten_categories: bool = True):
104
+ table = defaultdict(list)
105
+ for metric in self.metrics:
106
+ for category in metric.categories:
107
+ for subset in category.subsets:
108
+ table[ReportKey.model_name].append(self.model_name)
109
+ table[ReportKey.dataset_name].append(self.dataset_name)
110
+ table[ReportKey.metric_name].append(metric.name)
111
+ table[ReportKey.category_name].append(category.name)
112
+ table[ReportKey.subset_name].append(subset.name)
113
+ table[ReportKey.num].append(subset.num)
114
+ table[ReportKey.score].append(subset.score) # TODO: convert to percentage
115
+ # NOTE: only flatten metrics if needed, use the first metric by default
116
+ if not flatten_metrics:
117
+ break
118
+ df = pd.DataFrame.from_dict(table, orient='columns')
119
+ if flatten_categories:
120
+ df = self._flatten_categories(df)
121
+ return df
122
+
123
+ def _flatten_categories(self, df: pd.DataFrame):
124
+ # expand categories to multiple rows
125
+ df_categories = df.copy()
126
+ # multi-level aggregation for categories
127
+ max_depth = df_categories[ReportKey.category_name].apply(len).max()
128
+ for level in range(max_depth):
129
+ df_categories[f'{ReportKey.category_prefix}{level}'] = df_categories[ReportKey.category_name].apply(
130
+ lambda x: x[level] if len(x) > level else None)
131
+
132
+ df_categories.drop(columns=[ReportKey.category_name], inplace=True)
133
+ return df_categories
evalscope/run.py CHANGED
@@ -2,26 +2,21 @@
2
2
  """
3
3
  Run evaluation for LLMs.
4
4
  """
5
- import logging
6
5
  import os.path
7
- import torch
8
6
  from argparse import Namespace
9
7
  from datetime import datetime
10
- from typing import List, Optional, Union
8
+ from typing import TYPE_CHECKING, List, Optional, Union
11
9
 
12
- from evalscope.arguments import parse_args
13
10
  from evalscope.config import TaskConfig, parse_task_config
14
- from evalscope.constants import DEFAULT_MODEL_REVISION, DEFAULT_WORK_DIR, EvalBackend, EvalType
15
- from evalscope.evaluator import Evaluator
16
- from evalscope.models.custom import CustomModel
17
- from evalscope.utils import import_module_util, seed_everything
18
- from evalscope.utils.io_utils import OutputsStructure, are_paths_same
11
+ from evalscope.constants import DataCollection, EvalBackend
12
+ from evalscope.utils import seed_everything
13
+ from evalscope.utils.io_utils import OutputsStructure
19
14
  from evalscope.utils.logger import configure_logging, get_logger
20
15
 
21
- logger = get_logger()
16
+ if TYPE_CHECKING:
17
+ from evalscope.models import LocalModel
22
18
 
23
- BENCHMARK_PATH_PREFIX = 'evalscope.benchmarks.'
24
- MEMBERS_TO_IMPORT = ['DATASET_ID', 'SUBSET_LIST', 'DataAdapterClass', 'ModelAdapterClass']
19
+ logger = get_logger()
25
20
 
26
21
 
27
22
  def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]) -> Union[dict, List[dict]]:
@@ -38,15 +33,13 @@ def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]
38
33
 
39
34
  def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
40
35
  """Run a single evaluation task."""
41
- seed_everything(task_cfg.seed)
36
+ if task_cfg.seed is not None:
37
+ seed_everything(task_cfg.seed)
42
38
  outputs = setup_work_directory(task_cfg, run_time)
43
39
  configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
44
40
 
45
- task_cfg.dump_yaml(outputs.configs_dir)
46
- logger.info(task_cfg)
47
-
48
41
  if task_cfg.eval_backend != EvalBackend.NATIVE:
49
- return run_non_native_backend(task_cfg)
42
+ return run_non_native_backend(task_cfg, outputs)
50
43
  else:
51
44
  return evaluate_model(task_cfg, outputs)
52
45
 
@@ -56,8 +49,8 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
56
49
  if task_cfg.use_cache:
57
50
  task_cfg.work_dir = task_cfg.use_cache
58
51
  logger.info(f'Set resume from {task_cfg.work_dir}')
59
- elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
60
- task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
52
+ # elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
53
+ task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
61
54
 
62
55
  outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
63
56
 
@@ -68,7 +61,7 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
68
61
  return outputs
69
62
 
70
63
 
71
- def run_non_native_backend(task_cfg: TaskConfig) -> dict:
64
+ def run_non_native_backend(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
72
65
  """Run evaluation using a non-native backend."""
73
66
  eval_backend = task_cfg.eval_backend
74
67
  eval_config = task_cfg.eval_config
@@ -78,6 +71,10 @@ def run_non_native_backend(task_cfg: TaskConfig) -> dict:
78
71
 
79
72
  backend_manager_class = get_backend_manager_class(eval_backend)
80
73
  backend_manager = backend_manager_class(config=eval_config)
74
+
75
+ task_cfg.dump_yaml(outputs.configs_dir)
76
+ logger.info(task_cfg)
77
+
81
78
  backend_manager.run()
82
79
 
83
80
  return dict()
@@ -100,78 +97,57 @@ def get_backend_manager_class(eval_backend: EvalBackend):
100
97
 
101
98
  def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
102
99
  """Evaluate the model based on the provided task configuration."""
100
+ from evalscope.models import get_local_model
101
+
103
102
  # Initialize evaluator
104
103
  eval_results = {}
105
-
104
+ base_model = get_local_model(task_cfg)
105
+ evaluators = []
106
106
  for dataset_name in task_cfg.datasets:
107
- evaluator = create_evaluator(task_cfg, dataset_name, outputs)
107
+ evaluator = create_evaluator(task_cfg, dataset_name, outputs, base_model)
108
+ evaluators.append(evaluator)
109
+
110
+ # dump task_cfg to outputs.configs_dir after creating evaluators
111
+ task_cfg.dump_yaml(outputs.configs_dir)
112
+ logger.info(task_cfg)
113
+
114
+ for evaluator in evaluators:
108
115
  res_dict = evaluator.eval(infer_cfg=task_cfg.generation_config, debug=task_cfg.debug, limit=task_cfg.limit)
109
116
  eval_results[dataset_name] = res_dict
110
117
 
111
118
  return eval_results
112
119
 
113
120
 
114
- def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure):
121
+ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: 'LocalModel'):
115
122
  """Create an evaluator object for the specified dataset."""
116
- imported_modules = import_module_util(BENCHMARK_PATH_PREFIX, dataset_name, MEMBERS_TO_IMPORT)
117
- model_adapter = initialize_model_adapter(task_cfg, dataset_name, imported_modules)
118
-
119
- dataset_config = task_cfg.dataset_args.get(dataset_name, {})
120
- dataset_name_or_path = dataset_config.get('local_path') or imported_modules['DATASET_ID']
121
- in_prompt_template = dataset_config.get('prompt_template', '')
122
- few_shot_num = dataset_config.get('few_shot_num', None)
123
- few_shot_random = dataset_config.get('few_shot_random', True)
124
-
125
- data_adapter = imported_modules['DataAdapterClass'](
126
- few_shot_num=few_shot_num,
127
- few_shot_random=few_shot_random,
128
- prompt_template=in_prompt_template,
129
- outputs=outputs,
130
- )
131
- in_subset_list = dataset_config.get('subset_list', imported_modules['SUBSET_LIST'])
123
+ from evalscope.benchmarks import Benchmark, BenchmarkMeta
124
+ from evalscope.evaluator import Evaluator
125
+ from evalscope.models import initialize_model_adapter
126
+
127
+ if dataset_name == DataCollection.NAME:
128
+ # EvaluatorCollection is a collection of evaluators
129
+ from evalscope.collections import EvaluatorCollection
130
+ return EvaluatorCollection(task_cfg, outputs)
132
131
 
133
- logger.info(f'Evaluating on subsets for {dataset_name}: {in_subset_list}\n')
132
+ benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
133
+
134
+ data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
135
+ model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
136
+
137
+ # update task_cfg.dataset_args
138
+ task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
134
139
 
135
140
  return Evaluator(
136
- dataset_name_or_path=dataset_name_or_path,
137
- subset_list=in_subset_list,
141
+ dataset_name_or_path=benchmark.dataset_id,
138
142
  data_adapter=data_adapter,
139
143
  model_adapter=model_adapter,
140
- use_cache=task_cfg.use_cache,
141
144
  outputs=outputs,
142
- datasets_dir=task_cfg.dataset_dir,
143
- datasets_hub=task_cfg.dataset_hub,
144
- stage=task_cfg.stage,
145
- eval_type=task_cfg.eval_type,
146
- overall_task_cfg=task_cfg,
145
+ task_cfg=task_cfg,
147
146
  )
148
147
 
149
148
 
150
- def initialize_model_adapter(task_cfg: TaskConfig, dataset_name: str, imported_modules):
151
- """Initialize the model adapter based on the task configuration."""
152
- if task_cfg.dry_run:
153
- from evalscope.models.dummy_chat_model import DummyChatModel
154
- return DummyChatModel(model_cfg=dict())
155
- elif task_cfg.eval_type == EvalType.CUSTOM:
156
- if not isinstance(task_cfg.model, CustomModel):
157
- raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
158
- from evalscope.models.model_adapter import CustomModelAdapter
159
- return CustomModelAdapter(custom_model=task_cfg.model)
160
- else:
161
- device_map = task_cfg.model_args.get('device_map', 'auto') if torch.cuda.is_available() else None
162
- model_precision = task_cfg.model_args.get('precision', torch.float16)
163
- if isinstance(model_precision, str) and model_precision != 'auto':
164
- model_precision = eval(model_precision)
165
- return imported_modules['ModelAdapterClass'](
166
- model_id=task_cfg.model,
167
- model_revision=task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION),
168
- device_map=device_map,
169
- torch_dtype=model_precision,
170
- generation_config=task_cfg.generation_config,
171
- chat_template=task_cfg.chat_template)
172
-
173
-
174
149
  def main():
150
+ from evalscope.arguments import parse_args
175
151
  args = parse_args()
176
152
  run_task(args)
177
153
 
evalscope/run_arena.py CHANGED
@@ -10,7 +10,7 @@ from tqdm import tqdm
10
10
 
11
11
  from evalscope.constants import EvalConfigKeys
12
12
  from evalscope.evaluator.rating_eval import RatingEvaluate
13
- from evalscope.models.model_adapter import ChatGenerationModelAdapter
13
+ from evalscope.models import ChatGenerationModelAdapter
14
14
  from evalscope.utils import get_obj_from_cfg
15
15
  from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list, yaml_to_dict
16
16
  from evalscope.utils.logger import get_logger
evalscope/summarizer.py CHANGED
@@ -6,7 +6,7 @@ from typing import List, Union
6
6
 
7
7
  from evalscope.config import TaskConfig, parse_task_config
8
8
  from evalscope.constants import EvalBackend
9
- from evalscope.tools.combine_reports import gen_table
9
+ from evalscope.report import gen_table
10
10
  from evalscope.utils import csv_to_list, get_latest_folder_path
11
11
  from evalscope.utils.io_utils import OutputsStructure, json_to_dict, yaml_to_dict
12
12
  from evalscope.utils.logger import get_logger
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from evalscope.constants import *
3
+ from evalscope.utils.model_utils import EvalBackend
4
4
  from evalscope.utils.utils import *
@@ -3,11 +3,9 @@ import time
3
3
  import torch
4
4
  from contextlib import contextmanager
5
5
  from functools import partial
6
- from modelscope import AutoModelForCausalLM, AutoTokenizer
7
6
  from pydantic import BaseModel, Field
8
7
  from threading import Thread
9
- from transformers import TextIteratorStreamer
10
- from typing import List, Literal, Optional, Union
8
+ from typing import Any, List, Literal, Optional, Union
11
9
 
12
10
 
13
11
  class Usage(BaseModel):
@@ -66,7 +64,7 @@ class ChatCompletionResponseStreamChoice(BaseModel):
66
64
  class ChatCompletionResponse(BaseModel):
67
65
  model: str
68
66
  object: Literal['chat.completion', 'chat.completion.chunk']
69
- choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
67
+ choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, Any]]
70
68
  created: Optional[int] = Field(default_factory=lambda: int(time.time()))
71
69
  usage: Optional[Usage]
72
70
 
@@ -96,6 +94,9 @@ class TextCompletionResponse(BaseModel):
96
94
  class ChatService:
97
95
 
98
96
  def __init__(self, model_path, attn_implementation):
97
+ from modelscope import AutoModelForCausalLM, AutoTokenizer
98
+ from transformers import TextIteratorStreamer
99
+
99
100
  self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
100
101
  self.model = AutoModelForCausalLM.from_pretrained(
101
102
  model_path,
@@ -160,3 +160,11 @@ def are_paths_same(path1, path2):
160
160
  real_path2 = os.path.realpath(os.path.abspath(os.path.expanduser(path2)))
161
161
 
162
162
  return real_path1 == real_path2
163
+
164
+
165
+ def dict_to_json(d: dict, json_file: str):
166
+ """
167
+ Dump dict to json file.
168
+ """
169
+ with open(json_file, 'w') as f:
170
+ json.dump(d, f, indent=4, ensure_ascii=False)
evalscope/utils/logger.py CHANGED
@@ -14,6 +14,11 @@ DEFAULT_LEVEL = logging.DEBUG if os.getenv('LOG_LEVEL', 'INFO') == 'DEBUG' else
14
14
 
15
15
  logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL)
16
16
 
17
+ # disable datasets logging
18
+ logging.getLogger('datasets').setLevel(logging.WARNING)
19
+ logging.getLogger('modelscope').setLevel(logging.WARNING)
20
+ logging.getLogger('httpx').setLevel(logging.WARNING)
21
+
17
22
 
18
23
  def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, file_mode: str = 'w', force=False):
19
24
  """Get logging logger
@@ -1,7 +1,20 @@
1
- from transformers import GenerationConfig
1
+ from enum import Enum
2
+ from typing import TYPE_CHECKING
2
3
 
4
+ if TYPE_CHECKING:
5
+ from transformers import GenerationConfig
3
6
 
4
- def fix_do_sample_warning(generation_config: GenerationConfig) -> None:
7
+
8
+ class EvalBackend(Enum):
9
+ # NOTE: compatible with ms-swfit v2.x
10
+ NATIVE = 'Native'
11
+ OPEN_COMPASS = 'OpenCompass'
12
+ VLM_EVAL_KIT = 'VLMEvalKit'
13
+ RAG_EVAL = 'RAGEval'
14
+ THIRD_PARTY = 'ThirdParty'
15
+
16
+
17
+ def fix_do_sample_warning(generation_config: 'GenerationConfig') -> None:
5
18
  # Use the default values of temperature/top_p/top_k in generation_config.
6
19
  if generation_config.temperature == 0:
7
20
  generation_config.do_sample = False
evalscope/utils/utils.py CHANGED
@@ -121,7 +121,6 @@ class ResponseParser:
121
121
  f'([{options_concat}])\s?是正确答案',
122
122
  f'选项\s?([{options_concat}])\s?正确',
123
123
  f'所以答\s?([{options_concat}])',
124
- f'1.\s?([{options_concat}])[.。$]?$',
125
124
  f'所以\s?([{options_concat}][.。$]?$)',
126
125
  f'所有\s?([{options_concat}][.。$]?$)',
127
126
  f'[\s,::,]([{options_concat}])[。,,\.]?$',
@@ -137,16 +136,15 @@ class ResponseParser:
137
136
  f'答案为(.*?)[{options_concat}]',
138
137
  f'固选(.*?)[{options_concat}]',
139
138
  f'答案应该是(.*?)[{options_concat}]',
140
- f'[Tt]he answer is [{options_concat}]',
139
+ f'[Tt]he answer is \(?[{options_concat}]\)?',
141
140
  f'[Tt]he correct answer is [{options_concat}]',
142
141
  f'[Tt]he correct answer is:\n[{options_concat}]',
143
142
  f'(\s|^)[{options_concat}][\s。,,\.$]', # noqa
144
- f'[{options_concat}]',
145
143
  f'^选项\s?([{options_concat}])',
146
144
  f'^([{options_concat}])\s?选?项',
147
145
  f'(\s|^)[{options_concat}][\s。,,::\.$]',
148
146
  f'(\s|^)[{options_concat}](\s|$)',
149
- f'1.\s?(.*?)$',
147
+ f'[{options_concat}]',
150
148
  ]
151
149
 
152
150
  regexes = [re.compile(pattern) for pattern in patterns]
@@ -169,6 +167,7 @@ class ResponseParser:
169
167
  """
170
168
  patterns = [
171
169
  r'[Aa]nswer:\s*(\w+)',
170
+ r'answer is \(?(\w+)\)?',
172
171
  r'[Tt]he correct answer is:\s*(\w+)',
173
172
  r'[Tt]he correct answer is:\n\s*(\w+)',
174
173
  r'[Tt]he correct answer is:\n\n-\s*(\w+)',
@@ -199,27 +198,6 @@ class ResponseParser:
199
198
 
200
199
 
201
200
 
202
- def import_module_util(import_path_prefix: str, module_name: str, members_to_import: list) -> dict:
203
- """
204
- Import module utility function.
205
-
206
- Args:
207
- import_path_prefix: e.g. 'evalscope.benchmarks.'
208
- module_name: The module name to import. e.g. 'mmlu'
209
- members_to_import: The members to import.
210
- e.g. ['DATASET_ID', 'SUBJECT_MAPPING', 'SUBSET_LIST', 'DataAdapterClass']
211
-
212
- Returns:
213
- dict: imported modules map. e.g. {'DATASET_ID': 'mmlu', 'SUBJECT_MAPPING': {...}, ...}
214
- """
215
- imported_modules = {}
216
- module = importlib.import_module(import_path_prefix + module_name)
217
- for member_name in members_to_import:
218
- imported_modules[member_name] = getattr(module, member_name)
219
-
220
- return imported_modules
221
-
222
-
223
201
  def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
224
202
  """
225
203
  Normalize score.
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.8.2'
4
- __release_datetime__ = '2024-12-26 20:00:00'
3
+ __version__ = '0.10.0'
4
+ __release_datetime__ = '2025-01-20 20:00:00'