evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +11 -3
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +24 -102
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +115 -87
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
- evalscope/benchmarks/ifeval/instructions.py +1478 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +26 -123
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +29 -0
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +198 -0
- evalscope/collections/sampler.py +138 -0
- evalscope/collections/schema.py +126 -0
- evalscope/config.py +7 -5
- evalscope/constants.py +9 -26
- evalscope/evaluator/evaluator.py +87 -121
- evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
- evalscope/metrics/__init__.py +3 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +18 -6
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +138 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +111 -0
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/main.py +0 -1
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +1 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +506 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +48 -72
- evalscope/run_arena.py +1 -1
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/chat_service.py +5 -4
- evalscope/utils/io_utils.py +8 -0
- evalscope/utils/logger.py +5 -0
- evalscope/utils/model_utils.py +15 -2
- evalscope/utils/utils.py +3 -25
- evalscope/version.py +2 -2
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
- tests/cli/test_collection.py +57 -0
- tests/cli/test_run.py +52 -1
- tests/rag/test_mteb.py +3 -2
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -133
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
tests/cli/test_run.py
CHANGED
|
@@ -4,6 +4,7 @@ import subprocess
|
|
|
4
4
|
import torch
|
|
5
5
|
import unittest
|
|
6
6
|
|
|
7
|
+
from evalscope.constants import EvalType
|
|
7
8
|
from evalscope.run import run_task
|
|
8
9
|
from evalscope.utils import is_module_installed, test_level_list
|
|
9
10
|
from evalscope.utils.logger import get_logger
|
|
@@ -70,7 +71,19 @@ class TestRun(unittest.TestCase):
|
|
|
70
71
|
|
|
71
72
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
72
73
|
def test_run_task(self):
|
|
73
|
-
task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct',
|
|
74
|
+
task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct',
|
|
75
|
+
'datasets': [
|
|
76
|
+
'mmlu_pro',
|
|
77
|
+
# 'bbh',
|
|
78
|
+
'hellaswag',
|
|
79
|
+
# 'gsm8k',
|
|
80
|
+
# 'arc'
|
|
81
|
+
# 'race',
|
|
82
|
+
# 'truthful_qa',
|
|
83
|
+
# 'trivia_qa',
|
|
84
|
+
],
|
|
85
|
+
'limit': 20,
|
|
86
|
+
'debug': True}
|
|
74
87
|
run_task(task_cfg=task_cfg)
|
|
75
88
|
|
|
76
89
|
|
|
@@ -110,5 +123,43 @@ class TestRun(unittest.TestCase):
|
|
|
110
123
|
|
|
111
124
|
run_task(task_cfg=task_cfg)
|
|
112
125
|
|
|
126
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
127
|
+
def test_run_server_model(self):
|
|
128
|
+
from evalscope.config import TaskConfig
|
|
129
|
+
|
|
130
|
+
task_cfg = TaskConfig(
|
|
131
|
+
model='Qwen2.5-7B-Instruct',
|
|
132
|
+
api_url='http://127.0.0.1:8801/v1/chat/completions',
|
|
133
|
+
api_key='EMPTY',
|
|
134
|
+
eval_type=EvalType.SERVICE,
|
|
135
|
+
datasets=[
|
|
136
|
+
'iquiz',
|
|
137
|
+
# 'ifeval',
|
|
138
|
+
# 'mmlu',
|
|
139
|
+
# 'mmlu_pro',
|
|
140
|
+
# 'race',
|
|
141
|
+
# 'trivia_qa',
|
|
142
|
+
# 'cmmlu',
|
|
143
|
+
# 'humaneval',
|
|
144
|
+
# 'competition_math',
|
|
145
|
+
# 'gsm8k',
|
|
146
|
+
# 'arc',
|
|
147
|
+
# 'ceval',
|
|
148
|
+
# 'bbh',
|
|
149
|
+
# 'hellaswag',
|
|
150
|
+
],
|
|
151
|
+
dataset_args={
|
|
152
|
+
'ceval': {
|
|
153
|
+
'subset_list': [
|
|
154
|
+
'computer_network', 'operating_system', 'computer_architecture', 'college_programming'
|
|
155
|
+
]
|
|
156
|
+
}
|
|
157
|
+
},
|
|
158
|
+
# limit=10
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
run_task(task_cfg=task_cfg)
|
|
162
|
+
|
|
163
|
+
|
|
113
164
|
if __name__ == '__main__':
|
|
114
165
|
unittest.main()
|
tests/rag/test_mteb.py
CHANGED
|
@@ -79,7 +79,7 @@ class TestMTEB(unittest.TestCase):
|
|
|
79
79
|
},
|
|
80
80
|
},
|
|
81
81
|
{
|
|
82
|
-
'model_name_or_path': '
|
|
82
|
+
'model_name_or_path': 'BAAI/bge-reranker-v2-m3',
|
|
83
83
|
'is_cross_encoder': True,
|
|
84
84
|
'max_seq_length': 512,
|
|
85
85
|
'prompt': '为这个问题生成一个检索用的表示',
|
|
@@ -94,7 +94,8 @@ class TestMTEB(unittest.TestCase):
|
|
|
94
94
|
'verbosity': 2,
|
|
95
95
|
'output_folder': 'outputs',
|
|
96
96
|
'overwrite_results': True,
|
|
97
|
-
'limits': 10,
|
|
97
|
+
# 'limits': 10,
|
|
98
|
+
'top_k': 10,
|
|
98
99
|
},
|
|
99
100
|
},
|
|
100
101
|
}
|
evalscope/models/api/__init__.py
DELETED
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
import random
|
|
4
|
-
import time
|
|
5
|
-
|
|
6
|
-
from evalscope.models import ChatBaseModel
|
|
7
|
-
from evalscope.utils.logger import get_logger
|
|
8
|
-
|
|
9
|
-
logger = get_logger()
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class DummyChatModel(ChatBaseModel):
|
|
13
|
-
|
|
14
|
-
MODEL_ID = 'dummy_chat_model_0801'
|
|
15
|
-
REVISION = 'v1.0.0'
|
|
16
|
-
|
|
17
|
-
def __init__(self, model_cfg: dict, **kwargs):
|
|
18
|
-
model_cfg['model_id'] = self.MODEL_ID
|
|
19
|
-
model_cfg['revision'] = self.REVISION
|
|
20
|
-
super(DummyChatModel, self).__init__(model_cfg=model_cfg)
|
|
21
|
-
|
|
22
|
-
def predict(self, inputs: dict, **kwargs) -> dict:
|
|
23
|
-
|
|
24
|
-
debug: bool = False
|
|
25
|
-
if debug:
|
|
26
|
-
messages = inputs['messages']
|
|
27
|
-
history = inputs['history']
|
|
28
|
-
|
|
29
|
-
logger.info(f'** messages: {messages}')
|
|
30
|
-
logger.info(f'** history: {history}')
|
|
31
|
-
|
|
32
|
-
choice = random.choice(['A', 'B', 'C', 'D'])
|
|
33
|
-
|
|
34
|
-
# Build response
|
|
35
|
-
res = {
|
|
36
|
-
'choices': [{
|
|
37
|
-
'index': 0,
|
|
38
|
-
'message': {
|
|
39
|
-
'content': choice,
|
|
40
|
-
'role': 'assistant'
|
|
41
|
-
}
|
|
42
|
-
}],
|
|
43
|
-
'created': time.time(),
|
|
44
|
-
'model': self.MODEL_ID + '-' + self.REVISION,
|
|
45
|
-
'object': 'chat.completion',
|
|
46
|
-
'usage': {}
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
return res
|
|
@@ -1,525 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
# Copyright (c) EleutherAI, Inc. and its affiliates.
|
|
3
|
-
# flake8: noqa
|
|
4
|
-
import numpy as np
|
|
5
|
-
import os
|
|
6
|
-
import sys
|
|
7
|
-
import time
|
|
8
|
-
import torch
|
|
9
|
-
from abc import ABC, abstractmethod
|
|
10
|
-
from copy import deepcopy
|
|
11
|
-
from modelscope import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
|
|
12
|
-
from torch import dtype
|
|
13
|
-
from typing import Any, Dict, List, Union
|
|
14
|
-
|
|
15
|
-
from evalscope.constants import DEFAULT_MODEL_CACHE_DIR
|
|
16
|
-
from evalscope.models.custom import CustomModel
|
|
17
|
-
from evalscope.utils.chat_service import ChatMessage
|
|
18
|
-
from evalscope.utils.logger import get_logger
|
|
19
|
-
from evalscope.utils.model_utils import fix_do_sample_warning
|
|
20
|
-
|
|
21
|
-
logger = get_logger()
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class BaseModelAdapter(ABC):
|
|
25
|
-
"""
|
|
26
|
-
Base class for model adapter.
|
|
27
|
-
"""
|
|
28
|
-
|
|
29
|
-
def __init__(self, model, tokenizer, model_cfg: dict):
|
|
30
|
-
"""
|
|
31
|
-
Args:
|
|
32
|
-
model: The model instance which is compatible with
|
|
33
|
-
AutoModel/AutoModelForCausalLM/AutoModelForSeq2SeqLM of transformers.
|
|
34
|
-
tokenizer: The tokenizer instance which is compatible with AutoTokenizer of transformers.
|
|
35
|
-
model_cfg:
|
|
36
|
-
Attributes: model_id, model_revision, device_map, torch_dtype
|
|
37
|
-
"""
|
|
38
|
-
self.model = model
|
|
39
|
-
self.tokenizer = tokenizer
|
|
40
|
-
self.model_cfg = model_cfg
|
|
41
|
-
|
|
42
|
-
@abstractmethod
|
|
43
|
-
@torch.no_grad()
|
|
44
|
-
def predict(self, *args, **kwargs) -> Any:
|
|
45
|
-
"""
|
|
46
|
-
Model prediction func.
|
|
47
|
-
"""
|
|
48
|
-
raise NotImplementedError
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
class MultiChoiceModelAdapter(BaseModelAdapter):
|
|
52
|
-
""" The multi-choice model adapter. """
|
|
53
|
-
|
|
54
|
-
_DEFAULT_MAX_LENGTH = 2048
|
|
55
|
-
|
|
56
|
-
def __init__(self,
|
|
57
|
-
model_id: str,
|
|
58
|
-
device_map: str = 'auto',
|
|
59
|
-
torch_dtype: dtype = torch.bfloat16,
|
|
60
|
-
model_revision: str = None,
|
|
61
|
-
max_length: int = None,
|
|
62
|
-
cache_dir: str = None,
|
|
63
|
-
**kwargs):
|
|
64
|
-
"""
|
|
65
|
-
Args:
|
|
66
|
-
model_id: The model id on ModelScope, or local model_dir. TODO: torch.nn.module to be supported.
|
|
67
|
-
device_map: The device map for model inference.
|
|
68
|
-
torch_dtype: The torch dtype for model inference. Default: torch.bfloat16.
|
|
69
|
-
model_revision: The model revision on ModelScope. Default: None.
|
|
70
|
-
max_length: The max length of input sequence. Default: None.
|
|
71
|
-
**kwargs: Other args.
|
|
72
|
-
"""
|
|
73
|
-
model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
|
|
74
|
-
|
|
75
|
-
self.model_id: str = model_id
|
|
76
|
-
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
77
|
-
logger.warning(f'Device: {self.device}')
|
|
78
|
-
|
|
79
|
-
torch_dtype = torch_dtype if torch_dtype is not None else 'auto'
|
|
80
|
-
|
|
81
|
-
model_cfg: dict = dict()
|
|
82
|
-
model_cfg['model_id'] = model_id
|
|
83
|
-
model_cfg['device_map'] = device_map
|
|
84
|
-
model_cfg['torch_dtype'] = str(torch_dtype)
|
|
85
|
-
|
|
86
|
-
tokenizer = AutoTokenizer.from_pretrained(
|
|
87
|
-
self.model_id, # self.model_id
|
|
88
|
-
revision=model_revision,
|
|
89
|
-
trust_remote_code=True,
|
|
90
|
-
cache_dir=model_cache_dir,
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
model = AutoModelForCausalLM.from_pretrained(
|
|
94
|
-
self.model_id, # self.model_id
|
|
95
|
-
revision=model_revision,
|
|
96
|
-
device_map=device_map,
|
|
97
|
-
trust_remote_code=True,
|
|
98
|
-
torch_dtype=torch_dtype,
|
|
99
|
-
cache_dir=model_cache_dir,
|
|
100
|
-
)
|
|
101
|
-
|
|
102
|
-
super().__init__(model=model, tokenizer=tokenizer, model_cfg=model_cfg)
|
|
103
|
-
|
|
104
|
-
self._max_length = max_length
|
|
105
|
-
|
|
106
|
-
@property
|
|
107
|
-
def max_length(self):
|
|
108
|
-
if self._max_length:
|
|
109
|
-
return self._max_length
|
|
110
|
-
seqlen_config_attrs = ('n_positions', 'max_position_embeddings', 'n_ctx')
|
|
111
|
-
for attr in seqlen_config_attrs:
|
|
112
|
-
if hasattr(self.model.config, attr):
|
|
113
|
-
return getattr(self.model.config, attr)
|
|
114
|
-
if hasattr(self.tokenizer, 'model_max_length'):
|
|
115
|
-
if self.tokenizer.model_max_length == 1000000000000000019884624838656:
|
|
116
|
-
return self._DEFAULT_MAX_LENGTH
|
|
117
|
-
return self.tokenizer.model_max_length
|
|
118
|
-
return self._DEFAULT_MAX_LENGTH
|
|
119
|
-
|
|
120
|
-
@torch.no_grad()
|
|
121
|
-
def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
|
|
122
|
-
"""
|
|
123
|
-
Multi-choice model prediction func.
|
|
124
|
-
|
|
125
|
-
Args:
|
|
126
|
-
inputs (dict): The inputs for a doc. Format:
|
|
127
|
-
{'data': [full_prompt], 'multi_choices': ['A', 'B', 'C', 'D']}
|
|
128
|
-
|
|
129
|
-
infer_cfg (dict): inference configuration.
|
|
130
|
-
|
|
131
|
-
Returns:
|
|
132
|
-
res (dict): The model prediction results. Format:
|
|
133
|
-
{
|
|
134
|
-
'choices': [
|
|
135
|
-
{
|
|
136
|
-
'index': 0,
|
|
137
|
-
'message': {
|
|
138
|
-
'content': [-14.9609, -13.6015, ...], # loglikelihood values for inputs context-continuation pairs.
|
|
139
|
-
'role': 'assistant'
|
|
140
|
-
}
|
|
141
|
-
}
|
|
142
|
-
],
|
|
143
|
-
'created': 1677664795,
|
|
144
|
-
# For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
|
|
145
|
-
'model': 'gpt-3.5-turbo-0613',
|
|
146
|
-
'object': 'chat.completion',
|
|
147
|
-
'usage': {
|
|
148
|
-
'completion_tokens': 17,
|
|
149
|
-
'prompt_tokens': 57,
|
|
150
|
-
'total_tokens': 74
|
|
151
|
-
}
|
|
152
|
-
}
|
|
153
|
-
"""
|
|
154
|
-
infer_cfg = infer_cfg or {}
|
|
155
|
-
self.model.generation_config.update(**infer_cfg)
|
|
156
|
-
|
|
157
|
-
input_data = inputs['data']
|
|
158
|
-
multi_choices = inputs['multi_choices']
|
|
159
|
-
|
|
160
|
-
output, input_info = self._get_logits(self.tokenizer, self.model, input_data)
|
|
161
|
-
assert output.shape[0] == 1
|
|
162
|
-
logits = output.flatten()
|
|
163
|
-
|
|
164
|
-
choice_logits = [logits[self.tokenizer(ch)['input_ids'][-1:]] for ch in multi_choices]
|
|
165
|
-
softval = torch.nn.functional.softmax(torch.tensor(choice_logits).float(), dim=0)
|
|
166
|
-
|
|
167
|
-
if softval.dtype in {torch.bfloat16, torch.float16}:
|
|
168
|
-
softval = softval.to(dtype=torch.float32)
|
|
169
|
-
probs = softval.detach().cpu().numpy()
|
|
170
|
-
pred: str = multi_choices[int(np.argmax(probs))] # Format: A or B or C or D
|
|
171
|
-
|
|
172
|
-
res_d = {
|
|
173
|
-
'choices': [{
|
|
174
|
-
'index': 0,
|
|
175
|
-
'message': {
|
|
176
|
-
'content': pred,
|
|
177
|
-
'role': 'assistant'
|
|
178
|
-
}
|
|
179
|
-
}],
|
|
180
|
-
'created': time.time(),
|
|
181
|
-
'model': self.model_id,
|
|
182
|
-
'object': 'chat.completion',
|
|
183
|
-
'usage': {}
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
return res_d
|
|
187
|
-
|
|
188
|
-
@staticmethod
|
|
189
|
-
def _get_logits(tokenizer, model, inputs: List[str]):
|
|
190
|
-
input_ids = tokenizer(inputs, padding=False)['input_ids']
|
|
191
|
-
input_ids = torch.tensor(input_ids, device=model.device)
|
|
192
|
-
tokens = {'input_ids': input_ids}
|
|
193
|
-
|
|
194
|
-
outputs = model(input_ids)['logits']
|
|
195
|
-
logits = outputs[:, -1, :]
|
|
196
|
-
log_probs = torch.nn.functional.softmax(logits, dim=-1)
|
|
197
|
-
return log_probs, {'tokens': tokens}
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
|
|
201
|
-
|
|
202
|
-
def __init__(self,
|
|
203
|
-
model_id: str,
|
|
204
|
-
device_map: str = 'auto',
|
|
205
|
-
torch_dtype: dtype = torch.bfloat16,
|
|
206
|
-
model_revision: str = None,
|
|
207
|
-
cache_dir: str = None,
|
|
208
|
-
**kwargs):
|
|
209
|
-
"""
|
|
210
|
-
Continuation-logits model adapter.
|
|
211
|
-
|
|
212
|
-
Args:
|
|
213
|
-
model_id: The model id on ModelScope, or local model_dir.
|
|
214
|
-
device_map: The device map for model inference.
|
|
215
|
-
torch_dtype: The torch dtype for model inference. Default: torch.bfloat16.
|
|
216
|
-
model_revision: The model revision on ModelScope. Default: None.
|
|
217
|
-
**kwargs: Other args.
|
|
218
|
-
"""
|
|
219
|
-
|
|
220
|
-
super().__init__(
|
|
221
|
-
model_id=model_id,
|
|
222
|
-
device_map=device_map,
|
|
223
|
-
torch_dtype=torch_dtype,
|
|
224
|
-
model_revision=model_revision,
|
|
225
|
-
cache_dir=cache_dir,
|
|
226
|
-
**kwargs)
|
|
227
|
-
|
|
228
|
-
@torch.no_grad()
|
|
229
|
-
def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
|
|
230
|
-
"""
|
|
231
|
-
Multi-choice model prediction func.
|
|
232
|
-
Args:
|
|
233
|
-
inputs (dict): The inputs for a doc. Format:
|
|
234
|
-
{'data': [(context, continuation), ...]}
|
|
235
|
-
infer_cfg (dict): inference configuration.
|
|
236
|
-
Returns:
|
|
237
|
-
res (dict): The model prediction results. Format:
|
|
238
|
-
{
|
|
239
|
-
'choices': [
|
|
240
|
-
{
|
|
241
|
-
'index': 0,
|
|
242
|
-
'message': {
|
|
243
|
-
'content': [-14.9609, -13.6015, ...], # loglikelihood values for inputs context-continuation pairs.
|
|
244
|
-
'role': 'assistant'
|
|
245
|
-
}
|
|
246
|
-
}
|
|
247
|
-
],
|
|
248
|
-
'created': 1677664795,
|
|
249
|
-
# For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
|
|
250
|
-
'model': 'gpt-3.5-turbo-0613',
|
|
251
|
-
'object': 'chat.completion',
|
|
252
|
-
'usage': {
|
|
253
|
-
'completion_tokens': 17,
|
|
254
|
-
'prompt_tokens': 57,
|
|
255
|
-
'total_tokens': 74
|
|
256
|
-
}
|
|
257
|
-
}
|
|
258
|
-
"""
|
|
259
|
-
infer_cfg = infer_cfg or {}
|
|
260
|
-
|
|
261
|
-
pred_list: list = self.loglikelihood(inputs=inputs['data'], infer_cfg=infer_cfg)
|
|
262
|
-
|
|
263
|
-
res_d = {
|
|
264
|
-
'choices': [{
|
|
265
|
-
'index': 0,
|
|
266
|
-
'message': {
|
|
267
|
-
'content': pred_list,
|
|
268
|
-
'role': 'assistant'
|
|
269
|
-
}
|
|
270
|
-
}],
|
|
271
|
-
'created': time.time(),
|
|
272
|
-
'model': self.model_id,
|
|
273
|
-
'object': 'chat.completion',
|
|
274
|
-
'usage': {}
|
|
275
|
-
}
|
|
276
|
-
return res_d
|
|
277
|
-
|
|
278
|
-
def loglikelihood(self, inputs: list, infer_cfg: dict = None) -> list:
|
|
279
|
-
self.model.generation_config.update(**infer_cfg)
|
|
280
|
-
# To predict one doc
|
|
281
|
-
doc_ele_pred = []
|
|
282
|
-
for ctx, continuation in inputs:
|
|
283
|
-
|
|
284
|
-
# ctx_enc shape: [context_tok_len] cont_enc shape: [continuation_tok_len]
|
|
285
|
-
ctx_enc, cont_enc = self._encode_pair(ctx, continuation)
|
|
286
|
-
|
|
287
|
-
inputs_tokens = torch.tensor(
|
|
288
|
-
(ctx_enc.tolist() + cont_enc.tolist())[-(self.max_length + 1):][:-1],
|
|
289
|
-
dtype=torch.long,
|
|
290
|
-
device=self.model.device).unsqueeze(0)
|
|
291
|
-
|
|
292
|
-
logits = self.model(inputs_tokens)[0]
|
|
293
|
-
logits = torch.nn.functional.log_softmax(logits.float(), dim=-1)
|
|
294
|
-
|
|
295
|
-
logits = logits[:, -len(cont_enc):, :]
|
|
296
|
-
cont_enc = cont_enc.unsqueeze(0).unsqueeze(-1)
|
|
297
|
-
logits = torch.gather(logits.cpu(), 2, cont_enc.cpu()).squeeze(-1)
|
|
298
|
-
|
|
299
|
-
choice_score = float(logits.sum())
|
|
300
|
-
doc_ele_pred.append(choice_score)
|
|
301
|
-
|
|
302
|
-
# e.g. [-2.3, -9.2, -12.9, 1.1], length=len(choices)
|
|
303
|
-
return doc_ele_pred
|
|
304
|
-
|
|
305
|
-
def _encode_pair(self, context, continuation):
|
|
306
|
-
n_spaces = len(context) - len(context.rstrip())
|
|
307
|
-
if n_spaces > 0:
|
|
308
|
-
continuation = context[-n_spaces:] + continuation
|
|
309
|
-
context = context[:-n_spaces]
|
|
310
|
-
|
|
311
|
-
whole_enc = self.tokenizer(context + continuation, padding=False)['input_ids']
|
|
312
|
-
whole_enc = torch.tensor(whole_enc, device=self.device)
|
|
313
|
-
|
|
314
|
-
context_enc = self.tokenizer(context, padding=False)['input_ids']
|
|
315
|
-
context_enc = torch.tensor(context_enc, device=self.device)
|
|
316
|
-
|
|
317
|
-
context_enc_len = len(context_enc)
|
|
318
|
-
continuation_enc = whole_enc[context_enc_len:]
|
|
319
|
-
|
|
320
|
-
return context_enc, continuation_enc
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
324
|
-
|
|
325
|
-
def __init__(self,
|
|
326
|
-
model_id: str,
|
|
327
|
-
model_revision: str = 'master',
|
|
328
|
-
device_map: str = 'auto',
|
|
329
|
-
torch_dtype: dtype = 'auto',
|
|
330
|
-
cache_dir: str = None,
|
|
331
|
-
**kwargs):
|
|
332
|
-
"""
|
|
333
|
-
Chat completion model adapter. Tasks of chat and generation are supported.
|
|
334
|
-
|
|
335
|
-
Args:
|
|
336
|
-
model_id: The model id on ModelScope, or local model_dir.
|
|
337
|
-
model_revision: The model revision on ModelScope. Default: None.
|
|
338
|
-
device_map: The device map for model inference.
|
|
339
|
-
torch_dtype: The torch dtype for model inference. Default: 'auto'.
|
|
340
|
-
**kwargs: Other args.
|
|
341
|
-
"""
|
|
342
|
-
|
|
343
|
-
custom_generation_config = kwargs.pop('generation_config', None)
|
|
344
|
-
custom_chat_template = kwargs.pop('chat_template', None)
|
|
345
|
-
model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
|
|
346
|
-
|
|
347
|
-
self.model_id: str = model_id
|
|
348
|
-
self.model_revision: str = model_revision
|
|
349
|
-
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
350
|
-
logger.warning(f'Device: {self.device}')
|
|
351
|
-
|
|
352
|
-
torch_dtype = torch_dtype if torch_dtype is not None else 'auto'
|
|
353
|
-
|
|
354
|
-
model_cfg: dict = dict()
|
|
355
|
-
model_cfg['model_id'] = model_id
|
|
356
|
-
model_cfg['device_map'] = device_map
|
|
357
|
-
model_cfg['torch_dtype'] = str(torch_dtype)
|
|
358
|
-
|
|
359
|
-
tokenizer = AutoTokenizer.from_pretrained(
|
|
360
|
-
self.model_id,
|
|
361
|
-
revision=model_revision,
|
|
362
|
-
trust_remote_code=True,
|
|
363
|
-
cache_dir=model_cache_dir,
|
|
364
|
-
)
|
|
365
|
-
|
|
366
|
-
model = AutoModelForCausalLM.from_pretrained(
|
|
367
|
-
self.model_id,
|
|
368
|
-
revision=model_revision,
|
|
369
|
-
device_map=device_map,
|
|
370
|
-
trust_remote_code=True,
|
|
371
|
-
torch_dtype=torch_dtype,
|
|
372
|
-
cache_dir=model_cache_dir,
|
|
373
|
-
)
|
|
374
|
-
|
|
375
|
-
self.generation_config = self._parse_generation_config(tokenizer, model)
|
|
376
|
-
|
|
377
|
-
if custom_generation_config:
|
|
378
|
-
logger.info('Updating generation config ...')
|
|
379
|
-
self.generation_config.update(**custom_generation_config)
|
|
380
|
-
|
|
381
|
-
if custom_chat_template:
|
|
382
|
-
tokenizer.chat_template = custom_chat_template
|
|
383
|
-
logger.info(f'Using custom chat template: {custom_chat_template}')
|
|
384
|
-
|
|
385
|
-
super().__init__(model=model, tokenizer=tokenizer, model_cfg=model_cfg)
|
|
386
|
-
|
|
387
|
-
def _parse_generation_config(self, tokenizer, model):
|
|
388
|
-
generation_config = getattr(model, 'generation_config', GenerationConfig(do_sample=False))
|
|
389
|
-
|
|
390
|
-
try:
|
|
391
|
-
remote_config = GenerationConfig.from_pretrained(
|
|
392
|
-
self.model_id, revision=self.model_revision, trust_remote_code=True)
|
|
393
|
-
generation_config.update(**remote_config.to_dict())
|
|
394
|
-
except:
|
|
395
|
-
logger.warning(f'Failed to get generation config of {self.model_id} from model hub, use default.')
|
|
396
|
-
|
|
397
|
-
if isinstance(self.model_id, str) and os.path.exists(self.model_id):
|
|
398
|
-
logger.warning(f'Got local model dir: {self.model_id}')
|
|
399
|
-
|
|
400
|
-
if tokenizer.eos_token_id is not None:
|
|
401
|
-
generation_config.eos_token_id = tokenizer.eos_token_id
|
|
402
|
-
if tokenizer.pad_token_id is not None:
|
|
403
|
-
generation_config.pad_token_id = tokenizer.pad_token_id
|
|
404
|
-
if generation_config.max_new_tokens is None:
|
|
405
|
-
generation_config.max_new_tokens = 2048
|
|
406
|
-
|
|
407
|
-
return generation_config
|
|
408
|
-
|
|
409
|
-
def _model_generate(self, query: str, infer_cfg: dict) -> str:
|
|
410
|
-
messages = [ChatMessage(role='user', content=query)]
|
|
411
|
-
formatted_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
412
|
-
inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device)
|
|
413
|
-
input_ids = inputs['input_ids']
|
|
414
|
-
|
|
415
|
-
# Process infer_cfg
|
|
416
|
-
if isinstance(infer_cfg.get('num_return_sequences'), int) and infer_cfg['num_return_sequences'] > 1:
|
|
417
|
-
infer_cfg['do_sample'] = True
|
|
418
|
-
|
|
419
|
-
# stop settings
|
|
420
|
-
stop = infer_cfg.get('stop', None)
|
|
421
|
-
eos_token_id = self.tokenizer.encode(stop, add_special_tokens=False)[0] \
|
|
422
|
-
if stop else self.tokenizer.eos_token_id
|
|
423
|
-
|
|
424
|
-
if eos_token_id is not None:
|
|
425
|
-
infer_cfg['eos_token_id'] = eos_token_id
|
|
426
|
-
infer_cfg['pad_token_id'] = eos_token_id # setting eos_token_id as pad token
|
|
427
|
-
|
|
428
|
-
self.generation_config.update(**infer_cfg)
|
|
429
|
-
fix_do_sample_warning(self.generation_config)
|
|
430
|
-
|
|
431
|
-
# Run inference
|
|
432
|
-
output_ids = self.model.generate(input_ids, generation_config=self.generation_config)
|
|
433
|
-
|
|
434
|
-
response = self.tokenizer.decode(output_ids[0, len(input_ids[0]):], skip_special_tokens=True)
|
|
435
|
-
return response
|
|
436
|
-
|
|
437
|
-
@torch.no_grad()
|
|
438
|
-
def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = {}) -> dict:
|
|
439
|
-
|
|
440
|
-
# Process inputs
|
|
441
|
-
if isinstance(inputs, str):
|
|
442
|
-
query = inputs
|
|
443
|
-
elif isinstance(inputs, dict):
|
|
444
|
-
query = inputs['data'][0]
|
|
445
|
-
elif isinstance(inputs, list):
|
|
446
|
-
query = '\n'.join(inputs)
|
|
447
|
-
else:
|
|
448
|
-
raise TypeError(f'Unsupported inputs type: {type(inputs)}')
|
|
449
|
-
|
|
450
|
-
response = self._model_generate(query, infer_cfg)
|
|
451
|
-
|
|
452
|
-
choices_list = [{'index': 0, 'message': {'content': response, 'role': 'assistant'}}]
|
|
453
|
-
|
|
454
|
-
res_d = {
|
|
455
|
-
'choices': choices_list,
|
|
456
|
-
'created': time.time(),
|
|
457
|
-
'model': self.model_id,
|
|
458
|
-
'object': 'chat.completion',
|
|
459
|
-
'usage': {}
|
|
460
|
-
}
|
|
461
|
-
|
|
462
|
-
return res_d
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
class CustomModelAdapter(BaseModelAdapter):
|
|
466
|
-
|
|
467
|
-
def __init__(self, custom_model: CustomModel, **kwargs):
|
|
468
|
-
"""
|
|
469
|
-
Custom model adapter.
|
|
470
|
-
|
|
471
|
-
Args:
|
|
472
|
-
custom_model: The custom model instance.
|
|
473
|
-
**kwargs: Other args.
|
|
474
|
-
"""
|
|
475
|
-
self.custom_model = custom_model
|
|
476
|
-
super(CustomModelAdapter, self).__init__(model=None, tokenizer=None, model_cfg=custom_model.config)
|
|
477
|
-
|
|
478
|
-
def predict(self, inputs: Union[str, dict, list], **kwargs) -> List[Dict[str, Any]]:
|
|
479
|
-
"""
|
|
480
|
-
Model prediction func.
|
|
481
|
-
|
|
482
|
-
Args:
|
|
483
|
-
inputs (Union[str, dict, list]): The input data. Depending on the specific model.
|
|
484
|
-
str: 'xxx'
|
|
485
|
-
dict: {'data': [full_prompt]}
|
|
486
|
-
list: ['xxx', 'yyy', 'zzz']
|
|
487
|
-
**kwargs: kwargs
|
|
488
|
-
|
|
489
|
-
Returns:
|
|
490
|
-
res (dict): The model prediction results. Format:
|
|
491
|
-
{
|
|
492
|
-
'choices': [
|
|
493
|
-
{
|
|
494
|
-
'index': 0,
|
|
495
|
-
'message': {
|
|
496
|
-
'content': 'xxx',
|
|
497
|
-
'role': 'assistant'
|
|
498
|
-
}
|
|
499
|
-
}
|
|
500
|
-
],
|
|
501
|
-
'created': 1677664795,
|
|
502
|
-
'model': 'gpt-3.5-turbo-0613', # should be model_id
|
|
503
|
-
'object': 'chat.completion',
|
|
504
|
-
'usage': {
|
|
505
|
-
'completion_tokens': 17,
|
|
506
|
-
'prompt_tokens': 57,
|
|
507
|
-
'total_tokens': 74
|
|
508
|
-
}
|
|
509
|
-
}
|
|
510
|
-
"""
|
|
511
|
-
in_prompts = []
|
|
512
|
-
|
|
513
|
-
# Note: here we assume the inputs are all prompts for the benchmark.
|
|
514
|
-
for input_prompt in inputs:
|
|
515
|
-
if isinstance(input_prompt, str):
|
|
516
|
-
in_prompts.append(input_prompt)
|
|
517
|
-
elif isinstance(input_prompt, dict):
|
|
518
|
-
# TODO: to be supported for continuation list like truthful_qa
|
|
519
|
-
in_prompts.append(input_prompt['data'][0])
|
|
520
|
-
elif isinstance(input_prompt, list):
|
|
521
|
-
in_prompts.append('\n'.join(input_prompt))
|
|
522
|
-
else:
|
|
523
|
-
raise TypeError(f'Unsupported inputs type: {type(input_prompt)}')
|
|
524
|
-
|
|
525
|
-
return self.custom_model.predict(prompts=in_prompts, **kwargs)
|