evalscope 0.16.1__py3-none-any.whl → 0.16.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +20 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
- evalscope/backend/rag_eval/utils/embedding.py +2 -4
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
- evalscope/benchmarks/aime/aime24_adapter.py +3 -1
- evalscope/benchmarks/aime/aime25_adapter.py +3 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
- evalscope/benchmarks/arc/arc_adapter.py +3 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
- evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
- evalscope/benchmarks/benchmark.py +1 -0
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
- evalscope/benchmarks/data_adapter.py +2 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +1 -0
- evalscope/benchmarks/drop/drop_adapter.py +3 -0
- evalscope/benchmarks/frames/frames_adapter.py +1 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
- evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
- evalscope/benchmarks/musr/musr_adapter.py +3 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +15 -8
- evalscope/benchmarks/needle_haystack/utils.py +2 -2
- evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
- evalscope/benchmarks/race/race_adapter.py +3 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +5 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
- evalscope/collections/evaluator.py +50 -28
- evalscope/constants.py +1 -1
- evalscope/evaluator/evaluator.py +6 -5
- evalscope/metrics/t2v_metrics/__init__.py +9 -23
- evalscope/models/adapters/__init__.py +2 -0
- evalscope/models/adapters/base_adapter.py +31 -27
- evalscope/models/adapters/bfcl_adapter.py +244 -0
- evalscope/models/adapters/server_adapter.py +78 -17
- evalscope/models/custom/custom_model.py +0 -3
- evalscope/models/custom/dummy_model.py +77 -39
- evalscope/models/local_model.py +1 -1
- evalscope/models/register.py +2 -1
- evalscope/perf/arguments.py +2 -0
- evalscope/perf/benchmark.py +16 -3
- evalscope/perf/plugin/api/openai_api.py +2 -0
- evalscope/report/combinator.py +38 -12
- evalscope/report/utils.py +24 -1
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -1
- evalscope/utils/io_utils.py +59 -2
- evalscope/version.py +2 -2
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/METADATA +4 -3
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/RECORD +82 -79
- tests/aigc/test_t2i.py +8 -8
- tests/cli/test_all.py +40 -33
- tests/cli/test_collection.py +4 -3
- tests/cli/test_run.py +36 -21
- tests/rag/test_clip_benchmark.py +5 -1
- tests/rag/test_mteb.py +46 -2
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -13,7 +13,7 @@ from evalscope.benchmarks import DataAdapter
|
|
|
13
13
|
from evalscope.config import TaskConfig
|
|
14
14
|
from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, JudgeStrategy, ReviewKeys
|
|
15
15
|
from evalscope.models import BaseModelAdapter
|
|
16
|
-
from evalscope.report import Report,
|
|
16
|
+
from evalscope.report import Report, gen_table
|
|
17
17
|
from evalscope.utils import dict_torch_dtype_to_str, gen_hash
|
|
18
18
|
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
19
19
|
from evalscope.utils.logger import get_logger
|
|
@@ -108,7 +108,6 @@ class Evaluator(object):
|
|
|
108
108
|
return answer_d
|
|
109
109
|
|
|
110
110
|
def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
|
|
111
|
-
answers_list = []
|
|
112
111
|
try:
|
|
113
112
|
# get answer from model
|
|
114
113
|
answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
|
|
@@ -117,10 +116,11 @@ class Evaluator(object):
|
|
|
117
116
|
# if ignore_errors is True, continue to next input
|
|
118
117
|
if self.task_cfg.ignore_errors:
|
|
119
118
|
logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
|
|
120
|
-
return
|
|
119
|
+
return []
|
|
121
120
|
else:
|
|
122
121
|
raise e
|
|
123
122
|
# process answer
|
|
123
|
+
answers_list = []
|
|
124
124
|
for answer_d, input_prompt in zip(answer_ds, input_prompts):
|
|
125
125
|
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
126
126
|
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
@@ -399,8 +399,9 @@ class Evaluator(object):
|
|
|
399
399
|
|
|
400
400
|
# Make table
|
|
401
401
|
try:
|
|
402
|
-
report_table =
|
|
403
|
-
logger.info(f'{self.dataset_name_or_path} report table:
|
|
402
|
+
report_table = gen_table(report_list=[report_map], add_overall_metric=True)
|
|
403
|
+
logger.info(f'\n{self.dataset_name_or_path} report table:'
|
|
404
|
+
f'\n{report_table} \n')
|
|
404
405
|
except Exception:
|
|
405
406
|
logger.error('Failed to generate report table.')
|
|
406
407
|
|
|
@@ -1,66 +1,52 @@
|
|
|
1
|
-
from __future__ import absolute_import, division, print_function
|
|
2
|
-
|
|
3
|
-
from .clipscore import CLIPScore, list_all_clipscore_models
|
|
4
|
-
from .constants import CACHE_DIR
|
|
5
|
-
from .itmscore import ITMScore, list_all_itmscore_models
|
|
6
|
-
from .vqascore import VQAScore, list_all_vqascore_models
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def list_all_models():
|
|
10
|
-
return list_all_vqascore_models() + list_all_clipscore_models() + list_all_itmscore_models()
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def get_score_model(model='clip-flant5-xxl', device='cuda', cache_dir=CACHE_DIR, **kwargs):
|
|
14
|
-
if model in list_all_vqascore_models():
|
|
15
|
-
return VQAScore(model, device=device, cache_dir=cache_dir, **kwargs)
|
|
16
|
-
elif model in list_all_clipscore_models():
|
|
17
|
-
return CLIPScore(model, device=device, cache_dir=cache_dir, **kwargs)
|
|
18
|
-
elif model in list_all_itmscore_models():
|
|
19
|
-
return ITMScore(model, device=device, cache_dir=cache_dir, **kwargs)
|
|
20
|
-
else:
|
|
21
|
-
raise NotImplementedError()
|
|
22
|
-
|
|
23
|
-
|
|
24
1
|
def clip_flant5_score():
|
|
2
|
+
from .vqascore import VQAScore
|
|
25
3
|
clip_flant5_score = VQAScore(model='clip-flant5-xxl')
|
|
26
4
|
return clip_flant5_score
|
|
27
5
|
|
|
28
6
|
|
|
29
7
|
def pick_score():
|
|
8
|
+
from .clipscore import CLIPScore
|
|
30
9
|
pick_score = CLIPScore(model='pickscore-v1')
|
|
31
10
|
return pick_score
|
|
32
11
|
|
|
33
12
|
|
|
34
13
|
def clip_score():
|
|
14
|
+
from .clipscore import CLIPScore
|
|
35
15
|
clip_score = CLIPScore(model='openai:ViT-L-14-336')
|
|
36
16
|
return clip_score
|
|
37
17
|
|
|
38
18
|
|
|
39
19
|
def blip2_score():
|
|
20
|
+
from .itmscore import ITMScore
|
|
40
21
|
blip_itm_score = ITMScore(model='blip2-itm')
|
|
41
22
|
return blip_itm_score
|
|
42
23
|
|
|
43
24
|
|
|
44
25
|
def hpsv2_score():
|
|
26
|
+
from .clipscore import CLIPScore
|
|
45
27
|
hpsv2_score = CLIPScore(model='hpsv2')
|
|
46
28
|
return hpsv2_score
|
|
47
29
|
|
|
48
30
|
|
|
49
31
|
def hpsv2_1_score():
|
|
32
|
+
from .clipscore import CLIPScore
|
|
50
33
|
hpsv2_1_score = CLIPScore(model='hpsv2.1')
|
|
51
34
|
return hpsv2_1_score
|
|
52
35
|
|
|
53
36
|
|
|
54
37
|
def image_reward_score():
|
|
38
|
+
from .itmscore import ITMScore
|
|
55
39
|
image_reward_score = ITMScore(model='image-reward-v1')
|
|
56
40
|
return image_reward_score
|
|
57
41
|
|
|
58
42
|
|
|
59
43
|
def fga_blip2_score():
|
|
44
|
+
from .itmscore import ITMScore
|
|
60
45
|
fga_blip2_score = ITMScore(model='fga_blip2')
|
|
61
46
|
return fga_blip2_score
|
|
62
47
|
|
|
63
48
|
|
|
64
49
|
def mps_score():
|
|
50
|
+
from .clipscore import CLIPScore
|
|
65
51
|
mps_score = CLIPScore(model='mps')
|
|
66
52
|
return mps_score
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from .base_adapter import BaseModelAdapter, initialize_model_adapter
|
|
2
|
+
from .bfcl_adapter import BFCLAdapter
|
|
2
3
|
from .chat_adapter import ChatGenerationModelAdapter
|
|
3
4
|
from .choice_adapter import ContinuationLogitsModelAdapter, MultiChoiceModelAdapter
|
|
4
5
|
from .custom_adapter import CustomModelAdapter
|
|
@@ -13,5 +14,6 @@ __all__ = [
|
|
|
13
14
|
'MultiChoiceModelAdapter',
|
|
14
15
|
'CustomModelAdapter',
|
|
15
16
|
'ServerModelAdapter',
|
|
17
|
+
'BFCLAdapter',
|
|
16
18
|
'T2IModelAdapter',
|
|
17
19
|
]
|
|
@@ -44,35 +44,39 @@ def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'DataAdapter', b
|
|
|
44
44
|
raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
|
|
45
45
|
from evalscope.models import CustomModelAdapter
|
|
46
46
|
return CustomModelAdapter(custom_model=task_cfg.model)
|
|
47
|
-
elif task_cfg.eval_type == EvalType.SERVICE or task_cfg.api_url is not None:
|
|
48
|
-
from evalscope.models import ServerModelAdapter
|
|
49
|
-
|
|
50
|
-
if benchmark.model_adapter in [OutputType.CONTINUOUS, OutputType.MULTIPLE_CHOICE]:
|
|
51
|
-
logger.warning('Output type is set to logits. This is not supported for service evaluation. '
|
|
52
|
-
'Setting output type to generation by default.')
|
|
53
|
-
benchmark.model_adapter = OutputType.GENERATION
|
|
54
|
-
|
|
55
|
-
return ServerModelAdapter(
|
|
56
|
-
api_url=task_cfg.api_url,
|
|
57
|
-
model_id=task_cfg.model,
|
|
58
|
-
api_key=task_cfg.api_key,
|
|
59
|
-
seed=task_cfg.seed,
|
|
60
|
-
timeout=task_cfg.timeout,
|
|
61
|
-
stream=task_cfg.stream,
|
|
62
|
-
)
|
|
63
47
|
else:
|
|
64
48
|
from ..register import get_model_adapter
|
|
65
49
|
|
|
66
|
-
#
|
|
50
|
+
# we need to determine the model adapter class based on the output type
|
|
67
51
|
model_adapter_cls_str = benchmark.model_adapter
|
|
68
|
-
if model_adapter_cls_str not in benchmark.output_types:
|
|
69
|
-
logger.warning(f'Output type {model_adapter_cls_str} is not supported for benchmark {benchmark.name}. '
|
|
70
|
-
f'Using {benchmark.output_types[0]} instead.')
|
|
71
|
-
model_adapter_cls_str = benchmark.output_types[0]
|
|
72
52
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
53
|
+
if task_cfg.eval_type == EvalType.SERVICE or task_cfg.api_url is not None:
|
|
54
|
+
|
|
55
|
+
if 'server' not in model_adapter_cls_str:
|
|
56
|
+
model_adapter_cls_str = 'server'
|
|
57
|
+
logger.info(
|
|
58
|
+
f'Using {model_adapter_cls.__name__} for api model evaluation for benchmark {benchmark.name}.')
|
|
59
|
+
|
|
60
|
+
# init server model adapter
|
|
61
|
+
model_adapter_cls = get_model_adapter(model_adapter_cls_str)
|
|
62
|
+
|
|
63
|
+
return model_adapter_cls(
|
|
64
|
+
api_url=task_cfg.api_url,
|
|
65
|
+
model_id=task_cfg.model,
|
|
66
|
+
api_key=task_cfg.api_key,
|
|
67
|
+
seed=task_cfg.seed,
|
|
68
|
+
timeout=task_cfg.timeout,
|
|
69
|
+
stream=task_cfg.stream,
|
|
70
|
+
)
|
|
71
|
+
else:
|
|
72
|
+
if model_adapter_cls_str not in benchmark.output_types:
|
|
73
|
+
logger.warning(f'Output type {model_adapter_cls_str} is not supported for benchmark {benchmark.name}.'
|
|
74
|
+
f'Using {benchmark.output_types[0]} instead.')
|
|
75
|
+
model_adapter_cls_str = benchmark.output_types[0]
|
|
76
|
+
|
|
77
|
+
model_adapter_cls = get_model_adapter(model_adapter_cls_str)
|
|
78
|
+
return model_adapter_cls(
|
|
79
|
+
model=base_model,
|
|
80
|
+
generation_config=task_cfg.generation_config,
|
|
81
|
+
chat_template=task_cfg.chat_template,
|
|
82
|
+
task_cfg=task_cfg)
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import time
|
|
3
|
+
import uuid
|
|
4
|
+
from typing import Any, List, Optional, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.utils.logger import get_logger
|
|
7
|
+
from .server_adapter import ServerModelAdapter
|
|
8
|
+
|
|
9
|
+
logger = get_logger()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BFCLAdapter(ServerModelAdapter):
|
|
13
|
+
"""
|
|
14
|
+
BFCL model adapter to request remote API model and generate results for BFCL evaluation.
|
|
15
|
+
Support multi-turn and single-turn function calling tasks.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
|
|
19
|
+
"""
|
|
20
|
+
Args:
|
|
21
|
+
api_url: The URL of the remote API model.
|
|
22
|
+
model_id: The ID of the remote API model.
|
|
23
|
+
api_key: The API key of the remote API model.
|
|
24
|
+
"""
|
|
25
|
+
super().__init__(api_url=api_url, model_id=model_id, api_key=api_key, **kwargs)
|
|
26
|
+
|
|
27
|
+
def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
|
|
28
|
+
"""
|
|
29
|
+
Model prediction func. For multi-turn evals, we pass a list[list[message]] to the model
|
|
30
|
+
where each list is a follow up turn in the conversation
|
|
31
|
+
each turn is a List[List[Message]]
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
inputs (List[dict]): The input data.
|
|
35
|
+
infer_cfg (dict): Inference configuration.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
res (List[dict]): The model prediction results.
|
|
39
|
+
"""
|
|
40
|
+
infer_cfg = infer_cfg or {}
|
|
41
|
+
results = []
|
|
42
|
+
|
|
43
|
+
for input_item in inputs:
|
|
44
|
+
# This flag decides if we pass tools to the API or try tool calling via prompting
|
|
45
|
+
# Passing tools to the API means that we rely on the API to manage system prompt specifics
|
|
46
|
+
# and also expect parsed tool calls in the ChatCompletionMessage object
|
|
47
|
+
# This is how the is_fc_model=True benchmark is designed to work
|
|
48
|
+
# On the other hand, we try to manage
|
|
49
|
+
# tool calling via prompting and parse tool calls in the standard text response
|
|
50
|
+
# This is how the is_fc_model=False benchmark is designed to work
|
|
51
|
+
row = input_item.get('messages')
|
|
52
|
+
is_fc_model = row.get('is_fc_model', False)
|
|
53
|
+
|
|
54
|
+
if is_fc_model:
|
|
55
|
+
response = self.generate_turn_with_tools(row, infer_cfg)
|
|
56
|
+
else:
|
|
57
|
+
response = self.generate_turn(row, infer_cfg)
|
|
58
|
+
|
|
59
|
+
# wrap response with openai types
|
|
60
|
+
res_d = {
|
|
61
|
+
'choices': [{
|
|
62
|
+
'index': 0,
|
|
63
|
+
'message': {
|
|
64
|
+
'content': response,
|
|
65
|
+
'role': 'assistant'
|
|
66
|
+
}
|
|
67
|
+
}],
|
|
68
|
+
'created': time.time(),
|
|
69
|
+
'model': self.model_id,
|
|
70
|
+
'object': 'chat.completion',
|
|
71
|
+
'usage': {
|
|
72
|
+
'completion_tokens': 0,
|
|
73
|
+
'prompt_tokens': 0,
|
|
74
|
+
'total_tokens': 0
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
results.append(res_d)
|
|
78
|
+
|
|
79
|
+
return results
|
|
80
|
+
|
|
81
|
+
def generate_turn(self, row: dict[str, Any], infer_cfg: dict[str, Any]) -> list[str]:
|
|
82
|
+
from bfcl_eval.constants.default_prompts import (DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING,
|
|
83
|
+
MAXIMUM_STEP_LIMIT)
|
|
84
|
+
from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
|
|
85
|
+
from bfcl_eval.model_handler.utils import default_decode_execute_prompting
|
|
86
|
+
|
|
87
|
+
all_model_responses = []
|
|
88
|
+
current_messages = []
|
|
89
|
+
turns = row['turns']
|
|
90
|
+
for turn_idx, messages in enumerate(turns):
|
|
91
|
+
n_steps = 0
|
|
92
|
+
current_responses = []
|
|
93
|
+
current_messages += messages.copy()
|
|
94
|
+
|
|
95
|
+
if str(turn_idx) in row['missing_functions']:
|
|
96
|
+
assert len(messages) == 0, 'Holdout turn should not have user message.'
|
|
97
|
+
new_turn = [{
|
|
98
|
+
'role':
|
|
99
|
+
'user',
|
|
100
|
+
'content':
|
|
101
|
+
DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING.format(
|
|
102
|
+
functions=row['missing_functions'][str(turn_idx)]),
|
|
103
|
+
}]
|
|
104
|
+
current_messages += new_turn
|
|
105
|
+
|
|
106
|
+
while True:
|
|
107
|
+
input_item = {
|
|
108
|
+
'messages': current_messages,
|
|
109
|
+
}
|
|
110
|
+
responses = self.process_single_input(input_item, infer_cfg)
|
|
111
|
+
result = responses['choices'][0]['message']['content']
|
|
112
|
+
|
|
113
|
+
logger.debug(f'Turn:{turn_idx} Step:{n_steps} Result: {result}')
|
|
114
|
+
current_messages.append({
|
|
115
|
+
'role': 'assistant',
|
|
116
|
+
'content': result,
|
|
117
|
+
})
|
|
118
|
+
current_responses.append(result)
|
|
119
|
+
|
|
120
|
+
execute_tools = row.get('should_execute_tool_calls', False)
|
|
121
|
+
if execute_tools:
|
|
122
|
+
try:
|
|
123
|
+
tool_calls = default_decode_execute_prompting(result)
|
|
124
|
+
except Exception:
|
|
125
|
+
tool_calls = None
|
|
126
|
+
|
|
127
|
+
if tool_calls is None:
|
|
128
|
+
break
|
|
129
|
+
|
|
130
|
+
tool_outputs, _ = execute_multi_turn_func_call(
|
|
131
|
+
tool_calls,
|
|
132
|
+
initial_config=row['initial_config'],
|
|
133
|
+
involved_classes=row['involved_classes'],
|
|
134
|
+
model_name='evaluator_loop',
|
|
135
|
+
test_entry_id=row['id'],
|
|
136
|
+
long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
|
|
137
|
+
is_evaL_run=False,
|
|
138
|
+
)
|
|
139
|
+
# Append tool outputs to the current messages
|
|
140
|
+
tool_results = []
|
|
141
|
+
for tool_output, tool_call in zip(tool_outputs, tool_calls):
|
|
142
|
+
tool_results.append({'role': 'tool', 'name': tool_call, 'content': tool_output})
|
|
143
|
+
current_messages.append({
|
|
144
|
+
'role': 'user',
|
|
145
|
+
'content': repr(tool_results),
|
|
146
|
+
})
|
|
147
|
+
else:
|
|
148
|
+
break
|
|
149
|
+
|
|
150
|
+
n_steps += 1
|
|
151
|
+
if n_steps > MAXIMUM_STEP_LIMIT:
|
|
152
|
+
logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
|
|
153
|
+
break
|
|
154
|
+
|
|
155
|
+
all_model_responses.append(current_responses)
|
|
156
|
+
|
|
157
|
+
return all_model_responses
|
|
158
|
+
|
|
159
|
+
def generate_turn_with_tools(self, row: dict[str, Any], infer_cfg: dict[str, Any]) -> list[str]:
|
|
160
|
+
from bfcl_eval.constants.default_prompts import (DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
|
|
161
|
+
MAXIMUM_STEP_LIMIT)
|
|
162
|
+
from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
|
|
163
|
+
from bfcl_eval.model_handler.utils import convert_to_function_call
|
|
164
|
+
|
|
165
|
+
all_model_responses = []
|
|
166
|
+
current_messages = []
|
|
167
|
+
turns = row['turns']
|
|
168
|
+
for turn_idx, messages in enumerate(turns):
|
|
169
|
+
n_steps = 0
|
|
170
|
+
current_responses = []
|
|
171
|
+
current_messages += messages.copy()
|
|
172
|
+
tools = row['tools']
|
|
173
|
+
|
|
174
|
+
if str(turn_idx) in row['missing_functions']:
|
|
175
|
+
assert len(messages) == 0, 'Holdout turn should not have user message.'
|
|
176
|
+
# inject new functions on the fly
|
|
177
|
+
new_tools = row['missing_functions'][str(turn_idx)]
|
|
178
|
+
for new_tool in new_tools:
|
|
179
|
+
tools.append({
|
|
180
|
+
'type': 'function',
|
|
181
|
+
'function': new_tool[0],
|
|
182
|
+
})
|
|
183
|
+
new_turn = [{
|
|
184
|
+
'role': 'user',
|
|
185
|
+
'content': DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
|
|
186
|
+
}]
|
|
187
|
+
current_messages += new_turn
|
|
188
|
+
|
|
189
|
+
while True:
|
|
190
|
+
input_item = {
|
|
191
|
+
'messages': current_messages,
|
|
192
|
+
'tools': tools,
|
|
193
|
+
}
|
|
194
|
+
responses = self.process_single_input(input_item, infer_cfg)
|
|
195
|
+
message = responses['choices'][0]['message']
|
|
196
|
+
|
|
197
|
+
current_messages.append(message)
|
|
198
|
+
if isinstance(message, str):
|
|
199
|
+
model_responses = [message]
|
|
200
|
+
tool_call_strs = None
|
|
201
|
+
elif message.get('tool_calls'):
|
|
202
|
+
model_responses = [{
|
|
203
|
+
tc['function']['name']: tc['function']['arguments']
|
|
204
|
+
} for tc in message['tool_calls']]
|
|
205
|
+
try:
|
|
206
|
+
tool_call_strs = convert_to_function_call(model_responses)
|
|
207
|
+
except Exception as e:
|
|
208
|
+
logger.error(f'Error converting tool calls to function call strings: {e}')
|
|
209
|
+
tool_call_strs = None
|
|
210
|
+
else:
|
|
211
|
+
model_responses = [message['content']]
|
|
212
|
+
tool_call_strs = None
|
|
213
|
+
|
|
214
|
+
current_responses.extend(model_responses)
|
|
215
|
+
|
|
216
|
+
execute_tools = row.get('should_execute_tool_calls', False)
|
|
217
|
+
if execute_tools and tool_call_strs is not None:
|
|
218
|
+
tool_outputs, _ = execute_multi_turn_func_call(
|
|
219
|
+
tool_call_strs,
|
|
220
|
+
initial_config=row['initial_config'],
|
|
221
|
+
involved_classes=row['involved_classes'],
|
|
222
|
+
model_name='evaluator_loop',
|
|
223
|
+
test_entry_id=row['id'],
|
|
224
|
+
long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
|
|
225
|
+
is_evaL_run=False,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
for tc, tool_output in zip(message['tool_calls'], tool_outputs, strict=False):
|
|
229
|
+
current_messages.append({
|
|
230
|
+
'role': 'tool',
|
|
231
|
+
'tool_call_id': tc['id'],
|
|
232
|
+
'content': json.dumps({'response': tool_output}),
|
|
233
|
+
})
|
|
234
|
+
else:
|
|
235
|
+
break
|
|
236
|
+
|
|
237
|
+
n_steps += 1
|
|
238
|
+
if n_steps > MAXIMUM_STEP_LIMIT:
|
|
239
|
+
logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
|
|
240
|
+
break
|
|
241
|
+
|
|
242
|
+
all_model_responses.append(current_responses)
|
|
243
|
+
|
|
244
|
+
return all_model_responses
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import copy
|
|
1
2
|
import openai
|
|
2
3
|
from collections import defaultdict
|
|
3
4
|
from openai.types.chat import ChatCompletion, ChatCompletionChunk
|
|
@@ -61,18 +62,17 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
61
62
|
|
|
62
63
|
def process_single_input(self, input_item: dict, infer_cfg: dict) -> dict:
|
|
63
64
|
"""Process a single input item."""
|
|
64
|
-
|
|
65
|
-
content = input_item['messages']
|
|
66
|
-
else:
|
|
67
|
-
content = self.make_request_content(input_item)
|
|
68
|
-
request_json = self.make_request(content, infer_cfg)
|
|
65
|
+
request_json = self.make_request(input_item, infer_cfg)
|
|
69
66
|
response = self.send_request(request_json)
|
|
70
67
|
return response
|
|
71
68
|
|
|
72
|
-
def
|
|
69
|
+
def make_request_messages(self, input_item: dict) -> list:
|
|
73
70
|
"""
|
|
74
|
-
Make request
|
|
71
|
+
Make request messages for OpenAI API.
|
|
75
72
|
"""
|
|
73
|
+
if input_item.get('messages', None):
|
|
74
|
+
return input_item['messages']
|
|
75
|
+
|
|
76
76
|
data: list = input_item['data']
|
|
77
77
|
if isinstance(data[0], tuple): # for truthful_qa and hellaswag
|
|
78
78
|
query = '\n'.join(''.join(item) for item in data)
|
|
@@ -89,18 +89,28 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
89
89
|
|
|
90
90
|
return messages
|
|
91
91
|
|
|
92
|
-
def make_request(self,
|
|
92
|
+
def make_request(self, input_item: dict, infer_cfg: dict) -> dict:
|
|
93
93
|
"""Make request to remote API."""
|
|
94
|
+
messages = self.make_request_messages(input_item)
|
|
94
95
|
# Format request JSON according to OpenAI API format
|
|
95
|
-
request_json = {'model': self.model_id, 'messages':
|
|
96
|
+
request_json = {'model': self.model_id, 'messages': messages, **infer_cfg}
|
|
96
97
|
|
|
97
98
|
if self.timeout:
|
|
98
99
|
request_json['timeout'] = self.timeout
|
|
99
100
|
|
|
101
|
+
request_json['stream'] = self.stream
|
|
100
102
|
if self.stream:
|
|
101
|
-
request_json['stream'] = self.stream
|
|
102
103
|
request_json['stream_options'] = {'include_usage': True}
|
|
103
104
|
|
|
105
|
+
if input_item.get('tools', None):
|
|
106
|
+
tools_copy = copy.deepcopy(input_item.get('tools'))
|
|
107
|
+
# Remove the "responses" from the functions, as that doesn't
|
|
108
|
+
# need to go to the model
|
|
109
|
+
for tool in tools_copy:
|
|
110
|
+
if 'function' in tool and 'response' in tool['function']:
|
|
111
|
+
del tool['function']['response']
|
|
112
|
+
request_json['tools'] = tools_copy
|
|
113
|
+
|
|
104
114
|
logger.debug(f'Request to remote API: {request_json}')
|
|
105
115
|
|
|
106
116
|
return request_json
|
|
@@ -135,19 +145,65 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
135
145
|
collected_chunks = []
|
|
136
146
|
collected_messages = defaultdict(list)
|
|
137
147
|
collected_reasoning = defaultdict(list)
|
|
148
|
+
collected_tool_calls = defaultdict(dict)
|
|
138
149
|
|
|
139
150
|
for chunk in response_stream:
|
|
140
151
|
collected_chunks.append(chunk)
|
|
141
152
|
for choice in chunk.choices:
|
|
153
|
+
# Handle reasoning content
|
|
142
154
|
if hasattr(choice.delta, 'reasoning_content') and choice.delta.reasoning_content is not None:
|
|
143
155
|
collected_reasoning[choice.index].append(choice.delta.reasoning_content)
|
|
156
|
+
|
|
157
|
+
# Handle regular content
|
|
144
158
|
if choice.delta.content is not None:
|
|
145
159
|
collected_messages[choice.index].append(choice.delta.content)
|
|
146
160
|
|
|
161
|
+
# Handle tool calls
|
|
162
|
+
if hasattr(choice.delta, 'tool_calls') and choice.delta.tool_calls:
|
|
163
|
+
for tool_call in choice.delta.tool_calls:
|
|
164
|
+
tool_id = tool_call.index
|
|
165
|
+
|
|
166
|
+
# Initialize tool call if not present
|
|
167
|
+
if tool_id not in collected_tool_calls[choice.index]:
|
|
168
|
+
collected_tool_calls[choice.index][tool_id] = {
|
|
169
|
+
'id': tool_call.id if hasattr(tool_call, 'id') and tool_call.id else None,
|
|
170
|
+
'type': tool_call.type if hasattr(tool_call, 'type') and tool_call.type else None,
|
|
171
|
+
'function': {
|
|
172
|
+
'name': '',
|
|
173
|
+
'arguments': ''
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
# Update tool call with new chunks
|
|
178
|
+
if hasattr(tool_call, 'function'):
|
|
179
|
+
if hasattr(tool_call.function, 'name') and tool_call.function.name:
|
|
180
|
+
collected_tool_calls[
|
|
181
|
+
choice.index][tool_id]['function']['name'] = tool_call.function.name
|
|
182
|
+
|
|
183
|
+
if hasattr(tool_call.function, 'arguments') and tool_call.function.arguments:
|
|
184
|
+
collected_tool_calls[
|
|
185
|
+
choice.index][tool_id]['function']['arguments'] += tool_call.function.arguments
|
|
186
|
+
|
|
187
|
+
# Update ID if it was received later
|
|
188
|
+
if hasattr(tool_call, 'id') and tool_call.id:
|
|
189
|
+
collected_tool_calls[choice.index][tool_id]['id'] = tool_call.id
|
|
190
|
+
|
|
191
|
+
# Get all unique choice indices from all collections
|
|
192
|
+
all_indices = set(collected_messages.keys()) | set(collected_reasoning.keys()) | set(
|
|
193
|
+
collected_tool_calls.keys())
|
|
194
|
+
|
|
147
195
|
choices = []
|
|
148
|
-
for index
|
|
149
|
-
full_reply_content = ''.join(
|
|
150
|
-
reasoning = ''.join(collected_reasoning[
|
|
196
|
+
for index in all_indices:
|
|
197
|
+
full_reply_content = ''.join(collected_messages.get(index, []))
|
|
198
|
+
reasoning = ''.join(collected_reasoning.get(index, []))
|
|
199
|
+
|
|
200
|
+
# Process tool_calls for this choice if any exists
|
|
201
|
+
tool_calls_list = None
|
|
202
|
+
if index in collected_tool_calls and collected_tool_calls[index]:
|
|
203
|
+
tool_calls_list = list(collected_tool_calls[index].values())
|
|
204
|
+
# Filter out any tool calls with None id (incomplete tool calls)
|
|
205
|
+
tool_calls_list = [tc for tc in tool_calls_list if tc['id'] is not None]
|
|
206
|
+
|
|
151
207
|
# use the finish_reason from the last chunk that generated this choice
|
|
152
208
|
finish_reason = None
|
|
153
209
|
for chunk in reversed(collected_chunks):
|
|
@@ -155,11 +211,16 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
155
211
|
finish_reason = chunk.choices[0].finish_reason
|
|
156
212
|
break
|
|
157
213
|
|
|
214
|
+
message_kwargs = {'role': 'assistant', 'content': full_reply_content}
|
|
215
|
+
|
|
216
|
+
if reasoning:
|
|
217
|
+
message_kwargs['reasoning_content'] = reasoning
|
|
218
|
+
|
|
219
|
+
if tool_calls_list:
|
|
220
|
+
message_kwargs['tool_calls'] = tool_calls_list
|
|
221
|
+
|
|
158
222
|
choice = Choice(
|
|
159
|
-
finish_reason=finish_reason or 'stop',
|
|
160
|
-
index=index,
|
|
161
|
-
message=ChatCompletionMessage(
|
|
162
|
-
role='assistant', content=full_reply_content, reasoning_content=reasoning))
|
|
223
|
+
finish_reason=finish_reason or 'stop', index=index, message=ChatCompletionMessage(**message_kwargs))
|
|
163
224
|
choices.append(choice)
|
|
164
225
|
|
|
165
226
|
# build the final completion object
|
|
@@ -10,9 +10,6 @@ class CustomModel(ABC):
|
|
|
10
10
|
self.config = config
|
|
11
11
|
self.kwargs = kwargs
|
|
12
12
|
|
|
13
|
-
if config.get('model_id', None) is None:
|
|
14
|
-
raise ValueError(f'**Error: model_id is required in config for CustomModel. Got config: {config}')
|
|
15
|
-
|
|
16
13
|
@abstractmethod
|
|
17
14
|
@torch.no_grad()
|
|
18
15
|
def predict(self, prompts: List[str], **kwargs) -> List[Dict[str, Any]]:
|