evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,227 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
# flake8: noqa
|
|
3
|
-
|
|
4
|
-
import ast
|
|
5
|
-
import re
|
|
6
|
-
|
|
7
|
-
# from . import utils as ann_utils
|
|
8
|
-
from evalscope.constants import ArenaWinner
|
|
9
|
-
from evalscope.utils.logger import get_logger
|
|
10
|
-
|
|
11
|
-
logger = get_logger()
|
|
12
|
-
|
|
13
|
-
one_score_pattern = re.compile('\[\[(\d+\.?\d*)\]\]')
|
|
14
|
-
one_score_pattern_backup = re.compile('\[(\d+\.?\d*)\]')
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
# modified from: https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/eval_gpt_review.py#L47
|
|
18
|
-
# does not work with batched completions
|
|
19
|
-
def lmsys_parser(completion, output_format):
|
|
20
|
-
if output_format == '[[rating]]':
|
|
21
|
-
match = re.search(one_score_pattern, completion)
|
|
22
|
-
if not match:
|
|
23
|
-
match = re.search(one_score_pattern_backup, completion)
|
|
24
|
-
|
|
25
|
-
if match:
|
|
26
|
-
rating = ast.literal_eval(match.groups()[0])
|
|
27
|
-
else:
|
|
28
|
-
logger.error(f'Content: {completion}\n'
|
|
29
|
-
'You must manually fix the score.')
|
|
30
|
-
rating = -1
|
|
31
|
-
|
|
32
|
-
return rating
|
|
33
|
-
if output_format == '[[rating_a,rating_b]]':
|
|
34
|
-
try:
|
|
35
|
-
score_pair = completion.split('\n')[0]
|
|
36
|
-
score_pair = score_pair.replace(',', ' ')
|
|
37
|
-
sp = score_pair.split(' ')
|
|
38
|
-
if len(sp) == 2:
|
|
39
|
-
score_1 = float(sp[0])
|
|
40
|
-
score_2 = float(sp[1])
|
|
41
|
-
if score_1 > score_2:
|
|
42
|
-
winner = ArenaWinner.MODEL_A
|
|
43
|
-
elif score_1 < score_2:
|
|
44
|
-
winner = ArenaWinner.MODEL_B
|
|
45
|
-
else:
|
|
46
|
-
if score_1 == score_1 == -1:
|
|
47
|
-
winner = ArenaWinner.UNKNOWN
|
|
48
|
-
winner = ArenaWinner.TIE
|
|
49
|
-
return winner, [score_1, score_2]
|
|
50
|
-
else:
|
|
51
|
-
raise Exception('Invalid score pair.')
|
|
52
|
-
except Exception as e:
|
|
53
|
-
logger.error(f'{e}\nContent: {completion}\nYou must manually fix the score pair.')
|
|
54
|
-
return ArenaWinner.UNKNOWN, [-1, -1]
|
|
55
|
-
elif output_format == '[[A]]':
|
|
56
|
-
if '[[A]]' in completion:
|
|
57
|
-
winner = ArenaWinner.MODEL_A
|
|
58
|
-
elif '[[B]]' in completion:
|
|
59
|
-
winner = ArenaWinner.MODEL_B
|
|
60
|
-
elif '[[C]]' in completion:
|
|
61
|
-
winner = ArenaWinner.TIE
|
|
62
|
-
else:
|
|
63
|
-
logger.error(f'\nContent: {completion}\nYou must manually fix the score.')
|
|
64
|
-
winner = ArenaWinner.UNKNOWN
|
|
65
|
-
return winner
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
def ranking_parser(completion, **kwargs):
|
|
69
|
-
try:
|
|
70
|
-
if isinstance(completion, str):
|
|
71
|
-
ordered_completions = ast.literal_eval(completion)
|
|
72
|
-
else:
|
|
73
|
-
ordered_completions = completion
|
|
74
|
-
|
|
75
|
-
rank = [c for c in ordered_completions if c['model'] == 'model_a'][0]['rank']
|
|
76
|
-
assert rank in [1, 2]
|
|
77
|
-
|
|
78
|
-
return ArenaWinner.MODEL_A if rank == 1 else ArenaWinner.MODEL_B
|
|
79
|
-
except Exception as e:
|
|
80
|
-
logger.error(f'{e}\nContent: {completion}\n'
|
|
81
|
-
'You must manually fix the score pair.')
|
|
82
|
-
return ArenaWinner.UNKNOWN
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
class ResponseParser:
|
|
86
|
-
|
|
87
|
-
@staticmethod
|
|
88
|
-
def parse_first_capital(text: str, options: list[str]) -> str:
|
|
89
|
-
for t in text:
|
|
90
|
-
if t.isupper() and (t in options):
|
|
91
|
-
return t
|
|
92
|
-
return ''
|
|
93
|
-
|
|
94
|
-
@staticmethod
|
|
95
|
-
def parse_last_capital(text: str, options: list[str]) -> str:
|
|
96
|
-
for t in text[::-1]:
|
|
97
|
-
if t.isupper() and (t in options):
|
|
98
|
-
return t
|
|
99
|
-
return ''
|
|
100
|
-
|
|
101
|
-
@staticmethod
|
|
102
|
-
def parse_first_option_with_choices(text: str, options: list[str]) -> str:
|
|
103
|
-
"""
|
|
104
|
-
Find first valid option for text.
|
|
105
|
-
|
|
106
|
-
Args:
|
|
107
|
-
text: The text to parse.
|
|
108
|
-
options: The options to find. e.g. ['A', 'B', 'C', 'D']
|
|
109
|
-
"""
|
|
110
|
-
options_concat = ResponseParser.process_options(options)
|
|
111
|
-
|
|
112
|
-
patterns = [
|
|
113
|
-
rf'答案是?\s?([{options_concat}])',
|
|
114
|
-
rf'答案是?\s?:([{options_concat}])',
|
|
115
|
-
rf'答案是?\s?:([{options_concat}])',
|
|
116
|
-
rf'答案应该?是\s?([{options_concat}])',
|
|
117
|
-
rf'答案应该?选\s?([{options_concat}])',
|
|
118
|
-
rf'答案为\s?([{options_concat}])',
|
|
119
|
-
rf'答案选\s?([{options_concat}])',
|
|
120
|
-
rf'选择?\s?([{options_concat}])',
|
|
121
|
-
rf'故选?\s?([{options_concat}])'
|
|
122
|
-
rf'只有选?项?\s?([{options_concat}])\s?是?对',
|
|
123
|
-
rf'只有选?项?\s?([{options_concat}])\s?是?错',
|
|
124
|
-
rf'只有选?项?\s?([{options_concat}])\s?不?正确',
|
|
125
|
-
rf'只有选?项?\s?([{options_concat}])\s?错误',
|
|
126
|
-
rf'说法不?对选?项?的?是\s?([{options_concat}])',
|
|
127
|
-
rf'说法不?正确选?项?的?是\s?([{options_concat}])',
|
|
128
|
-
rf'说法错误选?项?的?是\s?([{options_concat}])',
|
|
129
|
-
rf'([{options_concat}])\s?是正确的',
|
|
130
|
-
rf'([{options_concat}])\s?是正确答案',
|
|
131
|
-
rf'选项\s?([{options_concat}])\s?正确',
|
|
132
|
-
rf'所以答\s?([{options_concat}])',
|
|
133
|
-
rf'所以\s?([{options_concat}][.。$]?$)',
|
|
134
|
-
rf'所有\s?([{options_concat}][.。$]?$)',
|
|
135
|
-
rf'[\s,::,]([{options_concat}])[。,,\.]?$',
|
|
136
|
-
rf'[\s,,::][故即]([{options_concat}])[。\.]?$',
|
|
137
|
-
rf'[\s,,::]因此([{options_concat}])[。\.]?$',
|
|
138
|
-
rf'[是为。]\s?([{options_concat}])[。\.]?$',
|
|
139
|
-
rf'因此\s?([{options_concat}])[。\.]?$',
|
|
140
|
-
rf'显然\s?([{options_concat}])[。\.]?$',
|
|
141
|
-
rf'答案是\s?(\S+)(?:。|$)',
|
|
142
|
-
rf'答案应该是\s?(\S+)(?:。|$)',
|
|
143
|
-
rf'答案为\s?(\S+)(?:。|$)',
|
|
144
|
-
rf'答案是(.*?)[{options_concat}]',
|
|
145
|
-
rf'答案为(.*?)[{options_concat}]',
|
|
146
|
-
rf'固选(.*?)[{options_concat}]',
|
|
147
|
-
rf'答案应该是(.*?)[{options_concat}]',
|
|
148
|
-
rf'[Tt]he answer is \(?[{options_concat}]\)?',
|
|
149
|
-
rf'[Tt]he correct answer is [{options_concat}]',
|
|
150
|
-
rf'[Tt]he correct answer is:\n[{options_concat}]',
|
|
151
|
-
rf'(\s|^)[{options_concat}][\s。,,\.$]', # noqa
|
|
152
|
-
rf'^选项\s?([{options_concat}])',
|
|
153
|
-
rf'^([{options_concat}])\s?选?项',
|
|
154
|
-
rf'(\s|^)[{options_concat}][\s。,,::\.$]',
|
|
155
|
-
rf'(\s|^)[{options_concat}](\s|$)',
|
|
156
|
-
rf'[{options_concat}]',
|
|
157
|
-
]
|
|
158
|
-
|
|
159
|
-
regexes = [re.compile(pattern) for pattern in patterns]
|
|
160
|
-
for regex in regexes:
|
|
161
|
-
match = regex.search(text)
|
|
162
|
-
if match:
|
|
163
|
-
outputs = match.group(0)
|
|
164
|
-
for i in options:
|
|
165
|
-
if i in outputs:
|
|
166
|
-
return i
|
|
167
|
-
# If no match found, try to find the last capital letter in the text
|
|
168
|
-
last_capital = ResponseParser.parse_last_capital(text, options)
|
|
169
|
-
if last_capital:
|
|
170
|
-
return last_capital
|
|
171
|
-
return 'No valid option found'
|
|
172
|
-
|
|
173
|
-
@staticmethod
|
|
174
|
-
def parse_first_option(text: str, options: list[str]) -> str:
|
|
175
|
-
"""
|
|
176
|
-
Find first valid option for text.
|
|
177
|
-
|
|
178
|
-
Args:
|
|
179
|
-
text: The text to parse.
|
|
180
|
-
"""
|
|
181
|
-
options_pattern = ResponseParser.process_options(options)
|
|
182
|
-
|
|
183
|
-
patterns = [
|
|
184
|
-
rf'[Aa]nswer:\s*({options_pattern})',
|
|
185
|
-
rf'ANSWER:\s*({options_pattern})',
|
|
186
|
-
rf'answer is \(?({options_pattern})\)?',
|
|
187
|
-
rf'[Tt]he correct answer is:\s*({options_pattern})',
|
|
188
|
-
rf'[Tt]he correct answer is:\n\s*({options_pattern})',
|
|
189
|
-
rf'[Tt]he correct answer is:\n\n-\s*({options_pattern})',
|
|
190
|
-
rf'[Tt]he answer might be:\n\n-\s*({options_pattern})',
|
|
191
|
-
rf'[Tt]he answer is \s*({options_pattern})',
|
|
192
|
-
]
|
|
193
|
-
|
|
194
|
-
regexes = [re.compile(pattern) for pattern in patterns]
|
|
195
|
-
for regex in regexes:
|
|
196
|
-
matches = regex.search(text)
|
|
197
|
-
if matches:
|
|
198
|
-
return matches.group(1)
|
|
199
|
-
# If no match found, try to find the last capital letter in the text
|
|
200
|
-
last_capital = ResponseParser.parse_last_capital(text, options)
|
|
201
|
-
if last_capital:
|
|
202
|
-
return last_capital
|
|
203
|
-
return 'No valid option found'
|
|
204
|
-
|
|
205
|
-
@staticmethod
|
|
206
|
-
def parse_bracketed_answer(text: str, options: list[str]) -> str:
|
|
207
|
-
options = ResponseParser.process_options(options)
|
|
208
|
-
# Match the first occurrence of the options in angle brackets
|
|
209
|
-
match = re.search(rf'<({options})>', text)
|
|
210
|
-
if match:
|
|
211
|
-
return match.group(1)
|
|
212
|
-
return 'No valid option found'
|
|
213
|
-
|
|
214
|
-
@staticmethod
|
|
215
|
-
def process_options(options: list[str]) -> str:
|
|
216
|
-
# Escape each option to ensure special characters in options are treated literally
|
|
217
|
-
escaped_options = [re.escape(option) for option in options]
|
|
218
|
-
# Join options into a regex pattern separated by '|', to match any of the options
|
|
219
|
-
options_pattern = '|'.join(escaped_options)
|
|
220
|
-
return options_pattern
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
if __name__ == '__main__':
|
|
224
|
-
result = '**Answer: A **Answer: C**'
|
|
225
|
-
options = ['A', 'B', 'C', 'D']
|
|
226
|
-
parsed_result = ResponseParser.parse_first_option(result, options)
|
|
227
|
-
print(f'Parsed result: {parsed_result}') # Should print 'C'
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass, field
|
|
2
|
-
from functools import partial
|
|
3
|
-
from typing import Callable, Dict
|
|
4
|
-
|
|
5
|
-
from evalscope.metrics.metrics import mean, pass_at_k, weighted_mean
|
|
6
|
-
from evalscope.metrics.t2v_metrics import (blip2_score, clip_flant5_score, clip_score, fga_blip2_score, hpsv2_1_score,
|
|
7
|
-
hpsv2_score, image_reward_score, mps_score, pick_score)
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass
|
|
11
|
-
class Metric:
|
|
12
|
-
name: str = 'default_metric'
|
|
13
|
-
object: Callable = field(default_factory=lambda: mean)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class MetricRegistry:
|
|
17
|
-
|
|
18
|
-
def __init__(self):
|
|
19
|
-
self.metrics: Dict[str, Metric] = {}
|
|
20
|
-
|
|
21
|
-
def register(self, metric: Metric):
|
|
22
|
-
self.metrics[metric.name] = metric
|
|
23
|
-
|
|
24
|
-
def get(self, name: str) -> Metric:
|
|
25
|
-
try:
|
|
26
|
-
return self.metrics[name]
|
|
27
|
-
except KeyError:
|
|
28
|
-
raise KeyError(f'Metric {name} not found in the registry. Available metrics: {self.list_metrics()}')
|
|
29
|
-
|
|
30
|
-
def list_metrics(self):
|
|
31
|
-
return list(self.metrics.keys())
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
metric_registry = MetricRegistry()
|
|
35
|
-
|
|
36
|
-
# Register metrics
|
|
37
|
-
metric_registry.register(Metric(name='AverageAccuracy', object=mean))
|
|
38
|
-
metric_registry.register(Metric(name='WeightedAverageAccuracy', object=weighted_mean))
|
|
39
|
-
metric_registry.register(Metric(name='AverageBLEU', object=mean))
|
|
40
|
-
metric_registry.register(Metric(name='AverageRouge', object=mean))
|
|
41
|
-
metric_registry.register(Metric(name='WeightedAverageBLEU', object=weighted_mean))
|
|
42
|
-
metric_registry.register(Metric(name='AveragePass@1', object=mean))
|
|
43
|
-
for k in range(1, 17):
|
|
44
|
-
metric_registry.register(Metric(name=f'Pass@{k}', object=partial(pass_at_k, k=k)))
|
|
45
|
-
|
|
46
|
-
# t2v_metrics
|
|
47
|
-
metric_registry.register(Metric(name='VQAScore', object=clip_flant5_score))
|
|
48
|
-
metric_registry.register(Metric(name='PickScore', object=pick_score))
|
|
49
|
-
metric_registry.register(Metric(name='CLIPScore', object=clip_score))
|
|
50
|
-
metric_registry.register(Metric(name='BLIPv2Score', object=blip2_score))
|
|
51
|
-
metric_registry.register(Metric(name='HPSv2Score', object=hpsv2_score))
|
|
52
|
-
metric_registry.register(Metric(name='HPSv2.1Score', object=hpsv2_1_score))
|
|
53
|
-
metric_registry.register(Metric(name='ImageRewardScore', object=image_reward_score))
|
|
54
|
-
metric_registry.register(Metric(name='FGA_BLIP2Score', object=fga_blip2_score))
|
|
55
|
-
metric_registry.register(Metric(name='MPS', object=mps_score))
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from .base_adapter import BaseModelAdapter, initialize_model_adapter
|
|
2
|
-
from .bfcl_adapter import BFCLAdapter
|
|
3
|
-
from .chat_adapter import ChatGenerationModelAdapter
|
|
4
|
-
from .choice_adapter import ContinuationLogitsModelAdapter, MultiChoiceModelAdapter
|
|
5
|
-
from .custom_adapter import CustomModelAdapter
|
|
6
|
-
from .server_adapter import ServerModelAdapter
|
|
7
|
-
from .t2i_adapter import T2IModelAdapter
|
|
8
|
-
from .tau_bench_adapter import TauBenchAdapter
|
|
9
|
-
|
|
10
|
-
__all__ = [
|
|
11
|
-
'initialize_model_adapter', 'BaseModelAdapter', 'ChatGenerationModelAdapter', 'ContinuationLogitsModelAdapter',
|
|
12
|
-
'MultiChoiceModelAdapter', 'CustomModelAdapter', 'ServerModelAdapter', 'BFCLAdapter', 'T2IModelAdapter',
|
|
13
|
-
'TauBenchAdapter'
|
|
14
|
-
]
|
|
@@ -1,84 +0,0 @@
|
|
|
1
|
-
import torch
|
|
2
|
-
from abc import ABC, abstractmethod
|
|
3
|
-
from typing import TYPE_CHECKING, Any, List, Optional, Union
|
|
4
|
-
|
|
5
|
-
from evalscope.constants import EvalType, OutputType
|
|
6
|
-
from evalscope.utils.logger import get_logger
|
|
7
|
-
from ..custom import CustomModel
|
|
8
|
-
from ..local_model import LocalModel
|
|
9
|
-
|
|
10
|
-
logger = get_logger()
|
|
11
|
-
|
|
12
|
-
if TYPE_CHECKING:
|
|
13
|
-
from evalscope.benchmarks import DataAdapter
|
|
14
|
-
from evalscope.config import TaskConfig
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class BaseModelAdapter(ABC):
|
|
18
|
-
|
|
19
|
-
def __init__(self, model: Optional[Union[LocalModel, CustomModel]], **kwargs):
|
|
20
|
-
if model is None:
|
|
21
|
-
self.model_cfg = kwargs.get('model_cfg', None)
|
|
22
|
-
elif isinstance(model, LocalModel):
|
|
23
|
-
self.model = model.model
|
|
24
|
-
self.model_id = model.model_id
|
|
25
|
-
self.model_revision = model.model_revision
|
|
26
|
-
self.device = model.device
|
|
27
|
-
self.tokenizer = model.tokenizer
|
|
28
|
-
self.model_cfg = model.model_cfg
|
|
29
|
-
elif isinstance(model, CustomModel):
|
|
30
|
-
self.model_cfg = model.config
|
|
31
|
-
else:
|
|
32
|
-
raise ValueError(f'Unsupported model type: {type(model)}')
|
|
33
|
-
|
|
34
|
-
@abstractmethod
|
|
35
|
-
@torch.no_grad()
|
|
36
|
-
def predict(self, *args, **kwargs) -> Any:
|
|
37
|
-
raise NotImplementedError
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'DataAdapter', base_model: 'LocalModel'):
|
|
41
|
-
"""Initialize the model adapter based on the task configuration."""
|
|
42
|
-
if task_cfg.eval_type == EvalType.CUSTOM:
|
|
43
|
-
if not isinstance(task_cfg.model, CustomModel):
|
|
44
|
-
raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
|
|
45
|
-
from evalscope.models import CustomModelAdapter
|
|
46
|
-
return CustomModelAdapter(custom_model=task_cfg.model)
|
|
47
|
-
else:
|
|
48
|
-
from ..register import get_model_adapter
|
|
49
|
-
|
|
50
|
-
# we need to determine the model adapter class based on the output type
|
|
51
|
-
model_adapter_cls_str = benchmark.model_adapter
|
|
52
|
-
|
|
53
|
-
if task_cfg.eval_type == EvalType.SERVICE or task_cfg.api_url is not None:
|
|
54
|
-
|
|
55
|
-
if 'server' not in model_adapter_cls_str:
|
|
56
|
-
logger.warning(f'Output type {model_adapter_cls_str} is not supported for service evaluation. '
|
|
57
|
-
f'Using server model adapter instead.')
|
|
58
|
-
model_adapter_cls_str = 'server'
|
|
59
|
-
benchmark.model_adapter = model_adapter_cls_str
|
|
60
|
-
|
|
61
|
-
# init server model adapter
|
|
62
|
-
model_adapter_cls = get_model_adapter(model_adapter_cls_str)
|
|
63
|
-
|
|
64
|
-
return model_adapter_cls(
|
|
65
|
-
api_url=task_cfg.api_url,
|
|
66
|
-
model_id=task_cfg.model,
|
|
67
|
-
api_key=task_cfg.api_key,
|
|
68
|
-
seed=task_cfg.seed,
|
|
69
|
-
timeout=task_cfg.timeout,
|
|
70
|
-
stream=task_cfg.stream,
|
|
71
|
-
)
|
|
72
|
-
else:
|
|
73
|
-
if model_adapter_cls_str not in benchmark.output_types:
|
|
74
|
-
logger.warning(f'Output type {model_adapter_cls_str} is not supported for benchmark {benchmark.name}.'
|
|
75
|
-
f'Using {benchmark.output_types[0]} instead.')
|
|
76
|
-
model_adapter_cls_str = benchmark.output_types[0]
|
|
77
|
-
benchmark.model_adapter = model_adapter_cls_str
|
|
78
|
-
|
|
79
|
-
model_adapter_cls = get_model_adapter(model_adapter_cls_str)
|
|
80
|
-
return model_adapter_cls(
|
|
81
|
-
model=base_model,
|
|
82
|
-
generation_config=task_cfg.generation_config,
|
|
83
|
-
chat_template=task_cfg.chat_template,
|
|
84
|
-
task_cfg=task_cfg)
|
|
@@ -1,246 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import time
|
|
3
|
-
import uuid
|
|
4
|
-
from typing import Any, List, Optional, Union
|
|
5
|
-
|
|
6
|
-
from evalscope.utils.logger import get_logger
|
|
7
|
-
from ..register import register_model_adapter
|
|
8
|
-
from .server_adapter import ServerModelAdapter
|
|
9
|
-
|
|
10
|
-
logger = get_logger()
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
@register_model_adapter(name='bfcl_server')
|
|
14
|
-
class BFCLAdapter(ServerModelAdapter):
|
|
15
|
-
"""
|
|
16
|
-
BFCL model adapter to request remote API model and generate results for BFCL evaluation.
|
|
17
|
-
Support multi-turn and single-turn function calling tasks.
|
|
18
|
-
"""
|
|
19
|
-
|
|
20
|
-
def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
|
|
21
|
-
"""
|
|
22
|
-
Args:
|
|
23
|
-
api_url: The URL of the remote API model.
|
|
24
|
-
model_id: The ID of the remote API model.
|
|
25
|
-
api_key: The API key of the remote API model.
|
|
26
|
-
"""
|
|
27
|
-
super().__init__(api_url=api_url, model_id=model_id, api_key=api_key, **kwargs)
|
|
28
|
-
|
|
29
|
-
def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
|
|
30
|
-
"""
|
|
31
|
-
Model prediction func. For multi-turn evals, we pass a list[list[message]] to the model
|
|
32
|
-
where each list is a follow up turn in the conversation
|
|
33
|
-
each turn is a List[List[Message]]
|
|
34
|
-
|
|
35
|
-
Args:
|
|
36
|
-
inputs (List[dict]): The input data.
|
|
37
|
-
infer_cfg (dict): Inference configuration.
|
|
38
|
-
|
|
39
|
-
Returns:
|
|
40
|
-
res (List[dict]): The model prediction results.
|
|
41
|
-
"""
|
|
42
|
-
infer_cfg = infer_cfg or {}
|
|
43
|
-
results = []
|
|
44
|
-
|
|
45
|
-
for input_item in inputs:
|
|
46
|
-
# This flag decides if we pass tools to the API or try tool calling via prompting
|
|
47
|
-
# Passing tools to the API means that we rely on the API to manage system prompt specifics
|
|
48
|
-
# and also expect parsed tool calls in the ChatCompletionMessage object
|
|
49
|
-
# This is how the is_fc_model=True benchmark is designed to work
|
|
50
|
-
# On the other hand, we try to manage
|
|
51
|
-
# tool calling via prompting and parse tool calls in the standard text response
|
|
52
|
-
# This is how the is_fc_model=False benchmark is designed to work
|
|
53
|
-
row = input_item.get('messages')
|
|
54
|
-
is_fc_model = row.get('is_fc_model', False)
|
|
55
|
-
|
|
56
|
-
if is_fc_model:
|
|
57
|
-
response = self.generate_turn_with_tools(row, infer_cfg)
|
|
58
|
-
else:
|
|
59
|
-
response = self.generate_turn(row, infer_cfg)
|
|
60
|
-
|
|
61
|
-
# wrap response with openai types
|
|
62
|
-
res_d = {
|
|
63
|
-
'choices': [{
|
|
64
|
-
'index': 0,
|
|
65
|
-
'message': {
|
|
66
|
-
'content': response,
|
|
67
|
-
'role': 'assistant'
|
|
68
|
-
}
|
|
69
|
-
}],
|
|
70
|
-
'created': time.time(),
|
|
71
|
-
'model': self.model_id,
|
|
72
|
-
'object': 'chat.completion',
|
|
73
|
-
'usage': {
|
|
74
|
-
'completion_tokens': 0,
|
|
75
|
-
'prompt_tokens': 0,
|
|
76
|
-
'total_tokens': 0
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
results.append(res_d)
|
|
80
|
-
|
|
81
|
-
return results
|
|
82
|
-
|
|
83
|
-
def generate_turn(self, row: dict[str, Any], infer_cfg: dict[str, Any]) -> list[str]:
|
|
84
|
-
from bfcl_eval.constants.default_prompts import (DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING,
|
|
85
|
-
MAXIMUM_STEP_LIMIT)
|
|
86
|
-
from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
|
|
87
|
-
from bfcl_eval.model_handler.utils import default_decode_execute_prompting
|
|
88
|
-
|
|
89
|
-
all_model_responses = []
|
|
90
|
-
current_messages = []
|
|
91
|
-
turns = row['turns']
|
|
92
|
-
for turn_idx, messages in enumerate(turns):
|
|
93
|
-
n_steps = 0
|
|
94
|
-
current_responses = []
|
|
95
|
-
current_messages += messages.copy()
|
|
96
|
-
|
|
97
|
-
if str(turn_idx) in row['missing_functions']:
|
|
98
|
-
assert len(messages) == 0, 'Holdout turn should not have user message.'
|
|
99
|
-
new_turn = [{
|
|
100
|
-
'role':
|
|
101
|
-
'user',
|
|
102
|
-
'content':
|
|
103
|
-
DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING.format(
|
|
104
|
-
functions=row['missing_functions'][str(turn_idx)]),
|
|
105
|
-
}]
|
|
106
|
-
current_messages += new_turn
|
|
107
|
-
|
|
108
|
-
while True:
|
|
109
|
-
input_item = {
|
|
110
|
-
'messages': current_messages,
|
|
111
|
-
}
|
|
112
|
-
responses = self.process_single_input(input_item, infer_cfg)
|
|
113
|
-
result = responses['choices'][0]['message']['content']
|
|
114
|
-
|
|
115
|
-
logger.debug(f'Turn:{turn_idx} Step:{n_steps} Result: {result}')
|
|
116
|
-
current_messages.append({
|
|
117
|
-
'role': 'assistant',
|
|
118
|
-
'content': result,
|
|
119
|
-
})
|
|
120
|
-
current_responses.append(result)
|
|
121
|
-
|
|
122
|
-
execute_tools = row.get('should_execute_tool_calls', False)
|
|
123
|
-
if execute_tools:
|
|
124
|
-
try:
|
|
125
|
-
tool_calls = default_decode_execute_prompting(result)
|
|
126
|
-
except Exception:
|
|
127
|
-
tool_calls = None
|
|
128
|
-
|
|
129
|
-
if tool_calls is None:
|
|
130
|
-
break
|
|
131
|
-
|
|
132
|
-
tool_outputs, _ = execute_multi_turn_func_call(
|
|
133
|
-
tool_calls,
|
|
134
|
-
initial_config=row['initial_config'],
|
|
135
|
-
involved_classes=row['involved_classes'],
|
|
136
|
-
model_name='evaluator_loop',
|
|
137
|
-
test_entry_id=row['id'],
|
|
138
|
-
long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
|
|
139
|
-
is_evaL_run=False,
|
|
140
|
-
)
|
|
141
|
-
# Append tool outputs to the current messages
|
|
142
|
-
tool_results = []
|
|
143
|
-
for tool_output, tool_call in zip(tool_outputs, tool_calls):
|
|
144
|
-
tool_results.append({'role': 'tool', 'name': tool_call, 'content': tool_output})
|
|
145
|
-
current_messages.append({
|
|
146
|
-
'role': 'user',
|
|
147
|
-
'content': repr(tool_results),
|
|
148
|
-
})
|
|
149
|
-
else:
|
|
150
|
-
break
|
|
151
|
-
|
|
152
|
-
n_steps += 1
|
|
153
|
-
if n_steps > MAXIMUM_STEP_LIMIT:
|
|
154
|
-
logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
|
|
155
|
-
break
|
|
156
|
-
|
|
157
|
-
all_model_responses.append(current_responses)
|
|
158
|
-
|
|
159
|
-
return all_model_responses
|
|
160
|
-
|
|
161
|
-
def generate_turn_with_tools(self, row: dict[str, Any], infer_cfg: dict[str, Any]) -> list[str]:
|
|
162
|
-
from bfcl_eval.constants.default_prompts import (DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
|
|
163
|
-
MAXIMUM_STEP_LIMIT)
|
|
164
|
-
from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
|
|
165
|
-
from bfcl_eval.model_handler.utils import convert_to_function_call
|
|
166
|
-
|
|
167
|
-
all_model_responses = []
|
|
168
|
-
current_messages = []
|
|
169
|
-
turns = row['turns']
|
|
170
|
-
for turn_idx, messages in enumerate(turns):
|
|
171
|
-
n_steps = 0
|
|
172
|
-
current_responses = []
|
|
173
|
-
current_messages += messages.copy()
|
|
174
|
-
tools = row['tools']
|
|
175
|
-
|
|
176
|
-
if str(turn_idx) in row['missing_functions']:
|
|
177
|
-
assert len(messages) == 0, 'Holdout turn should not have user message.'
|
|
178
|
-
# inject new functions on the fly
|
|
179
|
-
new_tools = row['missing_functions'][str(turn_idx)]
|
|
180
|
-
for new_tool in new_tools:
|
|
181
|
-
tools.append({
|
|
182
|
-
'type': 'function',
|
|
183
|
-
'function': new_tool[0],
|
|
184
|
-
})
|
|
185
|
-
new_turn = [{
|
|
186
|
-
'role': 'user',
|
|
187
|
-
'content': DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
|
|
188
|
-
}]
|
|
189
|
-
current_messages += new_turn
|
|
190
|
-
|
|
191
|
-
while True:
|
|
192
|
-
input_item = {
|
|
193
|
-
'messages': current_messages,
|
|
194
|
-
'tools': tools,
|
|
195
|
-
}
|
|
196
|
-
responses = self.process_single_input(input_item, infer_cfg)
|
|
197
|
-
message = responses['choices'][0]['message']
|
|
198
|
-
|
|
199
|
-
current_messages.append(message)
|
|
200
|
-
if isinstance(message, str):
|
|
201
|
-
model_responses = [message]
|
|
202
|
-
tool_call_strs = None
|
|
203
|
-
elif message.get('tool_calls'):
|
|
204
|
-
model_responses = [{
|
|
205
|
-
tc['function']['name']: tc['function']['arguments']
|
|
206
|
-
} for tc in message['tool_calls']]
|
|
207
|
-
try:
|
|
208
|
-
tool_call_strs = convert_to_function_call(model_responses)
|
|
209
|
-
except Exception as e:
|
|
210
|
-
logger.error(f'Error converting tool calls to function call strings: {e}')
|
|
211
|
-
tool_call_strs = None
|
|
212
|
-
else:
|
|
213
|
-
model_responses = [message['content']]
|
|
214
|
-
tool_call_strs = None
|
|
215
|
-
|
|
216
|
-
current_responses.extend(model_responses)
|
|
217
|
-
|
|
218
|
-
execute_tools = row.get('should_execute_tool_calls', False)
|
|
219
|
-
if execute_tools and tool_call_strs is not None:
|
|
220
|
-
tool_outputs, _ = execute_multi_turn_func_call(
|
|
221
|
-
tool_call_strs,
|
|
222
|
-
initial_config=row['initial_config'],
|
|
223
|
-
involved_classes=row['involved_classes'],
|
|
224
|
-
model_name='evaluator_loop',
|
|
225
|
-
test_entry_id=row['id'],
|
|
226
|
-
long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
|
|
227
|
-
is_evaL_run=False,
|
|
228
|
-
)
|
|
229
|
-
|
|
230
|
-
for tc, tool_output in zip(message['tool_calls'], tool_outputs, strict=False):
|
|
231
|
-
current_messages.append({
|
|
232
|
-
'role': 'tool',
|
|
233
|
-
'tool_call_id': tc['id'],
|
|
234
|
-
'content': json.dumps({'response': tool_output}),
|
|
235
|
-
})
|
|
236
|
-
else:
|
|
237
|
-
break
|
|
238
|
-
|
|
239
|
-
n_steps += 1
|
|
240
|
-
if n_steps > MAXIMUM_STEP_LIMIT:
|
|
241
|
-
logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
|
|
242
|
-
break
|
|
243
|
-
|
|
244
|
-
all_model_responses.append(current_responses)
|
|
245
|
-
|
|
246
|
-
return all_model_responses
|