evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,236 +0,0 @@
|
|
|
1
|
-
import copy
|
|
2
|
-
import openai
|
|
3
|
-
from collections import defaultdict
|
|
4
|
-
from openai.types.chat import ChatCompletion, ChatCompletionChunk
|
|
5
|
-
from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
|
|
6
|
-
from typing import List, Optional, Union
|
|
7
|
-
|
|
8
|
-
from evalscope.utils.argument_utils import get_supported_params
|
|
9
|
-
from evalscope.utils.logger import get_logger
|
|
10
|
-
from ..register import register_model_adapter
|
|
11
|
-
from .base_adapter import BaseModelAdapter
|
|
12
|
-
|
|
13
|
-
logger = get_logger()
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@register_model_adapter(name='server')
|
|
17
|
-
class ServerModelAdapter(BaseModelAdapter):
|
|
18
|
-
"""
|
|
19
|
-
Server model adapter to request remote API model and generate results.
|
|
20
|
-
"""
|
|
21
|
-
|
|
22
|
-
def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
|
|
23
|
-
"""
|
|
24
|
-
Args:
|
|
25
|
-
api_url: The URL of the remote API model.
|
|
26
|
-
model_id: The ID of the remote API model.
|
|
27
|
-
api_key: The API key of the remote API model.
|
|
28
|
-
"""
|
|
29
|
-
self.api_url = api_url.rstrip('/').rsplit('/chat/completions', 1)[0]
|
|
30
|
-
self.model_id = model_id
|
|
31
|
-
self.api_key = api_key
|
|
32
|
-
|
|
33
|
-
self.client = openai.OpenAI(
|
|
34
|
-
api_key=self.api_key,
|
|
35
|
-
base_url=self.api_url,
|
|
36
|
-
)
|
|
37
|
-
self.supported_params = get_supported_params(self.client.chat.completions.create)
|
|
38
|
-
|
|
39
|
-
self.seed = kwargs.get('seed', None)
|
|
40
|
-
self.timeout = kwargs.get('timeout', 60)
|
|
41
|
-
self.stream = kwargs.get('stream', False)
|
|
42
|
-
self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
|
|
43
|
-
super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
|
|
44
|
-
|
|
45
|
-
def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
|
|
46
|
-
"""
|
|
47
|
-
Model prediction func.
|
|
48
|
-
|
|
49
|
-
Args:
|
|
50
|
-
inputs (List[dict]): The input data.
|
|
51
|
-
infer_cfg (dict): Inference configuration.
|
|
52
|
-
|
|
53
|
-
Returns:
|
|
54
|
-
res (List[dict]): The model prediction results.
|
|
55
|
-
"""
|
|
56
|
-
infer_cfg = infer_cfg or {}
|
|
57
|
-
results = []
|
|
58
|
-
|
|
59
|
-
for input_item in inputs:
|
|
60
|
-
response = self.process_single_input(input_item, infer_cfg)
|
|
61
|
-
results.append(response)
|
|
62
|
-
|
|
63
|
-
return results
|
|
64
|
-
|
|
65
|
-
def process_single_input(self, input_item: dict, infer_cfg: dict) -> dict:
|
|
66
|
-
"""Process a single input item."""
|
|
67
|
-
request_json = self.make_request(input_item, infer_cfg)
|
|
68
|
-
response = self.send_request(request_json)
|
|
69
|
-
return response
|
|
70
|
-
|
|
71
|
-
def make_request_messages(self, input_item: dict) -> list:
|
|
72
|
-
"""
|
|
73
|
-
Make request messages for OpenAI API.
|
|
74
|
-
"""
|
|
75
|
-
if input_item.get('messages', None):
|
|
76
|
-
return input_item['messages']
|
|
77
|
-
|
|
78
|
-
data: list = input_item['data']
|
|
79
|
-
if isinstance(data[0], tuple): # for truthful_qa and hellaswag
|
|
80
|
-
query = '\n'.join(''.join(item) for item in data)
|
|
81
|
-
system_prompt = input_item.get('system_prompt', None)
|
|
82
|
-
else:
|
|
83
|
-
query = data[0]
|
|
84
|
-
system_prompt = input_item.get('system_prompt', None)
|
|
85
|
-
|
|
86
|
-
messages = []
|
|
87
|
-
if system_prompt:
|
|
88
|
-
messages.append({'role': 'system', 'content': system_prompt})
|
|
89
|
-
|
|
90
|
-
messages.append({'role': 'user', 'content': query})
|
|
91
|
-
|
|
92
|
-
return messages
|
|
93
|
-
|
|
94
|
-
def make_request(self, input_item: dict, infer_cfg: dict) -> dict:
|
|
95
|
-
"""Make request to remote API."""
|
|
96
|
-
messages = self.make_request_messages(input_item)
|
|
97
|
-
# Format request JSON according to OpenAI API format
|
|
98
|
-
request_json = {'model': self.model_id, 'messages': messages, **infer_cfg}
|
|
99
|
-
|
|
100
|
-
if self.timeout:
|
|
101
|
-
request_json['timeout'] = self.timeout
|
|
102
|
-
|
|
103
|
-
request_json['stream'] = self.stream
|
|
104
|
-
if self.stream:
|
|
105
|
-
request_json['stream_options'] = {'include_usage': True}
|
|
106
|
-
|
|
107
|
-
if input_item.get('tools', None):
|
|
108
|
-
tools_copy = copy.deepcopy(input_item.get('tools'))
|
|
109
|
-
# Remove the "responses" from the functions, as that doesn't
|
|
110
|
-
# need to go to the model
|
|
111
|
-
for tool in tools_copy:
|
|
112
|
-
if 'function' in tool and 'response' in tool['function']:
|
|
113
|
-
del tool['function']['response']
|
|
114
|
-
request_json['tools'] = tools_copy
|
|
115
|
-
|
|
116
|
-
logger.debug(f'Request to remote API: {request_json}')
|
|
117
|
-
|
|
118
|
-
return request_json
|
|
119
|
-
|
|
120
|
-
def _parse_extra_params(self, request_json):
|
|
121
|
-
api_params = {}
|
|
122
|
-
extra_body = {}
|
|
123
|
-
for key, value in request_json.items():
|
|
124
|
-
if key in self.supported_params:
|
|
125
|
-
api_params[key] = value
|
|
126
|
-
else:
|
|
127
|
-
extra_body[key] = value
|
|
128
|
-
|
|
129
|
-
if extra_body:
|
|
130
|
-
api_params['extra_body'] = extra_body
|
|
131
|
-
return api_params
|
|
132
|
-
|
|
133
|
-
def send_request(self, request_json: dict) -> dict:
|
|
134
|
-
try:
|
|
135
|
-
parsed_request = self._parse_extra_params(request_json)
|
|
136
|
-
response = self.client.chat.completions.create(**parsed_request)
|
|
137
|
-
|
|
138
|
-
if response and self.stream:
|
|
139
|
-
response = self._collect_stream_response(response)
|
|
140
|
-
|
|
141
|
-
return response.model_dump(exclude_unset=True)
|
|
142
|
-
except Exception as e:
|
|
143
|
-
logger.error(f'Error when calling remote API: {str(e)}')
|
|
144
|
-
raise e
|
|
145
|
-
|
|
146
|
-
def _collect_stream_response(self, response_stream: List[ChatCompletionChunk]) -> ChatCompletion:
|
|
147
|
-
collected_chunks = []
|
|
148
|
-
collected_messages = defaultdict(list)
|
|
149
|
-
collected_reasoning = defaultdict(list)
|
|
150
|
-
collected_tool_calls = defaultdict(dict)
|
|
151
|
-
|
|
152
|
-
for chunk in response_stream:
|
|
153
|
-
collected_chunks.append(chunk)
|
|
154
|
-
for choice in chunk.choices:
|
|
155
|
-
# Handle reasoning content
|
|
156
|
-
if hasattr(choice.delta, 'reasoning_content') and choice.delta.reasoning_content is not None:
|
|
157
|
-
collected_reasoning[choice.index].append(choice.delta.reasoning_content)
|
|
158
|
-
|
|
159
|
-
# Handle regular content
|
|
160
|
-
if choice.delta.content is not None:
|
|
161
|
-
collected_messages[choice.index].append(choice.delta.content)
|
|
162
|
-
|
|
163
|
-
# Handle tool calls
|
|
164
|
-
if hasattr(choice.delta, 'tool_calls') and choice.delta.tool_calls:
|
|
165
|
-
for tool_call in choice.delta.tool_calls:
|
|
166
|
-
tool_id = tool_call.index
|
|
167
|
-
|
|
168
|
-
# Initialize tool call if not present
|
|
169
|
-
if tool_id not in collected_tool_calls[choice.index]:
|
|
170
|
-
collected_tool_calls[choice.index][tool_id] = {
|
|
171
|
-
'id': tool_call.id if hasattr(tool_call, 'id') and tool_call.id else None,
|
|
172
|
-
'type': tool_call.type if hasattr(tool_call, 'type') and tool_call.type else None,
|
|
173
|
-
'function': {
|
|
174
|
-
'name': '',
|
|
175
|
-
'arguments': ''
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
# Update tool call with new chunks
|
|
180
|
-
if hasattr(tool_call, 'function'):
|
|
181
|
-
if hasattr(tool_call.function, 'name') and tool_call.function.name:
|
|
182
|
-
collected_tool_calls[
|
|
183
|
-
choice.index][tool_id]['function']['name'] = tool_call.function.name
|
|
184
|
-
|
|
185
|
-
if hasattr(tool_call.function, 'arguments') and tool_call.function.arguments:
|
|
186
|
-
collected_tool_calls[
|
|
187
|
-
choice.index][tool_id]['function']['arguments'] += tool_call.function.arguments
|
|
188
|
-
|
|
189
|
-
# Update ID if it was received later
|
|
190
|
-
if hasattr(tool_call, 'id') and tool_call.id:
|
|
191
|
-
collected_tool_calls[choice.index][tool_id]['id'] = tool_call.id
|
|
192
|
-
|
|
193
|
-
# Get all unique choice indices from all collections
|
|
194
|
-
all_indices = set(collected_messages.keys()) | set(collected_reasoning.keys()) | set(
|
|
195
|
-
collected_tool_calls.keys())
|
|
196
|
-
|
|
197
|
-
choices = []
|
|
198
|
-
for index in all_indices:
|
|
199
|
-
full_reply_content = ''.join(collected_messages.get(index, []))
|
|
200
|
-
reasoning = ''.join(collected_reasoning.get(index, []))
|
|
201
|
-
|
|
202
|
-
# Process tool_calls for this choice if any exists
|
|
203
|
-
tool_calls_list = None
|
|
204
|
-
if index in collected_tool_calls and collected_tool_calls[index]:
|
|
205
|
-
tool_calls_list = list(collected_tool_calls[index].values())
|
|
206
|
-
# Filter out any tool calls with None id (incomplete tool calls)
|
|
207
|
-
tool_calls_list = [tc for tc in tool_calls_list if tc['id'] is not None]
|
|
208
|
-
|
|
209
|
-
# use the finish_reason from the last chunk that generated this choice
|
|
210
|
-
finish_reason = None
|
|
211
|
-
for chunk in reversed(collected_chunks):
|
|
212
|
-
if chunk.choices and chunk.choices[0].index == index:
|
|
213
|
-
finish_reason = chunk.choices[0].finish_reason
|
|
214
|
-
break
|
|
215
|
-
|
|
216
|
-
message_kwargs = {'role': 'assistant', 'content': full_reply_content}
|
|
217
|
-
|
|
218
|
-
if reasoning:
|
|
219
|
-
message_kwargs['reasoning_content'] = reasoning
|
|
220
|
-
|
|
221
|
-
if tool_calls_list:
|
|
222
|
-
message_kwargs['tool_calls'] = tool_calls_list
|
|
223
|
-
|
|
224
|
-
choice = Choice(
|
|
225
|
-
finish_reason=finish_reason or 'stop', index=index, message=ChatCompletionMessage(**message_kwargs))
|
|
226
|
-
choices.append(choice)
|
|
227
|
-
|
|
228
|
-
# build the final completion object
|
|
229
|
-
return ChatCompletion(
|
|
230
|
-
id=collected_chunks[0].id,
|
|
231
|
-
choices=choices,
|
|
232
|
-
created=collected_chunks[0].created,
|
|
233
|
-
model=collected_chunks[0].model,
|
|
234
|
-
object='chat.completion',
|
|
235
|
-
usage=collected_chunks[-1].usage # use the usage from the last chunk
|
|
236
|
-
)
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import time
|
|
3
|
-
import torch
|
|
4
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
5
|
-
|
|
6
|
-
from evalscope.constants import OutputType
|
|
7
|
-
from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
|
|
8
|
-
from evalscope.utils.io_utils import OutputsStructure
|
|
9
|
-
from evalscope.utils.logger import get_logger
|
|
10
|
-
from ..local_model import LocalModel
|
|
11
|
-
from ..register import register_model_adapter
|
|
12
|
-
from .base_adapter import BaseModelAdapter
|
|
13
|
-
|
|
14
|
-
logger = get_logger()
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@register_model_adapter(name=OutputType.IMAGE_GENERATION)
|
|
18
|
-
class T2IModelAdapter(BaseModelAdapter):
|
|
19
|
-
"""
|
|
20
|
-
Text to image model adapter.
|
|
21
|
-
"""
|
|
22
|
-
|
|
23
|
-
def __init__(self, model: LocalModel, **kwargs):
|
|
24
|
-
super().__init__(model)
|
|
25
|
-
|
|
26
|
-
self.task_config = kwargs.get('task_cfg', None)
|
|
27
|
-
assert self.task_config is not None, 'Task config is required for T2I model adapter.'
|
|
28
|
-
|
|
29
|
-
self.save_path = os.path.join(self.task_config.work_dir, OutputsStructure.PREDICTIONS_DIR,
|
|
30
|
-
self.task_config.model_id, 'images')
|
|
31
|
-
os.makedirs(self.save_path, exist_ok=True)
|
|
32
|
-
|
|
33
|
-
def _model_generate(self, prompt, infer_cfg=None) -> List:
|
|
34
|
-
"""
|
|
35
|
-
Generate images from the model.
|
|
36
|
-
Args:
|
|
37
|
-
prompt: The input prompt.
|
|
38
|
-
infer_cfg: The inference configuration.
|
|
39
|
-
Returns:
|
|
40
|
-
The generated images.
|
|
41
|
-
"""
|
|
42
|
-
infer_cfg = infer_cfg or {}
|
|
43
|
-
|
|
44
|
-
sample = self.model(prompt=prompt, **infer_cfg).images
|
|
45
|
-
return sample
|
|
46
|
-
|
|
47
|
-
@torch.no_grad()
|
|
48
|
-
def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
|
|
49
|
-
"""
|
|
50
|
-
Args:
|
|
51
|
-
inputs: The input data.
|
|
52
|
-
infer_cfg: The inference configuration.
|
|
53
|
-
Returns:
|
|
54
|
-
The prediction results.
|
|
55
|
-
"""
|
|
56
|
-
results = []
|
|
57
|
-
for input_item in inputs:
|
|
58
|
-
prompt = input_item['data'][0]
|
|
59
|
-
image_id = input_item.get('id') or input_item.get('index')
|
|
60
|
-
|
|
61
|
-
samples = self._model_generate(prompt, infer_cfg)
|
|
62
|
-
|
|
63
|
-
choices_list = []
|
|
64
|
-
for index, sample in enumerate(samples):
|
|
65
|
-
image_file_path = os.path.join(self.save_path, f'{image_id}_{index}.jpeg')
|
|
66
|
-
sample.save(image_file_path)
|
|
67
|
-
logger.debug(f'Saved image to {image_file_path}')
|
|
68
|
-
|
|
69
|
-
choice = ChatCompletionResponseChoice(
|
|
70
|
-
index=index, message=ChatMessage(content=image_file_path, role='assistant'), finish_reason='stop')
|
|
71
|
-
choices_list.append(choice)
|
|
72
|
-
|
|
73
|
-
res_d = ChatCompletionResponse(
|
|
74
|
-
model=self.model_id, choices=choices_list, object='images.generations',
|
|
75
|
-
created=int(time.time())).model_dump(exclude_unset=True)
|
|
76
|
-
|
|
77
|
-
results.append(res_d)
|
|
78
|
-
|
|
79
|
-
return results
|
|
@@ -1,189 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import time
|
|
3
|
-
from typing import Any, Dict, List, Optional, Union
|
|
4
|
-
|
|
5
|
-
from evalscope.utils.logger import get_logger
|
|
6
|
-
from ..register import register_model_adapter
|
|
7
|
-
from .server_adapter import ServerModelAdapter
|
|
8
|
-
|
|
9
|
-
logger = get_logger()
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@register_model_adapter(name='tau_bench_server')
|
|
13
|
-
class TauBenchAdapter(ServerModelAdapter):
|
|
14
|
-
"""
|
|
15
|
-
TauBench model adapter to request remote API model and generate results for TauBench evaluation.
|
|
16
|
-
Support multi-turn and single-turn function calling tasks.
|
|
17
|
-
"""
|
|
18
|
-
|
|
19
|
-
def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
|
|
20
|
-
"""
|
|
21
|
-
Args:
|
|
22
|
-
api_url: The URL of the remote API model.
|
|
23
|
-
model_id: The ID of the remote API model.
|
|
24
|
-
api_key: The API key of the remote API model.
|
|
25
|
-
"""
|
|
26
|
-
super().__init__(api_url=api_url, model_id=model_id, api_key=api_key, **kwargs)
|
|
27
|
-
|
|
28
|
-
self._patch_agent_solve()
|
|
29
|
-
|
|
30
|
-
def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
|
|
31
|
-
"""
|
|
32
|
-
Model prediction func. For multi-turn evals, we pass a list[list[message]] to the model
|
|
33
|
-
where each list is a follow up turn in the conversation
|
|
34
|
-
each turn is a List[List[Message]]
|
|
35
|
-
|
|
36
|
-
Args:
|
|
37
|
-
inputs (List[dict]): The input data.
|
|
38
|
-
infer_cfg (dict): Inference configuration.
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
res (List[dict]): The model prediction results.
|
|
42
|
-
"""
|
|
43
|
-
infer_cfg = infer_cfg or {}
|
|
44
|
-
results = []
|
|
45
|
-
|
|
46
|
-
for input_item in inputs:
|
|
47
|
-
raw_input = input_item.get('raw_input')
|
|
48
|
-
|
|
49
|
-
res_d = self.solve(env_name=raw_input['env_name'], task_index=raw_input['task_index'], infer_cfg=infer_cfg)
|
|
50
|
-
|
|
51
|
-
wrapper_res = {
|
|
52
|
-
'choices': [{
|
|
53
|
-
'index': 0,
|
|
54
|
-
'message': {
|
|
55
|
-
'content': json.dumps(res_d, ensure_ascii=False),
|
|
56
|
-
'role': 'assistant'
|
|
57
|
-
}
|
|
58
|
-
}],
|
|
59
|
-
'created':
|
|
60
|
-
time.time(),
|
|
61
|
-
'model':
|
|
62
|
-
self.model_id,
|
|
63
|
-
'object':
|
|
64
|
-
'chat.completion',
|
|
65
|
-
'usage': {
|
|
66
|
-
'completion_tokens': 0,
|
|
67
|
-
'prompt_tokens': 0,
|
|
68
|
-
'total_tokens': 0
|
|
69
|
-
}
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
results.append(wrapper_res)
|
|
73
|
-
|
|
74
|
-
return results
|
|
75
|
-
|
|
76
|
-
def _patch_agent_solve(self):
|
|
77
|
-
"""Patch ToolCallingAgent.solve method to use custom model configuration"""
|
|
78
|
-
from tau_bench.agents.tool_calling_agent import ToolCallingAgent, message_to_action
|
|
79
|
-
from tau_bench.envs.base import Env
|
|
80
|
-
from tau_bench.types import RESPOND_ACTION_NAME, SolveResult
|
|
81
|
-
from typing import List, Optional
|
|
82
|
-
|
|
83
|
-
def patched_solve(self,
|
|
84
|
-
env: Env,
|
|
85
|
-
task_index: Optional[int] = None,
|
|
86
|
-
max_num_steps: int = 30,
|
|
87
|
-
infer_cfg: Optional[dict] = {}) -> SolveResult:
|
|
88
|
-
env_reset_res = env.reset(task_index=task_index)
|
|
89
|
-
obs = env_reset_res.observation
|
|
90
|
-
info = env_reset_res.info.model_dump()
|
|
91
|
-
reward = 0.0
|
|
92
|
-
messages: List[Dict[str, Any]] = [
|
|
93
|
-
{
|
|
94
|
-
'role': 'system',
|
|
95
|
-
'content': self.wiki
|
|
96
|
-
},
|
|
97
|
-
{
|
|
98
|
-
'role': 'user',
|
|
99
|
-
'content': obs
|
|
100
|
-
},
|
|
101
|
-
]
|
|
102
|
-
|
|
103
|
-
for step_index in range(max_num_steps):
|
|
104
|
-
# Use adapter's model configuration instead of agent's
|
|
105
|
-
request_json = adapter_instance.make_request(
|
|
106
|
-
input_item={
|
|
107
|
-
'messages': messages,
|
|
108
|
-
'tools': self.tools_info
|
|
109
|
-
}, infer_cfg=infer_cfg)
|
|
110
|
-
res = adapter_instance.send_request(request_json)
|
|
111
|
-
|
|
112
|
-
next_message = res['choices'][0]['message']
|
|
113
|
-
action = message_to_action(next_message)
|
|
114
|
-
env_response = env.step(action)
|
|
115
|
-
reward = env_response.reward
|
|
116
|
-
info = {**info, **env_response.info.model_dump()}
|
|
117
|
-
|
|
118
|
-
if action.name != RESPOND_ACTION_NAME:
|
|
119
|
-
next_message['tool_calls'] = next_message['tool_calls'][:1]
|
|
120
|
-
messages.extend([
|
|
121
|
-
next_message,
|
|
122
|
-
{
|
|
123
|
-
'role': 'tool',
|
|
124
|
-
'tool_call_id': next_message['tool_calls'][0]['id'],
|
|
125
|
-
'name': next_message['tool_calls'][0]['function']['name'],
|
|
126
|
-
'content': env_response.observation,
|
|
127
|
-
},
|
|
128
|
-
])
|
|
129
|
-
else:
|
|
130
|
-
messages.extend([
|
|
131
|
-
next_message,
|
|
132
|
-
{
|
|
133
|
-
'role': 'user',
|
|
134
|
-
'content': env_response.observation
|
|
135
|
-
},
|
|
136
|
-
])
|
|
137
|
-
logger.debug(f'Task: {task_index} Step: {step_index} finished')
|
|
138
|
-
|
|
139
|
-
if env_response.done:
|
|
140
|
-
break
|
|
141
|
-
|
|
142
|
-
return SolveResult(
|
|
143
|
-
reward=reward,
|
|
144
|
-
info=info,
|
|
145
|
-
messages=messages,
|
|
146
|
-
total_cost=0,
|
|
147
|
-
)
|
|
148
|
-
|
|
149
|
-
adapter_instance = self
|
|
150
|
-
|
|
151
|
-
ToolCallingAgent.solve = patched_solve
|
|
152
|
-
|
|
153
|
-
return 'ToolCallingAgent.solve patched successfully'
|
|
154
|
-
|
|
155
|
-
def solve(self, env_name, task_index, infer_cfg, **kwargs):
|
|
156
|
-
"""
|
|
157
|
-
Solve a specific task in the TauBench environment.
|
|
158
|
-
|
|
159
|
-
Args:
|
|
160
|
-
env_name (str): The name of the TauBench environment.
|
|
161
|
-
task_index (int): The index of the task to solve.
|
|
162
|
-
**kwargs: Additional arguments for the task.
|
|
163
|
-
|
|
164
|
-
Returns:
|
|
165
|
-
dict: The result of the task.
|
|
166
|
-
"""
|
|
167
|
-
from tau_bench.agents.tool_calling_agent import ToolCallingAgent
|
|
168
|
-
from tau_bench.envs import get_env
|
|
169
|
-
|
|
170
|
-
# This method can be implemented to solve specific tasks in the TauBench environment
|
|
171
|
-
isolated_env = get_env(
|
|
172
|
-
env_name=env_name,
|
|
173
|
-
user_strategy='llm',
|
|
174
|
-
user_model='dummy', # Use dummy model to prevent errors
|
|
175
|
-
user_provider='openai', # Use dummy provider to prevent errors
|
|
176
|
-
task_split='test',
|
|
177
|
-
task_index=task_index,
|
|
178
|
-
)
|
|
179
|
-
agent = ToolCallingAgent(
|
|
180
|
-
tools_info=isolated_env.tools_info,
|
|
181
|
-
wiki=isolated_env.wiki,
|
|
182
|
-
model='dummy', # Use dummy model to prevent errors
|
|
183
|
-
provider='dummy', # Use dummy provider to prevent errors
|
|
184
|
-
temperature=0, # dummy temperature to prevent errors
|
|
185
|
-
)
|
|
186
|
-
|
|
187
|
-
res = agent.solve(env=isolated_env, task_index=task_index, infer_cfg=infer_cfg)
|
|
188
|
-
|
|
189
|
-
return res.model_dump()
|
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import torch
|
|
3
|
-
from abc import ABC, abstractmethod
|
|
4
|
-
from typing import Any, Dict, List, Union
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class CustomModel(ABC):
|
|
8
|
-
|
|
9
|
-
def __init__(self, config: dict, **kwargs):
|
|
10
|
-
self.config = config
|
|
11
|
-
self.kwargs = kwargs
|
|
12
|
-
|
|
13
|
-
@abstractmethod
|
|
14
|
-
@torch.no_grad()
|
|
15
|
-
def predict(self, prompts: List[str], **kwargs) -> List[Dict[str, Any]]:
|
|
16
|
-
"""
|
|
17
|
-
Model prediction function for batch inputs.
|
|
18
|
-
|
|
19
|
-
Args:
|
|
20
|
-
prompts (str): The input batch of prompts to predict.
|
|
21
|
-
|
|
22
|
-
**kwargs: kwargs
|
|
23
|
-
|
|
24
|
-
Returns:
|
|
25
|
-
res (dict): The model prediction results (batch). Format:
|
|
26
|
-
[
|
|
27
|
-
{
|
|
28
|
-
'choices': [
|
|
29
|
-
{
|
|
30
|
-
'index': 0,
|
|
31
|
-
'message': {
|
|
32
|
-
'content': 'xxx',
|
|
33
|
-
'role': 'assistant'
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
],
|
|
37
|
-
'created': 1677664795,
|
|
38
|
-
'model': 'gpt-3.5-turbo-0613', # should be model_id
|
|
39
|
-
'object': 'chat.completion',
|
|
40
|
-
'usage': {
|
|
41
|
-
'completion_tokens': 17,
|
|
42
|
-
'prompt_tokens': 57,
|
|
43
|
-
'total_tokens': 74
|
|
44
|
-
}
|
|
45
|
-
}
|
|
46
|
-
,
|
|
47
|
-
...
|
|
48
|
-
]
|
|
49
|
-
"""
|
|
50
|
-
raise NotImplementedError
|
|
@@ -1,99 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import time
|
|
3
|
-
from typing import List
|
|
4
|
-
|
|
5
|
-
from evalscope.models import CustomModel
|
|
6
|
-
from evalscope.utils.logger import get_logger
|
|
7
|
-
|
|
8
|
-
logger = get_logger()
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class DummyCustomModel(CustomModel):
|
|
12
|
-
|
|
13
|
-
def __init__(self, config: dict = {}, **kwargs):
|
|
14
|
-
super(DummyCustomModel, self).__init__(config=config, **kwargs)
|
|
15
|
-
|
|
16
|
-
def make_request_messages(self, input_item: dict) -> list:
|
|
17
|
-
"""
|
|
18
|
-
Make request messages for OpenAI API.
|
|
19
|
-
"""
|
|
20
|
-
if input_item.get('messages', None):
|
|
21
|
-
return input_item['messages']
|
|
22
|
-
|
|
23
|
-
data: list = input_item['data']
|
|
24
|
-
if isinstance(data[0], tuple): # for truthful_qa and hellaswag
|
|
25
|
-
query = '\n'.join(''.join(item) for item in data)
|
|
26
|
-
system_prompt = input_item.get('system_prompt', None)
|
|
27
|
-
else:
|
|
28
|
-
query = data[0]
|
|
29
|
-
system_prompt = input_item.get('system_prompt', None)
|
|
30
|
-
|
|
31
|
-
messages = []
|
|
32
|
-
if system_prompt:
|
|
33
|
-
messages.append({'role': 'system', 'content': system_prompt})
|
|
34
|
-
|
|
35
|
-
messages.append({'role': 'user', 'content': query})
|
|
36
|
-
|
|
37
|
-
return messages
|
|
38
|
-
|
|
39
|
-
def predict(self, prompts: List[dict], **kwargs):
|
|
40
|
-
original_inputs = kwargs.get('origin_inputs', None)
|
|
41
|
-
infer_cfg = kwargs.get('infer_cfg', None)
|
|
42
|
-
|
|
43
|
-
logger.debug(f'** Prompts: {prompts}')
|
|
44
|
-
if original_inputs is not None:
|
|
45
|
-
logger.debug(f'** Original inputs: {original_inputs}')
|
|
46
|
-
if infer_cfg is not None:
|
|
47
|
-
logger.debug(f'** Inference config: {infer_cfg}')
|
|
48
|
-
|
|
49
|
-
# Simulate a response based on the prompts
|
|
50
|
-
# Must return a list of dicts with the same format as the OpenAI API.
|
|
51
|
-
responses = []
|
|
52
|
-
for input_item in original_inputs:
|
|
53
|
-
# message = self.make_request_messages(input_item)
|
|
54
|
-
# response = f'Dummy response for prompt: {message}'
|
|
55
|
-
|
|
56
|
-
res_d = {
|
|
57
|
-
'choices': [{
|
|
58
|
-
'index': 0,
|
|
59
|
-
'message': {
|
|
60
|
-
'content': '*PlaceHolder*',
|
|
61
|
-
'role': 'assistant'
|
|
62
|
-
}
|
|
63
|
-
}],
|
|
64
|
-
'created': time.time(),
|
|
65
|
-
'model': self.config.get('model_id'),
|
|
66
|
-
'object': 'chat.completion',
|
|
67
|
-
'usage': {
|
|
68
|
-
'completion_tokens': 0,
|
|
69
|
-
'prompt_tokens': 0,
|
|
70
|
-
'total_tokens': 0
|
|
71
|
-
}
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
responses.append(res_d)
|
|
75
|
-
|
|
76
|
-
return responses
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
if __name__ == '__main__':
|
|
80
|
-
from evalscope import TaskConfig, run_task
|
|
81
|
-
|
|
82
|
-
dummy_model = DummyCustomModel()
|
|
83
|
-
task_config = TaskConfig(
|
|
84
|
-
model=dummy_model,
|
|
85
|
-
model_id='evalscope-model-dummy',
|
|
86
|
-
datasets=['gsm8k'],
|
|
87
|
-
eval_type='custom', # must be custom for custom model evaluation
|
|
88
|
-
generation_config={
|
|
89
|
-
'max_new_tokens': 100,
|
|
90
|
-
'temperature': 0.0,
|
|
91
|
-
'top_p': 1.0,
|
|
92
|
-
'top_k': 50,
|
|
93
|
-
'repetition_penalty': 1.0
|
|
94
|
-
},
|
|
95
|
-
debug=True,
|
|
96
|
-
limit=5,
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
eval_results = run_task(task_cfg=task_config)
|