evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +5 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +356 -0
- evalscope/api/benchmark/meta.py +121 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +262 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +378 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +275 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +243 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +155 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/app.py +3 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +26 -14
- evalscope/app/utils/data_utils.py +43 -27
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -14
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +7 -10
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +10 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +136 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +157 -57
- evalscope/constants.py +37 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +275 -419
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +47 -33
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +67 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +126 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +701 -0
- evalscope/perf/benchmark.py +4 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +15 -10
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +11 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -3
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +51 -35
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +33 -47
- evalscope/summarizer.py +1 -1
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +3 -2
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +142 -6
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +11 -7
- evalscope/utils/multi_choices.py +288 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
- tests/benchmark/test_eval.py +385 -0
- tests/benchmark/test_image_edit.py +65 -0
- tests/{aigc → benchmark}/test_t2i.py +22 -4
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +85 -47
- tests/cli/test_collection.py +20 -8
- tests/cli/test_custom.py +22 -15
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -2
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/__init__.py +0 -0
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import time
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.messages import dict_to_chat_message
|
|
7
|
+
from evalscope.api.model import ChatCompletionChoice, Model, ModelOutput, ModelUsage
|
|
8
|
+
from evalscope.api.tool.tool_info import ToolInfo
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def predict(model: Model, sample: Sample) -> ModelOutput:
|
|
15
|
+
"""Main prediction function for BFCL using the new API framework."""
|
|
16
|
+
# Extract the row data from sample metadata
|
|
17
|
+
row = sample.metadata
|
|
18
|
+
is_fc_model = row.get('is_fc_model', False)
|
|
19
|
+
|
|
20
|
+
if is_fc_model:
|
|
21
|
+
response, model_usage = generate_turn_with_tools(model, row)
|
|
22
|
+
else:
|
|
23
|
+
response, model_usage = generate_turn(model, row)
|
|
24
|
+
|
|
25
|
+
sample.metadata['generation'] = response
|
|
26
|
+
# wrap response with openai types
|
|
27
|
+
return ModelOutput(
|
|
28
|
+
model=model.name,
|
|
29
|
+
choices=[ChatCompletionChoice.from_content(json.dumps(response, ensure_ascii=False, indent=2))],
|
|
30
|
+
model_usage=model_usage,
|
|
31
|
+
time=time.time()
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def generate_turn(model: Model, row: dict[str, Any]):
|
|
36
|
+
from bfcl_eval.constants.default_prompts import (
|
|
37
|
+
DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING,
|
|
38
|
+
MAXIMUM_STEP_LIMIT,
|
|
39
|
+
)
|
|
40
|
+
from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
|
|
41
|
+
from bfcl_eval.model_handler.utils import default_decode_execute_prompting
|
|
42
|
+
|
|
43
|
+
all_model_responses = []
|
|
44
|
+
current_messages = []
|
|
45
|
+
turns = row['turns']
|
|
46
|
+
model_usage = ModelUsage()
|
|
47
|
+
|
|
48
|
+
for turn_idx, messages in enumerate(turns):
|
|
49
|
+
n_steps = 0
|
|
50
|
+
current_responses = []
|
|
51
|
+
current_messages += messages.copy()
|
|
52
|
+
|
|
53
|
+
if str(turn_idx) in row['missing_functions']:
|
|
54
|
+
assert len(messages) == 0, 'Holdout turn should not have user message.'
|
|
55
|
+
new_turn = [{
|
|
56
|
+
'role':
|
|
57
|
+
'user',
|
|
58
|
+
'content':
|
|
59
|
+
DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING.format(
|
|
60
|
+
functions=row['missing_functions'][str(turn_idx)]
|
|
61
|
+
),
|
|
62
|
+
}]
|
|
63
|
+
current_messages += new_turn
|
|
64
|
+
|
|
65
|
+
while True:
|
|
66
|
+
# Create a sample for the current messages
|
|
67
|
+
from evalscope.api.messages.chat_message import dict_to_chat_message
|
|
68
|
+
chat_messages = [dict_to_chat_message(msg) for msg in current_messages]
|
|
69
|
+
|
|
70
|
+
# Get model response using generate method
|
|
71
|
+
model_output = model.generate(chat_messages)
|
|
72
|
+
|
|
73
|
+
# Handle the response based on the model output structure
|
|
74
|
+
message = model_output.message
|
|
75
|
+
model_usage += model_output.usage
|
|
76
|
+
|
|
77
|
+
current_messages.append(message)
|
|
78
|
+
if isinstance(message, str):
|
|
79
|
+
result = message
|
|
80
|
+
else:
|
|
81
|
+
result = message.text
|
|
82
|
+
|
|
83
|
+
logger.debug(f'Turn:{turn_idx} Step:{n_steps} Result: {result}')
|
|
84
|
+
current_responses.append(result)
|
|
85
|
+
|
|
86
|
+
execute_tools = row.get('should_execute_tool_calls', False)
|
|
87
|
+
if execute_tools:
|
|
88
|
+
try:
|
|
89
|
+
tool_calls = default_decode_execute_prompting(result)
|
|
90
|
+
except Exception:
|
|
91
|
+
tool_calls = None
|
|
92
|
+
|
|
93
|
+
if tool_calls is None:
|
|
94
|
+
break
|
|
95
|
+
|
|
96
|
+
tool_outputs, _ = execute_multi_turn_func_call(
|
|
97
|
+
tool_calls,
|
|
98
|
+
initial_config=row['initial_config'],
|
|
99
|
+
involved_classes=row['involved_classes'],
|
|
100
|
+
model_name='evaluator_loop',
|
|
101
|
+
test_entry_id=row['id'],
|
|
102
|
+
long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
|
|
103
|
+
is_evaL_run=False,
|
|
104
|
+
)
|
|
105
|
+
# Append tool outputs to the current messages
|
|
106
|
+
tool_results = []
|
|
107
|
+
for tool_output, tool_call in zip(tool_outputs, tool_calls):
|
|
108
|
+
tool_results.append({'role': 'tool', 'name': tool_call, 'content': tool_output})
|
|
109
|
+
current_messages.append({
|
|
110
|
+
'role': 'user',
|
|
111
|
+
'content': repr(tool_results),
|
|
112
|
+
})
|
|
113
|
+
else:
|
|
114
|
+
break
|
|
115
|
+
|
|
116
|
+
n_steps += 1
|
|
117
|
+
if n_steps > MAXIMUM_STEP_LIMIT:
|
|
118
|
+
logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
|
|
119
|
+
break
|
|
120
|
+
|
|
121
|
+
all_model_responses.append(current_responses)
|
|
122
|
+
|
|
123
|
+
return all_model_responses, model_usage
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def generate_turn_with_tools(model: Model, row: dict[str, Any]):
|
|
127
|
+
from bfcl_eval.constants.default_prompts import DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC, MAXIMUM_STEP_LIMIT
|
|
128
|
+
from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
|
|
129
|
+
from bfcl_eval.model_handler.utils import convert_to_function_call
|
|
130
|
+
|
|
131
|
+
all_model_responses = []
|
|
132
|
+
current_messages = []
|
|
133
|
+
turns = row['turns']
|
|
134
|
+
model_usage = ModelUsage()
|
|
135
|
+
|
|
136
|
+
for turn_idx, messages in enumerate(turns):
|
|
137
|
+
n_steps = 0
|
|
138
|
+
current_responses = []
|
|
139
|
+
current_messages += messages.copy()
|
|
140
|
+
tools = row['tools']
|
|
141
|
+
|
|
142
|
+
if str(turn_idx) in row['missing_functions']:
|
|
143
|
+
assert len(messages) == 0, 'Holdout turn should not have user message.'
|
|
144
|
+
# inject new functions on the fly
|
|
145
|
+
new_tools = row['missing_functions'][str(turn_idx)]
|
|
146
|
+
for new_tool in new_tools:
|
|
147
|
+
cur_tool = new_tool[0]
|
|
148
|
+
# change type to object
|
|
149
|
+
if cur_tool['parameters']['type'] != 'object':
|
|
150
|
+
cur_tool['parameters']['type'] = 'object'
|
|
151
|
+
tools.append({
|
|
152
|
+
'type': 'function',
|
|
153
|
+
'function': cur_tool,
|
|
154
|
+
})
|
|
155
|
+
new_turn = [{
|
|
156
|
+
'role': 'user',
|
|
157
|
+
'content': DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
|
|
158
|
+
}]
|
|
159
|
+
current_messages += new_turn
|
|
160
|
+
|
|
161
|
+
while True:
|
|
162
|
+
# Create a sample for the current messages with tools
|
|
163
|
+
chat_messages = [dict_to_chat_message(msg) for msg in current_messages]
|
|
164
|
+
current_sample = Sample(
|
|
165
|
+
input=chat_messages,
|
|
166
|
+
target='',
|
|
167
|
+
tools=[ToolInfo.model_validate(tool['function']) for tool in tools],
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# Get model response
|
|
171
|
+
model_output = model.generate(current_sample.input, tools=current_sample.tools)
|
|
172
|
+
|
|
173
|
+
# Handle the response based on the model output structure
|
|
174
|
+
message = model_output.message
|
|
175
|
+
model_usage += model_output.usage
|
|
176
|
+
|
|
177
|
+
current_messages.append(message)
|
|
178
|
+
if isinstance(message, str):
|
|
179
|
+
model_responses = [message]
|
|
180
|
+
tool_call_strs = None
|
|
181
|
+
elif message.tool_calls:
|
|
182
|
+
model_responses = [{tc.function.name: tc.function.arguments} for tc in message.tool_calls]
|
|
183
|
+
try:
|
|
184
|
+
tool_call_strs = convert_to_function_call(model_responses)
|
|
185
|
+
except Exception as e:
|
|
186
|
+
logger.error(f'Error converting tool calls to function call strings: {e}')
|
|
187
|
+
tool_call_strs = None
|
|
188
|
+
else:
|
|
189
|
+
model_responses = [message.text]
|
|
190
|
+
tool_call_strs = None
|
|
191
|
+
|
|
192
|
+
current_responses.extend(model_responses)
|
|
193
|
+
|
|
194
|
+
execute_tools = row.get('should_execute_tool_calls', False)
|
|
195
|
+
if execute_tools and tool_call_strs is not None:
|
|
196
|
+
tool_outputs, _ = execute_multi_turn_func_call(
|
|
197
|
+
tool_call_strs,
|
|
198
|
+
initial_config=row['initial_config'],
|
|
199
|
+
involved_classes=row['involved_classes'],
|
|
200
|
+
model_name='evaluator_loop',
|
|
201
|
+
test_entry_id=row['id'],
|
|
202
|
+
long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
|
|
203
|
+
is_evaL_run=False,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
for tc, tool_output in zip(message.tool_calls, tool_outputs, strict=False):
|
|
207
|
+
current_messages.append({
|
|
208
|
+
'role': 'tool',
|
|
209
|
+
'tool_call_id': tc.id,
|
|
210
|
+
'content': json.dumps({'response': tool_output}),
|
|
211
|
+
})
|
|
212
|
+
else:
|
|
213
|
+
break
|
|
214
|
+
|
|
215
|
+
n_steps += 1
|
|
216
|
+
if n_steps > MAXIMUM_STEP_LIMIT:
|
|
217
|
+
logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
|
|
218
|
+
break
|
|
219
|
+
|
|
220
|
+
all_model_responses.append(current_responses)
|
|
221
|
+
|
|
222
|
+
return all_model_responses, model_usage
|
|
@@ -1,73 +1,14 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import csv
|
|
3
|
-
import os
|
|
4
|
-
from collections import defaultdict
|
|
5
|
-
|
|
6
|
-
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
|
-
from evalscope.constants import EvalType, OutputType
|
|
8
|
-
from evalscope.metrics import exact_match
|
|
9
|
-
from evalscope.metrics.completion_parsers import ResponseParser
|
|
10
|
-
from evalscope.utils.io_utils import csv_to_list
|
|
11
|
-
from evalscope.utils.logger import get_logger
|
|
12
2
|
|
|
13
|
-
|
|
3
|
+
from typing import Any, Dict
|
|
14
4
|
|
|
15
|
-
|
|
5
|
+
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
7
|
+
from evalscope.api.registry import register_benchmark
|
|
8
|
+
from evalscope.constants import Tags
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
16
10
|
|
|
17
|
-
|
|
18
|
-
'computer_network',
|
|
19
|
-
'operating_system',
|
|
20
|
-
'computer_architecture',
|
|
21
|
-
'college_programming',
|
|
22
|
-
'college_physics',
|
|
23
|
-
'college_chemistry',
|
|
24
|
-
'advanced_mathematics',
|
|
25
|
-
'probability_and_statistics',
|
|
26
|
-
'discrete_mathematics',
|
|
27
|
-
'electrical_engineer',
|
|
28
|
-
'metrology_engineer',
|
|
29
|
-
'high_school_mathematics',
|
|
30
|
-
'high_school_physics',
|
|
31
|
-
'high_school_chemistry',
|
|
32
|
-
'high_school_biology',
|
|
33
|
-
'middle_school_mathematics',
|
|
34
|
-
'middle_school_biology',
|
|
35
|
-
'middle_school_physics',
|
|
36
|
-
'middle_school_chemistry',
|
|
37
|
-
'veterinary_medicine',
|
|
38
|
-
'college_economics',
|
|
39
|
-
'business_administration',
|
|
40
|
-
'marxism',
|
|
41
|
-
'mao_zedong_thought',
|
|
42
|
-
'education_science',
|
|
43
|
-
'teacher_qualification',
|
|
44
|
-
'high_school_politics',
|
|
45
|
-
'high_school_geography',
|
|
46
|
-
'middle_school_politics',
|
|
47
|
-
'middle_school_geography',
|
|
48
|
-
'modern_chinese_history',
|
|
49
|
-
'ideological_and_moral_cultivation',
|
|
50
|
-
'logic',
|
|
51
|
-
'law',
|
|
52
|
-
'chinese_language_and_literature',
|
|
53
|
-
'art_studies',
|
|
54
|
-
'professional_tour_guide',
|
|
55
|
-
'legal_professional',
|
|
56
|
-
'high_school_chinese',
|
|
57
|
-
'high_school_history',
|
|
58
|
-
'middle_school_history',
|
|
59
|
-
'civil_servant',
|
|
60
|
-
'sports_science',
|
|
61
|
-
'plant_protection',
|
|
62
|
-
'basic_medicine',
|
|
63
|
-
'clinical_medicine',
|
|
64
|
-
'urban_and_rural_planner',
|
|
65
|
-
'accountant',
|
|
66
|
-
'fire_engineer',
|
|
67
|
-
'environmental_impact_assessment_engineer',
|
|
68
|
-
'tax_accountant',
|
|
69
|
-
'physician',
|
|
70
|
-
]
|
|
11
|
+
logger = get_logger()
|
|
71
12
|
|
|
72
13
|
SUBJECT_MAPPING = {
|
|
73
14
|
'computer_network': ['Computer Network', '计算机网络', 'STEM'],
|
|
@@ -124,115 +65,105 @@ SUBJECT_MAPPING = {
|
|
|
124
65
|
'physician': ['Physician', '医师资格', 'Other']
|
|
125
66
|
}
|
|
126
67
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
68
|
+
# Based on the prompt template for Chinese evaluation
|
|
69
|
+
USER_PROMPT_TEMPLATE = """以下是中国关于{subject}的单项选择题,请选出其中的正确答案。你的回答的最后一行应该是这样的格式:"答案:LETTER"(不带引号),其中 LETTER 是 A、B、C、D 中的一个。
|
|
70
|
+
|
|
71
|
+
问题:{question}
|
|
72
|
+
选项:
|
|
73
|
+
{choices}
|
|
74
|
+
""".lstrip() # noqa: E501
|
|
75
|
+
|
|
76
|
+
FEWSHOT_TEMPLATE = """以下是一些示例问题:
|
|
77
|
+
|
|
78
|
+
{fewshot}
|
|
79
|
+
|
|
80
|
+
""".lstrip()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@register_benchmark(
|
|
84
|
+
BenchmarkMeta(
|
|
85
|
+
name='ceval',
|
|
86
|
+
pretty_name='C-Eval',
|
|
87
|
+
tags=[Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE, Tags.CHINESE],
|
|
88
|
+
description=
|
|
89
|
+
'C-Eval is a benchmark designed to evaluate the performance of AI models on Chinese exams across various subjects, including STEM, social sciences, and humanities. It consists of multiple-choice questions that test knowledge and reasoning abilities in these areas.', # noqa: E501
|
|
90
|
+
dataset_id='evalscope/ceval',
|
|
91
|
+
subset_list=list(SUBJECT_MAPPING.keys()),
|
|
92
|
+
metric_list=['acc'],
|
|
93
|
+
few_shot_num=5,
|
|
94
|
+
train_split='dev',
|
|
95
|
+
eval_split='val',
|
|
96
|
+
prompt_template=USER_PROMPT_TEMPLATE,
|
|
97
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
98
|
+
)
|
|
144
99
|
)
|
|
145
|
-
class CEVALAdapter(
|
|
100
|
+
class CEVALAdapter(MultiChoiceAdapter):
|
|
146
101
|
|
|
147
102
|
def __init__(self, **kwargs):
|
|
148
103
|
|
|
149
|
-
few_shot_num = kwargs.get('few_shot_num', 0)
|
|
150
|
-
if few_shot_num > 5:
|
|
151
|
-
logger.warning(f'few_shot_num <= 5 for C-Eval, but got {few_shot_num}. Use 5-shot by default.')
|
|
152
|
-
kwargs['few_shot_num'] = 5
|
|
153
104
|
super().__init__(**kwargs)
|
|
154
105
|
|
|
155
106
|
self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
|
|
156
|
-
self.choices = ['A', 'B', 'C', 'D']
|
|
157
|
-
|
|
158
|
-
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
159
|
-
data_dict = defaultdict(dict)
|
|
160
|
-
for subset_name in subset_list:
|
|
161
|
-
for split_name in [self.train_split, self.eval_split]:
|
|
162
|
-
if os.path.exists(dataset_name_or_path):
|
|
163
|
-
file_path = os.path.join(dataset_name_or_path, f'{subset_name}_{split_name}.csv')
|
|
164
|
-
else:
|
|
165
|
-
file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name}.csv')
|
|
166
|
-
if os.path.exists(file_path):
|
|
167
|
-
data_dict[subset_name][split_name] = csv_to_list(file_path)
|
|
168
|
-
|
|
169
|
-
return data_dict
|
|
170
|
-
|
|
171
|
-
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
172
|
-
"""
|
|
173
|
-
Generate model prompt from raw input, unify the prompt format for C-Eval benchmark.
|
|
174
|
-
|
|
175
|
-
Args:
|
|
176
|
-
input_d (dict): The raw input. A single data format of the C-Eval:
|
|
177
|
-
|
|
178
|
-
{'id': 0,
|
|
179
|
-
'question': '下列关于税法基本原则的表述中,不正确的是____。',
|
|
180
|
-
'A': '税收法定原则包括税收要件法定原则和税务合法性原则',
|
|
181
|
-
'B': '税收公平原则源于法律上的平等性原则',
|
|
182
|
-
'C': '税收效率原则包含经济效率和行政效率两个方面',
|
|
183
|
-
'D': '税务机关按法定程序依法征税,可以自由做出减征、停征或免征税款的决定',
|
|
184
|
-
'answer': 'D',
|
|
185
|
-
'explanation': ''}
|
|
186
|
-
|
|
187
|
-
Returns:
|
|
188
|
-
{'data': ['prompt ...']}
|
|
189
|
-
"""
|
|
190
|
-
|
|
191
|
-
few_shot_prompts = [self._format_example(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
192
|
-
|
|
193
|
-
if len(few_shot_prompts) > 0:
|
|
194
|
-
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
195
|
-
else:
|
|
196
|
-
context = ''
|
|
197
|
-
|
|
198
|
-
query: str = context.strip() + self._format_example(input_d=input_d, include_answer=False)
|
|
199
|
-
|
|
200
|
-
subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
|
|
201
|
-
full_prompt = self.prompt_template.format(subset_name=subject_name, query=query)
|
|
202
107
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
108
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
109
|
+
# Build choices list from A, B, C, D fields
|
|
110
|
+
choices = [record['A'], record['B'], record['C'], record['D']]
|
|
111
|
+
subset = self.current_subset_name
|
|
112
|
+
|
|
113
|
+
return Sample(
|
|
114
|
+
input=record['question'],
|
|
115
|
+
choices=choices,
|
|
116
|
+
target=record['answer'],
|
|
117
|
+
metadata={
|
|
118
|
+
'id': record.get('id', ''),
|
|
119
|
+
'explanation': record.get('explanation', ''),
|
|
120
|
+
'subject': subset
|
|
121
|
+
},
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
def sample_to_fewshot(self, sample: Sample) -> str:
|
|
125
|
+
q_str = f"""问题:{sample.input}"""
|
|
126
|
+
choices = sample.choices if sample.choices is not None else []
|
|
127
|
+
opt_str_list = []
|
|
128
|
+
for i, choice in enumerate(choices):
|
|
129
|
+
opt_str_list.append(f"""{chr(65 + i)}. {choice}""")
|
|
130
|
+
opt_str = '\n'.join(opt_str_list)
|
|
131
|
+
opt_str = f"""选项:\n{opt_str}"""
|
|
132
|
+
exp_str = f"""解析:{sample.metadata.get('explanation', '')}"""
|
|
133
|
+
ans_str = f"""答案:{sample.target}"""
|
|
134
|
+
final_str = '\n'.join([q_str, opt_str, exp_str, ans_str])
|
|
135
|
+
|
|
136
|
+
return final_str
|
|
137
|
+
|
|
138
|
+
def format_fewshot_template(self, fewshot, sample):
|
|
139
|
+
fewshot_str = FEWSHOT_TEMPLATE.format(fewshot=fewshot)
|
|
140
|
+
prompt_str = self.format_prompt_template(sample)
|
|
141
|
+
return fewshot_str + '\n' + prompt_str
|
|
142
|
+
|
|
143
|
+
def format_prompt_template(self, sample):
|
|
144
|
+
subject_name = SUBJECT_MAPPING.get(sample.metadata['subject'])[1]
|
|
145
|
+
choices = sample.choices if sample.choices is not None else []
|
|
146
|
+
choices_str = '\n'.join([f'{chr(65 + i)}. {choice}' for i, choice in enumerate(choices)])
|
|
147
|
+
|
|
148
|
+
return USER_PROMPT_TEMPLATE.format(subject=subject_name, question=sample.input, choices=choices_str)
|
|
149
|
+
|
|
150
|
+
def extract_answer(self, prediction, task_state) -> str:
|
|
210
151
|
"""
|
|
211
|
-
|
|
152
|
+
Extract the answer from the prediction based on the task state.
|
|
212
153
|
|
|
213
154
|
Args:
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
eval_type: `checkpoint` or `service` or `custom`. Default is `checkpoint`.
|
|
155
|
+
prediction (str): The model's prediction string
|
|
156
|
+
task_state (dict): The current task state containing metadata
|
|
217
157
|
|
|
218
158
|
Returns:
|
|
219
|
-
The
|
|
159
|
+
str: The extracted answer from the prediction
|
|
220
160
|
"""
|
|
221
|
-
|
|
222
|
-
return result
|
|
223
|
-
else:
|
|
224
|
-
return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
|
|
225
|
-
|
|
226
|
-
def match(self, gold: str, pred: str) -> float:
|
|
227
|
-
return exact_match(gold=gold, pred=pred)
|
|
228
|
-
|
|
229
|
-
def _format_example(self, input_d: dict, include_answer=True):
|
|
230
|
-
example = '问题:' + input_d['question']
|
|
231
|
-
for choice in self.choices:
|
|
232
|
-
example += f'\n{choice}. {input_d[f"{choice}"]}'
|
|
161
|
+
import re
|
|
233
162
|
|
|
234
|
-
|
|
235
|
-
|
|
163
|
+
# Use regex to find the answer in the format "答案:LETTER"
|
|
164
|
+
match = re.search(r'答案:([A-D])', prediction)
|
|
165
|
+
if match:
|
|
166
|
+
return match.group(1)
|
|
236
167
|
else:
|
|
237
|
-
|
|
238
|
-
|
|
168
|
+
logger.warning(f'No valid answer found in prediction: {prediction}')
|
|
169
|
+
return ''
|