evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,81 +1,102 @@
|
|
|
1
1
|
import json
|
|
2
|
-
from typing import
|
|
3
|
-
|
|
4
|
-
from evalscope.
|
|
5
|
-
from evalscope.
|
|
6
|
-
from evalscope.
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator import TaskState
|
|
7
|
+
from evalscope.api.messages.chat_message import ChatMessage, dict_to_chat_message
|
|
8
|
+
from evalscope.api.metric import Score
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
11
|
+
from evalscope.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@register_benchmark(
|
|
17
|
+
BenchmarkMeta(
|
|
18
|
+
name='tool_bench',
|
|
19
|
+
pretty_name='ToolBench-Static',
|
|
20
|
+
tags=[Tags.REASONING, Tags.FUNCTION_CALLING],
|
|
21
|
+
description='ToolBench is a benchmark for evaluating AI models on tool use tasks. '
|
|
22
|
+
'It includes various subsets such as in-domain and out-of-domain, '
|
|
23
|
+
'each with its own set of problems that require step-by-step reasoning to arrive at the correct answer. '
|
|
24
|
+
'[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/toolbench.html)',
|
|
25
|
+
dataset_id='AI-ModelScope/ToolBench-Static',
|
|
26
|
+
subset_list=['in_domain', 'out_of_domain'],
|
|
27
|
+
metric_list=['Act.EM', 'Plan.EM', 'F1', 'HalluRate', 'Rouge-L'],
|
|
28
|
+
eval_split='test',
|
|
29
|
+
)
|
|
23
30
|
)
|
|
24
|
-
class ToolBenchAdapter(
|
|
31
|
+
class ToolBenchAdapter(DefaultDataAdapter):
|
|
32
|
+
"""
|
|
33
|
+
ToolBench adapter using the new data processing framework.
|
|
34
|
+
"""
|
|
25
35
|
|
|
26
36
|
def __init__(self, **kwargs):
|
|
27
37
|
super().__init__(**kwargs)
|
|
28
38
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
36
|
-
"""
|
|
37
|
-
Generate model prompt from input data.
|
|
38
|
-
"""
|
|
39
|
-
messages = input_d['messages']
|
|
40
|
-
# use prepared messages and remove the name field
|
|
39
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
40
|
+
"""Convert a data record to a Sample object."""
|
|
41
|
+
messages = record['messages']
|
|
42
|
+
|
|
43
|
+
# Process messages and remove the name field, convert function messages
|
|
44
|
+
processed_messages = []
|
|
41
45
|
for message in messages:
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
return
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
46
|
+
msg_dict = message.copy()
|
|
47
|
+
if 'name' in msg_dict:
|
|
48
|
+
del msg_dict['name']
|
|
49
|
+
if 'role' in msg_dict:
|
|
50
|
+
if msg_dict['role'] == 'function':
|
|
51
|
+
content = json.dumps(msg_dict, ensure_ascii=False)
|
|
52
|
+
msg_dict['role'] = 'user'
|
|
53
|
+
msg_dict['content'] = content
|
|
54
|
+
|
|
55
|
+
# Convert to ChatMessage object
|
|
56
|
+
chat_msg = dict_to_chat_message(msg_dict)
|
|
57
|
+
processed_messages.append(chat_msg)
|
|
58
|
+
|
|
59
|
+
return Sample(
|
|
60
|
+
input=processed_messages,
|
|
61
|
+
target='', # Store the full record as target for evaluation
|
|
62
|
+
metadata={
|
|
63
|
+
'target': record['target'],
|
|
64
|
+
'tools': record['tools'],
|
|
65
|
+
'messages': record['messages']
|
|
66
|
+
}
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def match_score(
|
|
70
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
71
|
+
) -> Score:
|
|
67
72
|
from .utils import calculate_metrics
|
|
68
73
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
74
|
+
score = Score(
|
|
75
|
+
extracted_prediction=filtered_prediction,
|
|
76
|
+
prediction=original_prediction,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
doc = task_state.metadata
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
data = {
|
|
83
|
+
'target': doc['target'],
|
|
84
|
+
'predictions': filtered_prediction,
|
|
85
|
+
'tools': doc['tools'],
|
|
86
|
+
}
|
|
87
|
+
metrics = calculate_metrics(data)
|
|
88
|
+
|
|
89
|
+
score.value = metrics
|
|
90
|
+
score.explanation = f'Metrics: {metrics}'
|
|
91
|
+
score.metadata = {'target': doc['target'], 'tools': doc['tools'], 'detailed_metrics': metrics}
|
|
92
|
+
# Set the main score (you can choose the most important metric)
|
|
93
|
+
score.main_score_name = 'F1'
|
|
76
94
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
95
|
+
except Exception as e:
|
|
96
|
+
# Handle evaluation errors
|
|
97
|
+
score.value = {'Act.EM': 0.0, 'Plan.EM': 0.0, 'F1': 0.0, 'HalluRate': 1.0, 'Rouge-L': 0.0}
|
|
98
|
+
score.explanation = f'Evaluation failed: {str(e)}'
|
|
99
|
+
score.metadata = {'error': str(e)}
|
|
100
|
+
score.main_score_name = 'F1'
|
|
80
101
|
|
|
81
|
-
return
|
|
102
|
+
return score
|
|
@@ -1,142 +1,74 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
# Copyright (c) EleutherAI Inc, and its affiliates.
|
|
3
|
-
import csv
|
|
4
|
-
import os
|
|
5
3
|
|
|
6
|
-
from
|
|
7
|
-
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
8
|
-
from evalscope.constants import EvalType, OutputType
|
|
9
|
-
from evalscope.utils import get_logger
|
|
4
|
+
from typing import Any, Dict
|
|
10
5
|
|
|
11
|
-
|
|
6
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
7
|
+
from evalscope.api.dataset import Sample
|
|
8
|
+
from evalscope.api.evaluator import TaskState
|
|
9
|
+
from evalscope.api.metric import Score
|
|
10
|
+
from evalscope.api.registry import register_benchmark
|
|
11
|
+
from evalscope.constants import Tags
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
12
13
|
|
|
13
14
|
logger = get_logger()
|
|
14
15
|
|
|
16
|
+
PROMPT_TEMPLATE = """
|
|
17
|
+
Read the content and answer the following question.
|
|
15
18
|
|
|
16
|
-
|
|
17
|
-
name='trivia_qa',
|
|
18
|
-
pretty_name='TriviaQA',
|
|
19
|
-
tags=['QA', 'Reading Comprehension'],
|
|
20
|
-
description=
|
|
21
|
-
'TriviaQA is a large-scale reading comprehension dataset consisting of question-answer pairs collected from trivia websites. It includes questions with multiple possible answers, making it suitable for evaluating the ability of models to understand and generate answers based on context.', # noqa: E501
|
|
22
|
-
dataset_id='modelscope/trivia_qa',
|
|
23
|
-
subset_list=['default'],
|
|
24
|
-
metric_list=['AverageAccuracy'],
|
|
25
|
-
few_shot_num=5,
|
|
26
|
-
train_split='dev',
|
|
27
|
-
eval_split='test',
|
|
28
|
-
)
|
|
29
|
-
class TriviaQaAdapter(DataAdapter):
|
|
30
|
-
|
|
31
|
-
def __init__(self, **kwargs):
|
|
32
|
-
|
|
33
|
-
super().__init__(**kwargs)
|
|
19
|
+
Content: {content}
|
|
34
20
|
|
|
35
|
-
|
|
36
|
-
data_dict = {}
|
|
37
|
-
for subset_name in subset_list:
|
|
38
|
-
data_dict[subset_name] = {}
|
|
39
|
-
for split in [self.train_split, self.eval_split]:
|
|
40
|
-
if os.path.exists(dataset_name_or_path):
|
|
41
|
-
file_path = os.path.join(dataset_name_or_path, f'trivia-{split}.qa.csv')
|
|
42
|
-
else:
|
|
43
|
-
file_path = os.path.join(work_dir, dataset_name_or_path, f'trivia-{split}.qa.csv')
|
|
44
|
-
if os.path.exists(file_path):
|
|
45
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
|
46
|
-
reader = csv.reader(f, delimiter='\t')
|
|
47
|
-
split_data = []
|
|
48
|
-
for row in reader:
|
|
49
|
-
assert len(row) == 2
|
|
50
|
-
question = row[0]
|
|
51
|
-
answers = eval(row[1])
|
|
52
|
-
split_data.append({
|
|
53
|
-
'input': [{
|
|
54
|
-
'role': 'system',
|
|
55
|
-
'content': 'Follow the given examples and answer the question.'
|
|
56
|
-
}, {
|
|
57
|
-
'role': 'user',
|
|
58
|
-
'content': question
|
|
59
|
-
}],
|
|
60
|
-
'ideal':
|
|
61
|
-
answers
|
|
62
|
-
})
|
|
63
|
-
data_dict[subset_name][split] = split_data
|
|
21
|
+
Question: {question}
|
|
64
22
|
|
|
65
|
-
|
|
23
|
+
Keep your The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.
|
|
24
|
+
""".lstrip() # noqa: E501
|
|
66
25
|
|
|
67
|
-
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
68
|
-
"""
|
|
69
|
-
Generate model prompt from raw input, unify the prompt format for TriviaQA benchmark.
|
|
70
26
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
"sunset bulevard",
|
|
87
|
-
"west sunset boulevard",
|
|
88
|
-
"sunset blvd"
|
|
89
|
-
]
|
|
27
|
+
@register_benchmark(
|
|
28
|
+
BenchmarkMeta(
|
|
29
|
+
name='trivia_qa',
|
|
30
|
+
pretty_name='TriviaQA',
|
|
31
|
+
dataset_id='evalscope/trivia_qa',
|
|
32
|
+
tags=[Tags.QA, Tags.READING_COMPREHENSION],
|
|
33
|
+
description=
|
|
34
|
+
'TriviaQA is a large-scale reading comprehension dataset consisting of question-answer pairs collected from trivia websites. It includes questions with multiple possible answers, making it suitable for evaluating the ability of models to understand and generate answers based on context.', # noqa: E501
|
|
35
|
+
subset_list=['rc.wikipedia'],
|
|
36
|
+
few_shot_num=0,
|
|
37
|
+
train_split=None,
|
|
38
|
+
eval_split='validation',
|
|
39
|
+
metric_list=[{
|
|
40
|
+
'acc': {
|
|
41
|
+
'allow_inclusion': True
|
|
90
42
|
}
|
|
43
|
+
}],
|
|
44
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
45
|
+
)
|
|
46
|
+
)
|
|
47
|
+
class TriviaQaAdapter(DefaultDataAdapter):
|
|
91
48
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
"""
|
|
95
|
-
|
|
96
|
-
def get_sys_prompt(inp: dict) -> str:
|
|
97
|
-
return inp['input'][0]['content']
|
|
98
|
-
|
|
99
|
-
if self.few_shot_num > 0:
|
|
100
|
-
sys_prompt = get_sys_prompt(input_d)
|
|
101
|
-
else:
|
|
102
|
-
sys_prompt = None
|
|
103
|
-
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
104
|
-
context = '\n'.join(few_shot_prompts) + '\n'
|
|
105
|
-
context += self._generate_prompt(input_d=input_d, include_answer=False)
|
|
106
|
-
full_prompt = context
|
|
107
|
-
|
|
108
|
-
return self.gen_prompt_data(full_prompt, system_prompt=sys_prompt)
|
|
109
|
-
|
|
110
|
-
def get_gold_answer(self, input_d: dict) -> list:
|
|
111
|
-
# Get the gold choice
|
|
112
|
-
ans: list = input_d.get('ideal', [])
|
|
113
|
-
return ans
|
|
114
|
-
|
|
115
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
116
|
-
"""
|
|
117
|
-
Parse the model output to get the answer.
|
|
118
|
-
|
|
119
|
-
Args:
|
|
120
|
-
result: Predicted answer from the model. A list of loglikelihood values for inputs pairs.
|
|
121
|
-
raw_input_d: The raw input. A single data format of the TriviaQA:
|
|
122
|
-
eval_type: The type of evaluation, e.g. 'checkpoint' or 'service' or 'custom'.
|
|
123
|
-
|
|
124
|
-
Returns:
|
|
125
|
-
The predicted answer.
|
|
126
|
-
"""
|
|
127
|
-
return result
|
|
49
|
+
def __init__(self, **kwargs):
|
|
50
|
+
super().__init__(**kwargs)
|
|
128
51
|
|
|
129
|
-
def
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
return
|
|
52
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
53
|
+
question = record['question']
|
|
54
|
+
answers = record['answer']['aliases'] + record['answer']['normalized_aliases']
|
|
55
|
+
content = record['entity_pages']['wiki_context']
|
|
56
|
+
return Sample(
|
|
57
|
+
input=question, target=answers, metadata={
|
|
58
|
+
'question_id': record['question_id'],
|
|
59
|
+
'content': content
|
|
60
|
+
}
|
|
61
|
+
)
|
|
134
62
|
|
|
135
|
-
|
|
136
|
-
|
|
63
|
+
def format_prompt_template(self, sample):
|
|
64
|
+
return self.prompt_template.format(content=sample.metadata['content'], question=sample.input)
|
|
137
65
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
66
|
+
def extract_answer(self, prediction: str, task_state: TaskState):
|
|
67
|
+
# use regex to extract the answer from the prediction
|
|
68
|
+
import re
|
|
141
69
|
|
|
142
|
-
|
|
70
|
+
pattern = r'ANSWER:\s*(.*)'
|
|
71
|
+
match = re.search(pattern, prediction)
|
|
72
|
+
if match:
|
|
73
|
+
return match.group(1).strip()
|
|
74
|
+
return prediction.strip()
|