evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
import datasets
|
|
15
|
-
import json
|
|
16
|
-
import os
|
|
17
|
-
import pandas as pd
|
|
18
|
-
|
|
19
|
-
_CITATION = """\
|
|
20
|
-
@article{2017arXivtriviaqa,
|
|
21
|
-
author = {{Joshi}, Mandar and {Choi}, Eunsol and {Weld},
|
|
22
|
-
Daniel and {Zettlemoyer}, Luke},
|
|
23
|
-
title = "{triviaqa: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension}",
|
|
24
|
-
journal = {arXiv e-prints},
|
|
25
|
-
year = 2017,
|
|
26
|
-
eid = {arXiv:1705.03551},
|
|
27
|
-
pages = {arXiv:1705.03551},
|
|
28
|
-
archivePrefix = {arXiv},
|
|
29
|
-
eprint = {1705.03551},
|
|
30
|
-
}
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
_DESCRIPTION = """\
|
|
34
|
-
TriviaqQA is a reading comprehension dataset containing over 650K question-answer-evidence triples.
|
|
35
|
-
"""
|
|
36
|
-
|
|
37
|
-
_HOMEPAGE = 'https://modelscope.cn/datasets/modelscope/trivia_qa/summary'
|
|
38
|
-
|
|
39
|
-
_URL = 'https://modelscope.cn/api/v1/datasets/modelscope/trivia_qa/repo?Revision=master&FilePath=trivia_qa.zip'
|
|
40
|
-
|
|
41
|
-
task_list = ['default']
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
class TriviaQAConfig(datasets.BuilderConfig):
|
|
45
|
-
|
|
46
|
-
def __init__(self, **kwargs):
|
|
47
|
-
super().__init__(version=datasets.Version('1.0.0'), **kwargs)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class TriviaQA(datasets.GeneratorBasedBuilder):
|
|
51
|
-
BUILDER_CONFIGS = [TriviaQAConfig(name=task_name, ) for task_name in task_list]
|
|
52
|
-
|
|
53
|
-
def _info(self):
|
|
54
|
-
features = datasets.Features({
|
|
55
|
-
'input': [{
|
|
56
|
-
'role': datasets.features.Value('string'),
|
|
57
|
-
'content': datasets.features.Value('string'),
|
|
58
|
-
}],
|
|
59
|
-
'ideal': [datasets.Value('string')],
|
|
60
|
-
})
|
|
61
|
-
return datasets.DatasetInfo(
|
|
62
|
-
description=_DESCRIPTION,
|
|
63
|
-
features=features,
|
|
64
|
-
homepage=_HOMEPAGE,
|
|
65
|
-
citation=_CITATION,
|
|
66
|
-
)
|
|
67
|
-
|
|
68
|
-
def _split_generators(self, dl_manager):
|
|
69
|
-
data_dir = dl_manager.download_and_extract(_URL)
|
|
70
|
-
return [
|
|
71
|
-
datasets.SplitGenerator(
|
|
72
|
-
name=datasets.Split.TEST,
|
|
73
|
-
gen_kwargs={
|
|
74
|
-
'filepath': os.path.join(data_dir, 'trivia_qa/test.jsonl'),
|
|
75
|
-
},
|
|
76
|
-
),
|
|
77
|
-
datasets.SplitGenerator(
|
|
78
|
-
name=datasets.Split('dev'),
|
|
79
|
-
gen_kwargs={
|
|
80
|
-
'filepath': os.path.join(data_dir, 'trivia_qa/dev.jsonl'),
|
|
81
|
-
},
|
|
82
|
-
),
|
|
83
|
-
]
|
|
84
|
-
|
|
85
|
-
def _generate_examples(self, filepath):
|
|
86
|
-
with open(filepath, encoding='utf-8') as f:
|
|
87
|
-
contents = [json.loads(line) for line in f.readlines()]
|
|
88
|
-
for i, instance in enumerate(contents):
|
|
89
|
-
yield i, instance
|
|
@@ -1,163 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
15
|
-
"""TruthfulQA dataset."""
|
|
16
|
-
# flake8: noqa
|
|
17
|
-
|
|
18
|
-
import csv
|
|
19
|
-
import datasets
|
|
20
|
-
import json
|
|
21
|
-
|
|
22
|
-
_CITATION = """\
|
|
23
|
-
@misc{lin2021truthfulqa,
|
|
24
|
-
title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
|
|
25
|
-
author={Stephanie Lin and Jacob Hilton and Owain Evans},
|
|
26
|
-
year={2021},
|
|
27
|
-
eprint={2109.07958},
|
|
28
|
-
archivePrefix={arXiv},
|
|
29
|
-
primaryClass={cs.CL}
|
|
30
|
-
}
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
_DESCRIPTION = """\
|
|
34
|
-
TruthfulQA is a benchmark to measure whether a language model is truthful in
|
|
35
|
-
generating answers to questions. The benchmark comprises 817 questions that
|
|
36
|
-
span 38 categories, including health, law, finance and politics. Questions are
|
|
37
|
-
crafted so that some humans would answer falsely due to a false belief or
|
|
38
|
-
misconception. To perform well, models must avoid generating false answers
|
|
39
|
-
learned from imitating human texts.
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
_HOMEPAGE = 'https://github.com/sylinrl/TruthfulQA'
|
|
43
|
-
|
|
44
|
-
_LICENSE = 'Apache License 2.0'
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
class TruthfulQaConfig(datasets.BuilderConfig):
|
|
48
|
-
"""BuilderConfig for TruthfulQA."""
|
|
49
|
-
|
|
50
|
-
def __init__(self, url, features, **kwargs):
|
|
51
|
-
"""BuilderConfig for TruthfulQA.
|
|
52
|
-
Args:
|
|
53
|
-
url: *string*, the url to the configuration's data.
|
|
54
|
-
features: *list[string]*, list of features that'll appear in the feature dict.
|
|
55
|
-
**kwargs: keyword arguments forwarded to super.
|
|
56
|
-
"""
|
|
57
|
-
super().__init__(version=datasets.Version('1.1.0'), **kwargs)
|
|
58
|
-
self.url = url
|
|
59
|
-
self.features = features
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
class TruthfulQa(datasets.GeneratorBasedBuilder):
|
|
63
|
-
"""TruthfulQA is a benchmark to measure whether a language model is truthful in generating answers to questions."""
|
|
64
|
-
|
|
65
|
-
BUILDER_CONFIGS = [
|
|
66
|
-
TruthfulQaConfig(
|
|
67
|
-
name='generation',
|
|
68
|
-
# url="https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv",
|
|
69
|
-
url='https://modelscope.oss-cn-beijing.aliyuncs.com/open_data/truthful_qa/TruthfulQA.csv',
|
|
70
|
-
features=datasets.Features({
|
|
71
|
-
'type': datasets.Value('string'),
|
|
72
|
-
'category': datasets.Value('string'),
|
|
73
|
-
'question': datasets.Value('string'),
|
|
74
|
-
'best_answer': datasets.Value('string'),
|
|
75
|
-
'correct_answers': datasets.features.Sequence(datasets.Value('string')),
|
|
76
|
-
'incorrect_answers': datasets.features.Sequence(datasets.Value('string')),
|
|
77
|
-
'source': datasets.Value('string'),
|
|
78
|
-
}),
|
|
79
|
-
description=
|
|
80
|
-
"The Generation TruthfulQA (main) task tests a model's ability to generate 1-2 sentence answers for a given question truthfully.",
|
|
81
|
-
),
|
|
82
|
-
TruthfulQaConfig(
|
|
83
|
-
name='multiple_choice',
|
|
84
|
-
# url="https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json",
|
|
85
|
-
url='https://modelscope.oss-cn-beijing.aliyuncs.com/open_data/truthful_qa/mc_task.json',
|
|
86
|
-
features=datasets.Features({
|
|
87
|
-
'question': datasets.Value('string'),
|
|
88
|
-
'mc1_targets': {
|
|
89
|
-
'choices': datasets.features.Sequence(datasets.Value('string')),
|
|
90
|
-
'labels': datasets.features.Sequence(datasets.Value('int32')),
|
|
91
|
-
},
|
|
92
|
-
'mc2_targets': {
|
|
93
|
-
'choices': datasets.features.Sequence(datasets.Value('string')),
|
|
94
|
-
'labels': datasets.features.Sequence(datasets.Value('int32')),
|
|
95
|
-
},
|
|
96
|
-
}),
|
|
97
|
-
description=
|
|
98
|
-
"The Multiple-Choice TruthfulQA task provides a multiple-choice option to test a model's ability to identify true statements.",
|
|
99
|
-
),
|
|
100
|
-
]
|
|
101
|
-
|
|
102
|
-
def _info(self):
|
|
103
|
-
return datasets.DatasetInfo(
|
|
104
|
-
description=_DESCRIPTION,
|
|
105
|
-
features=self.config.features,
|
|
106
|
-
homepage=_HOMEPAGE,
|
|
107
|
-
license=_LICENSE,
|
|
108
|
-
citation=_CITATION,
|
|
109
|
-
)
|
|
110
|
-
|
|
111
|
-
def _split_generators(self, dl_manager):
|
|
112
|
-
data_dir = dl_manager.download(self.config.url)
|
|
113
|
-
return [
|
|
114
|
-
datasets.SplitGenerator(
|
|
115
|
-
name=datasets.Split.VALIDATION,
|
|
116
|
-
gen_kwargs={
|
|
117
|
-
'filepath': data_dir,
|
|
118
|
-
},
|
|
119
|
-
),
|
|
120
|
-
]
|
|
121
|
-
|
|
122
|
-
def _split_csv_list(self, csv_list: str, delimiter: str = ';') -> str:
|
|
123
|
-
"""
|
|
124
|
-
Splits a csv list field, delimited by `delimiter` (';'), into a list
|
|
125
|
-
of strings.
|
|
126
|
-
"""
|
|
127
|
-
csv_list = csv_list.strip().split(delimiter)
|
|
128
|
-
return [item.strip() for item in csv_list]
|
|
129
|
-
|
|
130
|
-
def _generate_examples(self, filepath):
|
|
131
|
-
if self.config.name == 'multiple_choice':
|
|
132
|
-
# Multiple choice data is in a `JSON` file.
|
|
133
|
-
with open(filepath, encoding='utf-8') as f:
|
|
134
|
-
contents = json.load(f)
|
|
135
|
-
for key, row in enumerate(contents):
|
|
136
|
-
yield key, {
|
|
137
|
-
'question': row['question'],
|
|
138
|
-
'mc1_targets': {
|
|
139
|
-
'choices': list(row['mc1_targets'].keys()),
|
|
140
|
-
'labels': list(row['mc1_targets'].values()),
|
|
141
|
-
},
|
|
142
|
-
'mc2_targets': {
|
|
143
|
-
'choices': list(row['mc2_targets'].keys()),
|
|
144
|
-
'labels': list(row['mc2_targets'].values()),
|
|
145
|
-
},
|
|
146
|
-
}
|
|
147
|
-
else:
|
|
148
|
-
# Generation data is in a `CSV` file.
|
|
149
|
-
with open(filepath, newline='', encoding='utf-8-sig') as f:
|
|
150
|
-
contents = csv.DictReader(f)
|
|
151
|
-
for key, row in enumerate(contents):
|
|
152
|
-
# Ensure that references exist.
|
|
153
|
-
if not row['Correct Answers'] or not row['Incorrect Answers']:
|
|
154
|
-
continue
|
|
155
|
-
yield key, {
|
|
156
|
-
'type': row['Type'],
|
|
157
|
-
'category': row['Category'],
|
|
158
|
-
'question': row['Question'],
|
|
159
|
-
'best_answer': row['Best Answer'],
|
|
160
|
-
'correct_answers': self._split_csv_list(row['Correct Answers']),
|
|
161
|
-
'incorrect_answers': self._split_csv_list(row['Incorrect Answers']),
|
|
162
|
-
'source': row['Source'],
|
|
163
|
-
}
|
evalscope/benchmarks/utils.py
DELETED
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
from dataclasses import asdict, dataclass
|
|
2
|
-
from functools import wraps
|
|
3
|
-
from typing import Dict, List, Optional, Union
|
|
4
|
-
|
|
5
|
-
from .filters import Filter
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
@dataclass
|
|
9
|
-
class PromptData:
|
|
10
|
-
data: List[str]
|
|
11
|
-
index: Optional[Union[int, str]] = 0
|
|
12
|
-
system_prompt: Optional[str] = None
|
|
13
|
-
multi_choices: Optional[List[str]] = None
|
|
14
|
-
id: Optional[str] = None
|
|
15
|
-
messages: Optional[List[dict]] = None
|
|
16
|
-
extra_data: Optional[Dict] = None
|
|
17
|
-
|
|
18
|
-
def to_dict(self) -> Dict:
|
|
19
|
-
return {k: v for k, v in asdict(self).items() if v is not None}
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def preprocess_decorator(func):
|
|
23
|
-
|
|
24
|
-
@wraps(func)
|
|
25
|
-
def wrapper(self, result: str, raw_input_d: dict = None, **kwargs):
|
|
26
|
-
if result is None:
|
|
27
|
-
result = ''
|
|
28
|
-
filters = self.config_kwargs.get('filters', None)
|
|
29
|
-
if filters:
|
|
30
|
-
# Apply filters to the resultply filters to the result
|
|
31
|
-
for filter_name, filter_value in filters.items():
|
|
32
|
-
result = Filter.apply(filter_name, result, filter_value)
|
|
33
|
-
return func(self, result, raw_input_d, **kwargs)
|
|
34
|
-
|
|
35
|
-
return wrapper
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def load_file_with_extension(file_path: Union[str, List[str]]) -> List[dict]:
|
|
39
|
-
"""
|
|
40
|
-
Load a file with a specific extension and return its content as a list of dictionaries.
|
|
41
|
-
"""
|
|
42
|
-
import json
|
|
43
|
-
import os
|
|
44
|
-
|
|
45
|
-
if isinstance(file_path, str):
|
|
46
|
-
file_path = [file_path]
|
|
47
|
-
|
|
48
|
-
data = []
|
|
49
|
-
for path in file_path:
|
|
50
|
-
if not os.path.exists(path):
|
|
51
|
-
raise FileNotFoundError(f'The file {path} does not exist.')
|
|
52
|
-
|
|
53
|
-
with open(path, 'r', encoding='utf-8') as f:
|
|
54
|
-
if path.endswith('.json'):
|
|
55
|
-
data.extend(json.load(f))
|
|
56
|
-
elif path.endswith('.jsonl'):
|
|
57
|
-
data.extend([json.loads(line) for line in f])
|
|
58
|
-
elif path.endswith('.txt'):
|
|
59
|
-
data.extend([{'text': f.read()}])
|
|
60
|
-
return data
|
|
@@ -1,375 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
import pandas as pd
|
|
4
|
-
import random
|
|
5
|
-
from collections import defaultdict
|
|
6
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
7
|
-
from copy import deepcopy
|
|
8
|
-
from tabulate import tabulate
|
|
9
|
-
from tqdm import tqdm
|
|
10
|
-
from typing import Any, Dict, List
|
|
11
|
-
|
|
12
|
-
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
13
|
-
from evalscope.collections.sampler import DatasetEntry
|
|
14
|
-
from evalscope.config import TaskConfig
|
|
15
|
-
from evalscope.constants import AnswerKeys, DataCollection, DumpMode, EvalType
|
|
16
|
-
from evalscope.evaluator import Evaluator
|
|
17
|
-
from evalscope.models import initialize_model_adapter
|
|
18
|
-
from evalscope.report import ReportGenerator
|
|
19
|
-
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
20
|
-
from evalscope.utils.logger import get_logger
|
|
21
|
-
|
|
22
|
-
logger = get_logger()
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class SimpleEvaluator(Evaluator):
|
|
26
|
-
|
|
27
|
-
def __init__(self, dataset_name, data_adapter, model_adapter, task_cfg, outputs):
|
|
28
|
-
super().__init__(
|
|
29
|
-
dataset_name_or_path=dataset_name,
|
|
30
|
-
data_adapter=data_adapter,
|
|
31
|
-
model_adapter=model_adapter,
|
|
32
|
-
task_cfg=task_cfg,
|
|
33
|
-
outputs=outputs)
|
|
34
|
-
|
|
35
|
-
def get_answer(self, samples: List[DatasetEntry], infer_cfg: dict) -> List[dict]:
|
|
36
|
-
input_prompts = [sample.prompt for sample in samples]
|
|
37
|
-
subset_name = samples[0].subset_name
|
|
38
|
-
try:
|
|
39
|
-
# get answer from model
|
|
40
|
-
answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
|
|
41
|
-
except Exception as e:
|
|
42
|
-
logger.error(f'Failed to get answer for {input_prompts}, due to {e}')
|
|
43
|
-
# if ignore_errors is True, continue to next input
|
|
44
|
-
if self.task_cfg.ignore_errors:
|
|
45
|
-
logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
|
|
46
|
-
return [None] * len(samples), samples
|
|
47
|
-
else:
|
|
48
|
-
raise e
|
|
49
|
-
# process answers
|
|
50
|
-
answers_list = []
|
|
51
|
-
for answer_d, input_prompt in zip(answer_ds, input_prompts):
|
|
52
|
-
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
53
|
-
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
54
|
-
answers_list.append(processed_answer)
|
|
55
|
-
return answers_list, samples
|
|
56
|
-
|
|
57
|
-
def get_review(self, answer_d) -> dict:
|
|
58
|
-
review_id, reviewer_spec = self._generate_review_id(answer_d)
|
|
59
|
-
review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
|
|
60
|
-
return review_d
|
|
61
|
-
|
|
62
|
-
def get_score(self, review_d) -> float:
|
|
63
|
-
metric_score: List[dict] = self.compute_metrics(reviews_list=[review_d])
|
|
64
|
-
return metric_score
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
class EvaluatorCollection:
|
|
68
|
-
|
|
69
|
-
def __init__(self, task_cfg: TaskConfig, data_adapter: DataAdapter, outputs: OutputsStructure, base_model):
|
|
70
|
-
self.task_cfg = task_cfg
|
|
71
|
-
self.data_adapter = data_adapter
|
|
72
|
-
self.outputs = outputs
|
|
73
|
-
self.model = base_model
|
|
74
|
-
|
|
75
|
-
self.dataset, self.dataset_name = self.load()
|
|
76
|
-
self.dataset_name_map = EvaluatorCollection._init_name_map(self.dataset)
|
|
77
|
-
self.dataset_id_map = EvaluatorCollection._init_id_map(self.dataset)
|
|
78
|
-
self.evaluators = self._initialize_evaluators()
|
|
79
|
-
|
|
80
|
-
def load(self) -> tuple[List[DatasetEntry], str]:
|
|
81
|
-
dataset_name = os.path.splitext(os.path.basename(self.data_adapter.dataset_id))[0]
|
|
82
|
-
raw_dataset = self.data_adapter.load()
|
|
83
|
-
# random limit the dataset
|
|
84
|
-
limit = len(raw_dataset)
|
|
85
|
-
if self.task_cfg.limit is not None:
|
|
86
|
-
if isinstance(self.task_cfg.limit, int):
|
|
87
|
-
limit = self.task_cfg.limit
|
|
88
|
-
elif isinstance(self.task_cfg.limit, float):
|
|
89
|
-
limit = int(len(raw_dataset) * self.task_cfg.limit)
|
|
90
|
-
raw_dataset = random.sample(raw_dataset, min(limit, len(raw_dataset)))
|
|
91
|
-
# index dataset
|
|
92
|
-
datasets = []
|
|
93
|
-
for sample in raw_dataset:
|
|
94
|
-
sample['prompt'].update({'index': sample['index']})
|
|
95
|
-
datasets.append(DatasetEntry(**sample))
|
|
96
|
-
|
|
97
|
-
return datasets, dataset_name
|
|
98
|
-
|
|
99
|
-
@staticmethod
|
|
100
|
-
def _init_name_map(dataset: List[DatasetEntry]) -> Dict[str, Dict[str, List[int]]]:
|
|
101
|
-
dataset_name_map = defaultdict(lambda: defaultdict(list))
|
|
102
|
-
for sample in dataset:
|
|
103
|
-
dataset_name, subset_name = sample.dataset_name, sample.subset_name
|
|
104
|
-
dataset_name_map[dataset_name][subset_name].append(sample.index)
|
|
105
|
-
return dataset_name_map
|
|
106
|
-
|
|
107
|
-
@staticmethod
|
|
108
|
-
def _init_id_map(dataset: List[DatasetEntry]) -> Dict[int, DatasetEntry]:
|
|
109
|
-
dataset_id_map = {}
|
|
110
|
-
for sample in dataset:
|
|
111
|
-
dataset_id_map[sample.index] = sample
|
|
112
|
-
return dataset_id_map
|
|
113
|
-
|
|
114
|
-
def _initialize_evaluators(self) -> Dict[str, SimpleEvaluator]:
|
|
115
|
-
evaluators = {}
|
|
116
|
-
# load dataset args
|
|
117
|
-
dataset_args = deepcopy(self.task_cfg.dataset_args)
|
|
118
|
-
common_args = dataset_args.get(DataCollection.NAME, {})
|
|
119
|
-
for dataset_name in self.dataset_name_map.keys():
|
|
120
|
-
benchmark = Benchmark.get(dataset_name)
|
|
121
|
-
model_adapter = initialize_model_adapter(self.task_cfg, benchmark, self.model)
|
|
122
|
-
# update dataset args
|
|
123
|
-
cur_dataset_args = dataset_args.get(dataset_name, {})
|
|
124
|
-
cur_dataset_args.update(common_args)
|
|
125
|
-
# get data adapter
|
|
126
|
-
data_adapter = benchmark.get_data_adapter(cur_dataset_args)
|
|
127
|
-
evaluators[dataset_name] = SimpleEvaluator(dataset_name, data_adapter, model_adapter, self.task_cfg,
|
|
128
|
-
self.outputs)
|
|
129
|
-
return evaluators
|
|
130
|
-
|
|
131
|
-
def get_report(self, scores):
|
|
132
|
-
if not scores:
|
|
133
|
-
return
|
|
134
|
-
|
|
135
|
-
def get_dataframe(scores):
|
|
136
|
-
data = []
|
|
137
|
-
for dataset_name, data_map in self.dataset_name_map.items():
|
|
138
|
-
for subset_name, ids in data_map.items():
|
|
139
|
-
for _id in ids:
|
|
140
|
-
row_data: DatasetEntry = self.dataset_id_map[_id]
|
|
141
|
-
for metric in scores[_id]:
|
|
142
|
-
data.append(
|
|
143
|
-
dict(
|
|
144
|
-
task_type=row_data.task_type,
|
|
145
|
-
categories=tuple(row_data.categories),
|
|
146
|
-
dataset_name=dataset_name,
|
|
147
|
-
subset_name=subset_name,
|
|
148
|
-
tags=row_data.tags,
|
|
149
|
-
metric=metric['metric_name'],
|
|
150
|
-
score=metric['score']))
|
|
151
|
-
return pd.DataFrame(data)
|
|
152
|
-
|
|
153
|
-
def aggregate_and_sort(df, group_by_cols):
|
|
154
|
-
# aggregate by group_by_cols, and calculate average_score and count
|
|
155
|
-
report_df = df.groupby(group_by_cols) \
|
|
156
|
-
.agg(average_score=('score', 'mean'), count=('score', 'size')) \
|
|
157
|
-
.reset_index()
|
|
158
|
-
report_df['average_score'] = report_df['average_score'].round(4)
|
|
159
|
-
report_df = report_df.sort_values(by='count', ascending=False) \
|
|
160
|
-
.to_dict(orient='records')
|
|
161
|
-
return report_df
|
|
162
|
-
|
|
163
|
-
df = get_dataframe(scores)
|
|
164
|
-
|
|
165
|
-
# multi-level aggregation
|
|
166
|
-
subset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name', 'subset_name'])
|
|
167
|
-
dataset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name'])
|
|
168
|
-
task_report_df = aggregate_and_sort(df, ['task_type', 'metric'])
|
|
169
|
-
|
|
170
|
-
# explode tags to multiple rows
|
|
171
|
-
df_exploded_tags = df.explode('tags')
|
|
172
|
-
tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags', 'metric'])
|
|
173
|
-
|
|
174
|
-
# process multi-level categories
|
|
175
|
-
df_categories = df.copy()
|
|
176
|
-
# multi-level aggregation for categories
|
|
177
|
-
max_depth = df_categories['categories'].apply(len).max()
|
|
178
|
-
for level in range(max_depth):
|
|
179
|
-
df_categories[f'category{level}'] = df_categories['categories'].apply(lambda x: x[level]
|
|
180
|
-
if len(x) > level else '')
|
|
181
|
-
category_report_df = aggregate_and_sort(df_categories,
|
|
182
|
-
[f'category{level}' for level in range(max_depth)] + ['metric'])
|
|
183
|
-
|
|
184
|
-
# convert to dict format
|
|
185
|
-
report_dict = {
|
|
186
|
-
'subset_level': subset_report_df,
|
|
187
|
-
'dataset_level': dataset_report_df,
|
|
188
|
-
'task_level': task_report_df,
|
|
189
|
-
'tag_level': tag_report_df,
|
|
190
|
-
'category_level': category_report_df,
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
# record report
|
|
194
|
-
for level, data in report_dict.items():
|
|
195
|
-
table = tabulate(data, headers='keys', tablefmt='pretty', showindex=False)
|
|
196
|
-
logger.info(f'{level} Report:\n{table}')
|
|
197
|
-
|
|
198
|
-
report = ReportGenerator.gen_collection_report(df, self.dataset_name, self.task_cfg.model_id)
|
|
199
|
-
# Make report analysis
|
|
200
|
-
if self.task_cfg.analysis_report:
|
|
201
|
-
logger.info('Generating report analysis, please wait ...')
|
|
202
|
-
analysis = report.generate_analysis(self.task_cfg.judge_model_args)
|
|
203
|
-
logger.info('Report analysis:\n%s', analysis)
|
|
204
|
-
else:
|
|
205
|
-
logger.info('Skipping report analysis (`analysis_report=False`).')
|
|
206
|
-
|
|
207
|
-
# save report to JSON file
|
|
208
|
-
report_file_path = os.path.join(self.outputs.reports_dir, self.task_cfg.model_id, f'{self.dataset_name}.json')
|
|
209
|
-
report.to_json(report_file_path)
|
|
210
|
-
|
|
211
|
-
logger.info(f'Report saved to {report_file_path}')
|
|
212
|
-
return report
|
|
213
|
-
|
|
214
|
-
def _filter_answer(self, pred_file_path):
|
|
215
|
-
answer_dict = defaultdict(dict)
|
|
216
|
-
if self.task_cfg.use_cache and os.path.exists(pred_file_path):
|
|
217
|
-
answers_list = jsonl_to_list(pred_file_path)
|
|
218
|
-
# Create a set of sample indices for which we have answers
|
|
219
|
-
indices = set()
|
|
220
|
-
for answer in answers_list:
|
|
221
|
-
index = answer.get(AnswerKeys.INDEX)
|
|
222
|
-
answer_dict[index] = answer
|
|
223
|
-
indices.add(index)
|
|
224
|
-
|
|
225
|
-
# Filter dataset to only include samples that don't have answers
|
|
226
|
-
data = [sample for sample in self.dataset if sample.index not in indices]
|
|
227
|
-
|
|
228
|
-
# Initialize name map for the filtered dataset
|
|
229
|
-
data_map = self._init_name_map(data)
|
|
230
|
-
|
|
231
|
-
logger.info(f'Reuse from {pred_file_path}. Loaded {len(indices)} samples, remain {len(data)} samples.')
|
|
232
|
-
return answer_dict, data, data_map
|
|
233
|
-
else:
|
|
234
|
-
# If cache isn't enabled or file doesn't exist, return the full dataset
|
|
235
|
-
return answer_dict, self.dataset, self.dataset_name_map
|
|
236
|
-
|
|
237
|
-
def get_answers(self):
|
|
238
|
-
pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
|
|
239
|
-
f'{self.dataset_name}.jsonl')
|
|
240
|
-
os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
|
|
241
|
-
|
|
242
|
-
answers, dataset, dataset_name_map = self._filter_answer(pred_file_path)
|
|
243
|
-
|
|
244
|
-
eval_batch_size = self.task_cfg.eval_batch_size
|
|
245
|
-
# Process samples and get answers
|
|
246
|
-
with tqdm(total=len(dataset), desc='Getting answers') as pbar:
|
|
247
|
-
if self.task_cfg.eval_type == EvalType.SERVICE:
|
|
248
|
-
# Create a thread pool for parallel processing
|
|
249
|
-
with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
|
|
250
|
-
futures = []
|
|
251
|
-
for sample in dataset:
|
|
252
|
-
evaluator = self.evaluators[sample.dataset_name]
|
|
253
|
-
futures.append(executor.submit(evaluator.get_answer, [sample], self.task_cfg.generation_config))
|
|
254
|
-
# Process completed tasks
|
|
255
|
-
for future in as_completed(futures):
|
|
256
|
-
answer_list, samples = future.result()
|
|
257
|
-
for answer_d, sample in zip(answer_list, samples):
|
|
258
|
-
if answer_d is None:
|
|
259
|
-
continue
|
|
260
|
-
answers[sample.index] = answer_d
|
|
261
|
-
dump_jsonl_data([answer_d], pred_file_path, dump_mode=DumpMode.APPEND)
|
|
262
|
-
pbar.update(1)
|
|
263
|
-
else:
|
|
264
|
-
for dataset_name, data_map in dataset_name_map.items():
|
|
265
|
-
# get evaluator for the dataset
|
|
266
|
-
evaluator = self.evaluators[dataset_name]
|
|
267
|
-
for subset_name, ids in data_map.items():
|
|
268
|
-
for i in range(0, len(ids), eval_batch_size):
|
|
269
|
-
# get batch samples
|
|
270
|
-
batch_ids = ids[i:i + eval_batch_size]
|
|
271
|
-
batch_samples = [self.dataset_id_map[_id] for _id in batch_ids]
|
|
272
|
-
answer_list, samples = evaluator.get_answer(batch_samples, self.task_cfg.generation_config)
|
|
273
|
-
# update answers
|
|
274
|
-
for answer_d, sample in zip(answer_list, samples):
|
|
275
|
-
if answer_d is None:
|
|
276
|
-
continue
|
|
277
|
-
answers[sample.index] = answer_d
|
|
278
|
-
dump_jsonl_data([answer_d], pred_file_path, dump_mode=DumpMode.APPEND)
|
|
279
|
-
pbar.update(1)
|
|
280
|
-
return answers
|
|
281
|
-
|
|
282
|
-
def get_reviews(self, answers: Dict[int, Any]) -> Dict[int, Any]:
|
|
283
|
-
"""
|
|
284
|
-
Retrieve or generate reviews for given answers.
|
|
285
|
-
|
|
286
|
-
Args:
|
|
287
|
-
answers: Dictionary of answers indexed by sample index.
|
|
288
|
-
|
|
289
|
-
Returns:
|
|
290
|
-
Dictionary of reviews indexed by sample index.
|
|
291
|
-
"""
|
|
292
|
-
# Set up the review file path
|
|
293
|
-
review_file_path = os.path.join(self.outputs.reviews_dir, self.task_cfg.model_id)
|
|
294
|
-
os.makedirs(review_file_path, exist_ok=True)
|
|
295
|
-
|
|
296
|
-
review_history_map = defaultdict(dict)
|
|
297
|
-
|
|
298
|
-
# Handle caching logic
|
|
299
|
-
if os.path.exists(review_file_path):
|
|
300
|
-
if not self.task_cfg.use_cache:
|
|
301
|
-
# Clear existing reviews if not using cache
|
|
302
|
-
self._clear_review_files(review_file_path)
|
|
303
|
-
else:
|
|
304
|
-
# Load existing reviews if using cache
|
|
305
|
-
self._load_existing_reviews(review_file_path, review_history_map)
|
|
306
|
-
|
|
307
|
-
reviews = {}
|
|
308
|
-
for sample in tqdm(self.dataset, desc='Getting reviews'):
|
|
309
|
-
try:
|
|
310
|
-
file_name = f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'
|
|
311
|
-
|
|
312
|
-
if self.task_cfg.use_cache and sample.index in review_history_map.get(file_name, {}):
|
|
313
|
-
# Use cached review if available
|
|
314
|
-
review_d = review_history_map[file_name][sample.index]
|
|
315
|
-
else:
|
|
316
|
-
# Generate new review
|
|
317
|
-
evaluator = self.evaluators[sample.dataset_name]
|
|
318
|
-
review_d = evaluator.get_review(answers[sample.index])
|
|
319
|
-
# Only save the review if it's not in the cache
|
|
320
|
-
self._save_review(review_file_path, file_name, review_d)
|
|
321
|
-
|
|
322
|
-
reviews[sample.index] = review_d
|
|
323
|
-
except Exception as e:
|
|
324
|
-
logger.error(f'Error getting review for sample index {sample.index}: {e}. Skipping this sample.')
|
|
325
|
-
|
|
326
|
-
return reviews
|
|
327
|
-
|
|
328
|
-
def _clear_review_files(self, review_file_path: str) -> None:
|
|
329
|
-
"""Clear existing review files."""
|
|
330
|
-
if os.path.isdir(review_file_path):
|
|
331
|
-
for filename in os.listdir(review_file_path):
|
|
332
|
-
file_path = os.path.join(review_file_path, filename)
|
|
333
|
-
try:
|
|
334
|
-
if os.path.isfile(file_path):
|
|
335
|
-
os.remove(file_path)
|
|
336
|
-
except Exception as e:
|
|
337
|
-
logger.error(f'Error deleting file {file_path}: {e}')
|
|
338
|
-
else:
|
|
339
|
-
os.remove(review_file_path)
|
|
340
|
-
|
|
341
|
-
def _load_existing_reviews(self, review_file_path: str, review_history_map: Dict[str, Dict[int, Any]]) -> None:
|
|
342
|
-
"""Load existing reviews from files."""
|
|
343
|
-
logger.info(f'use_cache={self.task_cfg.use_cache}, reloading the review file: {review_file_path}')
|
|
344
|
-
if os.path.isdir(review_file_path):
|
|
345
|
-
for filename in os.listdir(review_file_path):
|
|
346
|
-
if '.ipynb_checkpoints' in filename:
|
|
347
|
-
continue
|
|
348
|
-
file_path = os.path.join(review_file_path, filename)
|
|
349
|
-
with open(file_path, 'r') as f:
|
|
350
|
-
review_history = [json.loads(line.strip()) for line in f]
|
|
351
|
-
review_history_map[filename] = {item['index']: item for item in review_history}
|
|
352
|
-
|
|
353
|
-
def _save_review(self, review_file_path: str, file_name: str, review_d: Dict[str, Any]) -> None:
|
|
354
|
-
"""Save a single review to file."""
|
|
355
|
-
file_path = os.path.join(review_file_path, file_name)
|
|
356
|
-
dump_jsonl_data(review_d, file_path, dump_mode=DumpMode.APPEND)
|
|
357
|
-
|
|
358
|
-
def get_scores(self, reviews) -> float:
|
|
359
|
-
scores = defaultdict(dict)
|
|
360
|
-
for sample in tqdm(self.dataset, desc='Getting scores'):
|
|
361
|
-
evaluator = self.evaluators[sample.dataset_name]
|
|
362
|
-
if sample.index not in reviews:
|
|
363
|
-
continue
|
|
364
|
-
review_d = reviews[sample.index]
|
|
365
|
-
score = evaluator.get_score(review_d)
|
|
366
|
-
scores[sample.index] = score
|
|
367
|
-
|
|
368
|
-
return scores
|
|
369
|
-
|
|
370
|
-
def eval(self, **kwargs):
|
|
371
|
-
answers = self.get_answers()
|
|
372
|
-
reviews = self.get_reviews(answers)
|
|
373
|
-
scores = self.get_scores(reviews)
|
|
374
|
-
report = self.get_report(scores)
|
|
375
|
-
return report
|