evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +5 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +356 -0
- evalscope/api/benchmark/meta.py +121 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +262 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +378 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +275 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +243 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +155 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/app.py +3 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +26 -14
- evalscope/app/utils/data_utils.py +43 -27
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -14
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +7 -10
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +10 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +136 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +157 -57
- evalscope/constants.py +37 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +275 -419
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +47 -33
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +67 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +126 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +701 -0
- evalscope/perf/benchmark.py +4 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +15 -10
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +11 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -3
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +51 -35
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +33 -47
- evalscope/summarizer.py +1 -1
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +3 -2
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +142 -6
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +11 -7
- evalscope/utils/multi_choices.py +288 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
- tests/benchmark/test_eval.py +385 -0
- tests/benchmark/test_image_edit.py +65 -0
- tests/{aigc → benchmark}/test_t2i.py +22 -4
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +85 -47
- tests/cli/test_collection.py +20 -8
- tests/cli/test_custom.py +22 -15
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -2
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/__init__.py +0 -0
|
@@ -1,161 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
15
|
-
# flake8: noqa
|
|
16
|
-
|
|
17
|
-
import datasets
|
|
18
|
-
import os
|
|
19
|
-
import pandas as pd
|
|
20
|
-
|
|
21
|
-
_CITATION = """\
|
|
22
|
-
@misc{li2023cmmlu,
|
|
23
|
-
title={CMMLU: Measuring massive multitask language understanding in Chinese},
|
|
24
|
-
author={Haonan Li and Yixuan Zhang and Fajri Koto and Yifei Yang and Hai Zhao and Yeyun Gong and Nan Duan and Timothy Baldwin},
|
|
25
|
-
year={2023},
|
|
26
|
-
eprint={2306.09212},
|
|
27
|
-
archivePrefix={arXiv},
|
|
28
|
-
primaryClass={cs.CL}
|
|
29
|
-
}
|
|
30
|
-
"""
|
|
31
|
-
|
|
32
|
-
_DESCRIPTION = """\
|
|
33
|
-
CMMLU is a comprehensive Chinese assessment suite specifically designed to evaluate the advanced knowledge and reasoning abilities of LLMs within the Chinese language and cultural context.
|
|
34
|
-
"""
|
|
35
|
-
|
|
36
|
-
_HOMEPAGE = 'https://modelscope.cn/datasets/modelscope/cmmlu/summary'
|
|
37
|
-
|
|
38
|
-
# _URL = r"https://huggingface.co/datasets/haonan-li/cmmlu/resolve/main/cmmlu_v1_0_1.zip"
|
|
39
|
-
_URL = r'https://modelscope.cn/api/v1/datasets/modelscope/cmmlu/repo?Revision=master&FilePath=cmmlu_v1_0_1.zip'
|
|
40
|
-
|
|
41
|
-
# contains 67 sub-tasks
|
|
42
|
-
task_list = [
|
|
43
|
-
'agronomy',
|
|
44
|
-
'anatomy',
|
|
45
|
-
'ancient_chinese',
|
|
46
|
-
'arts',
|
|
47
|
-
'astronomy',
|
|
48
|
-
'business_ethics',
|
|
49
|
-
'chinese_civil_service_exam',
|
|
50
|
-
'chinese_driving_rule',
|
|
51
|
-
'chinese_food_culture',
|
|
52
|
-
'chinese_foreign_policy',
|
|
53
|
-
'chinese_history',
|
|
54
|
-
'chinese_literature',
|
|
55
|
-
'chinese_teacher_qualification',
|
|
56
|
-
'clinical_knowledge',
|
|
57
|
-
'college_actuarial_science',
|
|
58
|
-
'college_education',
|
|
59
|
-
'college_engineering_hydrology',
|
|
60
|
-
'college_law',
|
|
61
|
-
'college_mathematics',
|
|
62
|
-
'college_medical_statistics',
|
|
63
|
-
'college_medicine',
|
|
64
|
-
'computer_science',
|
|
65
|
-
'computer_security',
|
|
66
|
-
'conceptual_physics',
|
|
67
|
-
'construction_project_management',
|
|
68
|
-
'economics',
|
|
69
|
-
'education',
|
|
70
|
-
'electrical_engineering',
|
|
71
|
-
'elementary_chinese',
|
|
72
|
-
'elementary_commonsense',
|
|
73
|
-
'elementary_information_and_technology',
|
|
74
|
-
'elementary_mathematics',
|
|
75
|
-
'ethnology',
|
|
76
|
-
'food_science',
|
|
77
|
-
'genetics',
|
|
78
|
-
'global_facts',
|
|
79
|
-
'high_school_biology',
|
|
80
|
-
'high_school_chemistry',
|
|
81
|
-
'high_school_geography',
|
|
82
|
-
'high_school_mathematics',
|
|
83
|
-
'high_school_physics',
|
|
84
|
-
'high_school_politics',
|
|
85
|
-
'human_sexuality',
|
|
86
|
-
'international_law',
|
|
87
|
-
'journalism',
|
|
88
|
-
'jurisprudence',
|
|
89
|
-
'legal_and_moral_basis',
|
|
90
|
-
'logical',
|
|
91
|
-
'machine_learning',
|
|
92
|
-
'management',
|
|
93
|
-
'marketing',
|
|
94
|
-
'marxist_theory',
|
|
95
|
-
'modern_chinese',
|
|
96
|
-
'nutrition',
|
|
97
|
-
'philosophy',
|
|
98
|
-
'professional_accounting',
|
|
99
|
-
'professional_law',
|
|
100
|
-
'professional_medicine',
|
|
101
|
-
'professional_psychology',
|
|
102
|
-
'public_relations',
|
|
103
|
-
'security_study',
|
|
104
|
-
'sociology',
|
|
105
|
-
'sports_science',
|
|
106
|
-
'traditional_chinese_medicine',
|
|
107
|
-
'virology',
|
|
108
|
-
'world_history',
|
|
109
|
-
'world_religions',
|
|
110
|
-
]
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
class CMMLUConfig(datasets.BuilderConfig):
|
|
114
|
-
|
|
115
|
-
def __init__(self, **kwargs):
|
|
116
|
-
super().__init__(version=datasets.Version('1.0.1'), **kwargs)
|
|
117
|
-
# V1.0.1 Fix: One comma missing in word_religions.csv
|
|
118
|
-
# V1.0.0 Init version
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
class CMMLU(datasets.GeneratorBasedBuilder):
|
|
122
|
-
BUILDER_CONFIGS = [CMMLUConfig(name=task_name) for task_name in task_list]
|
|
123
|
-
|
|
124
|
-
def _info(self):
|
|
125
|
-
features = datasets.Features({
|
|
126
|
-
'Question': datasets.Value('string'),
|
|
127
|
-
'A': datasets.Value('string'),
|
|
128
|
-
'B': datasets.Value('string'),
|
|
129
|
-
'C': datasets.Value('string'),
|
|
130
|
-
'D': datasets.Value('string'),
|
|
131
|
-
'Answer': datasets.Value('string'),
|
|
132
|
-
})
|
|
133
|
-
return datasets.DatasetInfo(
|
|
134
|
-
description=_DESCRIPTION,
|
|
135
|
-
features=features,
|
|
136
|
-
homepage=_HOMEPAGE,
|
|
137
|
-
citation=_CITATION,
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
def _split_generators(self, dl_manager):
|
|
141
|
-
data_dir = dl_manager.download_and_extract(_URL)
|
|
142
|
-
task_name = self.config.name
|
|
143
|
-
return [
|
|
144
|
-
datasets.SplitGenerator(
|
|
145
|
-
name=datasets.Split.TEST,
|
|
146
|
-
gen_kwargs={
|
|
147
|
-
'filepath': os.path.join(data_dir, f'test/{task_name}.csv'),
|
|
148
|
-
},
|
|
149
|
-
),
|
|
150
|
-
datasets.SplitGenerator(
|
|
151
|
-
name=datasets.Split('dev'),
|
|
152
|
-
gen_kwargs={
|
|
153
|
-
'filepath': os.path.join(data_dir, f'dev/{task_name}.csv'),
|
|
154
|
-
},
|
|
155
|
-
),
|
|
156
|
-
]
|
|
157
|
-
|
|
158
|
-
def _generate_examples(self, filepath):
|
|
159
|
-
df = pd.read_csv(filepath, header=0, index_col=0, encoding='utf-8')
|
|
160
|
-
for i, instance in enumerate(df.to_dict(orient='records')):
|
|
161
|
-
yield i, instance
|
|
@@ -1,5 +0,0 @@
|
|
|
1
|
-
{'input': '毛毛骑在牛背上过河,他共有甲、乙、丙、丁4头牛,甲过河要20分钟,乙过河要30分钟,丙过河要40分钟,丁过河要50分钟。毛毛每次只能赶2头牛过河,要把4头牛都赶到对岸去,最少要多少分钟?', 'A': '190', 'B': '180', 'C': '170', 'D': '160', 'target': 'D'}
|
|
2
|
-
{'input': '下列关于重力的说法正确的是', 'A': '在地球周围的物体都要受到重力作用,与其运动状态无关', 'B': '对某一物体而言,重力的大小是一个恒量,不随物体的地理位置而改变', 'C': '重力就是地球对物体的吸引力,重力的方向总是竖直向下', 'D': '在地球表面各处的重力方向都是相同的', 'target': 'A'}
|
|
3
|
-
{'input': '心脏的静脉血回心的主要途径是', 'A': '心小静脉', 'B': '冠状窦', 'C': '心中静脉', 'D': '心前静脉', 'target': 'B'}
|
|
4
|
-
{'input': "以西蒙为代表的决策理论学派提出的决策准则是", 'A': '最优化', 'B': '公平', 'C': '民主化', 'D': '满意', 'target': 'D'}
|
|
5
|
-
{'input': '20世纪初,英国首相阿斯奎斯说:“我们现在有一个牢固确立了两百年的传统,即归根到底,王位的占有者接受其大臣的建议并据此行事。”这一传统的确立,使一个以小农业和手工业生产为主的国家变成了一个典型的资本主义国家,成为欧洲各国效仿的对象。各国效仿的理由是', 'A': '英国“光荣革命”宣告了欧洲新社会政治制度的诞生', 'B': '殖民主义深刻影响了英国“世界工厂”的地位', 'C': '英国经济上的成就得益于其制度设计', 'D': '英国启蒙思想奠定了资产阶级民主主义政治的理论基础', 'target': 'C'}
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
"""Mathematics Aptitude Test of Heuristics (MATH) dataset."""
|
|
3
|
-
|
|
4
|
-
import datasets
|
|
5
|
-
import json
|
|
6
|
-
import os
|
|
7
|
-
|
|
8
|
-
_CITATION = """\
|
|
9
|
-
@article{hendrycksmath2021,
|
|
10
|
-
title={Measuring Mathematical Problem Solving With the MATH Dataset},
|
|
11
|
-
author={Dan Hendrycks
|
|
12
|
-
and Collin Burns
|
|
13
|
-
and Saurav Kadavath
|
|
14
|
-
and Akul Arora
|
|
15
|
-
and Steven Basart
|
|
16
|
-
and Eric Tang
|
|
17
|
-
and Dawn Song
|
|
18
|
-
and Jacob Steinhardt},
|
|
19
|
-
journal={arXiv preprint arXiv:2103.03874},
|
|
20
|
-
year={2021}
|
|
21
|
-
}
|
|
22
|
-
"""
|
|
23
|
-
|
|
24
|
-
_DESCRIPTION = """\
|
|
25
|
-
The Mathematics Aptitude Test of Heuristics (MATH) dataset consists of problems
|
|
26
|
-
from mathematics competitions, including the AMC 10, AMC 12, AIME, and more.
|
|
27
|
-
Each problem in MATH has a full step-by-step solution, which can be used to teach
|
|
28
|
-
models to generate answer derivations and explanations.
|
|
29
|
-
"""
|
|
30
|
-
|
|
31
|
-
_HOMEPAGE = 'https://github.com/hendrycks/math'
|
|
32
|
-
|
|
33
|
-
_LICENSE = 'https://github.com/hendrycks/math/blob/main/LICENSE'
|
|
34
|
-
|
|
35
|
-
# Original data URL: "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar"
|
|
36
|
-
_URL = 'https://sail-moe.oss-cn-hangzhou.aliyuncs.com/open_data/math/MATH.zip'
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
class CompetitionMathDataset(datasets.GeneratorBasedBuilder):
|
|
40
|
-
"""Mathematics Aptitude Test of Heuristics (MATH) dataset."""
|
|
41
|
-
|
|
42
|
-
VERSION = datasets.Version('1.0.0')
|
|
43
|
-
|
|
44
|
-
def _info(self):
|
|
45
|
-
features = datasets.Features({
|
|
46
|
-
'problem': datasets.Value('string'),
|
|
47
|
-
'level': datasets.Value('string'),
|
|
48
|
-
'type': datasets.Value('string'),
|
|
49
|
-
'solution': datasets.Value('string'),
|
|
50
|
-
})
|
|
51
|
-
return datasets.DatasetInfo(
|
|
52
|
-
description=_DESCRIPTION,
|
|
53
|
-
features=features,
|
|
54
|
-
supervised_keys=None,
|
|
55
|
-
homepage=_HOMEPAGE,
|
|
56
|
-
license=_LICENSE,
|
|
57
|
-
citation=_CITATION,
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
def _split_generators(self, dl_manager):
|
|
61
|
-
"""Returns SplitGenerators."""
|
|
62
|
-
download_dir = dl_manager.download_and_extract(_URL)
|
|
63
|
-
return [
|
|
64
|
-
datasets.SplitGenerator(
|
|
65
|
-
name=datasets.Split.TRAIN,
|
|
66
|
-
gen_kwargs={'data_dir': dl_manager.iter_files(os.path.join(download_dir, 'MATH', 'train'))},
|
|
67
|
-
),
|
|
68
|
-
datasets.SplitGenerator(
|
|
69
|
-
name=datasets.Split.TEST,
|
|
70
|
-
gen_kwargs={'data_dir': dl_manager.iter_files(os.path.join(download_dir, 'MATH', 'test'))},
|
|
71
|
-
),
|
|
72
|
-
]
|
|
73
|
-
|
|
74
|
-
def _generate_examples(self, data_dir):
|
|
75
|
-
"""Yields examples as (key, example) tuples."""
|
|
76
|
-
for id_, filepath in enumerate(data_dir):
|
|
77
|
-
with open(filepath, 'rb') as fin:
|
|
78
|
-
example = json.load(fin)
|
|
79
|
-
yield id_, example
|