PyPI - evalscope - Versions diffs - 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show

evalscope/__init__.py +4 -1
evalscope/api/benchmark/__init__.py +3 -0
evalscope/api/benchmark/adapters/__init__.py +5 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
evalscope/api/benchmark/benchmark.py +356 -0
evalscope/api/benchmark/meta.py +121 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +349 -0
evalscope/api/dataset/loader.py +262 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +378 -0
evalscope/api/evaluator/evaluator.py +56 -0
evalscope/api/evaluator/state.py +275 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +12 -0
evalscope/api/messages/chat_message.py +243 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +55 -0
evalscope/api/metric/scorer.py +113 -0
evalscope/api/mixin/__init__.py +1 -0
evalscope/api/mixin/llm_judge_mixin.py +168 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +155 -0
evalscope/api/model/model.py +386 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/app.py +3 -0
evalscope/app/ui/app_ui.py +2 -1
evalscope/app/ui/multi_model.py +50 -25
evalscope/app/ui/single_model.py +26 -14
evalscope/app/utils/data_utils.py +43 -27
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -14
evalscope/app/utils/visualization.py +9 -4
evalscope/arguments.py +7 -10
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +6 -5
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +10 -1
evalscope/backend/rag_eval/utils/llm.py +13 -12
evalscope/benchmarks/__init__.py +0 -2
evalscope/benchmarks/aime/aime24_adapter.py +38 -40
evalscope/benchmarks/aime/aime25_adapter.py +34 -40
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
evalscope/benchmarks/arc/arc_adapter.py +34 -147
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
evalscope/benchmarks/arena_hard/utils.py +37 -1
evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
evalscope/benchmarks/bfcl/generation.py +222 -0
evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
evalscope/benchmarks/docmath/utils.py +4 -5
evalscope/benchmarks/drop/drop_adapter.py +88 -40
evalscope/benchmarks/frames/frames_adapter.py +136 -52
evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
evalscope/benchmarks/general_arena/utils.py +23 -27
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
evalscope/benchmarks/hle/hle_adapter.py +127 -93
evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
evalscope/benchmarks/ifeval/instructions.py +109 -64
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
evalscope/benchmarks/musr/musr_adapter.py +33 -64
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
evalscope/benchmarks/race/race_adapter.py +33 -119
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
evalscope/benchmarks/super_gpqa/utils.py +2 -1
evalscope/benchmarks/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +2 -10
evalscope/collections/sampler.py +10 -10
evalscope/collections/schema.py +13 -11
evalscope/config.py +157 -57
evalscope/constants.py +37 -61
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +275 -419
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +13 -13
evalscope/metrics/llm_judge.py +47 -33
evalscope/metrics/math_parser.py +27 -22
evalscope/metrics/metric.py +307 -0
evalscope/metrics/metrics.py +22 -18
evalscope/metrics/t2v_metrics/__init__.py +0 -52
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
evalscope/models/__init__.py +6 -29
evalscope/models/image_edit_model.py +125 -0
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +67 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +126 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +701 -0
evalscope/perf/benchmark.py +4 -1
evalscope/perf/http_client.py +4 -2
evalscope/perf/plugin/api/custom_api.py +5 -4
evalscope/perf/plugin/api/openai_api.py +11 -9
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +4 -2
evalscope/perf/utils/benchmark_util.py +15 -10
evalscope/perf/utils/db_util.py +9 -6
evalscope/perf/utils/local_server.py +11 -3
evalscope/perf/utils/rich_display.py +16 -10
evalscope/report/__init__.py +2 -3
evalscope/report/combinator.py +18 -12
evalscope/report/generator.py +51 -35
evalscope/report/{utils.py → report.py} +8 -6
evalscope/run.py +33 -47
evalscope/summarizer.py +1 -1
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/__init__.py +21 -2
evalscope/utils/chat_service.py +3 -2
evalscope/utils/deprecation_utils.py +12 -1
evalscope/utils/function_utils.py +29 -0
evalscope/utils/import_utils.py +23 -1
evalscope/utils/io_utils.py +142 -6
evalscope/utils/json_schema.py +208 -0
evalscope/utils/logger.py +51 -12
evalscope/utils/model_utils.py +11 -7
evalscope/utils/multi_choices.py +288 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
tests/benchmark/test_eval.py +385 -0
tests/benchmark/test_image_edit.py +65 -0
tests/{aigc → benchmark}/test_t2i.py +22 -4
tests/benchmark/test_vlm.py +80 -0
tests/cli/test_all.py +85 -47
tests/cli/test_collection.py +20 -8
tests/cli/test_custom.py +22 -15
tests/cli/test_reasoning.py +81 -0
tests/common.py +73 -0
tests/perf/test_perf.py +4 -2
tests/rag/test_clip_benchmark.py +0 -2
evalscope/benchmarks/aigc/t2i/base.py +0 -56
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -81
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -528
evalscope/benchmarks/filters.py +0 -59
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/process_bench/critique_template.txt +0 -13
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/benchmarks/utils.py +0 -60
evalscope/collections/evaluator.py +0 -375
evalscope/metrics/completion_parsers.py +0 -227
evalscope/metrics/named_metrics.py +0 -55
evalscope/models/adapters/__init__.py +0 -14
evalscope/models/adapters/base_adapter.py +0 -84
evalscope/models/adapters/bfcl_adapter.py +0 -246
evalscope/models/adapters/chat_adapter.py +0 -207
evalscope/models/adapters/choice_adapter.py +0 -222
evalscope/models/adapters/custom_adapter.py +0 -71
evalscope/models/adapters/server_adapter.py +0 -236
evalscope/models/adapters/t2i_adapter.py +0 -79
evalscope/models/adapters/tau_bench_adapter.py +0 -189
evalscope/models/custom/__init__.py +0 -4
evalscope/models/custom/custom_model.py +0 -50
evalscope/models/custom/dummy_model.py +0 -99
evalscope/models/local_model.py +0 -128
evalscope/models/register.py +0 -41
tests/cli/test_run.py +0 -489
/evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
/tests/{aigc → benchmark}/__init__.py +0 -0

evalscope/benchmarks/gsm8k/gsm8k.py DELETED Viewed

@@ -1,121 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# flake8: noqa
-"""Grade School Math 8k dataset."""
-import datasets
-import json
-import textwrap
-_CITATION = """\
-@misc{cobbe2021training,
-      title={Training Verifiers to Solve Math Word Problems},
-      author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
-      year={2021},
-      eprint={2110.14168},
-      archivePrefix={arXiv},
-      primaryClass={cs.LG}
-}
-"""
-_DESCRIPTION = """\
-GSM8K (Grade School Math 8K) is a dataset of 8.5K high quality
-linguistically diverse grade school math word problems. The
-dataset was created to support the task of question answering
-on basic mathematical problems that require multi-step reasoning.
-"""
-_HOMEPAGE = 'https://openai.com/blog/grade-school-math'
-_MODELSCOPE_PAGE = 'https://modelscope.cn/datasets/modelscope/gsm8k/summary'
-_LICENSE = 'MIT'
-# _BASE_URL = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/"
-TRAIN_URL = 'https://sail-moe.oss-cn-hangzhou.aliyuncs.com/open_data/gsm8k/train.jsonl'
-TEST_URL = 'https://sail-moe.oss-cn-hangzhou.aliyuncs.com/open_data/gsm8k/test.jsonl'
-class Gsm8kConfig(datasets.BuilderConfig):
-    """BuilderConfig for GSM8K."""
-    def __init__(self, urls, **kwargs):
-        """BuilderConfig for GSM8K.
-        Args:
-        urls: *dict[string]*, the urls for each split of the GSM8k set.
-        """
-        super().__init__(version=datasets.Version('1.1.0'), **kwargs)
-        self.urls = urls
-class Gsm8k(datasets.GeneratorBasedBuilder):
-    """Grade School Math 8k (GSM8K)"""
-    BUILDER_CONFIGS = [
-        Gsm8kConfig(
-            name='main',
-            description=textwrap.dedent(
-                """
-                It is segmented into 7.5K training problems and 1K test problems.
-                These problems take between 2 and 8 steps to solve, and solutions
-                primarily involve performing a sequence of elementary calculations
-                using basic arithmetic operations (+ - / *) to reach the final
-                answer. A bright middle school student should be able to solve
-                every problem.
-                """, ),
-            urls={
-                'train': TRAIN_URL,
-                'test': TEST_URL,
-            },
-        ),
-    ]
-    def _info(self):
-        features = datasets.Features({
-            'question': datasets.Value('string'),
-            'answer': datasets.Value('string'),
-        })
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-    def _split_generators(self, dl_manager):
-        data_dir = dl_manager.download_and_extract(self.config.urls)
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={
-                    'filepath': data_dir['train'],
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                gen_kwargs={
-                    'filepath': data_dir['test'],
-                },
-            ),
-        ]
-    def _generate_examples(self, filepath):
-        with open(filepath, encoding='utf-8') as f:
-            for key, row in enumerate(f):
-                data = json.loads(row)
-                yield key, {
-                    'question': data['question'],
-                    'answer': data['answer'],
-                }

evalscope/benchmarks/hellaswag/hellaswag.py DELETED Viewed

@@ -1,112 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-"""HellaSwag: Can a Machine Really Finish Your Sentence? is a new dataset for commonsense NLI.
-    A paper was published at ACL2019.
-"""
-"""DO NOT EDIT."""
-import datasets
-import json
-# flake8: noqa
-# HomePage: https://rowanzellers.com/hellaswag/
-# GitHub: https://github.com/rowanz/hellaswag
-_CITATION = """\
-@inproceedings{zellers2019hellaswag,
-    title={HellaSwag: Can a Machine Really Finish Your Sentence?},
-    author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
-    booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
-    year={2019}
-}
-"""
-_DESCRIPTION = """
-HellaSwag: Can a Machine Really Finish Your Sentence? is a new dataset for commonsense NLI. A paper was published at ACL2019.
-"""
-_URL = 'https://modelscope.oss-cn-beijing.aliyuncs.com/open_data/hellaswag/'
-_URLS = {
-    'train': _URL + 'hellaswag_train.jsonl',
-    'test': _URL + 'hellaswag_test.jsonl',
-    'dev': _URL + 'hellaswag_val.jsonl',
-}
-class Hellaswag(datasets.GeneratorBasedBuilder):
-    """TODO(hellaswag): Short description of my dataset."""
-    # TODO(hellaswag): Set up version.
-    VERSION = datasets.Version('0.1.0')
-    def _info(self):
-        # TODO(hellaswag): Specifies the datasets.DatasetInfo object
-        return datasets.DatasetInfo(
-            # This is the description that will appear on the datasets page.
-            description=_DESCRIPTION,
-            # datasets.features.FeatureConnectors
-            features=datasets.Features({
-                # These are the features of your dataset like images, labels ...
-                'ind': datasets.Value('int32'),
-                'activity_label': datasets.Value('string'),
-                'ctx_a': datasets.Value('string'),
-                'ctx_b': datasets.Value('string'),
-                'ctx': datasets.Value('string'),
-                'endings': datasets.features.Sequence(datasets.Value('string')),
-                'source_id': datasets.Value('string'),
-                'split': datasets.Value('string'),
-                'split_type': datasets.Value('string'),
-                'label': datasets.Value('string'),
-            }),
-            # If there's a common (input, target) tuple from the features,
-            # specify them here. They'll be used if as_supervised=True in
-            # builder.as_dataset.
-            supervised_keys=None,
-            # Homepage of the dataset for documentation
-            homepage='https://rowanzellers.com/hellaswag/',
-            citation=_CITATION,
-        )
-    def _split_generators(self, dl_manager):
-        """Returns SplitGenerators."""
-        # TODO(hellaswag): Downloads the data and defines the splits
-        # dl_manager is a datasets.download.DownloadManager that can be used to
-        # download and extract URLs
-        urls_to_download = _URLS
-        dl_dir = dl_manager.download_and_extract(urls_to_download)
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={'filepath': dl_dir['train']},
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={'filepath': dl_dir['test']},
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={'filepath': dl_dir['dev']},
-            ),
-        ]
-    def _generate_examples(self, filepath):
-        """Yields examples."""
-        # TODO(hellaswag): Yields (key, example) tuples from the dataset
-        with open(filepath, encoding='utf-8') as f:
-            for id_, row in enumerate(f):
-                data = json.loads(row)
-                yield id_, {
-                    'ind': int(data['ind']),
-                    'activity_label': data['activity_label'],
-                    'ctx_a': data.get('ctx_a', ''),
-                    'ctx_b': data.get('ctx_b', ''),
-                    'ctx': data['ctx'],
-                    'endings': data.get('endings', []),
-                    'source_id': data['source_id'],
-                    'split': data['split'],
-                    'split_type': data['split_type'],
-                    'label': str(data.get('label', '')),
-                }

evalscope/benchmarks/humaneval/humaneval.py DELETED Viewed

@@ -1,79 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import datasets
-import json
-# flake8: noqa
-# NOTE: AUTOGENERATED, DO NOT CHANGE.
-_DESCRIPTION = """\
-The HumanEval dataset released by OpenAI contains 164 handcrafted programming challenges together with unittests to very the viability of a proposed solution.
-"""
-# _URL = "https://raw.githubusercontent.com/openai/human-eval/master/data/HumanEval.jsonl.gz"
-_URL = 'https://sail-moe.oss-cn-hangzhou.aliyuncs.com/open_data/humaneval/HumanEval.jsonl.gz'
-_CITATION = """\
-@misc{chen2021evaluating,
-      title={Evaluating Large Language Models Trained on Code},
-      author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
-      year={2021},
-      eprint={2107.03374},
-      archivePrefix={arXiv},
-      primaryClass={cs.LG}
-}"""
-_HOMEPAGE = 'https://github.com/openai/human-eval'
-_LICENSE = 'MIT'
-class OpenaiHumaneval(datasets.GeneratorBasedBuilder):
-    """HumanEval: A benchmark for code generation."""
-    VERSION = datasets.Version('1.0.0')
-    BUILDER_CONFIGS = [
-        datasets.BuilderConfig(
-            name='openai_humaneval',
-            version=datasets.Version('1.0.0'),
-            description=_DESCRIPTION,
-        )
-    ]
-    def _info(self):
-        features = datasets.Features({
-            'task_id': datasets.Value('string'),
-            'prompt': datasets.Value('string'),
-            'canonical_solution': datasets.Value('string'),
-            'test': datasets.Value('string'),
-            'entry_point': datasets.Value('string'),
-        })
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            supervised_keys=None,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-    def _split_generators(self, dl_manager):
-        """Returns SplitGenerators."""
-        data_dir = dl_manager.download_and_extract(_URL)
-        return [datasets.SplitGenerator(
-            name=datasets.Split.TEST,
-            gen_kwargs={
-                'filepath': data_dir,
-            },
-        )]
-    def _generate_examples(self, filepath):
-        """Yields examples."""
-        with open(filepath, encoding='utf-8') as file:
-            data = [json.loads(line) for line in file]
-            id_ = 0
-            for sample in data:
-                yield id_, sample
-                id_ += 1

evalscope/benchmarks/mmlu/mmlu.py DELETED Viewed

@@ -1,160 +0,0 @@
-# isort: skip_file
-# Copyright (c) Alibaba, Inc. and its affiliates.
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# flake8: noqa
-import datasets
-import os
-import pandas as pd
-"""The MMLU dataset on ModelScope hub. READ ONLY, DO NOT MODIFY."""
-_CITATION = """\
-@article{hendryckstest2021,
-  title={Measuring Massive Multitask Language Understanding},
-  author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
-  journal={Proceedings of the International Conference on Learning Representations (ICLR)},
-  year={2021}
-}
-"""
-_DESCRIPTION = """\
-Measuring Massive Multitask Language Understanding by Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas
-Mazeika, Dawn Song, and Jacob Steinhardt (ICLR 2021).
-"""
-_HOMEPAGE = 'https://modelscope.cn/datasets/modelscope/mmlu/summary'
-_LICENSE = 'MIT'
-# _URL = "https://people.eecs.berkeley.edu/~hendrycks/data.tar"
-_URL = 'https://modelscope.cn/api/v1/datasets/modelscope/mmlu/repo?Revision=master&FilePath=data.tar'
-task_list = [
-    'high_school_european_history',
-    'business_ethics',
-    'clinical_knowledge',
-    'medical_genetics',
-    'high_school_us_history',
-    'high_school_physics',
-    'high_school_world_history',
-    'virology',
-    'high_school_microeconomics',
-    'econometrics',
-    'college_computer_science',
-    'high_school_biology',
-    'abstract_algebra',
-    'professional_accounting',
-    'philosophy',
-    'professional_medicine',
-    'nutrition',
-    'global_facts',
-    'machine_learning',
-    'security_studies',
-    'public_relations',
-    'professional_psychology',
-    'prehistory',
-    'anatomy',
-    'human_sexuality',
-    'college_medicine',
-    'high_school_government_and_politics',
-    'college_chemistry',
-    'logical_fallacies',
-    'high_school_geography',
-    'elementary_mathematics',
-    'human_aging',
-    'college_mathematics',
-    'high_school_psychology',
-    'formal_logic',
-    'high_school_statistics',
-    'international_law',
-    'high_school_mathematics',
-    'high_school_computer_science',
-    'conceptual_physics',
-    'miscellaneous',
-    'high_school_chemistry',
-    'marketing',
-    'professional_law',
-    'management',
-    'college_physics',
-    'jurisprudence',
-    'world_religions',
-    'sociology',
-    'us_foreign_policy',
-    'high_school_macroeconomics',
-    'computer_security',
-    'moral_scenarios',
-    'moral_disputes',
-    'electrical_engineering',
-    'astronomy',
-    'college_biology',
-]
-class MMLUConfig(datasets.BuilderConfig):
-    def __init__(self, **kwargs):
-        super().__init__(version=datasets.Version('1.0.0'), **kwargs)
-class MMLU(datasets.GeneratorBasedBuilder):
-    BUILDER_CONFIGS = [MMLUConfig(name=task_name, ) for task_name in task_list]
-    def _info(self):
-        features = datasets.Features({
-            'input': datasets.Value('string'),
-            'A': datasets.Value('string'),
-            'B': datasets.Value('string'),
-            'C': datasets.Value('string'),
-            'D': datasets.Value('string'),
-            'target': datasets.Value('string'),
-        })
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-    def _split_generators(self, dl_manager):
-        data_dir = dl_manager.download_and_extract(_URL)
-        task_name = self.config.name
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                gen_kwargs={
-                    'filepath': os.path.join(data_dir, 'data', 'test', f'{task_name}_test.csv'),
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                gen_kwargs={
-                    'filepath': os.path.join(data_dir, 'data', 'val', f'{task_name}_val.csv'),
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={
-                    'filepath': os.path.join(data_dir, 'data', 'dev', f'{task_name}_dev.csv'),
-                },
-            ),
-        ]
-    def _generate_examples(self, filepath):
-        df = pd.read_csv(filepath)
-        df.columns = ['input', 'A', 'B', 'C', 'D', 'target']
-        for i, instance in enumerate(df.to_dict(orient='records')):
-            yield i, instance

evalscope/benchmarks/mmlu/samples.jsonl DELETED Viewed

@@ -1,5 +0,0 @@
-{'input': 'A "dished face" profile is often associated with', 'A': 'a protruding mandible due to reactivation of the condylar cartilage by acromegaly.', 'B': 'a recessive maxilla due to failure of elongation of the cranial base.', 'C': 'an enlarged frontal bone due to hydrocephaly.', 'D': 'defective development of the maxillary air sinus.', 'target': 'B'}
-{'input': '___________ is based on the idea that customer expectations of the service they will receive shape their perception of the actual service encounter.', 'A': 'Service quality.', 'B': 'Service action.', 'C': 'Service recovery.', 'D': 'Service satisfaction.', 'target': 'A'}
-{'input': ' Information collected for the first time specifically for a marketing research study is called:', 'A': 'Secondary research.', 'B': 'Primary research.', 'C': 'Soft research.', 'D': 'Experimental research.', 'target': 'B'}
-{'input': "This includes advertisements that contain 'call-to-response' mechanisms such as telephone numbers, website addresses, email and postal addresses:", 'A': 'Direct response advertising.', 'B': 'Sales promotions.', 'C': 'Mass media advertising.', 'D': 'Public relations.', 'target': 'A'}
-{'input': 'Which of the following is not part of the external marketing environment?', 'A': 'Political.', 'B': 'Legal.', 'C': 'Product.', 'D': 'Socio-cultural.', 'target': 'C'}

evalscope/benchmarks/process_bench/critique_template.txt DELETED Viewed

@@ -1,13 +0,0 @@
-The following is a math problem and a solution (split into paragraphs, enclosed with tags and indexed from 0):
-[Math Problem]
-{problem}
-[Solution]
-{tagged_response}
-Your task is to review and critique the solution paragraph by paragraph. Once you identify an error in a paragraph, return the index of the paragraph where the earliest error occurs. Otherwise, return the index of -1 (which typically denotes "not found").
-Please put your final answer (i.e., the index) in \boxed{{}}.

evalscope/benchmarks/race/race.py DELETED Viewed

@@ -1,104 +0,0 @@
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import datasets
-import os
-import pandas as pd
-_CITATION = """\
-@inproceedings{lai-etal-2017-race,
-    title = "{RACE}: Large-scale {R}e{A}ding Comprehension Dataset From Examinations",
-    author = "Lai, Guokun  and
-      Xie, Qizhe  and
-      Liu, Hanxiao  and
-      Yang, Yiming  and
-      Hovy, Eduard",
-    booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
-    month = sep,
-    year = "2017",
-    address = "Copenhagen, Denmark",
-    publisher = "Association for Computational Linguistics",
-    url = "https://aclanthology.org/D17-1082",
-    doi = "10.18653/v1/D17-1082",
-    pages = "785--794",
-}
-"""
-_DESCRIPTION = """\
-RACE is a large-scale reading comprehension dataset with more than 28,000 passages and nearly 100,000 questions.
-"""
-_HOMEPAGE = 'https://modelscope.cn/datasets/modelscope/race/summary'
-_URL = 'https://modelscope.cn/api/v1/datasets/modelscope/race/repo?Revision=master&FilePath=race.zip'
-task_list = [
-    'high',
-    'middle',
-]
-class RACEConfig(datasets.BuilderConfig):
-    def __init__(self, **kwargs):
-        super().__init__(version=datasets.Version('1.0.0'), **kwargs)
-class RACE(datasets.GeneratorBasedBuilder):
-    BUILDER_CONFIGS = [RACEConfig(name=task_name, ) for task_name in task_list]
-    def _info(self):
-        features = datasets.Features({
-            'example_id': datasets.Value('string'),
-            'article': datasets.Value('string'),
-            'answer': datasets.Value('string'),
-            'question': datasets.Value('string'),
-            'options': [datasets.Value('string')],
-        })
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            citation=_CITATION,
-        )
-    def _split_generators(self, dl_manager):
-        data_dir = dl_manager.download_and_extract(_URL)
-        task_name = self.config.name
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                gen_kwargs={
-                    'filepath': os.path.join(data_dir, f'race/test/{task_name}-00000-of-00001.parquet'),
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                gen_kwargs={
-                    'filepath': os.path.join(data_dir, f'race/val/{task_name}-00000-of-00001.parquet'),
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={
-                    'filepath': os.path.join(data_dir, f'race/train/{task_name}-00000-of-00001.parquet'),
-                },
-            ),
-        ]
-    def _generate_examples(self, filepath):
-        df = pd.read_parquet(filepath)
-        df.columns = ['example_id', 'article', 'answer', 'question', 'options']
-        for i, instance in enumerate(df.to_dict(orient='records')):
-            yield i, instance

evalscope/benchmarks/race/samples.jsonl DELETED Viewed

@@ -1,5 +0,0 @@
-{'example_id': 'middle4227.txt', 'article': 'There are many kinds...ealthy.\n,.', 'answer': 'D', 'question': 'We may read this pas... in   _  .', 'options': ['a letter', 'a story', 'a newspaper', 'a health magazine']}
-{'example_id': 'middle3329.txt', 'article': 'Do you know why diff...ng at all.', 'answer': 'B', 'question': 'Those pests with dif...of danger.', 'options': ['change their colours', 'hide in the day time...r at night', 'move quietly', 'hide at night and ap...e day time']}
-{'example_id': 'middle3614.txt', 'article': 'The seahorse is a ve...o the sea.', 'answer': 'B', 'question': 'A seahorse eats   _  .', 'options': ['sea weed', 'small fish', 'water', 'nothing']}
-{'example_id': 'middle6632.txt', 'article': 'Kids have unbelievab...h at her."', 'answer': 'D', 'question': 'Which is NOT mention...e passage?', 'options': ['Robots keep secrets.', 'Robots give suggestions.', 'Robots do chores.', 'Robots make movies.']}
-{'example_id': 'middle3503.txt', 'article': 'Have you ever heard ...eir lives.', 'answer': 'B', 'question': 'Which of the followi...lue moon"?', 'options': ['Simon often tells jo...blue moon.', 'Tom rarely remembers...blue moon.', 'Mary likes to go sho...blue moon.', 'Cindy hates to stay ...blue moon.']}

evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt DELETED Viewed

@@ -1,4 +0,0 @@
-Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is one of A, B, C, D, E, F, G, H, I, or J.
-{query}
-{choices}

evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl