PyPI - evalscope - Versions diffs - 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show

evalscope/__init__.py +4 -1
evalscope/api/benchmark/__init__.py +3 -0
evalscope/api/benchmark/adapters/__init__.py +5 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
evalscope/api/benchmark/benchmark.py +356 -0
evalscope/api/benchmark/meta.py +121 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +349 -0
evalscope/api/dataset/loader.py +262 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +378 -0
evalscope/api/evaluator/evaluator.py +56 -0
evalscope/api/evaluator/state.py +275 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +12 -0
evalscope/api/messages/chat_message.py +243 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +55 -0
evalscope/api/metric/scorer.py +113 -0
evalscope/api/mixin/__init__.py +1 -0
evalscope/api/mixin/llm_judge_mixin.py +168 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +155 -0
evalscope/api/model/model.py +386 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/app.py +3 -0
evalscope/app/ui/app_ui.py +2 -1
evalscope/app/ui/multi_model.py +50 -25
evalscope/app/ui/single_model.py +26 -14
evalscope/app/utils/data_utils.py +43 -27
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -14
evalscope/app/utils/visualization.py +9 -4
evalscope/arguments.py +7 -10
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +6 -5
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +10 -1
evalscope/backend/rag_eval/utils/llm.py +13 -12
evalscope/benchmarks/__init__.py +0 -2
evalscope/benchmarks/aime/aime24_adapter.py +38 -40
evalscope/benchmarks/aime/aime25_adapter.py +34 -40
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
evalscope/benchmarks/arc/arc_adapter.py +34 -147
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
evalscope/benchmarks/arena_hard/utils.py +37 -1
evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
evalscope/benchmarks/bfcl/generation.py +222 -0
evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
evalscope/benchmarks/docmath/utils.py +4 -5
evalscope/benchmarks/drop/drop_adapter.py +88 -40
evalscope/benchmarks/frames/frames_adapter.py +136 -52
evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
evalscope/benchmarks/general_arena/utils.py +23 -27
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
evalscope/benchmarks/hle/hle_adapter.py +127 -93
evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
evalscope/benchmarks/ifeval/instructions.py +109 -64
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
evalscope/benchmarks/musr/musr_adapter.py +33 -64
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
evalscope/benchmarks/race/race_adapter.py +33 -119
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
evalscope/benchmarks/super_gpqa/utils.py +2 -1
evalscope/benchmarks/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +2 -10
evalscope/collections/sampler.py +10 -10
evalscope/collections/schema.py +13 -11
evalscope/config.py +157 -57
evalscope/constants.py +37 -61
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +275 -419
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +13 -13
evalscope/metrics/llm_judge.py +47 -33
evalscope/metrics/math_parser.py +27 -22
evalscope/metrics/metric.py +307 -0
evalscope/metrics/metrics.py +22 -18
evalscope/metrics/t2v_metrics/__init__.py +0 -52
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
evalscope/models/__init__.py +6 -29
evalscope/models/image_edit_model.py +125 -0
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +67 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +126 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +701 -0
evalscope/perf/benchmark.py +4 -1
evalscope/perf/http_client.py +4 -2
evalscope/perf/plugin/api/custom_api.py +5 -4
evalscope/perf/plugin/api/openai_api.py +11 -9
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +4 -2
evalscope/perf/utils/benchmark_util.py +15 -10
evalscope/perf/utils/db_util.py +9 -6
evalscope/perf/utils/local_server.py +11 -3
evalscope/perf/utils/rich_display.py +16 -10
evalscope/report/__init__.py +2 -3
evalscope/report/combinator.py +18 -12
evalscope/report/generator.py +51 -35
evalscope/report/{utils.py → report.py} +8 -6
evalscope/run.py +33 -47
evalscope/summarizer.py +1 -1
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/__init__.py +21 -2
evalscope/utils/chat_service.py +3 -2
evalscope/utils/deprecation_utils.py +12 -1
evalscope/utils/function_utils.py +29 -0
evalscope/utils/import_utils.py +23 -1
evalscope/utils/io_utils.py +142 -6
evalscope/utils/json_schema.py +208 -0
evalscope/utils/logger.py +51 -12
evalscope/utils/model_utils.py +11 -7
evalscope/utils/multi_choices.py +288 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
tests/benchmark/test_eval.py +385 -0
tests/benchmark/test_image_edit.py +65 -0
tests/{aigc → benchmark}/test_t2i.py +22 -4
tests/benchmark/test_vlm.py +80 -0
tests/cli/test_all.py +85 -47
tests/cli/test_collection.py +20 -8
tests/cli/test_custom.py +22 -15
tests/cli/test_reasoning.py +81 -0
tests/common.py +73 -0
tests/perf/test_perf.py +4 -2
tests/rag/test_clip_benchmark.py +0 -2
evalscope/benchmarks/aigc/t2i/base.py +0 -56
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -81
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -528
evalscope/benchmarks/filters.py +0 -59
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/process_bench/critique_template.txt +0 -13
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/benchmarks/utils.py +0 -60
evalscope/collections/evaluator.py +0 -375
evalscope/metrics/completion_parsers.py +0 -227
evalscope/metrics/named_metrics.py +0 -55
evalscope/models/adapters/__init__.py +0 -14
evalscope/models/adapters/base_adapter.py +0 -84
evalscope/models/adapters/bfcl_adapter.py +0 -246
evalscope/models/adapters/chat_adapter.py +0 -207
evalscope/models/adapters/choice_adapter.py +0 -222
evalscope/models/adapters/custom_adapter.py +0 -71
evalscope/models/adapters/server_adapter.py +0 -236
evalscope/models/adapters/t2i_adapter.py +0 -79
evalscope/models/adapters/tau_bench_adapter.py +0 -189
evalscope/models/custom/__init__.py +0 -4
evalscope/models/custom/custom_model.py +0 -50
evalscope/models/custom/dummy_model.py +0 -99
evalscope/models/local_model.py +0 -128
evalscope/models/register.py +0 -41
tests/cli/test_run.py +0 -489
/evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
/tests/{aigc → benchmark}/__init__.py +0 -0

evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py DELETED Viewed

@@ -1,58 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import os.path
-from collections import defaultdict
-from typing import List, Optional, Union
-from evalscope.benchmarks import Benchmark
-from evalscope.constants import OutputType
-from evalscope.utils.io_utils import jsonl_to_list
-from evalscope.utils.logger import get_logger
-from .base import T2IBaseAdapter
-logger = get_logger()
-@Benchmark.register(
-    name='genai_bench',
-    dataset_id='AI-ModelScope/T2V-Eval-Prompts',
-    model_adapter=OutputType.IMAGE_GENERATION,
-    output_types=[OutputType.IMAGE_GENERATION],
-    subset_list=['GenAI-Bench-1600'],
-    metric_list=['VQAScore'],
-    few_shot_num=0,
-    train_split=None,
-    eval_split='test',
-)
-class GenAIBenchAdapter(T2IBaseAdapter):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-    def load(self, **kwargs) -> dict:
-        if os.path.isfile(self.dataset_id):
-            data_list = jsonl_to_list(self.dataset_id)
-            data_dict = {self.subset_list[0]: {'test': data_list}}
-            return data_dict
-        else:
-            return super().load(**kwargs)
-    def get_gold_answer(self, input_d: dict) -> dict:
-        # return prompt and elements dict
-        return {'prompt': input_d.get('prompt'), 'tags': input_d.get('tags', {})}
-    def match(self, gold: dict, pred: str) -> dict:
-        # dummy match for general t2i
-        # pred is the image path, gold is the prompt
-        res = {}
-        for metric_name, metric_func in self.metrics.items():
-            score = metric_func(images=[pred], texts=[gold['prompt']])[0][0]
-            res[metric_name] = score.cpu().item()
-            # fine-granular metrics
-            if gold['tags'].get('advanced'):
-                res[f'{metric_name}_advanced'] = score.cpu().item()
-            else:
-                res[f'{metric_name}_basic'] = score.cpu().item()
-        return res

evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py DELETED Viewed

@@ -1,58 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import os.path
-from collections import defaultdict
-from typing import List, Optional, Union
-from evalscope.benchmarks import Benchmark
-from evalscope.constants import OutputType
-from evalscope.utils.io_utils import jsonl_to_list
-from evalscope.utils.logger import get_logger
-from .base import T2IBaseAdapter
-logger = get_logger()
-@Benchmark.register(
-    name='general_t2i',
-    dataset_id='general_t2i',
-    model_adapter=OutputType.IMAGE_GENERATION,
-    output_types=[OutputType.IMAGE_GENERATION],
-    subset_list=['default'],
-    metric_list=['PickScore'],
-    few_shot_num=0,
-    train_split=None,
-    eval_split='test',
-)
-class GeneralT2IAdapter(T2IBaseAdapter):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-    def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
-        dataset_name_or_path = dataset_name_or_path or self.dataset_id
-        subset_list = subset_list or self.subset_list
-        data_file_dict = defaultdict(str)
-        data_item_dict = defaultdict(list)
-        # get data file path and subset name
-        if os.path.isdir(dataset_name_or_path):
-            for subset_name in subset_list:
-                data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
-        elif os.path.isfile(dataset_name_or_path):
-            cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
-            data_file_dict[cur_subset_name] = dataset_name_or_path
-        else:
-            raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
-        # load data from local disk
-        try:
-            for subset_name, file_path in data_file_dict.items():
-                data_item_dict[subset_name] = jsonl_to_list(file_path)
-        except Exception as e:
-            raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
-        data_dict = {subset_name: {'test': data_item_dict[subset_name]} for subset_name in data_file_dict.keys()}
-        return data_dict

evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py DELETED Viewed

@@ -1,57 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import os.path
-from collections import defaultdict
-from typing import List, Optional, Union
-from evalscope.benchmarks import Benchmark
-from evalscope.constants import OutputType
-from evalscope.utils.io_utils import jsonl_to_list
-from evalscope.utils.logger import get_logger
-from .base import T2IBaseAdapter
-logger = get_logger()
-@Benchmark.register(
-    name='hpdv2',
-    dataset_id='AI-ModelScope/T2V-Eval-Prompts',
-    model_adapter=OutputType.IMAGE_GENERATION,
-    output_types=[OutputType.IMAGE_GENERATION],
-    subset_list=['HPDv2'],
-    metric_list=['HPSv2.1Score'],
-    few_shot_num=0,
-    train_split=None,
-    eval_split='test',
-)
-class HPDv2Adapter(T2IBaseAdapter):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-    def load(self, **kwargs) -> dict:
-        if os.path.isfile(self.dataset_id):
-            data_list = jsonl_to_list(self.dataset_id)
-            data_dict = {self.subset_list[0]: {'test': data_list}}
-            return data_dict
-        else:
-            return super().load(**kwargs)
-    def get_gold_answer(self, input_d: dict) -> dict:
-        # return prompt and elements dict
-        return {'prompt': input_d.get('prompt'), 'tags': input_d.get('tags', {})}
-    def match(self, gold: dict, pred: str) -> dict:
-        # dummy match for general t2i
-        # pred is the image path, gold is the prompt
-        res = {}
-        for metric_name, metric_func in self.metrics.items():
-            score = metric_func(images=[pred], texts=[gold['prompt']])[0][0]
-            res[metric_name] = score.cpu().item()
-            # fine-granular metrics
-            category = gold['tags'].get('category')
-            if category:
-                res[f'{metric_name}_{category}'] = score.cpu().item()
-        return res

evalscope/benchmarks/aigc/t2i/tifa_adapter.py DELETED Viewed

@@ -1,37 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import os.path
-from collections import defaultdict
-from typing import List, Optional, Union
-from evalscope.benchmarks import Benchmark
-from evalscope.constants import OutputType
-from evalscope.utils.io_utils import jsonl_to_list
-from evalscope.utils.logger import get_logger
-from .base import T2IBaseAdapter
-logger = get_logger()
-@Benchmark.register(
-    name='tifa160',
-    dataset_id='AI-ModelScope/T2V-Eval-Prompts',
-    model_adapter=OutputType.IMAGE_GENERATION,
-    output_types=[OutputType.IMAGE_GENERATION],
-    subset_list=['TIFA-160'],
-    metric_list=['PickScore'],
-    few_shot_num=0,
-    train_split=None,
-    eval_split='test',
-)
-class TIFA_Adapter(T2IBaseAdapter):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-    def load(self, **kwargs) -> dict:
-        if os.path.isfile(self.dataset_id):
-            data_list = jsonl_to_list(self.dataset_id)
-            data_dict = {self.subset_list[0]: {'test': data_list}}
-            return data_dict
-        else:
-            return super().load(**kwargs)

evalscope/benchmarks/arc/ai2_arc.py DELETED Viewed

@@ -1,151 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-# Copyright (c) Allen Institute, and its affiliates.
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-"""AI2 ARC (Abstraction and Reasoning Corpus) for General Artificial Intelligence Benchmark."""
-"""AUTO GENERATED, DO NOT EDIT"""
-import datasets
-import json
-import os
-# flake8: noqa
-_CITATION = """\
-@article{allenai:arc,
-      author    = {Peter Clark  and Isaac Cowhey and Oren Etzioni and Tushar Khot and
-                    Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
-      title     = {Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
-      journal   = {arXiv:1803.05457v1},
-      year      = {2018},
-}
-"""
-_DESCRIPTION = """\
-A new dataset of 7,787 genuine grade-school level, multiple-choice science questions, assembled to encourage research in
- advanced question-answering. The dataset is partitioned into a Challenge Set and an Easy Set, where the former contains
- only questions answered incorrectly by both a retrieval-based algorithm and a word co-occurrence algorithm. We are also
- including a corpus of over 14 million science sentences relevant to the task,
- and an implementation of three neural baseline models for this dataset. We pose ARC as a challenge to the community.
-ARC-Easy:
-    train: 2251
-    test: 2376
-    validation: 570
-ARC-Challenge:
-    train: 1119
-    test: 1172
-    validation: 299
-"""
-_URL = 'https://modelscope.oss-cn-beijing.aliyuncs.com/open_data/arc/ARC-V1-Feb2018.zip'
-# tasks: ['ARC-Easy', 'ARC-Challenge']
-class Ai2ArcConfig(datasets.BuilderConfig):
-    """BuilderConfig for Ai2ARC."""
-    def __init__(self, **kwargs):
-        """BuilderConfig for Ai2Arc.
-        Args:
-          **kwargs: keyword arguments forwarded to super.
-        """
-        super(Ai2ArcConfig, self).__init__(version=datasets.Version('1.0.0', ''), **kwargs)
-class Ai2Arc(datasets.GeneratorBasedBuilder):
-    """
-    The AI2 Reasoning Challenge (ARC) dataset.
-    Subset: ARC-Easy, ARC-Challenge.
-    """
-    VERSION = datasets.Version('1.0.0')
-    BUILDER_CONFIGS = [
-        Ai2ArcConfig(
-            name='ARC-Challenge',
-            description="""\
-          Challenge Set of 2590 “hard” questions (those that both a retrieval and a co-occurrence method fail to answer correctly)
-          """,
-        ),
-        Ai2ArcConfig(
-            name='ARC-Easy',
-            description="""\
-          Easy Set of 5197 questions
-          """,
-        ),
-    ]
-    def _info(self):
-        return datasets.DatasetInfo(
-            # This is the description that will appear on the datasets page.
-            description=_DESCRIPTION,
-            # datasets.features.FeatureConnectors
-            features=datasets.Features({
-                'id':
-                datasets.Value('string'),
-                'question':
-                datasets.Value('string'),
-                'choices':
-                datasets.features.Sequence({
-                    'text': datasets.Value('string'),
-                    'label': datasets.Value('string')
-                }),
-                'answerKey':
-                datasets.Value('string')
-                # These are the features of your dataset like images, labels ...
-            }),
-            # If there's a common (input, target) tuple from the features,
-            # specify them here. They'll be used if as_supervised=True in
-            # builder.as_dataset.
-            supervised_keys=None,
-            # Homepage of the dataset for documentation
-            homepage='https://allenai.org/data/arc',
-            citation=_CITATION,
-        )
-    def _split_generators(self, dl_manager):
-        """Returns SplitGenerators."""
-        # dl_manager is a datasets.download.DownloadManager that can be used to
-        # download and extract URLs
-        dl_dir = dl_manager.download_and_extract(_URL)
-        data_dir = os.path.join(dl_dir, 'ARC-V1-Feb2018-2')
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={'filepath': os.path.join(data_dir, self.config.name, self.config.name + '-Train.jsonl')},
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={'filepath': os.path.join(data_dir, self.config.name, self.config.name + '-Test.jsonl')},
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={'filepath': os.path.join(data_dir, self.config.name, self.config.name + '-Dev.jsonl')},
-            ),
-        ]
-    def _generate_examples(self, filepath):
-        """Yields examples."""
-        with open(filepath, encoding='utf-8') as f:
-            for row in f:
-                data = json.loads(row)
-                answerkey = data['answerKey']
-                id_ = data['id']
-                question = data['question']['stem']
-                choices = data['question']['choices']
-                text_choices = [choice['text'] for choice in choices]
-                label_choices = [choice['label'] for choice in choices]
-                yield id_, {
-                    'id': id_,
-                    'answerKey': answerkey,
-                    'question': question,
-                    'choices': {
-                        'text': text_choices,
-                        'label': label_choices
-                    },
-                }

evalscope/benchmarks/benchmark.py DELETED Viewed

@@ -1,81 +0,0 @@
-import copy
-from collections import OrderedDict
-from dataclasses import dataclass, field, fields
-from typing import TYPE_CHECKING, Dict, List, Optional
-from evalscope.constants import OutputType
-if TYPE_CHECKING:
-    from evalscope.benchmarks import DataAdapter
-BENCHMARK_MAPPINGS = {}
-@dataclass
-class BenchmarkMeta:
-    name: str
-    dataset_id: str
-    data_adapter: 'DataAdapter'
-    model_adapter: Optional[str] = OutputType.GENERATION
-    output_types: Optional[List[str]] = field(default_factory=lambda: [OutputType.GENERATION])
-    subset_list: List[str] = field(default_factory=lambda: ['default'])
-    metric_list: List[str] = field(default_factory=list)
-    few_shot_num: int = 0
-    few_shot_random: bool = False
-    train_split: Optional[str] = None
-    eval_split: Optional[str] = None
-    prompt_template: Optional[str] = None
-    system_prompt: Optional[str] = None
-    query_template: Optional[str] = None
-    pretty_name: Optional[str] = None
-    description: Optional[str] = None
-    tags: Optional[List[str]] = field(default_factory=list)
-    filters: Optional[OrderedDict] = None
-    extra_params: Optional[Dict] = field(default_factory=dict)
-    def _update(self, args: dict):
-        if args.get('local_path'):
-            self.dataset_id = args['local_path']
-            del args['local_path']
-        self.__dict__.update(args)
-    def to_dict(self) -> dict:
-        return self.__dict__
-    def to_string_dict(self) -> dict:
-        cur_dict = copy.deepcopy(self.to_dict())
-        # cur_dict['data_adapter'] = self.data_adapter.__name__
-        del cur_dict['data_adapter']
-        return cur_dict
-    def get_data_adapter(self, config: dict = {}) -> 'DataAdapter':
-        if config:
-            self._update(config)
-        data_adapter = self.data_adapter(**self.to_dict())
-        return data_adapter
-class Benchmark:
-    def __init__(self):
-        pass
-    @classmethod
-    def get(cls, name: str) -> 'BenchmarkMeta':
-        if name not in BENCHMARK_MAPPINGS:
-            raise Exception(f'Unknown benchmark: {name}. Available tasks: {list(BENCHMARK_MAPPINGS.keys())}')
-        benchmark = BENCHMARK_MAPPINGS[name]
-        return benchmark
-    @classmethod
-    def register(cls, name: str, dataset_id: str, **kwargs):
-        def register_wrapper(data_adapter):
-            if name in BENCHMARK_MAPPINGS:
-                raise Exception(f'Benchmark {name} already registered')
-            BENCHMARK_MAPPINGS[name] = BenchmarkMeta(
-                name=name, data_adapter=data_adapter, dataset_id=dataset_id, **kwargs)
-            return data_adapter
-        return register_wrapper

evalscope/benchmarks/ceval/ceval_exam.py DELETED Viewed

@@ -1,146 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-#
-# Licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License
-import datasets
-import os
-import pandas as pd
-# flake8: noqa
-"""DO NOT EDIT unless you are contributing a new dataset."""
-_CITATION = """\
-@article{huang2023ceval,
-    title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models},
-    author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and Zhang, Junlei and Zhang, Jinghan and Su, Tangjun and Liu, Junteng and Lv, Chuancheng and Zhang, Yikai and Lei, Jiayi and Fu, Yao and Sun, Maosong and He, Junxian},
-    journal={arXiv preprint arXiv:2305.08322},
-    year={2023}
-}
-"""
-_DESCRIPTION = """\
-C-Eval is a comprehensive Chinese evaluation suite for foundation models. It consists of 13948 multi-choice questions spanning 52 diverse disciplines and four difficulty levels.
-"""
-_HOMEPAGE = 'https://cevalbenchmark.com'
-_LICENSE = 'Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License'
-_URL = r'https://modelscope.oss-cn-beijing.aliyuncs.com/open_data/c-eval/ceval-exam.zip'
-task_list = [
-    'computer_network',
-    'operating_system',
-    'computer_architecture',
-    'college_programming',
-    'college_physics',
-    'college_chemistry',
-    'advanced_mathematics',
-    'probability_and_statistics',
-    'discrete_mathematics',
-    'electrical_engineer',
-    'metrology_engineer',
-    'high_school_mathematics',
-    'high_school_physics',
-    'high_school_chemistry',
-    'high_school_biology',
-    'middle_school_mathematics',
-    'middle_school_biology',
-    'middle_school_physics',
-    'middle_school_chemistry',
-    'veterinary_medicine',
-    'college_economics',
-    'business_administration',
-    'marxism',
-    'mao_zedong_thought',
-    'education_science',
-    'teacher_qualification',
-    'high_school_politics',
-    'high_school_geography',
-    'middle_school_politics',
-    'middle_school_geography',
-    'modern_chinese_history',
-    'ideological_and_moral_cultivation',
-    'logic',
-    'law',
-    'chinese_language_and_literature',
-    'art_studies',
-    'professional_tour_guide',
-    'legal_professional',
-    'high_school_chinese',
-    'high_school_history',
-    'middle_school_history',
-    'civil_servant',
-    'sports_science',
-    'plant_protection',
-    'basic_medicine',
-    'clinical_medicine',
-    'urban_and_rural_planner',
-    'accountant',
-    'fire_engineer',
-    'environmental_impact_assessment_engineer',
-    'tax_accountant',
-    'physician',
-]
-class CevalExamConfig(datasets.BuilderConfig):
-    def __init__(self, **kwargs):
-        super().__init__(version=datasets.Version('1.0.0'), **kwargs)
-class CevalExam(datasets.GeneratorBasedBuilder):
-    BUILDER_CONFIGS = [CevalExamConfig(name=task_name, ) for task_name in task_list]
-    def _info(self):
-        features = datasets.Features({
-            'id': datasets.Value('int32'),
-            'question': datasets.Value('string'),
-            'A': datasets.Value('string'),
-            'B': datasets.Value('string'),
-            'C': datasets.Value('string'),
-            'D': datasets.Value('string'),
-            'answer': datasets.Value('string'),
-            'explanation': datasets.Value('string'),
-        })
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION,
-        )
-    def _split_generators(self, dl_manager):
-        data_dir = dl_manager.download_and_extract(_URL)
-        task_name = self.config.name
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                gen_kwargs={
-                    'filepath': os.path.join(data_dir, 'test', f'{task_name}_test.csv'),
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split('val'),
-                gen_kwargs={
-                    'filepath': os.path.join(data_dir, 'val', f'{task_name}_val.csv'),
-                },
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split('dev'),
-                gen_kwargs={
-                    'filepath': os.path.join(data_dir, 'dev', f'{task_name}_dev.csv'),
-                },
-            ),
-        ]
-    def _generate_examples(self, filepath):
-        df = pd.read_csv(filepath, encoding='utf-8')
-        for i, instance in enumerate(df.to_dict(orient='records')):
-            if 'answer' not in instance.keys():
-                instance['answer'] = ''
-            if 'explanation' not in instance.keys():
-                instance['explanation'] = ''
-            yield i, instance

evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl