evalscope 0.13.2__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +2 -1
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +21 -5
- evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
- evalscope/backend/rag_eval/utils/embedding.py +49 -3
- evalscope/backend/rag_eval/utils/llm.py +4 -4
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
- evalscope/benchmarks/__init__.py +2 -2
- evalscope/benchmarks/aigc/__init__.py +0 -0
- evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/t2i/base.py +56 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +77 -0
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +58 -0
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +58 -0
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +57 -0
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +37 -0
- evalscope/benchmarks/aime/aime24_adapter.py +1 -1
- evalscope/benchmarks/aime/aime25_adapter.py +4 -4
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -2
- evalscope/benchmarks/arc/arc_adapter.py +2 -2
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -3
- evalscope/benchmarks/ceval/ceval_adapter.py +2 -2
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -3
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +1 -1
- evalscope/benchmarks/competition_math/competition_math_adapter.py +1 -2
- evalscope/benchmarks/data_adapter.py +21 -10
- evalscope/benchmarks/data_collection/data_collection_adapter.py +6 -4
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +16 -21
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +5 -4
- evalscope/benchmarks/live_code_bench/testing_util.py +369 -550
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +1 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -8
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -2
- evalscope/benchmarks/utils.py +7 -16
- evalscope/cli/start_app.py +1 -1
- evalscope/collections/evaluator.py +20 -6
- evalscope/config.py +8 -4
- evalscope/constants.py +11 -0
- evalscope/evaluator/evaluator.py +2 -2
- evalscope/evaluator/reviewer/auto_reviewer.py +1 -1
- evalscope/metrics/__init__.py +49 -4
- evalscope/metrics/llm_judge.py +1 -1
- evalscope/metrics/named_metrics.py +13 -0
- evalscope/metrics/t2v_metrics/__init__.py +66 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +132 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +286 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +114 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +84 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +97 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +171 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +80 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +73 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +300 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +82 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +218 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +150 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +188 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +106 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +307 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +191 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +318 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +208 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1093 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +452 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +364 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +755 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +880 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1844 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +81 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +56 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +185 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +178 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +112 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +344 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +858 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +271 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +503 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1270 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +473 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +31 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +392 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +127 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +17 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +50 -14
- evalscope/models/adapters/__init__.py +17 -0
- evalscope/models/{base_adapter.py → adapters/base_adapter.py} +17 -17
- evalscope/models/{chat_adapter.py → adapters/chat_adapter.py} +10 -7
- evalscope/models/{choice_adapter.py → adapters/choice_adapter.py} +2 -6
- evalscope/models/{custom_adapter.py → adapters/custom_adapter.py} +2 -4
- evalscope/models/{server_adapter.py → adapters/server_adapter.py} +1 -3
- evalscope/models/adapters/t2i_adapter.py +76 -0
- evalscope/models/custom/__init__.py +2 -1
- evalscope/models/custom/dummy_model.py +11 -13
- evalscope/models/local_model.py +82 -33
- evalscope/models/model.py +2 -42
- evalscope/models/register.py +26 -0
- evalscope/perf/arguments.py +24 -5
- evalscope/perf/benchmark.py +28 -42
- evalscope/perf/http_client.py +2 -3
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +2 -2
- evalscope/perf/plugin/datasets/custom.py +4 -1
- evalscope/perf/plugin/datasets/flickr8k.py +2 -1
- evalscope/perf/plugin/datasets/line_by_line.py +4 -1
- evalscope/perf/plugin/datasets/longalpaca.py +4 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -1
- evalscope/perf/plugin/datasets/random_dataset.py +13 -6
- evalscope/perf/utils/benchmark_util.py +14 -8
- evalscope/perf/utils/db_util.py +9 -3
- evalscope/perf/utils/log_utils.py +41 -0
- evalscope/report/__init__.py +1 -0
- evalscope/report/app.py +128 -78
- evalscope/report/app_arguments.py +11 -0
- evalscope/report/generator.py +1 -1
- evalscope/run.py +10 -3
- evalscope/summarizer.py +2 -1
- evalscope/third_party/thinkbench/eval.py +19 -7
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/import_utils.py +66 -0
- evalscope/utils/utils.py +48 -29
- evalscope/version.py +2 -2
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/METADATA +37 -15
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/RECORD +209 -96
- tests/aigc/__init__.py +1 -0
- tests/aigc/test_t2i.py +87 -0
- tests/cli/test_all.py +4 -4
- tests/cli/test_collection.py +2 -1
- tests/cli/test_run.py +19 -12
- tests/perf/test_perf.py +3 -3
- tests/rag/test_clip_benchmark.py +0 -1
- tests/rag/test_mteb.py +37 -8
- tests/rag/test_ragas.py +29 -26
- tests/vlm/test_vlmeval.py +37 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/LICENSE +0 -0
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/WHEEL +0 -0
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
2
|
from evalscope.constants import OutputType
|
|
3
|
-
from evalscope.metrics
|
|
3
|
+
from evalscope.metrics import extract_answer, math_equal, strip_answer_string
|
|
4
4
|
from evalscope.utils.logger import get_logger
|
|
5
5
|
|
|
6
6
|
# flake8: noqa
|
|
@@ -11,12 +11,12 @@ logger = get_logger()
|
|
|
11
11
|
@Benchmark.register(
|
|
12
12
|
name='aime25',
|
|
13
13
|
pretty_name='AIME-2025',
|
|
14
|
-
dataset_id='
|
|
15
|
-
subset_list=['
|
|
14
|
+
dataset_id='opencompass/AIME2025',
|
|
15
|
+
subset_list=['AIME2025-I', 'AIME2025-II'],
|
|
16
16
|
metric_list=['AveragePass@1'],
|
|
17
17
|
few_shot_num=0,
|
|
18
18
|
train_split=None,
|
|
19
|
-
eval_split='
|
|
19
|
+
eval_split='test', # Only train set is available
|
|
20
20
|
prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
|
|
21
21
|
)
|
|
22
22
|
class AIME25Adapter(DataAdapter):
|
|
@@ -3,8 +3,7 @@ from collections import defaultdict
|
|
|
3
3
|
from typing import Any, List
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
-
from evalscope.metrics import Metric, mean, metric_registry
|
|
7
|
-
from evalscope.metrics.llm_judge import LLMJudge
|
|
6
|
+
from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
|
|
8
7
|
from evalscope.utils.logger import get_logger
|
|
9
8
|
|
|
10
9
|
# flake8: noqa
|
|
@@ -18,7 +18,7 @@ logger = get_logger()
|
|
|
18
18
|
name='arc',
|
|
19
19
|
pretty_name='ARC',
|
|
20
20
|
dataset_id='modelscope/ai2_arc',
|
|
21
|
-
model_adapter=OutputType.
|
|
21
|
+
model_adapter=OutputType.GENERATION,
|
|
22
22
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
23
23
|
subset_list=['ARC-Easy', 'ARC-Challenge'],
|
|
24
24
|
metric_list=['AverageAccuracy'],
|
|
@@ -134,7 +134,7 @@ class ARCAdapter(DataAdapter):
|
|
|
134
134
|
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
135
135
|
return result
|
|
136
136
|
else:
|
|
137
|
-
return ResponseParser.parse_first_option(text=result)
|
|
137
|
+
return ResponseParser.parse_first_option(text=result, options=self.choices)
|
|
138
138
|
|
|
139
139
|
def match(self, gold: str, pred: str) -> float:
|
|
140
140
|
return exact_match(gold=gold, pred=pred)
|
|
@@ -3,9 +3,7 @@ from collections import defaultdict
|
|
|
3
3
|
from typing import Any, List
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
-
from evalscope.
|
|
7
|
-
from evalscope.metrics import Metric, mean, metric_registry
|
|
8
|
-
from evalscope.metrics.llm_judge import LLMJudge
|
|
6
|
+
from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
|
|
9
7
|
from evalscope.utils.logger import get_logger
|
|
10
8
|
|
|
11
9
|
# flake8: noqa
|
|
@@ -4,7 +4,7 @@ import os
|
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType, OutputType
|
|
7
|
-
from evalscope.metrics
|
|
7
|
+
from evalscope.metrics import exact_match
|
|
8
8
|
from evalscope.utils import ResponseParser
|
|
9
9
|
from evalscope.utils.logger import get_logger
|
|
10
10
|
|
|
@@ -127,7 +127,7 @@ SUBJECT_MAPPING = {
|
|
|
127
127
|
name='ceval',
|
|
128
128
|
pretty_name='C-Eval',
|
|
129
129
|
dataset_id='modelscope/ceval-exam',
|
|
130
|
-
model_adapter=OutputType.
|
|
130
|
+
model_adapter=OutputType.GENERATION,
|
|
131
131
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
132
132
|
subset_list=SUBSET_LIST,
|
|
133
133
|
metric_list=['AverageAccuracy'],
|
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from collections import defaultdict
|
|
3
2
|
from typing import Any, List
|
|
4
3
|
|
|
5
4
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
-
from evalscope.metrics import Metric, mean, metric_registry
|
|
7
|
-
from evalscope.metrics.llm_judge import LLMJudge
|
|
5
|
+
from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
|
|
8
6
|
from evalscope.utils.logger import get_logger
|
|
9
7
|
|
|
10
8
|
# flake8: noqa
|
|
@@ -104,7 +104,7 @@ SUBJECT_MAPPING = {
|
|
|
104
104
|
name='cmmlu',
|
|
105
105
|
pretty_name='C-MMLU',
|
|
106
106
|
dataset_id='modelscope/cmmlu',
|
|
107
|
-
model_adapter=OutputType.
|
|
107
|
+
model_adapter=OutputType.GENERATION,
|
|
108
108
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
109
109
|
subset_list=SUBSET_LIST,
|
|
110
110
|
metric_list=['AverageAccuracy'],
|
|
@@ -6,8 +6,7 @@ import os
|
|
|
6
6
|
from collections import defaultdict
|
|
7
7
|
|
|
8
8
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
|
-
from evalscope.
|
|
10
|
-
from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
|
|
9
|
+
from evalscope.metrics import extract_answer, math_equal, strip_answer_string
|
|
11
10
|
from evalscope.utils.logger import get_logger
|
|
12
11
|
|
|
13
12
|
# flake8: noqa
|
|
@@ -3,12 +3,11 @@ import os.path
|
|
|
3
3
|
import random
|
|
4
4
|
from abc import ABC, abstractmethod
|
|
5
5
|
from collections import defaultdict
|
|
6
|
-
from typing import Any, List, Optional, Union
|
|
6
|
+
from typing import Any, Dict, List, Optional, Union
|
|
7
7
|
|
|
8
8
|
from evalscope.benchmarks.utils import PromptData, preprocess_decorator
|
|
9
9
|
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
|
|
10
|
-
from evalscope.metrics
|
|
11
|
-
from evalscope.metrics.named_metrics import metric_registry
|
|
10
|
+
from evalscope.metrics import LLMJudge, metric_registry
|
|
12
11
|
from evalscope.report import Report, ReportGenerator
|
|
13
12
|
from evalscope.utils.logger import get_logger
|
|
14
13
|
|
|
@@ -24,6 +23,7 @@ class DataAdapter(ABC):
|
|
|
24
23
|
subset_list: list,
|
|
25
24
|
metric_list: List[str],
|
|
26
25
|
llm_as_a_judge: bool = False,
|
|
26
|
+
output_types: Optional[List[str]] = None,
|
|
27
27
|
few_shot_num: Optional[int] = 0,
|
|
28
28
|
train_split: Optional[str] = None,
|
|
29
29
|
eval_split: Optional[str] = None,
|
|
@@ -63,6 +63,7 @@ class DataAdapter(ABC):
|
|
|
63
63
|
self.query_template = query_template
|
|
64
64
|
self.pretty_name = pretty_name
|
|
65
65
|
self.config_kwargs = kwargs
|
|
66
|
+
self.output_types = output_types or [model_adapter]
|
|
66
67
|
self.llm_as_a_judge = llm_as_a_judge
|
|
67
68
|
self.category_map = kwargs.get('category_map', {})
|
|
68
69
|
self.choices = kwargs.get('choices', None)
|
|
@@ -190,7 +191,7 @@ class DataAdapter(ABC):
|
|
|
190
191
|
if self.few_shot_num and self.few_shot_num < 0:
|
|
191
192
|
raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.')
|
|
192
193
|
|
|
193
|
-
logger.info(f'Use
|
|
194
|
+
logger.info(f'Use settings: '
|
|
194
195
|
f'> few_shot_num: {self.few_shot_num}, '
|
|
195
196
|
f'> few_shot_split: {self.train_split}, '
|
|
196
197
|
f'> target_eval_split: {self.eval_split}')
|
|
@@ -245,7 +246,8 @@ class DataAdapter(ABC):
|
|
|
245
246
|
res_list.append({'metric_name': metric_name, 'score': metric_func(review_res), 'num': len(review_res)})
|
|
246
247
|
return res_list
|
|
247
248
|
|
|
248
|
-
def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]],
|
|
249
|
+
def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]],
|
|
250
|
+
**kwargs) -> Dict[str, List[float]]:
|
|
249
251
|
"""
|
|
250
252
|
compute weighted mean of the bleu score of all samples
|
|
251
253
|
|
|
@@ -253,7 +255,7 @@ class DataAdapter(ABC):
|
|
|
253
255
|
review_res_list: [score1, score2, ...]
|
|
254
256
|
|
|
255
257
|
Returns:
|
|
256
|
-
avg_res: List[
|
|
258
|
+
avg_res: Dict[str, List[float]]
|
|
257
259
|
|
|
258
260
|
"""
|
|
259
261
|
if isinstance(review_res_list[0], list):
|
|
@@ -314,11 +316,20 @@ class DataAdapter(ABC):
|
|
|
314
316
|
kwargs['metric_list'] = self.metric_list
|
|
315
317
|
return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
|
|
316
318
|
|
|
317
|
-
def gen_prompt_data(self,
|
|
318
|
-
|
|
319
|
-
|
|
319
|
+
def gen_prompt_data(self,
|
|
320
|
+
prompt: str,
|
|
321
|
+
system_prompt: Optional[str] = None,
|
|
322
|
+
choices: Optional[List[str]] = None,
|
|
323
|
+
index: Optional[Union[int, str]] = None,
|
|
324
|
+
id: Optional[Union[int, str]] = None,
|
|
325
|
+
**kwargs) -> dict:
|
|
326
|
+
data = [prompt] if not isinstance(prompt, list) else prompt
|
|
320
327
|
prompt_data = PromptData(
|
|
321
|
-
data=
|
|
328
|
+
data=data,
|
|
329
|
+
multi_choices=choices or self.choices,
|
|
330
|
+
system_prompt=system_prompt or self.system_prompt,
|
|
331
|
+
index=index or 0,
|
|
332
|
+
id=id)
|
|
322
333
|
return prompt_data.to_dict()
|
|
323
334
|
|
|
324
335
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
@@ -48,14 +48,16 @@ class DataCollectionAdapter(DataAdapter):
|
|
|
48
48
|
if len(dataset) == 0:
|
|
49
49
|
raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
|
|
50
50
|
else:
|
|
51
|
-
from modelscope
|
|
51
|
+
from modelscope import dataset_snapshot_download
|
|
52
52
|
|
|
53
53
|
# Load dataset from remote
|
|
54
54
|
logger.info(f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path}')
|
|
55
55
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
56
|
+
dataset_path = dataset_snapshot_download(
|
|
57
|
+
dataset_name_or_path, cache_dir=work_dir, allow_file_pattern='*.jsonl')
|
|
58
|
+
# find the jsonl file
|
|
59
|
+
dataset_files = [os.path.join(dataset_path, f) for f in os.listdir(dataset_path) if f.endswith('.jsonl')]
|
|
60
|
+
dataset = jsonl_to_list(dataset_files[0])
|
|
59
61
|
|
|
60
62
|
return dataset
|
|
61
63
|
|
|
@@ -4,7 +4,7 @@ import os
|
|
|
4
4
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType, OutputType
|
|
7
|
-
from evalscope.metrics
|
|
7
|
+
from evalscope.metrics import exact_match
|
|
8
8
|
from evalscope.utils import ResponseParser
|
|
9
9
|
from evalscope.utils.logger import get_logger
|
|
10
10
|
|
|
@@ -17,7 +17,7 @@ logger = get_logger()
|
|
|
17
17
|
name='general_mcq',
|
|
18
18
|
pretty_name='General MCQ',
|
|
19
19
|
dataset_id='general_mcq',
|
|
20
|
-
model_adapter=OutputType.
|
|
20
|
+
model_adapter=OutputType.GENERATION,
|
|
21
21
|
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
22
22
|
subset_list=['default'],
|
|
23
23
|
metric_list=['AverageAccuracy'],
|
|
@@ -40,7 +40,7 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
40
40
|
for subset_name in subset_list:
|
|
41
41
|
data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
|
|
42
42
|
elif os.path.isfile(dataset_name_or_path):
|
|
43
|
-
cur_subset_name = os.path.basename(dataset_name_or_path)
|
|
43
|
+
cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
|
|
44
44
|
data_file_dict[cur_subset_name] = dataset_name_or_path
|
|
45
45
|
else:
|
|
46
46
|
raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
|
|
@@ -108,7 +108,7 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
108
108
|
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
109
109
|
return result
|
|
110
110
|
else:
|
|
111
|
-
return ResponseParser.parse_first_option(result)
|
|
111
|
+
return ResponseParser.parse_first_option(result, options=self.choices)
|
|
112
112
|
|
|
113
113
|
def match(self, gold: str, pred: str) -> float:
|
|
114
114
|
return exact_match(gold=str(gold), pred=str(pred))
|
|
@@ -2,7 +2,6 @@ import json
|
|
|
2
2
|
import multiprocessing
|
|
3
3
|
import numpy as np
|
|
4
4
|
from collections import defaultdict
|
|
5
|
-
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
6
5
|
|
|
7
6
|
from evalscope.utils.logger import get_logger
|
|
8
7
|
from .pass_k_utils import compute_metrics_from_results
|
|
@@ -31,7 +30,10 @@ def codegen_check_correctness(sample, generation, timeout, debug=True):
|
|
|
31
30
|
args=(sample, generation, debug, result, metadata_list, timeout),
|
|
32
31
|
)
|
|
33
32
|
p.start()
|
|
34
|
-
|
|
33
|
+
global_timeout = (timeout + 1) * len(json.loads(sample['input_output'])['inputs'])
|
|
34
|
+
if debug:
|
|
35
|
+
logger.info(f'global timeout = {global_timeout}')
|
|
36
|
+
p.join(timeout=global_timeout)
|
|
35
37
|
if p.is_alive():
|
|
36
38
|
p.kill()
|
|
37
39
|
if not result:
|
|
@@ -39,7 +41,7 @@ def codegen_check_correctness(sample, generation, timeout, debug=True):
|
|
|
39
41
|
# consider that all tests failed
|
|
40
42
|
result = [[-1 for i in range(len(in_outs['inputs']))]]
|
|
41
43
|
if debug:
|
|
42
|
-
logger.info('global timeout')
|
|
44
|
+
logger.info('global timeout occured: alarm went off')
|
|
43
45
|
return result[0], metadata_list[0]
|
|
44
46
|
|
|
45
47
|
|
|
@@ -99,7 +101,7 @@ def evaluate_generations(
|
|
|
99
101
|
samples_list: list,
|
|
100
102
|
generations_list: list[list[str]],
|
|
101
103
|
debug: bool = False,
|
|
102
|
-
num_process_evaluate: int = 16,
|
|
104
|
+
num_process_evaluate: int = 16, # This parameter will be unused
|
|
103
105
|
timeout=6,
|
|
104
106
|
):
|
|
105
107
|
"""We take the list of code generations and try to compile them and the run
|
|
@@ -117,26 +119,19 @@ def evaluate_generations(
|
|
|
117
119
|
[-2] = compile error, [-1] = runtime error [False] = failed test
|
|
118
120
|
case [True] = passed test case
|
|
119
121
|
"""
|
|
122
|
+
results = {}
|
|
123
|
+
metadata = {}
|
|
120
124
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
for index in range(len(generations_list))]
|
|
125
|
-
|
|
126
|
-
with ProcessPoolExecutor(max_workers=1 if debug else num_process_evaluate) as executor:
|
|
127
|
-
futures = {
|
|
128
|
-
executor.submit(evaluate_generations_by_problem, problem_generations, sample, debug, timeout): index
|
|
129
|
-
for (problem_generations, sample, debug, timeout), index in inputs
|
|
130
|
-
}
|
|
125
|
+
for index in range(len(generations_list)):
|
|
126
|
+
problem_generations = generations_list[index]
|
|
127
|
+
sample = samples_list[index]
|
|
131
128
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
index = futures[future]
|
|
136
|
-
results[index], metadata[index] = future.result()
|
|
129
|
+
result, meta = evaluate_generations_by_problem(problem_generations, sample, debug, timeout)
|
|
130
|
+
results[index] = result
|
|
131
|
+
metadata[index] = meta
|
|
137
132
|
|
|
138
|
-
assert len(results) == len(
|
|
139
|
-
|
|
133
|
+
assert len(results) == len(
|
|
134
|
+
generations_list), f'results = {len(results)} inputs = {len(generations_list)} {results=}'
|
|
140
135
|
|
|
141
136
|
return results, metadata
|
|
142
137
|
|
|
@@ -18,8 +18,8 @@ logger = get_logger()
|
|
|
18
18
|
extra_params={
|
|
19
19
|
'start_date': None,
|
|
20
20
|
'end_date': None,
|
|
21
|
-
'
|
|
22
|
-
'
|
|
21
|
+
'timeout': 6,
|
|
22
|
+
'debug': False
|
|
23
23
|
},
|
|
24
24
|
system_prompt=
|
|
25
25
|
'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.', # noqa: E501
|
|
@@ -33,8 +33,8 @@ class LiveCodeBenchAdapter(DataAdapter):
|
|
|
33
33
|
|
|
34
34
|
extra_params = kwargs.get('extra_params', {})
|
|
35
35
|
|
|
36
|
-
self.num_process_evaluate = extra_params.get('num_process_evaluate', 1)
|
|
37
36
|
self.timeout = extra_params.get('timeout', 6)
|
|
37
|
+
self.debug = extra_params.get('debug', False)
|
|
38
38
|
self.start_date = extra_params.get('start_date')
|
|
39
39
|
self.end_date = extra_params.get('end_date')
|
|
40
40
|
|
|
@@ -84,7 +84,8 @@ class LiveCodeBenchAdapter(DataAdapter):
|
|
|
84
84
|
references,
|
|
85
85
|
predictions,
|
|
86
86
|
k_list=[1],
|
|
87
|
-
num_process_evaluate=
|
|
87
|
+
num_process_evaluate=1,
|
|
88
88
|
timeout=self.timeout,
|
|
89
|
+
debug=self.debug,
|
|
89
90
|
)
|
|
90
91
|
return metrics['pass@1'] / 100 # convert to point scale
|