evalscope 0.13.2__py3-none-any.whl β 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +2 -1
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +21 -5
- evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
- evalscope/backend/rag_eval/utils/embedding.py +49 -3
- evalscope/backend/rag_eval/utils/llm.py +4 -4
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
- evalscope/benchmarks/__init__.py +2 -2
- evalscope/benchmarks/aigc/__init__.py +0 -0
- evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/t2i/base.py +56 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +77 -0
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +58 -0
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +58 -0
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +57 -0
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +37 -0
- evalscope/benchmarks/aime/aime24_adapter.py +1 -1
- evalscope/benchmarks/aime/aime25_adapter.py +4 -4
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -2
- evalscope/benchmarks/arc/arc_adapter.py +2 -2
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -3
- evalscope/benchmarks/ceval/ceval_adapter.py +2 -2
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -3
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +1 -1
- evalscope/benchmarks/competition_math/competition_math_adapter.py +1 -2
- evalscope/benchmarks/data_adapter.py +21 -10
- evalscope/benchmarks/data_collection/data_collection_adapter.py +6 -4
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +16 -21
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +5 -4
- evalscope/benchmarks/live_code_bench/testing_util.py +369 -550
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +1 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -8
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -2
- evalscope/benchmarks/utils.py +7 -16
- evalscope/cli/start_app.py +1 -1
- evalscope/collections/evaluator.py +20 -6
- evalscope/config.py +8 -4
- evalscope/constants.py +11 -0
- evalscope/evaluator/evaluator.py +2 -2
- evalscope/evaluator/reviewer/auto_reviewer.py +1 -1
- evalscope/metrics/__init__.py +49 -4
- evalscope/metrics/llm_judge.py +1 -1
- evalscope/metrics/named_metrics.py +13 -0
- evalscope/metrics/t2v_metrics/__init__.py +66 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +132 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +286 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +114 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +84 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +97 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +171 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +80 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +73 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +300 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +82 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +218 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +150 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +188 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +106 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +307 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +191 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +318 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +208 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1093 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +452 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +364 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +755 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +880 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1844 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +81 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +56 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +185 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +178 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +112 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +344 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +858 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +271 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +503 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1270 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +473 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +31 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +392 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +127 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +17 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +50 -14
- evalscope/models/adapters/__init__.py +17 -0
- evalscope/models/{base_adapter.py β adapters/base_adapter.py} +17 -17
- evalscope/models/{chat_adapter.py β adapters/chat_adapter.py} +10 -7
- evalscope/models/{choice_adapter.py β adapters/choice_adapter.py} +2 -6
- evalscope/models/{custom_adapter.py β adapters/custom_adapter.py} +2 -4
- evalscope/models/{server_adapter.py β adapters/server_adapter.py} +1 -3
- evalscope/models/adapters/t2i_adapter.py +76 -0
- evalscope/models/custom/__init__.py +2 -1
- evalscope/models/custom/dummy_model.py +11 -13
- evalscope/models/local_model.py +82 -33
- evalscope/models/model.py +2 -42
- evalscope/models/register.py +26 -0
- evalscope/perf/arguments.py +24 -5
- evalscope/perf/benchmark.py +28 -42
- evalscope/perf/http_client.py +2 -3
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +2 -2
- evalscope/perf/plugin/datasets/custom.py +4 -1
- evalscope/perf/plugin/datasets/flickr8k.py +2 -1
- evalscope/perf/plugin/datasets/line_by_line.py +4 -1
- evalscope/perf/plugin/datasets/longalpaca.py +4 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -1
- evalscope/perf/plugin/datasets/random_dataset.py +13 -6
- evalscope/perf/utils/benchmark_util.py +14 -8
- evalscope/perf/utils/db_util.py +9 -3
- evalscope/perf/utils/log_utils.py +41 -0
- evalscope/report/__init__.py +1 -0
- evalscope/report/app.py +128 -78
- evalscope/report/app_arguments.py +11 -0
- evalscope/report/generator.py +1 -1
- evalscope/run.py +10 -3
- evalscope/summarizer.py +2 -1
- evalscope/third_party/thinkbench/eval.py +19 -7
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/import_utils.py +66 -0
- evalscope/utils/utils.py +48 -29
- evalscope/version.py +2 -2
- {evalscope-0.13.2.dist-info β evalscope-0.15.0.dist-info}/METADATA +37 -15
- {evalscope-0.13.2.dist-info β evalscope-0.15.0.dist-info}/RECORD +209 -96
- tests/aigc/__init__.py +1 -0
- tests/aigc/test_t2i.py +87 -0
- tests/cli/test_all.py +4 -4
- tests/cli/test_collection.py +2 -1
- tests/cli/test_run.py +19 -12
- tests/perf/test_perf.py +3 -3
- tests/rag/test_clip_benchmark.py +0 -1
- tests/rag/test_mteb.py +37 -8
- tests/rag/test_ragas.py +29 -26
- tests/vlm/test_vlmeval.py +37 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- {evalscope-0.13.2.dist-info β evalscope-0.15.0.dist-info}/LICENSE +0 -0
- {evalscope-0.13.2.dist-info β evalscope-0.15.0.dist-info}/WHEEL +0 -0
- {evalscope-0.13.2.dist-info β evalscope-0.15.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.13.2.dist-info β evalscope-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
# Copyright 2023-present the HuggingFace Inc. team.
|
|
3
|
+
|
|
4
|
+
import importlib
|
|
5
|
+
import os
|
|
6
|
+
from itertools import chain
|
|
7
|
+
from types import ModuleType
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from .logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger() # pylint: disable=invalid-name
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class _LazyModule(ModuleType):
|
|
16
|
+
"""
|
|
17
|
+
Module class that surfaces all objects but only performs associated imports when the objects are requested.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
# Very heavily inspired by optuna.integration._IntegrationModule
|
|
21
|
+
# https://github.com/optuna/optuna/blob/master/optuna/integration/__init__.py
|
|
22
|
+
def __init__(self, name, module_file, import_structure, module_spec=None, extra_objects=None):
|
|
23
|
+
super().__init__(name)
|
|
24
|
+
self._modules = set(import_structure.keys())
|
|
25
|
+
self._class_to_module = {}
|
|
26
|
+
for key, values in import_structure.items():
|
|
27
|
+
for value in values:
|
|
28
|
+
self._class_to_module[value] = key
|
|
29
|
+
# Needed for autocompletion in an IDE
|
|
30
|
+
self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values()))
|
|
31
|
+
self.__file__ = module_file
|
|
32
|
+
self.__spec__ = module_spec
|
|
33
|
+
self.__path__ = [os.path.dirname(module_file)]
|
|
34
|
+
self._objects = {} if extra_objects is None else extra_objects
|
|
35
|
+
self._name = name
|
|
36
|
+
self._import_structure = import_structure
|
|
37
|
+
|
|
38
|
+
# Needed for autocompletion in an IDE
|
|
39
|
+
def __dir__(self):
|
|
40
|
+
result = super().__dir__()
|
|
41
|
+
# The elements of self.__all__ that are submodules may or may not be in the dir already, depending on whether
|
|
42
|
+
# they have been accessed or not. So we only add the elements of self.__all__ that are not already in the dir.
|
|
43
|
+
for attr in self.__all__:
|
|
44
|
+
if attr not in result:
|
|
45
|
+
result.append(attr)
|
|
46
|
+
return result
|
|
47
|
+
|
|
48
|
+
def __getattr__(self, name: str) -> Any:
|
|
49
|
+
if name in self._objects:
|
|
50
|
+
return self._objects[name]
|
|
51
|
+
if name in self._modules:
|
|
52
|
+
value = self._get_module(name)
|
|
53
|
+
elif name in self._class_to_module.keys():
|
|
54
|
+
module = self._get_module(self._class_to_module[name])
|
|
55
|
+
value = getattr(module, name)
|
|
56
|
+
else:
|
|
57
|
+
raise AttributeError(f'module {self.__name__} has no attribute {name}')
|
|
58
|
+
|
|
59
|
+
setattr(self, name, value)
|
|
60
|
+
return value
|
|
61
|
+
|
|
62
|
+
def _get_module(self, module_name: str):
|
|
63
|
+
return importlib.import_module('.' + module_name, self.__name__)
|
|
64
|
+
|
|
65
|
+
def __reduce__(self):
|
|
66
|
+
return self.__class__, (self._name, self.__file__, self._import_structure)
|
evalscope/utils/utils.py
CHANGED
|
@@ -76,21 +76,21 @@ def dict_torch_dtype_to_str(d: Dict[str, Any]) -> dict:
|
|
|
76
76
|
class ResponseParser:
|
|
77
77
|
|
|
78
78
|
@staticmethod
|
|
79
|
-
def parse_first_capital(text: str) -> str:
|
|
79
|
+
def parse_first_capital(text: str, options: list[str]) -> str:
|
|
80
80
|
for t in text:
|
|
81
|
-
if t.isupper():
|
|
81
|
+
if t.isupper() and (t in options):
|
|
82
82
|
return t
|
|
83
83
|
return ''
|
|
84
84
|
|
|
85
85
|
@staticmethod
|
|
86
|
-
def parse_last_capital(text: str) -> str:
|
|
86
|
+
def parse_last_capital(text: str, options: list[str]) -> str:
|
|
87
87
|
for t in text[::-1]:
|
|
88
|
-
if t.isupper():
|
|
88
|
+
if t.isupper() and (t in options):
|
|
89
89
|
return t
|
|
90
90
|
return ''
|
|
91
91
|
|
|
92
92
|
@staticmethod
|
|
93
|
-
def parse_first_option_with_choices(text: str, options: list) -> str:
|
|
93
|
+
def parse_first_option_with_choices(text: str, options: list[str]) -> str:
|
|
94
94
|
"""
|
|
95
95
|
Find first valid option for text.
|
|
96
96
|
|
|
@@ -98,7 +98,7 @@ class ResponseParser:
|
|
|
98
98
|
text: The text to parse.
|
|
99
99
|
options: The options to find. e.g. ['A', 'B', 'C', 'D']
|
|
100
100
|
"""
|
|
101
|
-
options_concat =
|
|
101
|
+
options_concat = ResponseParser.process_options(options)
|
|
102
102
|
|
|
103
103
|
patterns = [
|
|
104
104
|
rf'ηζ‘ζ―?\s?([{options_concat}])',
|
|
@@ -155,48 +155,61 @@ class ResponseParser:
|
|
|
155
155
|
for i in options:
|
|
156
156
|
if i in outputs:
|
|
157
157
|
return i
|
|
158
|
-
|
|
158
|
+
# If no match found, try to find the last capital letter in the text
|
|
159
|
+
last_capital = ResponseParser.parse_last_capital(text, options)
|
|
160
|
+
if last_capital:
|
|
161
|
+
return last_capital
|
|
162
|
+
return 'No valid option found'
|
|
159
163
|
|
|
160
164
|
@staticmethod
|
|
161
|
-
def parse_first_option(text: str) -> str:
|
|
165
|
+
def parse_first_option(text: str, options: list[str]) -> str:
|
|
162
166
|
"""
|
|
163
167
|
Find first valid option for text.
|
|
164
168
|
|
|
165
169
|
Args:
|
|
166
170
|
text: The text to parse.
|
|
167
171
|
"""
|
|
172
|
+
options_pattern = ResponseParser.process_options(options)
|
|
173
|
+
|
|
168
174
|
patterns = [
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
175
|
+
rf'[Aa]nswer:\s*({options_pattern})',
|
|
176
|
+
rf'ANSWER:\s*({options_pattern})',
|
|
177
|
+
rf'answer is \(?({options_pattern})\)?',
|
|
178
|
+
rf'[Tt]he correct answer is:\s*({options_pattern})',
|
|
179
|
+
rf'[Tt]he correct answer is:\n\s*({options_pattern})',
|
|
180
|
+
rf'[Tt]he correct answer is:\n\n-\s*({options_pattern})',
|
|
181
|
+
rf'[Tt]he answer might be:\n\n-\s*({options_pattern})',
|
|
182
|
+
rf'[Tt]he answer is \s*({options_pattern})',
|
|
176
183
|
]
|
|
177
184
|
|
|
178
185
|
regexes = [re.compile(pattern) for pattern in patterns]
|
|
179
186
|
for regex in regexes:
|
|
180
|
-
|
|
181
|
-
if
|
|
182
|
-
return
|
|
183
|
-
|
|
187
|
+
matches = regex.search(text)
|
|
188
|
+
if matches:
|
|
189
|
+
return matches.group(1)
|
|
190
|
+
# If no match found, try to find the last capital letter in the text
|
|
191
|
+
last_capital = ResponseParser.parse_last_capital(text, options)
|
|
192
|
+
if last_capital:
|
|
193
|
+
return last_capital
|
|
194
|
+
return 'No valid option found'
|
|
195
|
+
|
|
184
196
|
|
|
185
197
|
@staticmethod
|
|
186
|
-
def
|
|
187
|
-
|
|
198
|
+
def parse_bracketed_answer(text: str, options: list[str]) -> str:
|
|
199
|
+
options = ResponseParser.process_options(options)
|
|
200
|
+
# Match the first occurrence of the options in angle brackets
|
|
201
|
+
match = re.search(rf'<({options})>', text)
|
|
188
202
|
if match:
|
|
189
203
|
return match.group(1)
|
|
190
|
-
return ''
|
|
204
|
+
return 'No valid option found'
|
|
191
205
|
|
|
192
206
|
@staticmethod
|
|
193
|
-
def
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
207
|
+
def process_options(options: list[str]) -> str:
|
|
208
|
+
# Escape each option to ensure special characters in options are treated literally
|
|
209
|
+
escaped_options = [re.escape(option) for option in options]
|
|
210
|
+
# Join options into a regex pattern separated by '|', to match any of the options
|
|
211
|
+
options_pattern = '|'.join(escaped_options)
|
|
212
|
+
return options_pattern
|
|
200
213
|
|
|
201
214
|
def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
|
|
202
215
|
"""
|
|
@@ -299,3 +312,9 @@ def seed_everything(seed: int):
|
|
|
299
312
|
torch.cuda.manual_seed_all(seed)
|
|
300
313
|
torch.backends.cudnn.deterministic = True
|
|
301
314
|
torch.backends.cudnn.benchmark = False
|
|
315
|
+
|
|
316
|
+
if __name__ == '__main__':
|
|
317
|
+
options = ['A', 'B', 'C', 'D']
|
|
318
|
+
answers = ['Context .... ANSWER: A', 'answer: A']
|
|
319
|
+
for answer in answers:
|
|
320
|
+
print(ResponseParser.parse_first_option(answer, options))
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.15.0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -26,8 +26,10 @@ Requires-Dist: latex2sympy2
|
|
|
26
26
|
Requires-Dist: matplotlib
|
|
27
27
|
Requires-Dist: modelscope[framework]
|
|
28
28
|
Requires-Dist: nltk>=3.9
|
|
29
|
+
Requires-Dist: omegaconf
|
|
29
30
|
Requires-Dist: openai
|
|
30
31
|
Requires-Dist: pandas
|
|
32
|
+
Requires-Dist: pillow
|
|
31
33
|
Requires-Dist: pyarrow
|
|
32
34
|
Requires-Dist: pyyaml
|
|
33
35
|
Requires-Dist: requests
|
|
@@ -39,9 +41,15 @@ Requires-Dist: seaborn
|
|
|
39
41
|
Requires-Dist: sympy
|
|
40
42
|
Requires-Dist: tabulate
|
|
41
43
|
Requires-Dist: torch
|
|
44
|
+
Requires-Dist: torchvision
|
|
42
45
|
Requires-Dist: tqdm
|
|
43
46
|
Requires-Dist: transformers>=4.33
|
|
44
47
|
Requires-Dist: word2number
|
|
48
|
+
Provides-Extra: aigc
|
|
49
|
+
Requires-Dist: diffusers; extra == "aigc"
|
|
50
|
+
Requires-Dist: iopath; extra == "aigc"
|
|
51
|
+
Requires-Dist: open-clip-torch; extra == "aigc"
|
|
52
|
+
Requires-Dist: opencv-python; extra == "aigc"
|
|
45
53
|
Provides-Extra: all
|
|
46
54
|
Requires-Dist: accelerate; extra == "all"
|
|
47
55
|
Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
|
|
@@ -53,8 +61,10 @@ Requires-Dist: latex2sympy2; extra == "all"
|
|
|
53
61
|
Requires-Dist: matplotlib; extra == "all"
|
|
54
62
|
Requires-Dist: modelscope[framework]; extra == "all"
|
|
55
63
|
Requires-Dist: nltk>=3.9; extra == "all"
|
|
64
|
+
Requires-Dist: omegaconf; extra == "all"
|
|
56
65
|
Requires-Dist: openai; extra == "all"
|
|
57
66
|
Requires-Dist: pandas; extra == "all"
|
|
67
|
+
Requires-Dist: pillow; extra == "all"
|
|
58
68
|
Requires-Dist: pyarrow; extra == "all"
|
|
59
69
|
Requires-Dist: pyyaml; extra == "all"
|
|
60
70
|
Requires-Dist: requests; extra == "all"
|
|
@@ -66,17 +76,18 @@ Requires-Dist: seaborn; extra == "all"
|
|
|
66
76
|
Requires-Dist: sympy; extra == "all"
|
|
67
77
|
Requires-Dist: tabulate; extra == "all"
|
|
68
78
|
Requires-Dist: torch; extra == "all"
|
|
79
|
+
Requires-Dist: torchvision; extra == "all"
|
|
69
80
|
Requires-Dist: tqdm; extra == "all"
|
|
70
81
|
Requires-Dist: transformers>=4.33; extra == "all"
|
|
71
82
|
Requires-Dist: word2number; extra == "all"
|
|
72
83
|
Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
|
|
73
84
|
Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
|
|
74
|
-
Requires-Dist: langchain<0.3.0; extra == "all"
|
|
75
|
-
Requires-Dist: langchain-community<0.3.0; extra == "all"
|
|
76
|
-
Requires-Dist: langchain-core<0.3.0; extra == "all"
|
|
77
|
-
Requires-Dist: langchain-openai<0.3.0; extra == "all"
|
|
85
|
+
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
|
|
86
|
+
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "all"
|
|
87
|
+
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "all"
|
|
88
|
+
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "all"
|
|
78
89
|
Requires-Dist: mteb==1.19.4; extra == "all"
|
|
79
|
-
Requires-Dist: ragas==0.2.
|
|
90
|
+
Requires-Dist: ragas==0.2.14; extra == "all"
|
|
80
91
|
Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
81
92
|
Requires-Dist: aiohttp; extra == "all"
|
|
82
93
|
Requires-Dist: fastapi; extra == "all"
|
|
@@ -86,6 +97,10 @@ Requires-Dist: transformers; extra == "all"
|
|
|
86
97
|
Requires-Dist: unicorn; extra == "all"
|
|
87
98
|
Requires-Dist: gradio==5.4.0; extra == "all"
|
|
88
99
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
|
|
100
|
+
Requires-Dist: diffusers; extra == "all"
|
|
101
|
+
Requires-Dist: iopath; extra == "all"
|
|
102
|
+
Requires-Dist: open-clip-torch; extra == "all"
|
|
103
|
+
Requires-Dist: opencv-python; extra == "all"
|
|
89
104
|
Provides-Extra: app
|
|
90
105
|
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
91
106
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
|
|
@@ -99,12 +114,12 @@ Requires-Dist: sse-starlette; extra == "perf"
|
|
|
99
114
|
Requires-Dist: transformers; extra == "perf"
|
|
100
115
|
Requires-Dist: unicorn; extra == "perf"
|
|
101
116
|
Provides-Extra: rag
|
|
102
|
-
Requires-Dist: langchain<0.3.0; extra == "rag"
|
|
103
|
-
Requires-Dist: langchain-community<0.3.0; extra == "rag"
|
|
104
|
-
Requires-Dist: langchain-core<0.3.0; extra == "rag"
|
|
105
|
-
Requires-Dist: langchain-openai<0.3.0; extra == "rag"
|
|
117
|
+
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
|
|
118
|
+
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
|
|
119
|
+
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
|
|
120
|
+
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
|
|
106
121
|
Requires-Dist: mteb==1.19.4; extra == "rag"
|
|
107
|
-
Requires-Dist: ragas==0.2.
|
|
122
|
+
Requires-Dist: ragas==0.2.14; extra == "rag"
|
|
108
123
|
Requires-Dist: webdataset>0.2.0; extra == "rag"
|
|
109
124
|
Provides-Extra: vlmeval
|
|
110
125
|
Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
|
|
@@ -121,7 +136,7 @@ Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
|
|
|
121
136
|
</p>
|
|
122
137
|
|
|
123
138
|
<p align="center">
|
|
124
|
-
<img src="https://img.shields.io/badge/python-%E2%89%A53.
|
|
139
|
+
<img src="https://img.shields.io/badge/python-%E2%89%A53.9-5be.svg">
|
|
125
140
|
<a href="https://badge.fury.io/py/evalscope"><img src="https://badge.fury.io/py/evalscope.svg" alt="PyPI version" height="18"></a>
|
|
126
141
|
<a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://static.pepy.tech/badge/evalscope"></a>
|
|
127
142
|
<a href="https://github.com/modelscope/evalscope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
|
|
@@ -199,6 +214,10 @@ Please scan the QR code below to join our community groups:
|
|
|
199
214
|
|
|
200
215
|
## π News
|
|
201
216
|
|
|
217
|
+
- π₯ **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read π](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
|
|
218
|
+
- π₯ **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
|
|
219
|
+
- π₯ **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
|
|
220
|
+
- π₯ **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
|
|
202
221
|
- π₯ **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
|
|
203
222
|
- π₯ **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
|
|
204
223
|
- π₯ **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
|
|
@@ -212,15 +231,14 @@ Please scan the QR code below to join our community groups:
|
|
|
212
231
|
- π₯ **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasetsοΌrefer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
213
232
|
- π₯ **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [π Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
214
233
|
- π₯ **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [π Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
234
|
+
<details><summary>More</summary>
|
|
235
|
+
|
|
215
236
|
- π₯π₯ **[2024.12.31]** Support for adding benchmark evaluations, refer to the [π Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [π Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
216
237
|
- π₯ **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [π User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
217
238
|
- π₯ **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [π User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
|
|
218
239
|
- π₯ **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [π Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
|
|
219
240
|
- π₯ **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
|
|
220
241
|
- π₯ **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
|
|
221
|
-
|
|
222
|
-
<details><summary>More</summary>
|
|
223
|
-
|
|
224
242
|
- π₯ **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [π read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
|
|
225
243
|
- π₯ **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
|
|
226
244
|
- π₯ **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
|
|
@@ -503,6 +521,10 @@ Reference: Performance Testing [π User Guide](https://evalscope.readthedocs.i
|
|
|
503
521
|
|
|
504
522
|

|
|
505
523
|
|
|
524
|
+
**Supports swanlab for recording results**
|
|
525
|
+
|
|
526
|
+

|
|
527
|
+
|
|
506
528
|
**Supports Speed Benchmark**
|
|
507
529
|
|
|
508
530
|
It supports speed testing and provides speed benchmarks similar to those found in the [official Qwen](https://qwen.readthedocs.io/en/latest/benchmark/speed_benchmark.html) reports:
|