evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +5 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +356 -0
- evalscope/api/benchmark/meta.py +121 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +262 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +378 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +275 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +243 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +155 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/app.py +3 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +26 -14
- evalscope/app/utils/data_utils.py +43 -27
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -14
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +7 -10
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +10 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +136 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +157 -57
- evalscope/constants.py +37 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +275 -419
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +47 -33
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +67 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +126 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +701 -0
- evalscope/perf/benchmark.py +4 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +15 -10
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +11 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -3
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +51 -35
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +33 -47
- evalscope/summarizer.py +1 -1
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +3 -2
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +142 -6
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +11 -7
- evalscope/utils/multi_choices.py +288 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
- tests/benchmark/test_eval.py +385 -0
- tests/benchmark/test_image_edit.py +65 -0
- tests/{aigc → benchmark}/test_t2i.py +22 -4
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +85 -47
- tests/cli/test_collection.py +20 -8
- tests/cli/test_custom.py +22 -15
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -2
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/__init__.py +0 -0
evalscope/run.py
CHANGED
|
@@ -13,9 +13,6 @@ from evalscope.utils.io_utils import OutputsStructure
|
|
|
13
13
|
from evalscope.utils.logger import configure_logging, get_logger
|
|
14
14
|
from evalscope.utils.model_utils import seed_everything
|
|
15
15
|
|
|
16
|
-
if TYPE_CHECKING:
|
|
17
|
-
from evalscope.models import LocalModel
|
|
18
|
-
|
|
19
16
|
logger = get_logger()
|
|
20
17
|
|
|
21
18
|
|
|
@@ -109,27 +106,43 @@ def get_backend_manager_class(eval_backend: EvalBackend):
|
|
|
109
106
|
raise NotImplementedError(f'Not implemented for evaluation backend {eval_backend}')
|
|
110
107
|
|
|
111
108
|
|
|
112
|
-
def evaluate_model(
|
|
109
|
+
def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
113
110
|
"""Evaluate the model based on the provided task configuration."""
|
|
114
|
-
from evalscope.
|
|
111
|
+
from evalscope.api.evaluator import Evaluator
|
|
112
|
+
from evalscope.api.model import get_model_with_task_config
|
|
113
|
+
from evalscope.api.registry import get_benchmark
|
|
114
|
+
from evalscope.evaluator import DefaultEvaluator
|
|
115
115
|
from evalscope.report import gen_table
|
|
116
116
|
|
|
117
117
|
# Initialize evaluator
|
|
118
118
|
eval_results = {}
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
for
|
|
122
|
-
|
|
119
|
+
# Initialize model
|
|
120
|
+
model = get_model_with_task_config(task_config=task_config)
|
|
121
|
+
# Initialize evaluators for each dataset
|
|
122
|
+
evaluators: List[Evaluator] = []
|
|
123
|
+
for dataset_name in task_config.datasets:
|
|
124
|
+
# Create evaluator for each dataset
|
|
125
|
+
benchmark = get_benchmark(dataset_name, task_config)
|
|
126
|
+
evaluator = DefaultEvaluator(
|
|
127
|
+
task_config=task_config,
|
|
128
|
+
model=model,
|
|
129
|
+
benchmark=benchmark,
|
|
130
|
+
outputs=outputs,
|
|
131
|
+
)
|
|
123
132
|
evaluators.append(evaluator)
|
|
124
133
|
|
|
134
|
+
# Update task_config.dataset_args with benchmark metadata, except for DataCollection
|
|
135
|
+
if dataset_name != DataCollection.NAME:
|
|
136
|
+
task_config.dataset_args[dataset_name] = benchmark.to_dict()
|
|
137
|
+
|
|
125
138
|
# dump task_cfg to outputs.configs_dir after creating evaluators
|
|
126
|
-
|
|
127
|
-
logger.info(
|
|
139
|
+
task_config.dump_yaml(outputs.configs_dir)
|
|
140
|
+
logger.info(task_config)
|
|
128
141
|
|
|
129
142
|
# Run evaluation for each evaluator
|
|
130
143
|
for evaluator in evaluators:
|
|
131
144
|
res_dict = evaluator.eval()
|
|
132
|
-
eval_results[evaluator.
|
|
145
|
+
eval_results[evaluator.benchmark.name] = res_dict
|
|
133
146
|
|
|
134
147
|
# Make overall report
|
|
135
148
|
try:
|
|
@@ -137,48 +150,21 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
137
150
|
logger.info(f'Overall report table: \n{report_table} \n')
|
|
138
151
|
except Exception:
|
|
139
152
|
logger.error('Failed to generate report table.')
|
|
140
|
-
|
|
141
153
|
# Clean up
|
|
142
|
-
if
|
|
154
|
+
if model is not None:
|
|
143
155
|
import gc
|
|
144
|
-
import torch
|
|
145
156
|
|
|
146
|
-
del
|
|
157
|
+
del model
|
|
147
158
|
del evaluators
|
|
148
|
-
torch.cuda.empty_cache()
|
|
149
159
|
gc.collect()
|
|
150
160
|
|
|
151
|
-
|
|
152
|
-
|
|
161
|
+
from evalscope.utils.import_utils import check_import
|
|
162
|
+
if check_import('torch'):
|
|
163
|
+
import torch
|
|
164
|
+
if torch.cuda.is_available():
|
|
165
|
+
torch.cuda.empty_cache()
|
|
153
166
|
|
|
154
|
-
|
|
155
|
-
"""Create an evaluator object for the specified dataset."""
|
|
156
|
-
from evalscope.benchmarks import Benchmark, BenchmarkMeta
|
|
157
|
-
from evalscope.evaluator import Evaluator
|
|
158
|
-
from evalscope.models import initialize_model_adapter
|
|
159
|
-
|
|
160
|
-
benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
|
|
161
|
-
|
|
162
|
-
if dataset_name == DataCollection.NAME:
|
|
163
|
-
# EvaluatorCollection is a collection of evaluators
|
|
164
|
-
from evalscope.collections import EvaluatorCollection
|
|
165
|
-
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
166
|
-
return EvaluatorCollection(task_cfg, data_adapter, outputs, base_model)
|
|
167
|
-
|
|
168
|
-
# Initialize data adapter first to update config
|
|
169
|
-
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
170
|
-
# Initialize model adapter
|
|
171
|
-
model_adapter = initialize_model_adapter(task_cfg, data_adapter, base_model)
|
|
172
|
-
|
|
173
|
-
# update task_cfg.dataset_args
|
|
174
|
-
task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
|
|
175
|
-
|
|
176
|
-
return Evaluator(
|
|
177
|
-
data_adapter=data_adapter,
|
|
178
|
-
model_adapter=model_adapter,
|
|
179
|
-
outputs=outputs,
|
|
180
|
-
task_cfg=task_cfg,
|
|
181
|
-
)
|
|
167
|
+
return eval_results
|
|
182
168
|
|
|
183
169
|
|
|
184
170
|
def main():
|
evalscope/summarizer.py
CHANGED
|
@@ -80,7 +80,7 @@ class Summarizer:
|
|
|
80
80
|
|
|
81
81
|
summary_file_path = summary_files[0]
|
|
82
82
|
# Example: [{'dataset': 'gsm8k', 'version': '1d7fe4', 'metric': 'accuracy', 'mode': 'gen', 'qwen-7b-chat': '53.98'} # noqa: E501
|
|
83
|
-
summary_res: List[dict] = csv_to_list(
|
|
83
|
+
summary_res: List[dict] = csv_to_list(summary_file_path)
|
|
84
84
|
final_res_list.extend(summary_res)
|
|
85
85
|
elif eval_backend == EvalBackend.VLM_EVAL_KIT:
|
|
86
86
|
eval_config = Summarizer.parse_eval_config(candidate_task)
|
evalscope/utils/__init__.py
CHANGED
|
@@ -7,9 +7,22 @@ from .import_utils import _LazyModule
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
8
8
|
from .argument_utils import BaseArgument, get_supported_params, parse_int_or_float
|
|
9
9
|
from .deprecation_utils import deprecated
|
|
10
|
+
from .function_utils import run_once, thread_safe
|
|
10
11
|
from .import_utils import get_module_path, is_module_installed
|
|
11
|
-
from .io_utils import (
|
|
12
|
-
|
|
12
|
+
from .io_utils import (
|
|
13
|
+
OutputsStructure,
|
|
14
|
+
csv_to_jsonl,
|
|
15
|
+
csv_to_list,
|
|
16
|
+
dict_to_yaml,
|
|
17
|
+
gen_hash,
|
|
18
|
+
get_latest_folder_path,
|
|
19
|
+
get_valid_list,
|
|
20
|
+
json_to_dict,
|
|
21
|
+
jsonl_to_csv,
|
|
22
|
+
jsonl_to_list,
|
|
23
|
+
safe_filename,
|
|
24
|
+
yaml_to_dict,
|
|
25
|
+
)
|
|
13
26
|
from .logger import configure_logging, get_logger
|
|
14
27
|
from .model_utils import EvalBackend, dict_torch_dtype_to_str, fix_do_sample_warning, get_device, seed_everything
|
|
15
28
|
|
|
@@ -31,6 +44,10 @@ else:
|
|
|
31
44
|
'is_module_installed',
|
|
32
45
|
'get_module_path',
|
|
33
46
|
],
|
|
47
|
+
'function_utils': [
|
|
48
|
+
'thread_safe',
|
|
49
|
+
'run_once',
|
|
50
|
+
],
|
|
34
51
|
'io_utils': [
|
|
35
52
|
'OutputsStructure',
|
|
36
53
|
'csv_to_list',
|
|
@@ -44,6 +61,8 @@ else:
|
|
|
44
61
|
'jsonl_to_list',
|
|
45
62
|
'gen_hash',
|
|
46
63
|
'get_valid_list',
|
|
64
|
+
'safe_filename',
|
|
65
|
+
'thread_safe',
|
|
47
66
|
],
|
|
48
67
|
'deprecation_utils': [
|
|
49
68
|
'deprecated',
|
evalscope/utils/chat_service.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import time
|
|
3
|
-
import torch
|
|
4
3
|
from contextlib import contextmanager
|
|
5
4
|
from functools import partial
|
|
6
5
|
from pydantic import BaseModel, Field
|
|
@@ -95,6 +94,7 @@ class TextCompletionResponse(BaseModel):
|
|
|
95
94
|
class ChatService:
|
|
96
95
|
|
|
97
96
|
def __init__(self, model_path, attn_implementation):
|
|
97
|
+
import torch
|
|
98
98
|
from modelscope import AutoModelForCausalLM, AutoTokenizer
|
|
99
99
|
from transformers import TextIteratorStreamer
|
|
100
100
|
|
|
@@ -204,7 +204,8 @@ class ChatService:
|
|
|
204
204
|
|
|
205
205
|
def _prepare_chat_inputs(self, request: ChatCompletionRequest):
|
|
206
206
|
formatted_prompt = self.tokenizer.apply_chat_template(
|
|
207
|
-
request.messages, tokenize=False, add_generation_prompt=True
|
|
207
|
+
request.messages, tokenize=False, add_generation_prompt=True
|
|
208
|
+
)
|
|
208
209
|
inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=False).to(self.device)
|
|
209
210
|
prompt_tokens = len(inputs['input_ids'][0])
|
|
210
211
|
return formatted_prompt, inputs, prompt_tokens
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import functools
|
|
2
2
|
import inspect
|
|
3
|
+
import os
|
|
3
4
|
from typing import Callable, Optional
|
|
4
5
|
|
|
5
6
|
from .logger import get_logger
|
|
@@ -22,7 +23,7 @@ def deprecated(since: str, remove_in: Optional[str] = None, alternative: Optiona
|
|
|
22
23
|
@functools.wraps(func)
|
|
23
24
|
def wrapper(*args, **kwargs):
|
|
24
25
|
# Get the file name where the function is defined
|
|
25
|
-
file_name = inspect.getfile(func)
|
|
26
|
+
file_name = os.path.basename(inspect.getfile(func))
|
|
26
27
|
|
|
27
28
|
# Construct the warning message
|
|
28
29
|
warning_parts = [
|
|
@@ -40,3 +41,13 @@ def deprecated(since: str, remove_in: Optional[str] = None, alternative: Optiona
|
|
|
40
41
|
return wrapper
|
|
41
42
|
|
|
42
43
|
return decorator
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def deprecated_warning(logger, message: str):
|
|
47
|
+
"""
|
|
48
|
+
Log a deprecation warning.
|
|
49
|
+
|
|
50
|
+
:param logger: Logger instance to log the warning
|
|
51
|
+
:param message: Warning message to log
|
|
52
|
+
"""
|
|
53
|
+
logger.warning(f'Deprecated: {message}')
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import threading
|
|
2
|
+
from functools import wraps
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def run_once(func):
|
|
6
|
+
"""Decorator to ensure a function is only run once."""
|
|
7
|
+
has_run = False
|
|
8
|
+
result = None
|
|
9
|
+
|
|
10
|
+
def wrapper(*args, **kwargs):
|
|
11
|
+
nonlocal has_run, result
|
|
12
|
+
if not has_run:
|
|
13
|
+
result = func(*args, **kwargs)
|
|
14
|
+
has_run = True
|
|
15
|
+
return result
|
|
16
|
+
|
|
17
|
+
return wrapper
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def thread_safe(func):
|
|
21
|
+
"""Thread-safe decorator for functions that need to be executed in a thread-safe manner."""
|
|
22
|
+
lock = threading.RLock()
|
|
23
|
+
|
|
24
|
+
@wraps(func)
|
|
25
|
+
def wrapper(*args, **kwargs):
|
|
26
|
+
with lock:
|
|
27
|
+
return func(*args, **kwargs)
|
|
28
|
+
|
|
29
|
+
return wrapper
|
evalscope/utils/import_utils.py
CHANGED
|
@@ -5,13 +5,35 @@ import importlib
|
|
|
5
5
|
import os
|
|
6
6
|
from itertools import chain
|
|
7
7
|
from types import ModuleType
|
|
8
|
-
from typing import Any
|
|
8
|
+
from typing import Any, Optional, Union
|
|
9
9
|
|
|
10
10
|
from .logger import get_logger
|
|
11
11
|
|
|
12
12
|
logger = get_logger() # pylint: disable=invalid-name
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
def check_import(module_name: str, package: Optional[str] = None, raise_error: bool = False) -> bool:
|
|
16
|
+
"""Check if a module can be imported.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
module_name (str): The name of the module to check.
|
|
20
|
+
package (str, optional): The package to install if the module is not found. Defaults to None.
|
|
21
|
+
raise_error (bool, optional): Whether to raise an error if the module is not found. Defaults to False.
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
importlib.import_module(module_name)
|
|
25
|
+
return True
|
|
26
|
+
except ImportError:
|
|
27
|
+
error_msg = f'`{module_name}` not found.'
|
|
28
|
+
if package:
|
|
29
|
+
error_msg += f' Please run `pip install {package}` to use this feature.'
|
|
30
|
+
logger.warning(error_msg)
|
|
31
|
+
|
|
32
|
+
if raise_error:
|
|
33
|
+
raise ImportError(error_msg)
|
|
34
|
+
return False
|
|
35
|
+
|
|
36
|
+
|
|
15
37
|
class _LazyModule(ModuleType):
|
|
16
38
|
"""
|
|
17
39
|
Module class that surfaces all objects but only performs associated imports when the objects are requested.
|
evalscope/utils/io_utils.py
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import csv
|
|
3
3
|
import hashlib
|
|
4
|
+
import io
|
|
4
5
|
import json
|
|
5
6
|
import jsonlines as jsonl
|
|
6
7
|
import os
|
|
7
8
|
import re
|
|
9
|
+
import string
|
|
10
|
+
import unicodedata
|
|
8
11
|
import yaml
|
|
9
12
|
from io import BytesIO
|
|
10
13
|
from PIL import Image
|
|
@@ -33,7 +36,7 @@ class OutputsStructure:
|
|
|
33
36
|
'configs_dir': None
|
|
34
37
|
}
|
|
35
38
|
|
|
36
|
-
def _get_dir(self, attr_name, dir_name):
|
|
39
|
+
def _get_dir(self, attr_name, dir_name) -> str:
|
|
37
40
|
if self._dirs[attr_name] is None:
|
|
38
41
|
dir_path = os.path.join(self.outputs_dir, dir_name)
|
|
39
42
|
if self.is_make:
|
|
@@ -72,10 +75,20 @@ def jsonl_to_list(jsonl_file):
|
|
|
72
75
|
Returns:
|
|
73
76
|
list: list of lines. Each line is a dict.
|
|
74
77
|
"""
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
78
|
+
try:
|
|
79
|
+
res_list = []
|
|
80
|
+
with jsonl.open(jsonl_file, mode='r') as reader:
|
|
81
|
+
for line in reader.iter(type=dict, allow_none=True, skip_invalid=False):
|
|
82
|
+
res_list.append(line)
|
|
83
|
+
except Exception:
|
|
84
|
+
# Fallback to reading line by line
|
|
85
|
+
res_list = []
|
|
86
|
+
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
|
87
|
+
for line in f:
|
|
88
|
+
if line.strip(): # Skip empty lines
|
|
89
|
+
res_list.append(json.loads(line.strip()))
|
|
90
|
+
if not res_list:
|
|
91
|
+
logger.warning(f'No data found in {jsonl_file}.')
|
|
79
92
|
return res_list
|
|
80
93
|
|
|
81
94
|
|
|
@@ -271,8 +284,131 @@ def get_valid_list(input_list, candidate_list):
|
|
|
271
284
|
[i for i in input_list if i not in candidate_list]
|
|
272
285
|
|
|
273
286
|
|
|
274
|
-
def PIL_to_base64(image: Image.Image, format: str = 'JPEG') -> str:
|
|
287
|
+
def PIL_to_base64(image: Image.Image, format: str = 'JPEG', add_header: bool = False) -> str:
|
|
288
|
+
"""
|
|
289
|
+
Convert a PIL Image to a base64 encoded string.
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
image (Image.Image): The PIL Image to convert.
|
|
293
|
+
format (str): The format to save the image in. Default is 'JPEG'.
|
|
294
|
+
add_header (bool): Whether to add the base64 header. Default is False.
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
str: Base64 encoded string of the image.
|
|
298
|
+
"""
|
|
275
299
|
buffered = BytesIO()
|
|
276
300
|
image.save(buffered, format=format)
|
|
277
301
|
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
|
302
|
+
if add_header:
|
|
303
|
+
img_str = f'data:image/{format.lower()};base64,{img_str}'
|
|
304
|
+
return img_str
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def bytes_to_base64(bytes_data: bytes, format: str = 'png', add_header: bool = False) -> str:
|
|
308
|
+
"""Convert image bytes to a base64 encoded string.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
bytes_data (bytes): The bytes to convert.
|
|
312
|
+
add_header (bool): Whether to add the base64 header. Default is False.
|
|
313
|
+
|
|
314
|
+
Returns:
|
|
315
|
+
str: Base64 encoded string of the bytes.
|
|
316
|
+
"""
|
|
317
|
+
img_str = base64.b64encode(bytes_data).decode('utf-8')
|
|
318
|
+
if add_header:
|
|
319
|
+
img_str = f'data:image/{format};base64,{img_str}'
|
|
278
320
|
return img_str
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def base64_to_PIL(base64_str):
|
|
324
|
+
"""Convert a base64 encoded string to a PIL Image.
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
base64_str (str): The base64 encoded string.
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
Image.Image: The decoded PIL Image.
|
|
331
|
+
"""
|
|
332
|
+
# remove header
|
|
333
|
+
if ',' in base64_str:
|
|
334
|
+
base64_str = base64_str.split(',', 1)[1]
|
|
335
|
+
|
|
336
|
+
# decode
|
|
337
|
+
img_data = base64.b64decode(base64_str)
|
|
338
|
+
img_file = io.BytesIO(img_data)
|
|
339
|
+
img = Image.open(img_file)
|
|
340
|
+
return img
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def safe_filename(s: str, max_length: int = 255) -> str:
|
|
344
|
+
"""
|
|
345
|
+
Convert a string into a safe filename by removing or replacing unsafe characters.
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
s (str): The input string to convert
|
|
349
|
+
max_length (int): Maximum length of the resulting filename (default 255)
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
str: A safe filename string
|
|
353
|
+
|
|
354
|
+
Examples:
|
|
355
|
+
>>> safe_filename("Hello/World?.txt")
|
|
356
|
+
'Hello_World.txt'
|
|
357
|
+
"""
|
|
358
|
+
# normalize unicode characters
|
|
359
|
+
s = unicodedata.normalize('NFKD', s)
|
|
360
|
+
s = s.encode('ASCII', 'ignore').decode('ASCII')
|
|
361
|
+
|
|
362
|
+
# remove or replace unsafe characters
|
|
363
|
+
# Keep only alphanumeric characters, dots, dashes, and underscores
|
|
364
|
+
safe_chars = string.ascii_letters + string.digits + '.-_'
|
|
365
|
+
s = ''.join(c if c in safe_chars else '_' for c in s)
|
|
366
|
+
|
|
367
|
+
# remove consecutive underscores
|
|
368
|
+
s = re.sub(r'_+', '_', s)
|
|
369
|
+
|
|
370
|
+
# remove leading/trailing periods and underscores
|
|
371
|
+
s = s.strip('._')
|
|
372
|
+
|
|
373
|
+
# handle empty string case
|
|
374
|
+
if not s:
|
|
375
|
+
s = 'untitled'
|
|
376
|
+
|
|
377
|
+
# handle starting with a period (hidden files)
|
|
378
|
+
if s.startswith('.'):
|
|
379
|
+
s = '_' + s
|
|
380
|
+
|
|
381
|
+
# enforce length limit
|
|
382
|
+
if len(s) > max_length:
|
|
383
|
+
# If we need to truncate, preserve the file extension if present
|
|
384
|
+
name, ext = os.path.splitext(s)
|
|
385
|
+
ext_len = len(ext)
|
|
386
|
+
if ext_len > 0:
|
|
387
|
+
max_name_length = max_length - ext_len
|
|
388
|
+
s = name[:max_name_length] + ext
|
|
389
|
+
else:
|
|
390
|
+
s = s[:max_length]
|
|
391
|
+
|
|
392
|
+
return s
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def convert_numpy_types(obj):
|
|
396
|
+
"""Recursively convert numpy types to native Python types for JSON serialization."""
|
|
397
|
+
import numpy as np
|
|
398
|
+
|
|
399
|
+
if isinstance(obj, np.bool_):
|
|
400
|
+
return bool(obj)
|
|
401
|
+
elif isinstance(obj, np.integer):
|
|
402
|
+
return int(obj)
|
|
403
|
+
elif isinstance(obj, np.floating):
|
|
404
|
+
return float(obj)
|
|
405
|
+
elif isinstance(obj, np.ndarray):
|
|
406
|
+
return obj.tolist()
|
|
407
|
+
elif isinstance(obj, dict):
|
|
408
|
+
return {key: convert_numpy_types(value) for key, value in obj.items()}
|
|
409
|
+
elif isinstance(obj, list):
|
|
410
|
+
return [convert_numpy_types(item) for item in obj]
|
|
411
|
+
elif isinstance(obj, tuple):
|
|
412
|
+
return tuple(convert_numpy_types(item) for item in obj)
|
|
413
|
+
else:
|
|
414
|
+
return obj
|