evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +5 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +356 -0
- evalscope/api/benchmark/meta.py +121 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +262 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +378 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +275 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +243 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +155 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/app.py +3 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +26 -14
- evalscope/app/utils/data_utils.py +43 -27
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -14
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +7 -10
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +10 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +136 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +157 -57
- evalscope/constants.py +37 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +275 -419
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +47 -33
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +67 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +126 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +701 -0
- evalscope/perf/benchmark.py +4 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +15 -10
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +11 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -3
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +51 -35
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +33 -47
- evalscope/summarizer.py +1 -1
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +3 -2
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +142 -6
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +11 -7
- evalscope/utils/multi_choices.py +288 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
- tests/benchmark/test_eval.py +385 -0
- tests/benchmark/test_image_edit.py +65 -0
- tests/{aigc → benchmark}/test_t2i.py +22 -4
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +85 -47
- tests/cli/test_collection.py +20 -8
- tests/cli/test_custom.py +22 -15
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -2
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/__init__.py +0 -0
evalscope/cli/start_app.py
CHANGED
|
@@ -28,6 +28,12 @@ class StartAppCMD(CLICommand):
|
|
|
28
28
|
parser.set_defaults(func=subparser_func)
|
|
29
29
|
|
|
30
30
|
def execute(self):
|
|
31
|
-
|
|
31
|
+
try:
|
|
32
|
+
from evalscope.app import create_app
|
|
33
|
+
except ImportError as e:
|
|
34
|
+
raise ImportError(
|
|
35
|
+
f'Failed to import create_app from evalscope.app, due to {e}. '
|
|
36
|
+
"Please run `pip install 'evalscope[app]'`."
|
|
37
|
+
)
|
|
32
38
|
|
|
33
39
|
create_app(self.args)
|
evalscope/cli/start_perf.py
CHANGED
|
@@ -28,6 +28,12 @@ class PerfBenchCMD(CLICommand):
|
|
|
28
28
|
parser.set_defaults(func=subparser_func)
|
|
29
29
|
|
|
30
30
|
def execute(self):
|
|
31
|
-
|
|
31
|
+
try:
|
|
32
|
+
from evalscope.perf.main import run_perf_benchmark
|
|
33
|
+
except ImportError as e:
|
|
34
|
+
raise ImportError(
|
|
35
|
+
f'Failed to import run_perf_benchmark from evalscope.perf.main, due to {e}. '
|
|
36
|
+
"Please run `pip install 'evalscope[perf]'`."
|
|
37
|
+
)
|
|
32
38
|
|
|
33
39
|
run_perf_benchmark(self.args)
|
evalscope/cli/start_server.py
CHANGED
|
@@ -25,14 +25,16 @@ def add_perf_args(parser):
|
|
|
25
25
|
'--logdir',
|
|
26
26
|
required=True,
|
|
27
27
|
type=str,
|
|
28
|
-
help='The monitor log save dir, tensorboard start at this path for display!'
|
|
28
|
+
help='The monitor log save dir, tensorboard start at this path for display!'
|
|
29
|
+
)
|
|
29
30
|
parser.add_argument('--host', type=str, default='0.0.0.0', help='The tensorboard host')
|
|
30
31
|
parser.add_argument('--tensorboard-port', type=str, default='6006', help='The tensorboard port')
|
|
31
32
|
|
|
32
33
|
|
|
33
34
|
def async_run_command_with_popen(cmd):
|
|
34
35
|
sub_process = subprocess.Popen(
|
|
35
|
-
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=1, universal_newlines=True, encoding='utf8'
|
|
36
|
+
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=1, universal_newlines=True, encoding='utf8'
|
|
37
|
+
)
|
|
36
38
|
return sub_process
|
|
37
39
|
|
|
38
40
|
|
|
@@ -61,7 +63,8 @@ def start_server(args):
|
|
|
61
63
|
bufsize=1,
|
|
62
64
|
shell=True,
|
|
63
65
|
universal_newlines=True,
|
|
64
|
-
encoding='utf8'
|
|
66
|
+
encoding='utf8'
|
|
67
|
+
)
|
|
65
68
|
|
|
66
69
|
os.set_blocking(sub_process.stdout.fileno(), False)
|
|
67
70
|
return sub_process
|
|
@@ -4,20 +4,12 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
from evalscope.utils.import_utils import _LazyModule
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
|
-
from .
|
|
8
|
-
from .sampler import StratifiedSampler, UniformSampler, WeightedSampler
|
|
7
|
+
from .sampler import DatasetEntry, StratifiedSampler, UniformSampler, WeightedSampler
|
|
9
8
|
from .schema import CollectionSchema, DatasetInfo
|
|
10
9
|
|
|
11
10
|
else:
|
|
12
11
|
_import_structure = {
|
|
13
|
-
'
|
|
14
|
-
'EvaluatorCollection',
|
|
15
|
-
],
|
|
16
|
-
'sampler': [
|
|
17
|
-
'StratifiedSampler',
|
|
18
|
-
'UniformSampler',
|
|
19
|
-
'WeightedSampler',
|
|
20
|
-
],
|
|
12
|
+
'sampler': ['StratifiedSampler', 'UniformSampler', 'WeightedSampler', 'DatasetEntry'],
|
|
21
13
|
'schema': [
|
|
22
14
|
'CollectionSchema',
|
|
23
15
|
'DatasetInfo',
|
evalscope/collections/sampler.py
CHANGED
|
@@ -1,18 +1,17 @@
|
|
|
1
1
|
import random
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
|
-
from
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
4
|
from tqdm import tqdm
|
|
5
5
|
from typing import List, Optional
|
|
6
6
|
|
|
7
7
|
from evalscope.collections.schema import CollectionSchema, DatasetInfo
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
class DatasetEntry:
|
|
10
|
+
class DatasetEntry(BaseModel):
|
|
12
11
|
index: int = 0
|
|
13
|
-
prompt: dict =
|
|
14
|
-
tags: List[str] =
|
|
15
|
-
categories: List[str] =
|
|
12
|
+
prompt: dict = Field(default_factory=dict)
|
|
13
|
+
tags: List[str] = Field(default_factory=list)
|
|
14
|
+
categories: List[str] = Field(default_factory=list)
|
|
16
15
|
task_type: str = ''
|
|
17
16
|
weight: float = 0.0
|
|
18
17
|
dataset_name: str = ''
|
|
@@ -33,17 +32,18 @@ class Sampler(ABC):
|
|
|
33
32
|
all_data = []
|
|
34
33
|
data_dict = dataset.get_data()
|
|
35
34
|
for subset_name, subset_data in data_dict.items():
|
|
36
|
-
for
|
|
35
|
+
for sample in subset_data:
|
|
37
36
|
all_data.append(
|
|
38
37
|
DatasetEntry(
|
|
39
|
-
prompt=
|
|
38
|
+
prompt=sample.model_dump(exclude_none=True),
|
|
40
39
|
tags=dataset.tags,
|
|
41
40
|
categories=dataset.hierarchy,
|
|
42
41
|
task_type=dataset.task_type,
|
|
43
42
|
weight=dataset.weight,
|
|
44
43
|
dataset_name=dataset.name,
|
|
45
44
|
subset_name=subset_name,
|
|
46
|
-
)
|
|
45
|
+
)
|
|
46
|
+
)
|
|
47
47
|
count = min(count, len(all_data)) # avoid sampling more than the dataset size
|
|
48
48
|
sampled_data = random.sample(all_data, k=count)
|
|
49
49
|
return sampled_data
|
|
@@ -52,7 +52,7 @@ class Sampler(ABC):
|
|
|
52
52
|
result = []
|
|
53
53
|
for i, entry in enumerate(all_data):
|
|
54
54
|
entry.index = i
|
|
55
|
-
result.append(
|
|
55
|
+
result.append(entry.model_dump())
|
|
56
56
|
return result
|
|
57
57
|
|
|
58
58
|
|
evalscope/collections/schema.py
CHANGED
|
@@ -3,6 +3,10 @@ import json
|
|
|
3
3
|
from dataclasses import asdict, dataclass, field
|
|
4
4
|
from typing import List, Union
|
|
5
5
|
|
|
6
|
+
from evalscope.api.dataset import DatasetDict
|
|
7
|
+
from evalscope.api.registry import get_benchmark
|
|
8
|
+
from evalscope.config import TaskConfig
|
|
9
|
+
|
|
6
10
|
|
|
7
11
|
@dataclass
|
|
8
12
|
class DatasetInfo:
|
|
@@ -13,15 +17,11 @@ class DatasetInfo:
|
|
|
13
17
|
args: dict = field(default_factory=dict)
|
|
14
18
|
hierarchy: List[str] = field(default_factory=list)
|
|
15
19
|
|
|
16
|
-
def get_data(self) ->
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
data_adapter = benchmark_meta.get_data_adapter(config=self.args)
|
|
22
|
-
data_dict = data_adapter.load()
|
|
23
|
-
prompts = data_adapter.gen_prompts(data_dict)
|
|
24
|
-
return prompts
|
|
20
|
+
def get_data(self) -> DatasetDict:
|
|
21
|
+
dataset_args = {self.name: self.args}
|
|
22
|
+
benchmark_meta = get_benchmark(self.name, config=TaskConfig(dataset_args=dataset_args))
|
|
23
|
+
data_dict = benchmark_meta.load_dataset()
|
|
24
|
+
return data_dict
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
def flatten_weight(collection: 'CollectionSchema', base_weight=1):
|
|
@@ -111,8 +111,10 @@ if __name__ == '__main__':
|
|
|
111
111
|
]),
|
|
112
112
|
CollectionSchema(
|
|
113
113
|
name='chinese',
|
|
114
|
-
datasets=[DatasetInfo(name='ceval', weight=1, tags=['zh'], args={'subset_list': ['logic']})]
|
|
115
|
-
|
|
114
|
+
datasets=[DatasetInfo(name='ceval', weight=1, tags=['zh'], args={'subset_list': ['logic']})]
|
|
115
|
+
)
|
|
116
|
+
]
|
|
117
|
+
)
|
|
116
118
|
print(schema)
|
|
117
119
|
print(schema.flatten())
|
|
118
120
|
schema.dump_json('outputs/schema.json')
|
evalscope/config.py
CHANGED
|
@@ -1,16 +1,24 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
2
|
+
# flake8: noqa: E501
|
|
3
3
|
import copy
|
|
4
4
|
import os
|
|
5
5
|
from argparse import Namespace
|
|
6
6
|
from dataclasses import dataclass, field
|
|
7
7
|
from typing import Dict, List, Optional, Union
|
|
8
8
|
|
|
9
|
-
from evalscope.
|
|
10
|
-
|
|
11
|
-
|
|
9
|
+
from evalscope.api.model import GenerateConfig, Model, ModelAPI
|
|
10
|
+
from evalscope.constants import (
|
|
11
|
+
DEFAULT_DATASET_CACHE_DIR,
|
|
12
|
+
DEFAULT_WORK_DIR,
|
|
13
|
+
EvalBackend,
|
|
14
|
+
EvalType,
|
|
15
|
+
HubType,
|
|
16
|
+
JudgeStrategy,
|
|
17
|
+
ModelTask,
|
|
18
|
+
)
|
|
12
19
|
from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
|
|
13
|
-
from evalscope.utils.
|
|
20
|
+
from evalscope.utils.deprecation_utils import deprecated_warning
|
|
21
|
+
from evalscope.utils.io_utils import dict_to_yaml, gen_hash, safe_filename
|
|
14
22
|
from evalscope.utils.logger import get_logger
|
|
15
23
|
|
|
16
24
|
logger = get_logger()
|
|
@@ -19,104 +27,191 @@ logger = get_logger()
|
|
|
19
27
|
@dataclass
|
|
20
28
|
class TaskConfig(BaseArgument):
|
|
21
29
|
# Model-related arguments
|
|
22
|
-
model: Union[str,
|
|
30
|
+
model: Optional[Union[str, Model, ModelAPI]] = None
|
|
31
|
+
"""The model to be evaluated. Can be a string path, Model object, or ModelAPI object."""
|
|
32
|
+
|
|
23
33
|
model_id: Optional[str] = None
|
|
34
|
+
"""Unique identifier for the model. Auto-generated from model name if not provided."""
|
|
35
|
+
|
|
24
36
|
model_args: Dict = field(default_factory=dict)
|
|
37
|
+
"""Additional arguments to pass to the model during initialization."""
|
|
38
|
+
|
|
25
39
|
model_task: str = ModelTask.TEXT_GENERATION
|
|
40
|
+
"""The type of task the model performs (e.g., text generation, image generation)."""
|
|
26
41
|
|
|
27
42
|
# Template-related arguments
|
|
28
|
-
template_type: Optional[str] = None # Deprecated, will be removed in v1.0.0.
|
|
29
43
|
chat_template: Optional[str] = None
|
|
44
|
+
"""Chat template to use for formatting conversations with the model."""
|
|
30
45
|
|
|
31
46
|
# Dataset-related arguments
|
|
32
47
|
datasets: List[str] = field(default_factory=list)
|
|
48
|
+
"""List of dataset names to evaluate the model on."""
|
|
49
|
+
|
|
33
50
|
dataset_args: Dict = field(default_factory=dict)
|
|
51
|
+
"""Additional arguments to pass to datasets during loading."""
|
|
52
|
+
|
|
34
53
|
dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
|
|
54
|
+
"""Directory where datasets are cached locally."""
|
|
55
|
+
|
|
35
56
|
dataset_hub: str = HubType.MODELSCOPE
|
|
57
|
+
"""Hub platform to download datasets from (e.g., ModelScope, HuggingFace)."""
|
|
58
|
+
|
|
59
|
+
repeats: int = 1
|
|
60
|
+
"""Number of times to repeat the dataset items for k-metrics evaluation."""
|
|
36
61
|
|
|
37
62
|
# Generation configuration arguments
|
|
38
|
-
generation_config: Dict = field(default_factory=dict)
|
|
63
|
+
generation_config: Union[Dict, GenerateConfig] = field(default_factory=dict)
|
|
64
|
+
"""Configuration parameters for text/image generation."""
|
|
39
65
|
|
|
40
66
|
# Evaluation-related arguments
|
|
41
67
|
eval_type: str = EvalType.CHECKPOINT
|
|
68
|
+
"""Type of evaluation: checkpoint, service, or mock."""
|
|
69
|
+
|
|
42
70
|
eval_backend: str = EvalBackend.NATIVE
|
|
71
|
+
"""Backend framework to use for evaluation."""
|
|
72
|
+
|
|
43
73
|
eval_config: Union[str, Dict, None] = None
|
|
44
|
-
|
|
74
|
+
"""Additional evaluation configuration parameters."""
|
|
75
|
+
|
|
45
76
|
limit: Optional[Union[int, float]] = None
|
|
46
|
-
|
|
77
|
+
"""Maximum number of samples to evaluate. Can be int (count) or float (fraction)."""
|
|
78
|
+
|
|
79
|
+
eval_batch_size: int = 1
|
|
80
|
+
"""Batch size for evaluation processing."""
|
|
47
81
|
|
|
48
82
|
# Cache and working directory arguments
|
|
49
|
-
mem_cache: bool = False # Deprecated, will be removed in v1.0.0.
|
|
50
83
|
use_cache: Optional[str] = None
|
|
84
|
+
"""Whether to use cached results and which cache strategy to apply."""
|
|
85
|
+
|
|
86
|
+
rerun_review: bool = False
|
|
87
|
+
"""Whether to rerun the review process even if results exist."""
|
|
88
|
+
|
|
51
89
|
work_dir: str = DEFAULT_WORK_DIR
|
|
52
|
-
|
|
90
|
+
"""Working directory for storing evaluation results and temporary files."""
|
|
53
91
|
|
|
54
92
|
# Debug and runtime mode arguments
|
|
55
93
|
ignore_errors: bool = False
|
|
94
|
+
"""Whether to continue evaluation when encountering errors."""
|
|
95
|
+
|
|
56
96
|
debug: bool = False
|
|
57
|
-
|
|
97
|
+
"""Enable debug mode for detailed logging and error reporting."""
|
|
98
|
+
|
|
58
99
|
seed: Optional[int] = 42
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
100
|
+
"""Random seed for reproducible results."""
|
|
101
|
+
|
|
102
|
+
api_url: Optional[str] = None
|
|
103
|
+
"""API endpoint URL for server-based model evaluation."""
|
|
104
|
+
|
|
105
|
+
api_key: Optional[str] = 'EMPTY'
|
|
106
|
+
"""API key for authenticating with server-based models."""
|
|
107
|
+
|
|
108
|
+
timeout: Optional[float] = None
|
|
109
|
+
"""Request timeout in seconds for server-based models."""
|
|
110
|
+
|
|
111
|
+
stream: Optional[bool] = None
|
|
112
|
+
"""Whether to use streaming responses for server-based models."""
|
|
63
113
|
|
|
64
114
|
# LLMJudge arguments
|
|
65
115
|
judge_strategy: str = JudgeStrategy.AUTO
|
|
116
|
+
"""Strategy for LLM-based judgment (auto, single, pairwise)."""
|
|
117
|
+
|
|
66
118
|
judge_worker_num: int = 1
|
|
119
|
+
"""Number of worker processes for parallel LLM judging."""
|
|
120
|
+
|
|
67
121
|
judge_model_args: Optional[Dict] = field(default_factory=dict)
|
|
122
|
+
"""Additional arguments for the judge model configuration."""
|
|
123
|
+
|
|
68
124
|
analysis_report: bool = False
|
|
125
|
+
"""Whether to generate detailed analysis reports after evaluation."""
|
|
69
126
|
|
|
70
127
|
def __post_init__(self):
|
|
128
|
+
self.__init_model_and_id()
|
|
129
|
+
|
|
130
|
+
self.__init_eval_data_config()
|
|
131
|
+
|
|
132
|
+
# Set default generation_config and model_args
|
|
133
|
+
self.__init_default_generation_config()
|
|
134
|
+
self.__init_default_model_args()
|
|
135
|
+
|
|
136
|
+
def __init_model_and_id(self):
|
|
137
|
+
# Set model to DummyCustomModel if not provided
|
|
71
138
|
if self.model is None:
|
|
72
|
-
self.model =
|
|
73
|
-
self.eval_type = EvalType.
|
|
139
|
+
self.model = self.model_task
|
|
140
|
+
self.eval_type = EvalType.MOCK_LLM
|
|
74
141
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
142
|
+
# Set model_id if not provided
|
|
143
|
+
if not self.model_id:
|
|
144
|
+
if isinstance(self.model, str):
|
|
145
|
+
self.model_id = safe_filename(os.path.basename(self.model))
|
|
146
|
+
elif isinstance(self.model, Model):
|
|
147
|
+
self.model_id = safe_filename(self.model.name)
|
|
148
|
+
elif isinstance(self.model, ModelAPI):
|
|
149
|
+
self.model_id = safe_filename(self.model.model_name)
|
|
78
150
|
else:
|
|
79
|
-
self.model_id =
|
|
80
|
-
# fix path error, see http://github.com/modelscope/evalscope/issues/377
|
|
81
|
-
self.model_id = self.model_id.replace(':', '-')
|
|
82
|
-
|
|
83
|
-
# Set default eval_batch_size based on eval_type
|
|
84
|
-
if self.eval_batch_size is None:
|
|
85
|
-
self.eval_batch_size = 8 if self.eval_type == EvalType.SERVICE else 1
|
|
151
|
+
self.model_id = 'dummy_model'
|
|
86
152
|
|
|
153
|
+
def __init_eval_data_config(self):
|
|
87
154
|
# Post process limit
|
|
88
155
|
if self.limit is not None:
|
|
89
156
|
self.limit = parse_int_or_float(self.limit)
|
|
90
157
|
|
|
91
|
-
# Set default generation_config and model_args
|
|
92
|
-
self.__init_default_generation_config()
|
|
93
|
-
self.__init_default_model_args()
|
|
94
|
-
|
|
95
158
|
def __init_default_generation_config(self):
|
|
96
|
-
if self.generation_config:
|
|
97
|
-
|
|
98
|
-
if self.model_task == ModelTask.IMAGE_GENERATION:
|
|
99
|
-
self.generation_config = {
|
|
100
|
-
'height': 1024,
|
|
101
|
-
'width': 1024,
|
|
102
|
-
'num_inference_steps': 50,
|
|
103
|
-
'guidance_scale': 9.0,
|
|
104
|
-
}
|
|
105
|
-
elif self.model_task == ModelTask.TEXT_GENERATION:
|
|
106
|
-
if self.eval_type == EvalType.CHECKPOINT:
|
|
159
|
+
if not self.generation_config:
|
|
160
|
+
if self.model_task == ModelTask.IMAGE_GENERATION:
|
|
107
161
|
self.generation_config = {
|
|
108
|
-
'
|
|
109
|
-
'
|
|
110
|
-
'
|
|
111
|
-
'
|
|
112
|
-
'top_p': 1.0,
|
|
113
|
-
'temperature': 1.0,
|
|
114
|
-
}
|
|
115
|
-
elif self.eval_type == EvalType.SERVICE:
|
|
116
|
-
self.generation_config = {
|
|
117
|
-
'max_tokens': 2048,
|
|
118
|
-
'temperature': 0.0,
|
|
162
|
+
'height': 1024,
|
|
163
|
+
'width': 1024,
|
|
164
|
+
'num_inference_steps': 50,
|
|
165
|
+
'guidance_scale': 9.0,
|
|
119
166
|
}
|
|
167
|
+
if self.eval_batch_size != 1:
|
|
168
|
+
logger.warning(
|
|
169
|
+
'For image generation task, we only support eval_batch_size=1 for now, changed to 1.'
|
|
170
|
+
)
|
|
171
|
+
self.eval_batch_size = 1
|
|
172
|
+
elif self.model_task == ModelTask.TEXT_GENERATION:
|
|
173
|
+
if self.eval_type == EvalType.CHECKPOINT:
|
|
174
|
+
self.generation_config = {
|
|
175
|
+
'max_tokens': 2048,
|
|
176
|
+
'do_sample': False,
|
|
177
|
+
'top_k': 50,
|
|
178
|
+
'top_p': 1.0,
|
|
179
|
+
'temperature': 1.0,
|
|
180
|
+
'n': 1,
|
|
181
|
+
}
|
|
182
|
+
elif self.eval_type == EvalType.SERVICE:
|
|
183
|
+
self.generation_config = {
|
|
184
|
+
'max_tokens': 2048,
|
|
185
|
+
'temperature': 0.0,
|
|
186
|
+
}
|
|
187
|
+
if isinstance(self.generation_config, dict):
|
|
188
|
+
self.generation_config = GenerateConfig.model_validate(self.generation_config)
|
|
189
|
+
|
|
190
|
+
# Set eval_batch_size to generation_config.batch_size
|
|
191
|
+
self.generation_config.batch_size = self.eval_batch_size
|
|
192
|
+
|
|
193
|
+
# Set default values for generation_config
|
|
194
|
+
if self.timeout is not None:
|
|
195
|
+
deprecated_warning(
|
|
196
|
+
logger,
|
|
197
|
+
'The `timeout` parameter is deprecated and will be removed in v1.1.0. Use `generation_config.timeout` instead.'
|
|
198
|
+
)
|
|
199
|
+
self.generation_config.timeout = self.timeout
|
|
200
|
+
|
|
201
|
+
if self.stream is not None:
|
|
202
|
+
deprecated_warning(
|
|
203
|
+
logger,
|
|
204
|
+
'The `stream` parameter is deprecated and will be removed in v1.1.0. Use `generation_config.stream` instead.'
|
|
205
|
+
)
|
|
206
|
+
self.generation_config.stream = self.stream
|
|
207
|
+
|
|
208
|
+
if self.generation_config.n is not None and self.generation_config.n > 1:
|
|
209
|
+
self.repeats = self.generation_config.n
|
|
210
|
+
self.generation_config.n = 1
|
|
211
|
+
deprecated_warning(
|
|
212
|
+
logger,
|
|
213
|
+
'The `n` parameter in generation_config is deprecated and will be removed in v1.1.0. Use `TaskConfig.repeats` instead.'
|
|
214
|
+
)
|
|
120
215
|
|
|
121
216
|
def __init_default_model_args(self):
|
|
122
217
|
if self.model_args:
|
|
@@ -143,9 +238,14 @@ class TaskConfig(BaseArgument):
|
|
|
143
238
|
logger.warning(f'Failed to dump overall task config: {e}')
|
|
144
239
|
|
|
145
240
|
def to_dict(self):
|
|
146
|
-
result = self.__dict__
|
|
147
|
-
|
|
241
|
+
result = copy.deepcopy(self.__dict__)
|
|
242
|
+
del result['api_key'] # Do not expose api_key in the config
|
|
243
|
+
|
|
244
|
+
if isinstance(self.model, (Model, ModelAPI)):
|
|
148
245
|
result['model'] = self.model.__class__.__name__
|
|
246
|
+
|
|
247
|
+
if isinstance(self.generation_config, GenerateConfig):
|
|
248
|
+
result['generation_config'] = self.generation_config.model_dump(exclude_unset=True)
|
|
149
249
|
return result
|
|
150
250
|
|
|
151
251
|
|
evalscope/constants.py
CHANGED
|
@@ -9,9 +9,12 @@ from modelscope.utils.file_utils import get_dataset_cache_root, get_model_cache_
|
|
|
9
9
|
|
|
10
10
|
DEFAULT_WORK_DIR = './outputs'
|
|
11
11
|
DEFAULT_MODEL_REVISION = DEFAULT_REPOSITORY_REVISION # master
|
|
12
|
-
DEFAULT_MODEL_CACHE_DIR = get_model_cache_root() # ~/.cache/modelscope/hub
|
|
13
|
-
DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root() # ~/.cache/modelscope/datasets
|
|
12
|
+
DEFAULT_MODEL_CACHE_DIR = get_model_cache_root() # ~/.cache/modelscope/hub/models
|
|
13
|
+
DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root() # ~/.cache/modelscope/hub/datasets
|
|
14
14
|
DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR # compatible with old version
|
|
15
|
+
DEFAULT_EVALSCOPE_CACHE_DIR = os.path.expanduser(
|
|
16
|
+
os.getenv('EVALSCOPE_CACHE', '~/.cache/evalscope')
|
|
17
|
+
) # ~/.cache/evalscope
|
|
15
18
|
|
|
16
19
|
|
|
17
20
|
class HubType:
|
|
@@ -44,22 +47,12 @@ class MetricsConstant:
|
|
|
44
47
|
class ArenaWinner:
|
|
45
48
|
|
|
46
49
|
MODEL_A = 'model_a'
|
|
47
|
-
|
|
48
50
|
MODEL_B = 'model_b'
|
|
49
|
-
|
|
50
51
|
TIE = 'tie'
|
|
51
|
-
|
|
52
52
|
TIE_BOTH_BAD = 'tie_both_bad'
|
|
53
|
-
|
|
54
53
|
UNKNOWN = 'unknown'
|
|
55
54
|
|
|
56
55
|
|
|
57
|
-
class ArenaMode:
|
|
58
|
-
SINGLE = 'single'
|
|
59
|
-
PAIRWISE = 'pairwise'
|
|
60
|
-
PAIRWISE_BASELINE = 'pairwise_baseline'
|
|
61
|
-
|
|
62
|
-
|
|
63
56
|
class AnswerKeys:
|
|
64
57
|
INDEX = 'index'
|
|
65
58
|
ANSWER_ID = 'answer_id'
|
|
@@ -70,58 +63,14 @@ class AnswerKeys:
|
|
|
70
63
|
CHOICES = 'choices'
|
|
71
64
|
|
|
72
65
|
|
|
73
|
-
class ReviewKeys:
|
|
74
|
-
REVIEW_ID = 'review_id'
|
|
75
|
-
REVIEWED = 'reviewed'
|
|
76
|
-
REVIEWER_SPEC = 'reviewer_spec'
|
|
77
|
-
REVIEW_TIME = 'review_time'
|
|
78
|
-
MESSAGE = 'message'
|
|
79
|
-
CONTENT = 'content'
|
|
80
|
-
GOLD = 'gold'
|
|
81
|
-
PRED = 'pred'
|
|
82
|
-
RESULT = 'result'
|
|
83
|
-
REVIEW = 'review'
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
class EvalConfigKeys:
|
|
87
|
-
CLASS_REF = 'ref'
|
|
88
|
-
CLASS_ARGS = 'args'
|
|
89
|
-
ENABLE = 'enable'
|
|
90
|
-
POSITION_BIAS_MITIGATION = 'position_bias_mitigation'
|
|
91
|
-
RANDOM_SEED = 'random_seed'
|
|
92
|
-
FN_COMPLETION_PARSER = 'fn_completion_parser'
|
|
93
|
-
COMPLETION_PARSER_KWARGS = 'completion_parser_kwargs'
|
|
94
|
-
OUTPUT_FILE = 'output_file'
|
|
95
|
-
MODEL_ID_OR_PATH = 'model_id_or_path'
|
|
96
|
-
MODEL_REVISION = 'revision'
|
|
97
|
-
GENERATION_CONFIG = 'generation_config'
|
|
98
|
-
PRECISION = 'precision'
|
|
99
|
-
TEMPLATE_TYPE = 'template_type'
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
class FnCompletionParser:
|
|
103
|
-
LMSYS_PARSER: str = 'lmsys_parser'
|
|
104
|
-
RANKING_PARSER: str = 'ranking_parser'
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
class PositionBiasMitigation:
|
|
108
|
-
NONE = 'none'
|
|
109
|
-
RANDOMIZE_ORDER = 'randomize_order'
|
|
110
|
-
SWAP_POSITION = 'swap_position'
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
class EvalStage:
|
|
114
|
-
# Enums: `all`, `infer`, `review`
|
|
115
|
-
ALL = 'all'
|
|
116
|
-
INFER = 'infer'
|
|
117
|
-
REVIEW = 'review'
|
|
118
|
-
|
|
119
|
-
|
|
120
66
|
class EvalType:
|
|
121
67
|
|
|
122
68
|
CUSTOM = 'custom'
|
|
123
|
-
|
|
124
|
-
|
|
69
|
+
MOCK_LLM = 'mock_llm'
|
|
70
|
+
CHECKPOINT = 'llm_ckpt' # native model checkpoint
|
|
71
|
+
SERVICE = 'openai_api' # model service
|
|
72
|
+
TEXT2IMAGE = 'text2image' # image generation service
|
|
73
|
+
IMAGE_EDITING = 'image_editing' # image editing service
|
|
125
74
|
|
|
126
75
|
|
|
127
76
|
class OutputType:
|
|
@@ -142,6 +91,7 @@ class EvalBackend:
|
|
|
142
91
|
|
|
143
92
|
class DataCollection:
|
|
144
93
|
NAME = 'data_collection'
|
|
94
|
+
INFO = 'collection_info'
|
|
145
95
|
|
|
146
96
|
|
|
147
97
|
class JudgeStrategy:
|
|
@@ -159,3 +109,29 @@ class JudgeScoreType:
|
|
|
159
109
|
class ModelTask:
|
|
160
110
|
TEXT_GENERATION = 'text_generation'
|
|
161
111
|
IMAGE_GENERATION = 'image_generation'
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class Tags:
|
|
115
|
+
KNOWLEDGE = 'Knowledge'
|
|
116
|
+
MULTIPLE_CHOICE = 'MCQ'
|
|
117
|
+
MATH = 'Math'
|
|
118
|
+
REASONING = 'Reasoning'
|
|
119
|
+
CODING = 'Coding'
|
|
120
|
+
CHINESE = 'Chinese'
|
|
121
|
+
COMMONSENSE = 'Commonsense'
|
|
122
|
+
QA = 'QA'
|
|
123
|
+
READING_COMPREHENSION = 'ReadingComprehension'
|
|
124
|
+
CUSTOM = 'Custom'
|
|
125
|
+
INSTRUCTION_FOLLOWING = 'InstructionFollowing'
|
|
126
|
+
ARENA = 'Arena'
|
|
127
|
+
LONG_CONTEXT = 'LongContext'
|
|
128
|
+
RETRIEVAL = 'Retrieval'
|
|
129
|
+
FUNCTION_CALLING = 'FunctionCalling'
|
|
130
|
+
TEXT_TO_IMAGE = 'TextToImage'
|
|
131
|
+
IMAGE_EDITING = 'ImageEditing'
|
|
132
|
+
MULTI_MODAL = 'MultiModal'
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class FileConstants:
|
|
136
|
+
IMAGE_PATH = 'image_path'
|
|
137
|
+
ID = 'id'
|
evalscope/evaluator/__init__.py
CHANGED