evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +5 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +356 -0
- evalscope/api/benchmark/meta.py +121 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +262 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +378 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +275 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +243 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +155 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/app.py +3 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +26 -14
- evalscope/app/utils/data_utils.py +43 -27
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -14
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +7 -10
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +10 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +136 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +157 -57
- evalscope/constants.py +37 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +275 -419
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +47 -33
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +67 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +126 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +701 -0
- evalscope/perf/benchmark.py +4 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +15 -10
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +11 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -3
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +51 -35
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +33 -47
- evalscope/summarizer.py +1 -1
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +3 -2
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +142 -6
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +11 -7
- evalscope/utils/multi_choices.py +288 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
- tests/benchmark/test_eval.py +385 -0
- tests/benchmark/test_image_edit.py +65 -0
- tests/{aigc → benchmark}/test_t2i.py +22 -4
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +85 -47
- tests/cli/test_collection.py +20 -8
- tests/cli/test_custom.py +22 -15
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -2
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/__init__.py +0 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
from evalscope.api.evaluator import TaskState
|
|
4
|
+
from evalscope.api.metric import Score
|
|
5
|
+
from evalscope.constants import JudgeStrategy
|
|
6
|
+
from evalscope.metrics import LLMJudge
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from evalscope.config import TaskConfig
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LLMJudgeMixin:
|
|
16
|
+
"""
|
|
17
|
+
Mixin class for LLM Judge functionality.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, task_config: 'TaskConfig'):
|
|
21
|
+
self._task_config = task_config
|
|
22
|
+
self._use_llm_judge = False
|
|
23
|
+
"""Whether to use LLM as a judge"""
|
|
24
|
+
|
|
25
|
+
self._llm_judge: Optional[LLMJudge] = None
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def llm_judge(self) -> Optional[LLMJudge]:
|
|
29
|
+
"""Get LLM judge instance with lazy initialization."""
|
|
30
|
+
if self._llm_judge is None and self.use_llm_judge:
|
|
31
|
+
self._llm_judge = self.init_llm_judge()
|
|
32
|
+
return self._llm_judge
|
|
33
|
+
|
|
34
|
+
@llm_judge.setter
|
|
35
|
+
def llm_judge(self, value: Optional[LLMJudge]):
|
|
36
|
+
"""Set LLM judge instance."""
|
|
37
|
+
self._llm_judge = value
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def judge_strategy(self) -> str:
|
|
41
|
+
"""Get the judge strategy from the task configuration."""
|
|
42
|
+
return self._task_config.judge_strategy
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def use_llm_judge(self) -> bool:
|
|
46
|
+
"""Check if LLM judge is enabled."""
|
|
47
|
+
if self.judge_strategy == JudgeStrategy.RULE:
|
|
48
|
+
return False
|
|
49
|
+
elif self.judge_strategy == JudgeStrategy.LLM:
|
|
50
|
+
return True
|
|
51
|
+
elif self.judge_strategy == JudgeStrategy.LLM_RECALL:
|
|
52
|
+
return True
|
|
53
|
+
elif self.judge_strategy == JudgeStrategy.AUTO:
|
|
54
|
+
return self._use_llm_judge
|
|
55
|
+
else:
|
|
56
|
+
logger.warning(f'Unknown judge strategy: {self.judge_strategy}. Defaulting to False.')
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
def init_llm_judge(self) -> Optional[LLMJudge]:
|
|
60
|
+
"""
|
|
61
|
+
Initialize the LLM judge for the benchmark.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Optional[LLMJudge]: The initialized LLM judge instance or None
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
if self.judge_strategy == JudgeStrategy.RULE:
|
|
68
|
+
return None
|
|
69
|
+
else:
|
|
70
|
+
return LLMJudge(**self._task_config.judge_model_args)
|
|
71
|
+
|
|
72
|
+
def maybe_llm_match_score(
|
|
73
|
+
self,
|
|
74
|
+
original_prediction: str,
|
|
75
|
+
filtered_prediction: str,
|
|
76
|
+
reference: str,
|
|
77
|
+
task_state: TaskState,
|
|
78
|
+
rule_based_score: Optional[Score] = None,
|
|
79
|
+
) -> Score:
|
|
80
|
+
"""
|
|
81
|
+
Compute the match score between the original and filtered predictions against the reference.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
original_prediction: The original prediction output from the model.
|
|
85
|
+
filtered_prediction: The filtered prediction output from the model.
|
|
86
|
+
reference: The ground truth reference output.
|
|
87
|
+
task_state: The current task state.
|
|
88
|
+
original_score: Optional original score to be used for comparison.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Score: The computed match score.
|
|
92
|
+
"""
|
|
93
|
+
# If LLM judge is not used, return the rule-based score directly
|
|
94
|
+
if not self.use_llm_judge:
|
|
95
|
+
return rule_based_score
|
|
96
|
+
|
|
97
|
+
# For LLM_RECALL, if rule-based score is already perfect, skip LLM judge
|
|
98
|
+
if float(rule_based_score.main_value) > 0.99:
|
|
99
|
+
return rule_based_score
|
|
100
|
+
|
|
101
|
+
# Compute LLM judge score
|
|
102
|
+
llm_score = self.llm_match_score(
|
|
103
|
+
original_prediction=original_prediction,
|
|
104
|
+
filtered_prediction=filtered_prediction,
|
|
105
|
+
reference=reference,
|
|
106
|
+
task_state=task_state,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# For LLM RECALL, merge the scores
|
|
110
|
+
return self._merge_scores(rule_based_score, llm_score)
|
|
111
|
+
|
|
112
|
+
def llm_match_score(
|
|
113
|
+
self,
|
|
114
|
+
original_prediction: str,
|
|
115
|
+
filtered_prediction: str,
|
|
116
|
+
reference: str,
|
|
117
|
+
task_state: TaskState,
|
|
118
|
+
) -> Score:
|
|
119
|
+
"""Compute the LLM match score.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
original_prediction (str): The original prediction output from the model.
|
|
123
|
+
filtered_prediction (str): The filtered prediction output from the model.
|
|
124
|
+
reference (str): The ground truth reference output.
|
|
125
|
+
task_state (TaskState): The current task state.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
Score: The computed match score.
|
|
129
|
+
"""
|
|
130
|
+
score = Score(
|
|
131
|
+
extracted_prediction=filtered_prediction,
|
|
132
|
+
prediction=original_prediction,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
question = task_state.input_text
|
|
136
|
+
|
|
137
|
+
# Request judge and obtain score
|
|
138
|
+
prompt = self.llm_judge.build_prompt(pred=original_prediction, gold=reference, question=question)
|
|
139
|
+
judge_response = self.llm_judge.judge(prompt)
|
|
140
|
+
judge_score = self.llm_judge.get_score(judge_response)
|
|
141
|
+
|
|
142
|
+
score.value = {'acc': judge_score}
|
|
143
|
+
score.explanation = f'LLM judge: {judge_response}'
|
|
144
|
+
score.metadata = {
|
|
145
|
+
'source': 'llm_judge',
|
|
146
|
+
'judge_strategy': self.judge_strategy,
|
|
147
|
+
'model': self.llm_judge.model_id
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
return score
|
|
151
|
+
|
|
152
|
+
def _merge_scores(self, rule_based_score: Score, llm_score: Score) -> Score:
|
|
153
|
+
"""
|
|
154
|
+
Merge rule-based score with LLM judge score for LLM_RECALL strategy.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
rule_based_score: The original rule-based score
|
|
158
|
+
llm_score: The LLM judge score
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
Score: The merged score
|
|
162
|
+
"""
|
|
163
|
+
# Update the main value with LLM judge result
|
|
164
|
+
rule_based_score.main_value = llm_score.main_value
|
|
165
|
+
rule_based_score.explanation = llm_score.explanation
|
|
166
|
+
rule_based_score.metadata = llm_score.metadata
|
|
167
|
+
|
|
168
|
+
return rule_based_score
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from .generate_config import GenerateConfig
|
|
2
|
+
from .model import Model, ModelAPI, get_model, get_model_with_task_config
|
|
3
|
+
from .model_output import (
|
|
4
|
+
ChatCompletionChoice,
|
|
5
|
+
Logprob,
|
|
6
|
+
Logprobs,
|
|
7
|
+
ModelOutput,
|
|
8
|
+
ModelUsage,
|
|
9
|
+
StopReason,
|
|
10
|
+
TopLogprob,
|
|
11
|
+
as_stop_reason,
|
|
12
|
+
)
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
2
|
+
from copy import deepcopy
|
|
3
|
+
from pydantic import BaseModel, Field, model_validator
|
|
4
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.utils.json_schema import JSONSchema
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ResponseSchema(BaseModel):
|
|
10
|
+
"""Schema for model response when using Structured Output."""
|
|
11
|
+
|
|
12
|
+
name: str
|
|
13
|
+
"""The name of the response schema. Must be a-z, A-Z, 0-9, or contain underscores and dashes, with a maximum length of 64."""
|
|
14
|
+
|
|
15
|
+
json_schema: JSONSchema
|
|
16
|
+
"""The schema for the response format, described as a JSON Schema object."""
|
|
17
|
+
|
|
18
|
+
description: Optional[str] = Field(default=None)
|
|
19
|
+
"""A description of what the response format is for, used by the model to determine how to respond in the format."""
|
|
20
|
+
|
|
21
|
+
strict: Optional[bool] = Field(default=None)
|
|
22
|
+
"""Whether to enable strict schema adherence when generating the output. If set to true, the model will always follow the exact schema defined in the schema field.
|
|
23
|
+
OpenAI and Mistral only."""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class GenerateConfig(BaseModel):
|
|
27
|
+
"""Model generation options."""
|
|
28
|
+
model_config = {'extra': 'allow'}
|
|
29
|
+
|
|
30
|
+
timeout: Optional[int] = Field(default=None)
|
|
31
|
+
"""Request timeout (in seconds)."""
|
|
32
|
+
|
|
33
|
+
batch_size: Optional[int] = Field(default=None)
|
|
34
|
+
"""Maximum number of concurrent connections to Model API (default is model specific) or batch size for generation."""
|
|
35
|
+
|
|
36
|
+
stream: Optional[bool] = Field(default=None)
|
|
37
|
+
"""Whether to stream the response (default is model specific)."""
|
|
38
|
+
|
|
39
|
+
system_message: Optional[str] = Field(default=None)
|
|
40
|
+
"""Override the default system message."""
|
|
41
|
+
|
|
42
|
+
max_tokens: Optional[int] = Field(default=None)
|
|
43
|
+
"""The maximum number of tokens that can be generated in the completion (default is model specific)."""
|
|
44
|
+
|
|
45
|
+
top_p: Optional[float] = Field(default=None)
|
|
46
|
+
"""An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass."""
|
|
47
|
+
|
|
48
|
+
temperature: Optional[float] = Field(default=None)
|
|
49
|
+
"""What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic."""
|
|
50
|
+
|
|
51
|
+
stop_seqs: Optional[List[str]] = Field(default=None)
|
|
52
|
+
"""Sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence."""
|
|
53
|
+
|
|
54
|
+
best_of: Optional[int] = Field(default=None)
|
|
55
|
+
"""Generates best_of completions server-side and returns the 'best' (the one with the highest log probability per token). vLLM only."""
|
|
56
|
+
|
|
57
|
+
frequency_penalty: Optional[float] = Field(default=None)
|
|
58
|
+
"""Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. OpenAI, Google, Grok, Groq, vLLM, and SGLang only."""
|
|
59
|
+
|
|
60
|
+
presence_penalty: Optional[float] = Field(default=None)
|
|
61
|
+
"""Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. OpenAI, Google, Grok, Groq, vLLM, and SGLang only."""
|
|
62
|
+
|
|
63
|
+
logit_bias: Optional[Dict[int, float]] = Field(default=None)
|
|
64
|
+
"""Map token Ids to an associated bias value from -100 to 100 (e.g. "42=10,43=-10"). OpenAI, Grok, Grok, and vLLM only."""
|
|
65
|
+
|
|
66
|
+
seed: Optional[int] = Field(default=None)
|
|
67
|
+
"""Random seed. OpenAI, Google, Mistral, Groq, HuggingFace, and vLLM only."""
|
|
68
|
+
|
|
69
|
+
do_sample: Optional[bool] = Field(default=None)
|
|
70
|
+
"""Whether to use sampling; use greedy decoding otherwise. Only transformers models support this parameter."""
|
|
71
|
+
|
|
72
|
+
top_k: Optional[int] = Field(default=None)
|
|
73
|
+
"""Randomly sample the next word from the top_k most likely next words. Anthropic, Google, HuggingFace, vLLM, and SGLang only."""
|
|
74
|
+
|
|
75
|
+
n: Optional[int] = Field(default=None)
|
|
76
|
+
"""How many chat completion choices to generate for each input message. OpenAI, Grok, Google, TogetherAI, vLLM, and SGLang only."""
|
|
77
|
+
|
|
78
|
+
logprobs: Optional[bool] = Field(default=None)
|
|
79
|
+
"""Return log probabilities of the output tokens. OpenAI, Grok, TogetherAI, Huggingface, llama-cpp-python, vLLM, and SGLang only."""
|
|
80
|
+
|
|
81
|
+
top_logprobs: Optional[int] = Field(default=None)
|
|
82
|
+
"""Number of most likely tokens (0-20) to return at each token position, each with an associated log probability. OpenAI, Grok, Huggingface, vLLM, and SGLang only."""
|
|
83
|
+
|
|
84
|
+
parallel_tool_calls: Optional[bool] = Field(default=None)
|
|
85
|
+
"""Whether to enable parallel function calling during tool use (defaults to True). OpenAI and Groq only."""
|
|
86
|
+
|
|
87
|
+
internal_tools: Optional[bool] = Field(default=None)
|
|
88
|
+
"""Whether to automatically map tools to model internal implementations (e.g. 'computer' for anthropic)."""
|
|
89
|
+
|
|
90
|
+
max_tool_output: Optional[int] = Field(default=None)
|
|
91
|
+
"""Maximum tool output (in bytes). Defaults to 16 * 1024."""
|
|
92
|
+
|
|
93
|
+
cache_prompt: Union[Literal['auto'], bool, None] = Field(default=None)
|
|
94
|
+
"""Whether to cache the prompt prefix. Defaults to "auto", which will enable caching for requests with tools. Anthropic only."""
|
|
95
|
+
|
|
96
|
+
reasoning_effort: Optional[Literal['low', 'medium', 'high']] = Field(default=None)
|
|
97
|
+
"""Constrains effort on reasoning for reasoning models (defaults to `medium`). Open AI o1 models only."""
|
|
98
|
+
|
|
99
|
+
reasoning_tokens: Optional[int] = Field(default=None)
|
|
100
|
+
"""Maximum number of tokens to use for reasoning. Anthropic Claude models only."""
|
|
101
|
+
|
|
102
|
+
reasoning_summary: Optional[Literal['concise', 'detailed', 'auto']] = Field(default=None)
|
|
103
|
+
"""Provide summary of reasoning steps (defaults to no summary). Use 'auto' to access the most detailed summarizer available for the current model. OpenAI reasoning models only."""
|
|
104
|
+
|
|
105
|
+
reasoning_history: Optional[Literal['none', 'all', 'last', 'auto']] = Field(default=None)
|
|
106
|
+
"""Include reasoning in chat message history sent to generate."""
|
|
107
|
+
|
|
108
|
+
response_schema: Optional[ResponseSchema] = Field(default=None)
|
|
109
|
+
"""Request a response format as JSONSchema (output should still be validated). OpenAI, Google, and Mistral only."""
|
|
110
|
+
|
|
111
|
+
extra_body: Optional[Dict[str, Any]] = Field(default=None)
|
|
112
|
+
"""Extra body to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
|
|
113
|
+
|
|
114
|
+
height: Optional[int] = Field(default=None)
|
|
115
|
+
"""Image height for image generation model only"""
|
|
116
|
+
|
|
117
|
+
width: Optional[int] = Field(default=None)
|
|
118
|
+
"""Image width for image generation model only"""
|
|
119
|
+
|
|
120
|
+
num_inference_steps: Optional[int] = Field(default=None)
|
|
121
|
+
"""Number of inference steps for image generation model only"""
|
|
122
|
+
|
|
123
|
+
guidance_scale: Optional[float] = Field(default=None)
|
|
124
|
+
"""Guidance scale for image generation model only"""
|
|
125
|
+
|
|
126
|
+
# migrate reasoning_history as a bool
|
|
127
|
+
@model_validator(mode='before')
|
|
128
|
+
@classmethod
|
|
129
|
+
def migrate_reasoning(cls, data: Any) -> Any:
|
|
130
|
+
if isinstance(data, dict):
|
|
131
|
+
reasoning_history = data.get('reasoning_history', None)
|
|
132
|
+
if reasoning_history is True:
|
|
133
|
+
data['reasoning_history'] = 'all'
|
|
134
|
+
elif reasoning_history is False:
|
|
135
|
+
data['reasoning_history'] = 'none'
|
|
136
|
+
|
|
137
|
+
return data
|
|
138
|
+
|
|
139
|
+
def merge(self, other: 'GenerateConfig') -> 'GenerateConfig':
|
|
140
|
+
"""Merge another model configuration into this one.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
other (GenerateConfig):
|
|
144
|
+
Configuration to merge.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Merged configuration.
|
|
148
|
+
"""
|
|
149
|
+
config_keys = [field for field in self.__class__.model_fields.keys()]
|
|
150
|
+
config = deepcopy(self)
|
|
151
|
+
for key in config_keys:
|
|
152
|
+
value = getattr(other, key, None)
|
|
153
|
+
if value is not None:
|
|
154
|
+
setattr(config, key, value)
|
|
155
|
+
return config
|