evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.filter import Filter
|
|
5
|
+
from evalscope.api.registry import register_filter
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@register_filter('regex')
|
|
9
|
+
class RegexFilter(Filter):
|
|
10
|
+
"""A filter that extracts values from text using regex pattern matching.
|
|
11
|
+
|
|
12
|
+
This filter applies a regex pattern to each model response and extracts matched values.
|
|
13
|
+
If no match is found, returns a fallback value. Useful for extracting structured data
|
|
14
|
+
(like numbers) from unstructured model outputs.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
regex_pattern: str = r'#### (\-?[0-9\.\,]+)',
|
|
20
|
+
group_select: int = 0,
|
|
21
|
+
fallback: str = '[invalid]',
|
|
22
|
+
) -> None:
|
|
23
|
+
"""
|
|
24
|
+
pass a string `regex` to run `re.compile(r"regex")` on.
|
|
25
|
+
`fallback` defines the output returned if no matches for the regex are located.
|
|
26
|
+
"""
|
|
27
|
+
self.regex_pattern = regex_pattern
|
|
28
|
+
self.regex = re.compile(regex_pattern)
|
|
29
|
+
self.group_select = group_select
|
|
30
|
+
self.fallback = fallback
|
|
31
|
+
|
|
32
|
+
def apply(self, instance: List[str]) -> List[str]:
|
|
33
|
+
"""Apply regex pattern to each string in the instance list."""
|
|
34
|
+
filtered = []
|
|
35
|
+
for resp in instance:
|
|
36
|
+
match = self.regex.findall(resp)
|
|
37
|
+
if match:
|
|
38
|
+
match = match[self.group_select]
|
|
39
|
+
if isinstance(match, tuple):
|
|
40
|
+
match = [m for m in match if m]
|
|
41
|
+
if match:
|
|
42
|
+
match = match[0]
|
|
43
|
+
else:
|
|
44
|
+
match = self.fallback
|
|
45
|
+
match = match.strip()
|
|
46
|
+
else:
|
|
47
|
+
match = self.fallback
|
|
48
|
+
filtered.append(match)
|
|
49
|
+
return filtered
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@register_filter('regex_pos')
|
|
53
|
+
class POSFilter(Filter):
|
|
54
|
+
""" """
|
|
55
|
+
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
regex_pattern: str = r"\['(.*?)'\]",
|
|
59
|
+
group_select=0,
|
|
60
|
+
fallback=None,
|
|
61
|
+
) -> None:
|
|
62
|
+
"""
|
|
63
|
+
pass a string `regex` to run `re.compile(r"regex")` on.
|
|
64
|
+
`fallback` defines the output returned if no matches for the regex are located.
|
|
65
|
+
"""
|
|
66
|
+
if fallback is None:
|
|
67
|
+
fallback = ['invalid']
|
|
68
|
+
self.regex_pattern = regex_pattern
|
|
69
|
+
self.regex = re.compile(regex_pattern)
|
|
70
|
+
self.group_select = group_select
|
|
71
|
+
self.fallback = fallback
|
|
72
|
+
|
|
73
|
+
def apply(self, instance: List[str]) -> List[str]:
|
|
74
|
+
"""Extract POS tags from each string in the instance list."""
|
|
75
|
+
|
|
76
|
+
def extract_tagged_tokens(text):
|
|
77
|
+
# Extract tagged tokens list from text input using regex
|
|
78
|
+
tokens = re.findall(r"\('([^']*)', '([^']*)'\)", text)
|
|
79
|
+
return [(token, pos) for token, pos in tokens]
|
|
80
|
+
|
|
81
|
+
def extract_pos_tags(result):
|
|
82
|
+
pos_tags = []
|
|
83
|
+
if isinstance(result, str):
|
|
84
|
+
result = extract_tagged_tokens(result)
|
|
85
|
+
pos_tags.extend(pos for _, pos in result)
|
|
86
|
+
return pos_tags if pos_tags else self.fallback
|
|
87
|
+
|
|
88
|
+
filtered = []
|
|
89
|
+
for resp in instance:
|
|
90
|
+
match = extract_pos_tags(resp)
|
|
91
|
+
filtered.append(str(match))
|
|
92
|
+
return filtered
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@register_filter('remove_whitespace')
|
|
96
|
+
class WhitespaceFilter(Filter):
|
|
97
|
+
"""Filters out leading whitespace from responses."""
|
|
98
|
+
|
|
99
|
+
def apply(self, instance: List[str]) -> List[str]:
|
|
100
|
+
"""Remove leading whitespace from each string in the instance list."""
|
|
101
|
+
filtered_resp = []
|
|
102
|
+
for resp in instance:
|
|
103
|
+
resp = resp.lstrip()
|
|
104
|
+
filtered_resp.append(resp)
|
|
105
|
+
return filtered_resp
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@register_filter('remove_until')
|
|
109
|
+
class RemoveUntilFilter(Filter):
|
|
110
|
+
"""Filters out all text until a specified delimiter is found."""
|
|
111
|
+
|
|
112
|
+
def __init__(self, delimiter: str) -> None:
|
|
113
|
+
self.delimiter = delimiter
|
|
114
|
+
|
|
115
|
+
def apply(self, instance: List[str]) -> List[str]:
|
|
116
|
+
"""Remove all text until the delimiter from each string in the instance list."""
|
|
117
|
+
filtered_resp = []
|
|
118
|
+
for resp in instance:
|
|
119
|
+
resp = resp.split(self.delimiter, 1)[-1]
|
|
120
|
+
filtered_resp.append(resp)
|
|
121
|
+
return filtered_resp
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@register_filter('extract')
|
|
125
|
+
class ExtractFilter(RegexFilter):
|
|
126
|
+
...
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from collections import Counter
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.filter import Filter
|
|
5
|
+
from evalscope.api.registry import register_filter
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@register_filter('take_first')
|
|
9
|
+
class TakeFirstFilter(Filter):
|
|
10
|
+
|
|
11
|
+
def __init__(self) -> None:
|
|
12
|
+
"""
|
|
13
|
+
Can define custom behavior here, if an individual instantiation of a Filter class should have state.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def apply(self, instance: List[str]) -> List[str]:
|
|
17
|
+
"""
|
|
18
|
+
Take only the first response from the instance list.
|
|
19
|
+
"""
|
|
20
|
+
return [instance[0]] if instance else []
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@register_filter('take_first_k')
|
|
24
|
+
class TakeKFilter(Filter):
|
|
25
|
+
|
|
26
|
+
def __init__(self, **kwargs) -> None:
|
|
27
|
+
self.k = kwargs.pop('k')
|
|
28
|
+
super().__init__(**kwargs)
|
|
29
|
+
|
|
30
|
+
def apply(self, instance: List[str]) -> List[str]:
|
|
31
|
+
"""
|
|
32
|
+
Take the first k responses from the instance list.
|
|
33
|
+
"""
|
|
34
|
+
assert len(instance) >= self.k, (
|
|
35
|
+
f'Need at least {self.k} responses to take first {self.k}, but got {len(instance)} only!'
|
|
36
|
+
)
|
|
37
|
+
return instance[:self.k]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@register_filter('majority_vote')
|
|
41
|
+
class MajorityVoteFilter(Filter):
|
|
42
|
+
|
|
43
|
+
def __init__(self) -> None:
|
|
44
|
+
"""
|
|
45
|
+
Can define custom behavior here, if an individual instantiation of a Filter class should have state.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def apply(self, instance: List[str]) -> List[str]:
|
|
49
|
+
"""
|
|
50
|
+
Select the response that occurs most frequently in the instance list.
|
|
51
|
+
"""
|
|
52
|
+
if not instance:
|
|
53
|
+
return []
|
|
54
|
+
|
|
55
|
+
counts = Counter(instance)
|
|
56
|
+
vote = counts.most_common(1)[0][0]
|
|
57
|
+
return [vote]
|
evalscope/metrics/__init__.py
CHANGED
|
@@ -4,12 +4,18 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
from evalscope.utils.import_utils import _LazyModule
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
|
-
from .completion_parsers import ResponseParser, lmsys_parser, ranking_parser
|
|
8
7
|
from .llm_judge import DEFAULT_NUMERIC_SCORE_TEMPLATE, DEFAULT_PROMPT_TEMPLATE, LLMJudge
|
|
9
8
|
from .math_parser import extract_answer, math_equal, strip_answer_string
|
|
10
|
-
from .
|
|
11
|
-
|
|
12
|
-
|
|
9
|
+
from .metric import PassAtK
|
|
10
|
+
from .metrics import (
|
|
11
|
+
bleu_ngram_one_sample,
|
|
12
|
+
exact_match,
|
|
13
|
+
macro_mean,
|
|
14
|
+
mean,
|
|
15
|
+
micro_mean,
|
|
16
|
+
simple_f1_score,
|
|
17
|
+
weighted_mean,
|
|
18
|
+
)
|
|
13
19
|
from .rouge_metric import compute_rouge_score, compute_rouge_score_one_sample, compute_rouge_score_one_sample_zh
|
|
14
20
|
|
|
15
21
|
else:
|
|
@@ -23,9 +29,8 @@ else:
|
|
|
23
29
|
'simple_f1_score',
|
|
24
30
|
'weighted_mean',
|
|
25
31
|
],
|
|
26
|
-
'
|
|
27
|
-
'
|
|
28
|
-
'metric_registry',
|
|
32
|
+
'metric': [
|
|
33
|
+
'PassAtK',
|
|
29
34
|
],
|
|
30
35
|
'rouge_metric': [
|
|
31
36
|
'compute_rouge_score_one_sample_zh',
|
|
@@ -41,12 +46,7 @@ else:
|
|
|
41
46
|
'extract_answer',
|
|
42
47
|
'math_equal',
|
|
43
48
|
'strip_answer_string',
|
|
44
|
-
]
|
|
45
|
-
'completion_parsers': [
|
|
46
|
-
'ResponseParser',
|
|
47
|
-
'lmsys_parser',
|
|
48
|
-
'ranking_parser',
|
|
49
|
-
],
|
|
49
|
+
]
|
|
50
50
|
}
|
|
51
51
|
|
|
52
52
|
import sys
|
evalscope/metrics/llm_judge.py
CHANGED
|
@@ -48,17 +48,18 @@ class LLMJudge:
|
|
|
48
48
|
"""
|
|
49
49
|
|
|
50
50
|
def __init__(
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
51
|
+
self,
|
|
52
|
+
api_key: Optional[str] = None,
|
|
53
|
+
api_url: Optional[str] = None,
|
|
54
|
+
model_id: Optional[str] = None,
|
|
55
|
+
system_prompt: Optional[str] = None,
|
|
56
|
+
prompt_template: Optional[str] = None,
|
|
57
|
+
generation_config: Optional[Dict[str, Any]] = None,
|
|
58
|
+
score_pattern: Optional[str] = None,
|
|
59
|
+
score_mapping: Optional[Dict[str, float]] = None,
|
|
60
|
+
score_type: str = JudgeScoreType.PATTERN, # 'pattern', 'numeric'
|
|
61
|
+
**kwargs
|
|
62
|
+
):
|
|
62
63
|
"""
|
|
63
64
|
Initialize LLMJudge metric.
|
|
64
65
|
|
|
@@ -79,14 +80,15 @@ class LLMJudge:
|
|
|
79
80
|
self.api_url = api_url or os.environ.get('MODELSCOPE_API_BASE', DEFAULT_API_URL)
|
|
80
81
|
self.model_id = model_id or os.environ.get('MODELSCOPE_JUDGE_LLM', DEFAULT_JUDGE_MODEL)
|
|
81
82
|
self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
|
|
82
|
-
self.generation_config = generation_config or {}
|
|
83
|
+
self.generation_config = generation_config or {'temperature': 0.0, 'max_tokens': 1024}
|
|
83
84
|
|
|
84
85
|
# Default score mapping for A/B pattern
|
|
85
86
|
self.score_type = score_type
|
|
86
87
|
if self.score_type == JudgeScoreType.NUMERIC:
|
|
87
88
|
self.score_pattern = score_pattern or r'\[\[(\d+(?:\.\d+)?)\]\]'
|
|
88
|
-
self.prompt_template = prompt_template or os.environ.get(
|
|
89
|
-
|
|
89
|
+
self.prompt_template = prompt_template or os.environ.get(
|
|
90
|
+
'JUDGE_PROMPT_TEMPLATE', DEFAULT_NUMERIC_SCORE_TEMPLATE
|
|
91
|
+
)
|
|
90
92
|
elif self.score_type == JudgeScoreType.PATTERN:
|
|
91
93
|
self.score_pattern = score_pattern or r'(A|B)'
|
|
92
94
|
self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
|
|
@@ -97,12 +99,17 @@ class LLMJudge:
|
|
|
97
99
|
self._init_server_adapter()
|
|
98
100
|
|
|
99
101
|
def _init_server_adapter(self):
|
|
100
|
-
from evalscope.
|
|
102
|
+
from evalscope.api.model import GenerateConfig, get_model
|
|
101
103
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
+
self.model = get_model(
|
|
105
|
+
model=self.model_id,
|
|
106
|
+
eval_type='openai_api',
|
|
107
|
+
base_url=self.api_url,
|
|
108
|
+
api_key=self.api_key,
|
|
109
|
+
config=GenerateConfig(**self.generation_config),
|
|
110
|
+
)
|
|
104
111
|
|
|
105
|
-
def
|
|
112
|
+
def judge(self, prompt: str, system_prompt: Optional[str] = None) -> str:
|
|
106
113
|
"""
|
|
107
114
|
Args:
|
|
108
115
|
prompt (str): The prompt to evaluate
|
|
@@ -110,23 +117,18 @@ class LLMJudge:
|
|
|
110
117
|
Returns:
|
|
111
118
|
str: The response from the LLM
|
|
112
119
|
"""
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
# Inference configuration
|
|
116
|
-
infer_cfg = {'temperature': 0.0, 'max_tokens': 1024}
|
|
117
|
-
if self.generation_config:
|
|
118
|
-
infer_cfg.update(self.generation_config)
|
|
119
|
-
|
|
120
|
-
if self.model_id == DEFAULT_JUDGE_MODEL:
|
|
121
|
-
# Disable thinking for the default judge model
|
|
122
|
-
infer_cfg['enable_thinking'] = self.generation_config.get('enable_thinking', False)
|
|
120
|
+
from evalscope.api.messages import ChatMessageSystem, ChatMessageUser
|
|
123
121
|
|
|
122
|
+
system_content = system_prompt or self.system_prompt
|
|
123
|
+
input_messages = [ChatMessageUser(content=prompt)]
|
|
124
|
+
if system_content:
|
|
125
|
+
input_messages.insert(0, ChatMessageSystem(content=system_content))
|
|
124
126
|
try:
|
|
125
127
|
# Send request using ServerModelAdapter
|
|
126
|
-
response = self.
|
|
128
|
+
response = self.model.generate(input_messages)
|
|
127
129
|
|
|
128
130
|
# Extract content from response
|
|
129
|
-
llm_response = response.
|
|
131
|
+
llm_response = response.completion
|
|
130
132
|
return llm_response
|
|
131
133
|
except Exception as e:
|
|
132
134
|
logger.error(f'Error occurred during {self.model_id}@{self.api_url} LLM judge evaluation: {e}')
|
evalscope/metrics/math_parser.py
CHANGED
|
@@ -153,9 +153,11 @@ def strip_answer_string(string):
|
|
|
153
153
|
|
|
154
154
|
# cdot
|
|
155
155
|
# string = string.replace("\\cdot", "")
|
|
156
|
-
if (
|
|
157
|
-
|
|
158
|
-
|
|
156
|
+
if (
|
|
157
|
+
string.startswith('{') and string.endswith('}') and string.isalnum()
|
|
158
|
+
or string.startswith('(') and string.endswith(')') and string.isalnum()
|
|
159
|
+
or string.startswith('[') and string.endswith(']') and string.isalnum()
|
|
160
|
+
):
|
|
159
161
|
string = string[1:-1]
|
|
160
162
|
|
|
161
163
|
# inf
|
|
@@ -387,9 +389,8 @@ def math_equal(
|
|
|
387
389
|
|
|
388
390
|
## deal with [], (), {}
|
|
389
391
|
pred_str, ref_str = prediction, reference
|
|
390
|
-
if (prediction.startswith('[') and prediction.endswith(']')
|
|
391
|
-
|
|
392
|
-
and not reference.startswith('[')):
|
|
392
|
+
if (prediction.startswith('[') and prediction.endswith(']') and not reference.startswith('(')
|
|
393
|
+
) or (prediction.startswith('(') and prediction.endswith(')') and not reference.startswith('[')):
|
|
393
394
|
pred_str = pred_str.strip('[]()')
|
|
394
395
|
ref_str = ref_str.strip('[]()')
|
|
395
396
|
for s in ['{', '}', '(', ')']:
|
|
@@ -399,25 +400,29 @@ def math_equal(
|
|
|
399
400
|
return True
|
|
400
401
|
|
|
401
402
|
## [a, b] vs. [c, d], return a==c and b==d
|
|
402
|
-
if (
|
|
403
|
-
|
|
403
|
+
if (
|
|
404
|
+
regex.match(r'(\(|\[).+(\)|\])', prediction) is not None
|
|
405
|
+
and regex.match(r'(\(|\[).+(\)|\])', reference) is not None
|
|
406
|
+
):
|
|
404
407
|
pred_parts = prediction[1:-1].split(',')
|
|
405
408
|
ref_parts = reference[1:-1].split(',')
|
|
406
409
|
if len(pred_parts) == len(ref_parts):
|
|
407
|
-
if all(
|
|
408
|
-
|
|
409
|
-
|
|
410
|
+
if all([
|
|
411
|
+
math_equal(pred_parts[i], ref_parts[i], include_percentage, is_close) for i in range(len(pred_parts))
|
|
412
|
+
]):
|
|
410
413
|
return True
|
|
411
414
|
if ((prediction.startswith('\\begin{pmatrix}') or prediction.startswith('\\begin{bmatrix}'))
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
+
and (prediction.endswith('\\end{pmatrix}') or prediction.endswith('\\end{bmatrix}'))
|
|
416
|
+
and (reference.startswith('\\begin{pmatrix}') or reference.startswith('\\begin{bmatrix}'))
|
|
417
|
+
and (reference.endswith('\\end{pmatrix}') or reference.endswith('\\end{bmatrix}'))):
|
|
415
418
|
pred_lines = [
|
|
416
|
-
line.strip()
|
|
419
|
+
line.strip()
|
|
420
|
+
for line in prediction[len('\\begin{pmatrix}'):-len('\\end{pmatrix}')].split('\\\\')
|
|
417
421
|
if line.strip()
|
|
418
422
|
]
|
|
419
423
|
ref_lines = [
|
|
420
|
-
line.strip()
|
|
424
|
+
line.strip()
|
|
425
|
+
for line in reference[len('\\begin{pmatrix}'):-len('\\end{pmatrix}')].split('\\\\')
|
|
421
426
|
if line.strip()
|
|
422
427
|
]
|
|
423
428
|
matched = True
|
|
@@ -427,12 +432,12 @@ def math_equal(
|
|
|
427
432
|
ref_parts = ref_line.split('&')
|
|
428
433
|
if len(pred_parts) == len(ref_parts):
|
|
429
434
|
if not all([
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
435
|
+
math_equal(
|
|
436
|
+
pred_parts[i],
|
|
437
|
+
ref_parts[i],
|
|
438
|
+
include_percentage,
|
|
439
|
+
is_close,
|
|
440
|
+
) for i in range(len(pred_parts))
|
|
436
441
|
]):
|
|
437
442
|
matched = False
|
|
438
443
|
break
|