PyPI - evalscope - Versions diffs - 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show

evalscope/__init__.py +4 -1
evalscope/api/benchmark/__init__.py +3 -0
evalscope/api/benchmark/adapters/__init__.py +5 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
evalscope/api/benchmark/benchmark.py +356 -0
evalscope/api/benchmark/meta.py +121 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +349 -0
evalscope/api/dataset/loader.py +262 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +378 -0
evalscope/api/evaluator/evaluator.py +56 -0
evalscope/api/evaluator/state.py +275 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +12 -0
evalscope/api/messages/chat_message.py +243 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +55 -0
evalscope/api/metric/scorer.py +113 -0
evalscope/api/mixin/__init__.py +1 -0
evalscope/api/mixin/llm_judge_mixin.py +168 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +155 -0
evalscope/api/model/model.py +386 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/app.py +3 -0
evalscope/app/ui/app_ui.py +2 -1
evalscope/app/ui/multi_model.py +50 -25
evalscope/app/ui/single_model.py +26 -14
evalscope/app/utils/data_utils.py +43 -27
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -14
evalscope/app/utils/visualization.py +9 -4
evalscope/arguments.py +7 -10
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +6 -5
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +10 -1
evalscope/backend/rag_eval/utils/llm.py +13 -12
evalscope/benchmarks/__init__.py +0 -2
evalscope/benchmarks/aime/aime24_adapter.py +38 -40
evalscope/benchmarks/aime/aime25_adapter.py +34 -40
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
evalscope/benchmarks/arc/arc_adapter.py +34 -147
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
evalscope/benchmarks/arena_hard/utils.py +37 -1
evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
evalscope/benchmarks/bfcl/generation.py +222 -0
evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
evalscope/benchmarks/docmath/utils.py +4 -5
evalscope/benchmarks/drop/drop_adapter.py +88 -40
evalscope/benchmarks/frames/frames_adapter.py +136 -52
evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
evalscope/benchmarks/general_arena/utils.py +23 -27
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
evalscope/benchmarks/hle/hle_adapter.py +127 -93
evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
evalscope/benchmarks/ifeval/instructions.py +109 -64
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
evalscope/benchmarks/musr/musr_adapter.py +33 -64
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
evalscope/benchmarks/race/race_adapter.py +33 -119
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
evalscope/benchmarks/super_gpqa/utils.py +2 -1
evalscope/benchmarks/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +2 -10
evalscope/collections/sampler.py +10 -10
evalscope/collections/schema.py +13 -11
evalscope/config.py +157 -57
evalscope/constants.py +37 -61
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +275 -419
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +13 -13
evalscope/metrics/llm_judge.py +47 -33
evalscope/metrics/math_parser.py +27 -22
evalscope/metrics/metric.py +307 -0
evalscope/metrics/metrics.py +22 -18
evalscope/metrics/t2v_metrics/__init__.py +0 -52
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
evalscope/models/__init__.py +6 -29
evalscope/models/image_edit_model.py +125 -0
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +67 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +126 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +701 -0
evalscope/perf/benchmark.py +4 -1
evalscope/perf/http_client.py +4 -2
evalscope/perf/plugin/api/custom_api.py +5 -4
evalscope/perf/plugin/api/openai_api.py +11 -9
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +4 -2
evalscope/perf/utils/benchmark_util.py +15 -10
evalscope/perf/utils/db_util.py +9 -6
evalscope/perf/utils/local_server.py +11 -3
evalscope/perf/utils/rich_display.py +16 -10
evalscope/report/__init__.py +2 -3
evalscope/report/combinator.py +18 -12
evalscope/report/generator.py +51 -35
evalscope/report/{utils.py → report.py} +8 -6
evalscope/run.py +33 -47
evalscope/summarizer.py +1 -1
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/__init__.py +21 -2
evalscope/utils/chat_service.py +3 -2
evalscope/utils/deprecation_utils.py +12 -1
evalscope/utils/function_utils.py +29 -0
evalscope/utils/import_utils.py +23 -1
evalscope/utils/io_utils.py +142 -6
evalscope/utils/json_schema.py +208 -0
evalscope/utils/logger.py +51 -12
evalscope/utils/model_utils.py +11 -7
evalscope/utils/multi_choices.py +288 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
tests/benchmark/test_eval.py +385 -0
tests/benchmark/test_image_edit.py +65 -0
tests/{aigc → benchmark}/test_t2i.py +22 -4
tests/benchmark/test_vlm.py +80 -0
tests/cli/test_all.py +85 -47
tests/cli/test_collection.py +20 -8
tests/cli/test_custom.py +22 -15
tests/cli/test_reasoning.py +81 -0
tests/common.py +73 -0
tests/perf/test_perf.py +4 -2
tests/rag/test_clip_benchmark.py +0 -2
evalscope/benchmarks/aigc/t2i/base.py +0 -56
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -81
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -528
evalscope/benchmarks/filters.py +0 -59
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/process_bench/critique_template.txt +0 -13
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/benchmarks/utils.py +0 -60
evalscope/collections/evaluator.py +0 -375
evalscope/metrics/completion_parsers.py +0 -227
evalscope/metrics/named_metrics.py +0 -55
evalscope/models/adapters/__init__.py +0 -14
evalscope/models/adapters/base_adapter.py +0 -84
evalscope/models/adapters/bfcl_adapter.py +0 -246
evalscope/models/adapters/chat_adapter.py +0 -207
evalscope/models/adapters/choice_adapter.py +0 -222
evalscope/models/adapters/custom_adapter.py +0 -71
evalscope/models/adapters/server_adapter.py +0 -236
evalscope/models/adapters/t2i_adapter.py +0 -79
evalscope/models/adapters/tau_bench_adapter.py +0 -189
evalscope/models/custom/__init__.py +0 -4
evalscope/models/custom/custom_model.py +0 -50
evalscope/models/custom/dummy_model.py +0 -99
evalscope/models/local_model.py +0 -128
evalscope/models/register.py +0 -41
tests/cli/test_run.py +0 -489
/evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
/tests/{aigc → benchmark}/__init__.py +0 -0

evalscope/filters/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .extraction import *
2	+ from .selection import *

evalscope/filters/extraction.py ADDED Viewed

@@ -0,0 +1,126 @@
+import re
+from typing import List
+from evalscope.api.filter import Filter
+from evalscope.api.registry import register_filter
+@register_filter('regex')
+class RegexFilter(Filter):
+    """A filter that extracts values from text using regex pattern matching.
+    This filter applies a regex pattern to each model response and extracts matched values.
+    If no match is found, returns a fallback value. Useful for extracting structured data
+    (like numbers) from unstructured model outputs.
+    """
+    def __init__(
+        self,
+        regex_pattern: str = r'#### (\-?[0-9\.\,]+)',
+        group_select: int = 0,
+        fallback: str = '[invalid]',
+    ) -> None:
+        """
+        pass a string `regex` to run `re.compile(r"regex")` on.
+        `fallback` defines the output returned if no matches for the regex are located.
+        """
+        self.regex_pattern = regex_pattern
+        self.regex = re.compile(regex_pattern)
+        self.group_select = group_select
+        self.fallback = fallback
+    def apply(self, instance: List[str]) -> List[str]:
+        """Apply regex pattern to each string in the instance list."""
+        filtered = []
+        for resp in instance:
+            match = self.regex.findall(resp)
+            if match:
+                match = match[self.group_select]
+                if isinstance(match, tuple):
+                    match = [m for m in match if m]
+                    if match:
+                        match = match[0]
+                    else:
+                        match = self.fallback
+                match = match.strip()
+            else:
+                match = self.fallback
+            filtered.append(match)
+        return filtered
+@register_filter('regex_pos')
+class POSFilter(Filter):
+    """ """
+    def __init__(
+        self,
+        regex_pattern: str = r"\['(.*?)'\]",
+        group_select=0,
+        fallback=None,
+    ) -> None:
+        """
+        pass a string `regex` to run `re.compile(r"regex")` on.
+        `fallback` defines the output returned if no matches for the regex are located.
+        """
+        if fallback is None:
+            fallback = ['invalid']
+        self.regex_pattern = regex_pattern
+        self.regex = re.compile(regex_pattern)
+        self.group_select = group_select
+        self.fallback = fallback
+    def apply(self, instance: List[str]) -> List[str]:
+        """Extract POS tags from each string in the instance list."""
+        def extract_tagged_tokens(text):
+            # Extract tagged tokens list from text input using regex
+            tokens = re.findall(r"\('([^']*)', '([^']*)'\)", text)
+            return [(token, pos) for token, pos in tokens]
+        def extract_pos_tags(result):
+            pos_tags = []
+            if isinstance(result, str):
+                result = extract_tagged_tokens(result)
+            pos_tags.extend(pos for _, pos in result)
+            return pos_tags if pos_tags else self.fallback
+        filtered = []
+        for resp in instance:
+            match = extract_pos_tags(resp)
+            filtered.append(str(match))
+        return filtered
+@register_filter('remove_whitespace')
+class WhitespaceFilter(Filter):
+    """Filters out leading whitespace from responses."""
+    def apply(self, instance: List[str]) -> List[str]:
+        """Remove leading whitespace from each string in the instance list."""
+        filtered_resp = []
+        for resp in instance:
+            resp = resp.lstrip()
+            filtered_resp.append(resp)
+        return filtered_resp
+@register_filter('remove_until')
+class RemoveUntilFilter(Filter):
+    """Filters out all text until a specified delimiter is found."""
+    def __init__(self, delimiter: str) -> None:
+        self.delimiter = delimiter
+    def apply(self, instance: List[str]) -> List[str]:
+        """Remove all text until the delimiter from each string in the instance list."""
+        filtered_resp = []
+        for resp in instance:
+            resp = resp.split(self.delimiter, 1)[-1]
+            filtered_resp.append(resp)
+        return filtered_resp
+@register_filter('extract')
+class ExtractFilter(RegexFilter):
+    ...

evalscope/filters/selection.py ADDED Viewed

@@ -0,0 +1,57 @@
+from collections import Counter
+from typing import List
+from evalscope.api.filter import Filter
+from evalscope.api.registry import register_filter
+@register_filter('take_first')
+class TakeFirstFilter(Filter):
+    def __init__(self) -> None:
+        """
+        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+        """
+    def apply(self, instance: List[str]) -> List[str]:
+        """
+        Take only the first response from the instance list.
+        """
+        return [instance[0]] if instance else []
+@register_filter('take_first_k')
+class TakeKFilter(Filter):
+    def __init__(self, **kwargs) -> None:
+        self.k = kwargs.pop('k')
+        super().__init__(**kwargs)
+    def apply(self, instance: List[str]) -> List[str]:
+        """
+        Take the first k responses from the instance list.
+        """
+        assert len(instance) >= self.k, (
+            f'Need at least {self.k} responses to take first {self.k}, but got {len(instance)} only!'
+        )
+        return instance[:self.k]
+@register_filter('majority_vote')
+class MajorityVoteFilter(Filter):
+    def __init__(self) -> None:
+        """
+        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+        """
+    def apply(self, instance: List[str]) -> List[str]:
+        """
+        Select the response that occurs most frequently in the instance list.
+        """
+        if not instance:
+            return []
+        counts = Counter(instance)
+        vote = counts.most_common(1)[0][0]
+        return [vote]

evalscope/metrics/__init__.py CHANGED Viewed

@@ -4,12 +4,18 @@ from typing import TYPE_CHECKING
 from evalscope.utils.import_utils import _LazyModule
 if TYPE_CHECKING:
-    from .completion_parsers import ResponseParser, lmsys_parser, ranking_parser
     from .llm_judge import DEFAULT_NUMERIC_SCORE_TEMPLATE, DEFAULT_PROMPT_TEMPLATE, LLMJudge
     from .math_parser import extract_answer, math_equal, strip_answer_string
-    from .metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, simple_f1_score,
-                          weighted_mean)
-    from .named_metrics import Metric, metric_registry
+    from .metric import PassAtK
+    from .metrics import (
+        bleu_ngram_one_sample,
+        exact_match,
+        macro_mean,
+        mean,
+        micro_mean,
+        simple_f1_score,
+        weighted_mean,
+    )
     from .rouge_metric import compute_rouge_score, compute_rouge_score_one_sample, compute_rouge_score_one_sample_zh
 else:
@@ -23,9 +29,8 @@ else:
             'simple_f1_score',
             'weighted_mean',
         ],
-        'named_metrics': [
-            'Metric',
-            'metric_registry',
+        'metric': [
+            'PassAtK',
         ],
         'rouge_metric': [
             'compute_rouge_score_one_sample_zh',
@@ -41,12 +46,7 @@ else:
             'extract_answer',
             'math_equal',
             'strip_answer_string',
-        ],
-        'completion_parsers': [
-            'ResponseParser',
-            'lmsys_parser',
-            'ranking_parser',
-        ],
+        ]
     }
     import sys

evalscope/metrics/llm_judge.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import re
 from typing import Any, Dict, List, Optional
+from evalscope.api.messages import ChatMessage, ChatMessageSystem, ChatMessageUser
 from evalscope.constants import JudgeScoreType
 from evalscope.utils.logger import get_logger
@@ -48,17 +49,18 @@ class LLMJudge:
     """
     def __init__(
-            self,
-            api_key: Optional[str] = None,
-            api_url: Optional[str] = None,
-            model_id: Optional[str] = None,
-            system_prompt: Optional[str] = None,
-            prompt_template: Optional[str] = None,
-            generation_config: Optional[Dict[str, Any]] = None,
-            score_pattern: Optional[str] = None,
-            score_mapping: Optional[Dict[str, float]] = None,
-            score_type: str = JudgeScoreType.PATTERN,  # 'pattern', 'numeric'
-            **kwargs):
+        self,
+        api_key: Optional[str] = None,
+        api_url: Optional[str] = None,
+        model_id: Optional[str] = None,
+        system_prompt: Optional[str] = None,
+        prompt_template: Optional[str] = None,
+        generation_config: Optional[Dict[str, Any]] = None,
+        score_pattern: Optional[str] = None,
+        score_mapping: Optional[Dict[str, float]] = None,
+        score_type: str = JudgeScoreType.PATTERN,  # 'pattern', 'numeric'
+        **kwargs
+    ):
         """
         Initialize LLMJudge metric.
@@ -79,14 +81,15 @@ class LLMJudge:
         self.api_url = api_url or os.environ.get('MODELSCOPE_API_BASE', DEFAULT_API_URL)
         self.model_id = model_id or os.environ.get('MODELSCOPE_JUDGE_LLM', DEFAULT_JUDGE_MODEL)
         self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
-        self.generation_config = generation_config or {}
+        self.generation_config = generation_config or {'temperature': 0.0, 'max_tokens': 1024}
         # Default score mapping for A/B pattern
         self.score_type = score_type
         if self.score_type == JudgeScoreType.NUMERIC:
             self.score_pattern = score_pattern or r'\[\[(\d+(?:\.\d+)?)\]\]'
-            self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE',
-                                                                     DEFAULT_NUMERIC_SCORE_TEMPLATE)
+            self.prompt_template = prompt_template or os.environ.get(
+                'JUDGE_PROMPT_TEMPLATE', DEFAULT_NUMERIC_SCORE_TEMPLATE
+            )
         elif self.score_type == JudgeScoreType.PATTERN:
             self.score_pattern = score_pattern or r'(A|B)'
             self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
@@ -97,36 +100,47 @@ class LLMJudge:
         self._init_server_adapter()
     def _init_server_adapter(self):
-        from evalscope.models import ServerModelAdapter
-        # Initialize ServerModelAdapter
-        self.server_adapter = ServerModelAdapter(api_url=self.api_url, model_id=self.model_id, api_key=self.api_key)
-    def __call__(self, prompt: str, system_prompt: Optional[str] = None) -> str:
+        from evalscope.api.model import GenerateConfig, get_model
+        self.model = get_model(
+            model=self.model_id,
+            eval_type='openai_api',
+            base_url=self.api_url,
+            api_key=self.api_key,
+            config=GenerateConfig(**self.generation_config),
+        )
+    def judge(
+        self,
+        prompt: str = '',
+        system_prompt: Optional[str] = None,
+        messages: Optional[List[ChatMessage]] = None
+    ) -> str:
         """
+        Generate a response from the LLM based on the provided prompt and context.
+        If messages is provided, it will be used as the input context.
         Args:
             prompt (str): The prompt to evaluate
             system_prompt (str, optional): The system prompt to use for the evaluation
+            messages (List[ChatMessage], optional): A list of chat messages to include in the evaluation
         Returns:
             str: The response from the LLM
         """
-        input_data = {'data': [prompt], 'system_prompt': system_prompt or self.system_prompt}
-        # Inference configuration
-        infer_cfg = {'temperature': 0.0, 'max_tokens': 1024}
-        if self.generation_config:
-            infer_cfg.update(self.generation_config)
-        if self.model_id == DEFAULT_JUDGE_MODEL:
-            # Disable thinking for the default judge model
-            infer_cfg['enable_thinking'] = self.generation_config.get('enable_thinking', False)
+        # parse messages
+        if messages is not None:
+            input_messages = messages
+        else:
+            system_content = system_prompt or self.system_prompt
+            input_messages = [ChatMessageUser(content=prompt)]
+            if system_content:
+                input_messages.insert(0, ChatMessageSystem(content=system_content))
         try:
             # Send request using ServerModelAdapter
-            response = self.server_adapter.process_single_input(input_data, infer_cfg)
+            response = self.model.generate(input_messages)
             # Extract content from response
-            llm_response = response.get('choices', [{}])[0].get('message', {}).get('content', '')
+            llm_response = response.completion
             return llm_response
         except Exception as e:
             logger.error(f'Error occurred during {self.model_id}@{self.api_url} LLM judge evaluation: {e}')

evalscope/metrics/math_parser.py CHANGED Viewed

@@ -153,9 +153,11 @@ def strip_answer_string(string):
     # cdot
     # string = string.replace("\\cdot", "")
-    if (string.startswith('{') and string.endswith('}') and string.isalnum()
-            or string.startswith('(') and string.endswith(')') and string.isalnum()
-            or string.startswith('[') and string.endswith(']') and string.isalnum()):
+    if (
+        string.startswith('{') and string.endswith('}') and string.isalnum()
+        or string.startswith('(') and string.endswith(')') and string.isalnum()
+        or string.startswith('[') and string.endswith(']') and string.isalnum()
+    ):
         string = string[1:-1]
     # inf
@@ -387,9 +389,8 @@ def math_equal(
     ## deal with [], (), {}
     pred_str, ref_str = prediction, reference
-    if (prediction.startswith('[') and prediction.endswith(']')
-            and not reference.startswith('(')) or (prediction.startswith('(') and prediction.endswith(')')
-                                                   and not reference.startswith('[')):
+    if (prediction.startswith('[') and prediction.endswith(']') and not reference.startswith('(')
+        ) or (prediction.startswith('(') and prediction.endswith(')') and not reference.startswith('[')):
         pred_str = pred_str.strip('[]()')
         ref_str = ref_str.strip('[]()')
     for s in ['{', '}', '(', ')']:
@@ -399,25 +400,29 @@ def math_equal(
         return True
     ## [a, b] vs. [c, d], return a==c and b==d
-    if (regex.match(r'(\(|\[).+(\)|\])', prediction) is not None
-            and regex.match(r'(\(|\[).+(\)|\])', reference) is not None):
+    if (
+        regex.match(r'(\(|\[).+(\)|\])', prediction) is not None
+        and regex.match(r'(\(|\[).+(\)|\])', reference) is not None
+    ):
         pred_parts = prediction[1:-1].split(',')
         ref_parts = reference[1:-1].split(',')
         if len(pred_parts) == len(ref_parts):
-            if all(
-                [math_equal(pred_parts[i], ref_parts[i], include_percentage, is_close)
-                 for i in range(len(pred_parts))]):
+            if all([
+                math_equal(pred_parts[i], ref_parts[i], include_percentage, is_close) for i in range(len(pred_parts))
+            ]):
                 return True
     if ((prediction.startswith('\\begin{pmatrix}') or prediction.startswith('\\begin{bmatrix}'))
-            and (prediction.endswith('\\end{pmatrix}') or prediction.endswith('\\end{bmatrix}'))
-            and (reference.startswith('\\begin{pmatrix}') or reference.startswith('\\begin{bmatrix}'))
-            and (reference.endswith('\\end{pmatrix}') or reference.endswith('\\end{bmatrix}'))):
+        and (prediction.endswith('\\end{pmatrix}') or prediction.endswith('\\end{bmatrix}'))
+        and (reference.startswith('\\begin{pmatrix}') or reference.startswith('\\begin{bmatrix}'))
+        and (reference.endswith('\\end{pmatrix}') or reference.endswith('\\end{bmatrix}'))):
         pred_lines = [
-            line.strip() for line in prediction[len('\\begin{pmatrix}'):-len('\\end{pmatrix}')].split('\\\\')
+            line.strip()
+            for line in prediction[len('\\begin{pmatrix}'):-len('\\end{pmatrix}')].split('\\\\')
             if line.strip()
         ]
         ref_lines = [
-            line.strip() for line in reference[len('\\begin{pmatrix}'):-len('\\end{pmatrix}')].split('\\\\')
+            line.strip()
+            for line in reference[len('\\begin{pmatrix}'):-len('\\end{pmatrix}')].split('\\\\')
             if line.strip()
         ]
         matched = True
@@ -427,12 +432,12 @@ def math_equal(
                 ref_parts = ref_line.split('&')
                 if len(pred_parts) == len(ref_parts):
                     if not all([
-                            math_equal(
-                                pred_parts[i],
-                                ref_parts[i],
-                                include_percentage,
-                                is_close,
-                            ) for i in range(len(pred_parts))
+                        math_equal(
+                            pred_parts[i],
+                            ref_parts[i],
+                            include_percentage,
+                            is_close,
+                        ) for i in range(len(pred_parts))
                     ]):
                         matched = False
                         break

evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl