evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +5 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +356 -0
- evalscope/api/benchmark/meta.py +121 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +262 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +378 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +275 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +243 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +155 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/app.py +3 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +26 -14
- evalscope/app/utils/data_utils.py +43 -27
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -14
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +7 -10
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +10 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +136 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +157 -57
- evalscope/constants.py +37 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +275 -419
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +47 -33
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +67 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +126 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +701 -0
- evalscope/perf/benchmark.py +4 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +15 -10
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +11 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -3
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +51 -35
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +33 -47
- evalscope/summarizer.py +1 -1
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +3 -2
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +142 -6
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +11 -7
- evalscope/utils/multi_choices.py +288 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
- tests/benchmark/test_eval.py +385 -0
- tests/benchmark/test_image_edit.py +65 -0
- tests/{aigc → benchmark}/test_t2i.py +22 -4
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +85 -47
- tests/cli/test_collection.py +20 -8
- tests/cli/test_custom.py +22 -15
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -2
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/__init__.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .filter import Filter, FilterEnsemble, build_filter_ensemble
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Any, Callable, Dict, Iterable, List, Union
|
|
4
|
+
|
|
5
|
+
from evalscope.api.registry import get_filter
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Filter(ABC):
|
|
9
|
+
"""
|
|
10
|
+
Filter classes operate on a sample level.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, *args, **kwargs) -> None:
|
|
14
|
+
"""
|
|
15
|
+
Can define custom behavior here, if an individual instantiation of a Filter class should have state.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
@abstractmethod
|
|
19
|
+
def apply(self, instance: List[str]) -> List[str]:
|
|
20
|
+
|
|
21
|
+
return instance
|
|
22
|
+
|
|
23
|
+
def __call__(self, instance: str) -> str:
|
|
24
|
+
"""
|
|
25
|
+
Allows the filter to be called like a function.
|
|
26
|
+
"""
|
|
27
|
+
return self.apply([instance])[0]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class FilterEnsemble:
|
|
32
|
+
"""
|
|
33
|
+
FilterEnsemble creates a pipeline applying multiple filters.
|
|
34
|
+
Its intended usage is to stack multiple post-processing steps in order.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
name: str
|
|
38
|
+
filters: List[Callable[[], Filter]]
|
|
39
|
+
|
|
40
|
+
def apply(self, instance: List[str]) -> List[str]:
|
|
41
|
+
|
|
42
|
+
for f in self.filters:
|
|
43
|
+
# apply filters in sequence
|
|
44
|
+
instance = f.apply(instance)
|
|
45
|
+
|
|
46
|
+
return instance
|
|
47
|
+
|
|
48
|
+
def __call__(self, instance: str) -> str:
|
|
49
|
+
"""
|
|
50
|
+
Allows the filter ensemble to be called like a function.
|
|
51
|
+
"""
|
|
52
|
+
return self.apply([instance])[0]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def build_filter_ensemble(name: str = 'default', filters: Dict[str, Any] = {}) -> FilterEnsemble:
|
|
56
|
+
"""
|
|
57
|
+
Create a filtering pipeline.
|
|
58
|
+
"""
|
|
59
|
+
filter_funcs = []
|
|
60
|
+
for filter_name, filter_args in filters.items():
|
|
61
|
+
filter_cls = get_filter(filter_name)
|
|
62
|
+
if isinstance(filter_args, list):
|
|
63
|
+
filter_function = filter_cls(*filter_args)
|
|
64
|
+
elif isinstance(filter_args, dict):
|
|
65
|
+
filter_function = filter_cls(**filter_args)
|
|
66
|
+
else:
|
|
67
|
+
# Assume single value for simple filters
|
|
68
|
+
filter_function = filter_cls(filter_args)
|
|
69
|
+
# add the filter as a pipeline step
|
|
70
|
+
filter_funcs.append(filter_function)
|
|
71
|
+
|
|
72
|
+
return FilterEnsemble(name=name, filters=filter_funcs)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from .chat_message import (
|
|
2
|
+
ChatMessage,
|
|
3
|
+
ChatMessageAssistant,
|
|
4
|
+
ChatMessageSystem,
|
|
5
|
+
ChatMessageTool,
|
|
6
|
+
ChatMessageUser,
|
|
7
|
+
dict_to_chat_message,
|
|
8
|
+
messages_pretty_str,
|
|
9
|
+
messages_to_markdown,
|
|
10
|
+
)
|
|
11
|
+
from .content import Content, ContentAudio, ContentData, ContentImage, ContentReasoning, ContentText, ContentVideo
|
|
12
|
+
from .utils import parse_content_with_reasoning
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
from pydantic import BaseModel, Field, JsonValue, model_validator
|
|
3
|
+
from typing import Any, Dict, List, Literal, Optional, Type, Union
|
|
4
|
+
|
|
5
|
+
from evalscope.api.tool import ToolCall, ToolCallError
|
|
6
|
+
from .content import Content, ContentImage, ContentReasoning, ContentText
|
|
7
|
+
from .utils import parse_content_with_reasoning
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ChatMessageBase(BaseModel):
|
|
11
|
+
"""Base class for chat messages."""
|
|
12
|
+
|
|
13
|
+
id: Optional[str] = Field(default=None)
|
|
14
|
+
"""Unique identifer for message."""
|
|
15
|
+
|
|
16
|
+
content: Union[str, List[Content]]
|
|
17
|
+
"""Content (simple string or list of content objects)"""
|
|
18
|
+
|
|
19
|
+
source: Optional[Literal['input', 'generate']] = Field(default=None)
|
|
20
|
+
"""Source of message."""
|
|
21
|
+
|
|
22
|
+
metadata: Optional[Dict[str, Any]] = Field(default=None)
|
|
23
|
+
"""Additional message metadata."""
|
|
24
|
+
|
|
25
|
+
internal: Optional[JsonValue] = Field(default=None)
|
|
26
|
+
"""Model provider specific payload - typically used to aid transformation back to model types."""
|
|
27
|
+
|
|
28
|
+
def model_post_init(self, __context: Any) -> None:
|
|
29
|
+
# Generate ID
|
|
30
|
+
if self.id is None:
|
|
31
|
+
self.id = uuid.uuid4().hex[:8] # Shorten to 8 characters for simplicity
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def text(self) -> str:
|
|
35
|
+
"""Get the text content of this message.
|
|
36
|
+
|
|
37
|
+
ChatMessage content is very general and can contain either
|
|
38
|
+
a simple text value or a list of content parts (each of which
|
|
39
|
+
can either be text or an image). Solvers (e.g. for prompt
|
|
40
|
+
engineering) often need to interact with chat messages with
|
|
41
|
+
the assumption that they are a simple string. The text
|
|
42
|
+
property returns either the plain str content, or if the
|
|
43
|
+
content is a list of text and images, the text items
|
|
44
|
+
concatenated together (separated by newline)
|
|
45
|
+
"""
|
|
46
|
+
if isinstance(self.content, str):
|
|
47
|
+
return self.content
|
|
48
|
+
else:
|
|
49
|
+
all_text = [content.text for content in self.content if content.type == 'text']
|
|
50
|
+
return '\n'.join(all_text)
|
|
51
|
+
|
|
52
|
+
@text.setter
|
|
53
|
+
def text(self, text: str) -> None:
|
|
54
|
+
"""Set the primary text content for this message.
|
|
55
|
+
|
|
56
|
+
ChatMessage content is very general and can contain either
|
|
57
|
+
a simple text value or a list of content parts (each of which
|
|
58
|
+
can either be text or an image). Solvers (e.g. for prompt
|
|
59
|
+
engineering) often need to interact with chat messages with
|
|
60
|
+
the assumption that they are a simple string. The text property
|
|
61
|
+
sets text either to content directly (if it is a `str`) or to
|
|
62
|
+
the first text content item in the message (inserting one at
|
|
63
|
+
the beginning if necessary). If there are multiple text content
|
|
64
|
+
items in the message then after the set there will be only
|
|
65
|
+
one remaining (image content will remain).
|
|
66
|
+
"""
|
|
67
|
+
if isinstance(self.content, str):
|
|
68
|
+
self.content = text
|
|
69
|
+
else:
|
|
70
|
+
all_other = [content for content in self.content if content.type != 'text']
|
|
71
|
+
self.content = all_other + [ContentText(text=text)]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class ChatMessageSystem(ChatMessageBase):
|
|
75
|
+
"""System chat message."""
|
|
76
|
+
|
|
77
|
+
role: Literal['system'] = Field(default='system')
|
|
78
|
+
"""Conversation role."""
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class ChatMessageUser(ChatMessageBase):
|
|
82
|
+
"""User chat message."""
|
|
83
|
+
|
|
84
|
+
role: Literal['user'] = Field(default='user')
|
|
85
|
+
"""Conversation role."""
|
|
86
|
+
|
|
87
|
+
tool_call_id: Optional[List[str]] = Field(default=None)
|
|
88
|
+
"""ID(s) of tool call(s) this message has the content payload for."""
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class ChatMessageAssistant(ChatMessageBase):
|
|
92
|
+
"""Assistant chat message."""
|
|
93
|
+
|
|
94
|
+
role: Literal['assistant'] = Field(default='assistant')
|
|
95
|
+
"""Conversation role."""
|
|
96
|
+
|
|
97
|
+
tool_calls: Optional[List[ToolCall]] = Field(default=None)
|
|
98
|
+
"""Tool calls made by the model."""
|
|
99
|
+
|
|
100
|
+
model: Optional[str] = Field(default=None)
|
|
101
|
+
"""Model used to generate assistant message."""
|
|
102
|
+
|
|
103
|
+
# Some OpenAI compatible REST endpoints include reasoning as a field alongside
|
|
104
|
+
# content, however since this field doesn't exist in the OpenAI interface,
|
|
105
|
+
# hosting providers (so far we've seen this with Together and Groq) may
|
|
106
|
+
# include the reasoning in a <think></think> tag before the main response.
|
|
107
|
+
# We expect this pattern to be repeated elsewhere, so include this hook to
|
|
108
|
+
# automatically extract the reasoning content when the response is prefaced
|
|
109
|
+
# with a <think> block. If this ends up being an overeach we can fall back
|
|
110
|
+
# to each provider manually parsing out <think> using a helper function.
|
|
111
|
+
# The implementation isn't important here, the critical thing to establish
|
|
112
|
+
# is that EvalScope makes reasoning content available separately.
|
|
113
|
+
@model_validator(mode='before')
|
|
114
|
+
@classmethod
|
|
115
|
+
def extract_reasoning(cls, data: Any) -> Any:
|
|
116
|
+
if isinstance(data, dict):
|
|
117
|
+
# cleave apart <think> blocks
|
|
118
|
+
content = data.get('content', None)
|
|
119
|
+
if isinstance(content, str):
|
|
120
|
+
content_text, content_reasoning = parse_content_with_reasoning(content)
|
|
121
|
+
if content_reasoning:
|
|
122
|
+
data['content'] = [
|
|
123
|
+
content_reasoning,
|
|
124
|
+
ContentText(text=content_text),
|
|
125
|
+
]
|
|
126
|
+
# migrate messages that has explicit 'reasoning' field
|
|
127
|
+
# (which was our original representation of reasoning)
|
|
128
|
+
reasoning = data.get('reasoning', None)
|
|
129
|
+
if isinstance(reasoning, str):
|
|
130
|
+
# ensure that content is a list
|
|
131
|
+
content = data.get('content', None)
|
|
132
|
+
if content is None:
|
|
133
|
+
data['content'] = []
|
|
134
|
+
elif isinstance(content, str):
|
|
135
|
+
data['content'] = [ContentText(text=content)]
|
|
136
|
+
elif not isinstance(content, list):
|
|
137
|
+
data['content'] = []
|
|
138
|
+
data['content'].insert(0, ContentReasoning(reasoning=reasoning))
|
|
139
|
+
|
|
140
|
+
del data['reasoning']
|
|
141
|
+
return data
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class ChatMessageTool(ChatMessageBase):
|
|
145
|
+
"""Tool chat message."""
|
|
146
|
+
|
|
147
|
+
role: Literal['tool'] = Field(default='tool')
|
|
148
|
+
"""Conversation role."""
|
|
149
|
+
|
|
150
|
+
tool_call_id: Optional[str] = Field(default=None)
|
|
151
|
+
"""ID of tool call."""
|
|
152
|
+
|
|
153
|
+
function: Optional[str] = Field(default=None)
|
|
154
|
+
"""Name of function called."""
|
|
155
|
+
|
|
156
|
+
error: Optional[ToolCallError] = Field(default=None)
|
|
157
|
+
"""Error which occurred during tool call."""
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
ChatMessage = Union[ChatMessageSystem, ChatMessageUser, ChatMessageAssistant, ChatMessageTool]
|
|
161
|
+
"""Message in a chat conversation"""
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def dict_to_chat_message(data: Dict[str, Any]) -> ChatMessage:
|
|
165
|
+
"""Convert a dictionary to a ChatMessage."""
|
|
166
|
+
|
|
167
|
+
if isinstance(data, ChatMessage):
|
|
168
|
+
return data
|
|
169
|
+
|
|
170
|
+
if 'role' not in data:
|
|
171
|
+
raise ValueError('ChatMessage must have a "role" field')
|
|
172
|
+
|
|
173
|
+
role = data['role']
|
|
174
|
+
if role == 'system':
|
|
175
|
+
return ChatMessageSystem.model_validate(data)
|
|
176
|
+
elif role == 'user':
|
|
177
|
+
return ChatMessageUser.model_validate(data)
|
|
178
|
+
elif role == 'assistant':
|
|
179
|
+
return ChatMessageAssistant.model_validate(data)
|
|
180
|
+
elif role == 'tool':
|
|
181
|
+
return ChatMessageTool.model_validate(data)
|
|
182
|
+
else:
|
|
183
|
+
raise ValueError(f'Unknown chat message role: {role}')
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def messages_pretty_str(messages: List[ChatMessage]) -> str:
|
|
187
|
+
"""Pretty print a list of chat messages. Without images or other multi-modal contents."""
|
|
188
|
+
output = []
|
|
189
|
+
for message in messages:
|
|
190
|
+
role = message.role.capitalize()
|
|
191
|
+
content = message.text
|
|
192
|
+
if isinstance(message, ChatMessageTool):
|
|
193
|
+
if message.error:
|
|
194
|
+
content += f'\nError: {message.error.message}'
|
|
195
|
+
if message.function:
|
|
196
|
+
content += f'\nFunction: {message.function}'
|
|
197
|
+
output.append(f'**{role}**: {content}')
|
|
198
|
+
return '\n\n'.join(output)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def messages_to_markdown(messages: List[ChatMessage], max_length: Optional[int] = None) -> str:
|
|
202
|
+
"""Convert a list of chat messages to markdown format.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
messages (List[ChatMessage]): The list of chat messages to convert.
|
|
206
|
+
max_length (Optional[int]): If provided, truncates the base64 string of images to this length.
|
|
207
|
+
"""
|
|
208
|
+
output = []
|
|
209
|
+
for message in messages:
|
|
210
|
+
role = message.role.capitalize()
|
|
211
|
+
|
|
212
|
+
# Start with role header
|
|
213
|
+
content_parts = [f'**{role}**: ']
|
|
214
|
+
|
|
215
|
+
# Handle content based on type
|
|
216
|
+
if isinstance(message.content, str):
|
|
217
|
+
content_parts.append(message.content)
|
|
218
|
+
else:
|
|
219
|
+
for content_item in message.content:
|
|
220
|
+
if isinstance(content_item, ContentText):
|
|
221
|
+
content_parts.append(content_item.text)
|
|
222
|
+
elif isinstance(content_item, ContentImage):
|
|
223
|
+
# Use markdown image syntax
|
|
224
|
+
image_base64 = content_item.image
|
|
225
|
+
if max_length and len(image_base64) > max_length:
|
|
226
|
+
image_base64 = image_base64[:max_length]
|
|
227
|
+
content_parts.append(f'')
|
|
228
|
+
elif isinstance(content_item, ContentReasoning):
|
|
229
|
+
content_parts.append(f'**Reasoning:** {content_item.reasoning}')
|
|
230
|
+
|
|
231
|
+
# Add tool-specific information
|
|
232
|
+
if isinstance(message, ChatMessageTool):
|
|
233
|
+
if message.error:
|
|
234
|
+
content_parts.append(f'**Error:** {message.error.message}')
|
|
235
|
+
if message.function:
|
|
236
|
+
content_parts.append(f'**Function:** {message.function}')
|
|
237
|
+
elif isinstance(message, ChatMessageAssistant) and message.tool_calls:
|
|
238
|
+
for tool_call in message.tool_calls:
|
|
239
|
+
content_parts.append(f'**Tool Call:** {tool_call.function}')
|
|
240
|
+
|
|
241
|
+
output.append('\n'.join(content_parts))
|
|
242
|
+
|
|
243
|
+
return '\n\n'.join(output)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field, JsonValue
|
|
2
|
+
from typing import Dict, Literal, Optional, Sequence, Union
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ContentBase(BaseModel):
|
|
6
|
+
internal: Optional[JsonValue] = Field(default=None)
|
|
7
|
+
"""Model provider specific payload - typically used to aid transformation back to model types."""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ContentText(ContentBase):
|
|
11
|
+
"""Text content."""
|
|
12
|
+
|
|
13
|
+
type: Literal['text'] = Field(default='text')
|
|
14
|
+
"""Type."""
|
|
15
|
+
|
|
16
|
+
text: str
|
|
17
|
+
"""Text content."""
|
|
18
|
+
|
|
19
|
+
refusal: Optional[bool] = Field(default=None)
|
|
20
|
+
"""Was this a refusal message?"""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ContentReasoning(ContentBase):
|
|
24
|
+
"""Reasoning content.
|
|
25
|
+
|
|
26
|
+
See the specification for [thinking blocks](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#understanding-thinking-blocks) for Claude models.
|
|
27
|
+
""" # noqa: E501
|
|
28
|
+
|
|
29
|
+
type: Literal['reasoning'] = Field(default='reasoning')
|
|
30
|
+
"""Type."""
|
|
31
|
+
|
|
32
|
+
reasoning: str
|
|
33
|
+
"""Reasoning content."""
|
|
34
|
+
|
|
35
|
+
signature: Optional[str] = Field(default=None)
|
|
36
|
+
"""Signature for reasoning content (used by some models to ensure that reasoning content is not modified for replay)""" # noqa: E501
|
|
37
|
+
|
|
38
|
+
redacted: bool = Field(default=False)
|
|
39
|
+
"""Indicates that the explicit content of this reasoning block has been redacted."""
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ContentImage(ContentBase):
|
|
43
|
+
"""Image content."""
|
|
44
|
+
|
|
45
|
+
type: Literal['image'] = Field(default='image')
|
|
46
|
+
"""Type."""
|
|
47
|
+
|
|
48
|
+
image: str
|
|
49
|
+
"""Either a URL of the image or the base64 encoded image data."""
|
|
50
|
+
|
|
51
|
+
detail: Literal['auto', 'low', 'high'] = Field(default='auto')
|
|
52
|
+
"""Specifies the detail level of the image.
|
|
53
|
+
|
|
54
|
+
Currently only supported for OpenAI. Learn more in the [Vision guide](https://platform.openai.com/docs/guides/vision/low-or-high-fidelity-image-understanding).
|
|
55
|
+
""" # noqa: E501
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class ContentAudio(ContentBase):
|
|
59
|
+
"""Audio content."""
|
|
60
|
+
|
|
61
|
+
type: Literal['audio'] = Field(default='audio')
|
|
62
|
+
"""Type."""
|
|
63
|
+
|
|
64
|
+
audio: str
|
|
65
|
+
"""Audio file path or base64 encoded data URL."""
|
|
66
|
+
|
|
67
|
+
format: Literal['wav', 'mp3']
|
|
68
|
+
"""Format of audio data ('mp3' or 'wav')"""
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class ContentVideo(ContentBase):
|
|
72
|
+
"""Video content."""
|
|
73
|
+
|
|
74
|
+
type: Literal['video'] = Field(default='video')
|
|
75
|
+
"""Type."""
|
|
76
|
+
|
|
77
|
+
video: str
|
|
78
|
+
"""Audio file path or base64 encoded data URL."""
|
|
79
|
+
|
|
80
|
+
format: Literal['mp4', 'mpeg', 'mov']
|
|
81
|
+
"""Format of video data ('mp4', 'mpeg', or 'mov')"""
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class ContentData(ContentBase):
|
|
85
|
+
"""Model internal."""
|
|
86
|
+
|
|
87
|
+
type: Literal['data'] = Field(default='data')
|
|
88
|
+
"""Type."""
|
|
89
|
+
|
|
90
|
+
data: Dict[str, JsonValue]
|
|
91
|
+
"""Model provider specific payload - required for internal content."""
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
Content = Union[
|
|
95
|
+
ContentText,
|
|
96
|
+
ContentReasoning,
|
|
97
|
+
ContentImage,
|
|
98
|
+
ContentAudio,
|
|
99
|
+
ContentVideo,
|
|
100
|
+
ContentData,
|
|
101
|
+
]
|
|
102
|
+
"""Content sent to or received from a model."""
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from .content import ContentReasoning
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def parse_content_with_reasoning(content: str) -> tuple[str, Optional[ContentReasoning]]:
|
|
8
|
+
"""
|
|
9
|
+
Looks for and extracts <think/> tags into reasoning text.
|
|
10
|
+
|
|
11
|
+
Returns a tuple:
|
|
12
|
+
- The first element is the input content with the <think> tag and its contents fully removed.
|
|
13
|
+
- The second element is a ContentReasoning object (or None if no <think> tag is found).
|
|
14
|
+
"""
|
|
15
|
+
# Match <think> tag with optional attributes anywhere in the string
|
|
16
|
+
pattern = (r'<think(?:\s+signature="([^"]*)")?(?:\s+redacted="(true)")?\s*>(.*?)</think>')
|
|
17
|
+
match = re.search(pattern, content, re.DOTALL)
|
|
18
|
+
|
|
19
|
+
if match:
|
|
20
|
+
signature = match.group(1) # This will be None if not present
|
|
21
|
+
redacted_value = match.group(2) # This will be "true" or None
|
|
22
|
+
reasoning = match.group(3).strip()
|
|
23
|
+
# Remove the matched <think>...</think> from the input
|
|
24
|
+
start, end = match.span()
|
|
25
|
+
|
|
26
|
+
return (
|
|
27
|
+
(content[:start] + content[end:]).strip(),
|
|
28
|
+
ContentReasoning(
|
|
29
|
+
reasoning=reasoning,
|
|
30
|
+
signature=signature,
|
|
31
|
+
redacted=redacted_value == 'true',
|
|
32
|
+
),
|
|
33
|
+
)
|
|
34
|
+
else:
|
|
35
|
+
return content, None
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Callable, Iterable, List, Union
|
|
3
|
+
|
|
4
|
+
from evalscope.utils import get_logger
|
|
5
|
+
from evalscope.utils.function_utils import thread_safe
|
|
6
|
+
|
|
7
|
+
logger = get_logger()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Metric(ABC):
|
|
11
|
+
"""
|
|
12
|
+
Metric classes operate on a sample level.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, *args, **kwargs) -> None:
|
|
16
|
+
"""
|
|
17
|
+
Can define custom behavior here, if an individual instantiation of a Metric class should have state.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def apply(self, predictions: List[str], references: List[str]) -> List[float]:
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
def __call__(self, prediction: str, reference: str) -> float:
|
|
25
|
+
"""
|
|
26
|
+
Allows the metric to be called like a function.
|
|
27
|
+
"""
|
|
28
|
+
return self.apply([prediction], [reference])[0]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class T2IMetric(Metric):
|
|
32
|
+
_instance = None
|
|
33
|
+
|
|
34
|
+
@thread_safe
|
|
35
|
+
def __new__(cls, *args, **kwargs):
|
|
36
|
+
if cls._instance is None:
|
|
37
|
+
cls._instance = super().__new__(cls)
|
|
38
|
+
return cls._instance
|
|
39
|
+
|
|
40
|
+
def __init__(self, *args, **kwargs):
|
|
41
|
+
cls = self.__class__
|
|
42
|
+
if hasattr(self, '_init_done'):
|
|
43
|
+
return
|
|
44
|
+
logger.info(f'Initializing {cls.__name__}...')
|
|
45
|
+
self._init_once(*args, **kwargs)
|
|
46
|
+
self._init_done = True
|
|
47
|
+
|
|
48
|
+
def _init_once(self, *args, **kwargs):
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
def apply(self, images: List[str], texts: List[str], **kwargs) -> List[Union[float, dict]]:
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
def __call__(self, image: str, text: str, **kwargs) -> Union[float, dict]:
|
|
55
|
+
return self.apply([image], [text], **kwargs)[0]
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
|
|
3
|
+
|
|
4
|
+
from evalscope.utils.logger import get_logger
|
|
5
|
+
|
|
6
|
+
logger = get_logger()
|
|
7
|
+
|
|
8
|
+
Value = Dict[str, Union[int, float, bool]]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Score(BaseModel):
|
|
12
|
+
"""Score generated by a scorer."""
|
|
13
|
+
|
|
14
|
+
value: Value = Field(default_factory=dict)
|
|
15
|
+
"""Score value as a dictionary. Key is the score name, value is the score value.
|
|
16
|
+
The first key is considered the main score by default."""
|
|
17
|
+
|
|
18
|
+
extracted_prediction: Optional[str] = Field(default=None)
|
|
19
|
+
"""Answer extracted from model output (optional)"""
|
|
20
|
+
|
|
21
|
+
prediction: Optional[str] = Field(default=None)
|
|
22
|
+
"""Original prediction text from the model (optional)"""
|
|
23
|
+
|
|
24
|
+
explanation: Optional[str] = Field(default=None)
|
|
25
|
+
"""Explanation of score (optional)."""
|
|
26
|
+
|
|
27
|
+
metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
|
28
|
+
"""Additional metadata related to the score"""
|
|
29
|
+
|
|
30
|
+
main_score_name: Optional[str] = Field(default=None)
|
|
31
|
+
"""Main score name, if applicable. This is used to indicate which score is the primary score in a multi-score scenario.""" # noqa: E501
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def main_value(self) -> Union[int, float, bool]:
|
|
35
|
+
"""Main score value."""
|
|
36
|
+
if self.main_score_name and self.main_score_name in self.value:
|
|
37
|
+
return self.value[self.main_score_name]
|
|
38
|
+
elif self.value:
|
|
39
|
+
# If main_score_name is not set or not found, use the first value and update main_score_name
|
|
40
|
+
first_key = next(iter(self.value))
|
|
41
|
+
self.main_score_name = first_key
|
|
42
|
+
return self.value[first_key]
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
@main_value.setter
|
|
46
|
+
def main_value(self, value: Union[int, float, bool]):
|
|
47
|
+
"""Set the main score value."""
|
|
48
|
+
if self.main_score_name:
|
|
49
|
+
# If main_score_name is already set, use it
|
|
50
|
+
self.value[self.main_score_name] = value
|
|
51
|
+
elif self.value:
|
|
52
|
+
# If no main_score_name but value dict exists, use the first key
|
|
53
|
+
first_key = next(iter(self.value))
|
|
54
|
+
self.main_score_name = first_key
|
|
55
|
+
self.value[first_key] = value
|
|
56
|
+
else:
|
|
57
|
+
# If neither main_score_name nor value dict exists, initialize both
|
|
58
|
+
self.main_score_name = 'default'
|
|
59
|
+
self.value[self.main_score_name] = value
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class SampleScore(BaseModel):
|
|
63
|
+
"""Score for a Sample."""
|
|
64
|
+
|
|
65
|
+
score: Score
|
|
66
|
+
"""A score"""
|
|
67
|
+
|
|
68
|
+
sample_id: Optional[Union[str, int]] = Field(default=None)
|
|
69
|
+
"""A sample id"""
|
|
70
|
+
|
|
71
|
+
group_id: Optional[Union[str, int]] = Field(default=None)
|
|
72
|
+
"""A group id for the sample, used for grouping k repeated samples."""
|
|
73
|
+
|
|
74
|
+
sample_metadata: Optional[Dict[str, Any]] = Field(default=None)
|
|
75
|
+
"""Metadata from the sample"""
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class AggScore(BaseModel):
|
|
79
|
+
"""Output of an aggregation operation."""
|
|
80
|
+
|
|
81
|
+
score: float = Field(default=0.0)
|
|
82
|
+
"""Aggregated value as a float."""
|
|
83
|
+
|
|
84
|
+
metric_name: str = Field(default='')
|
|
85
|
+
"""Name of the metric being aggregated."""
|
|
86
|
+
|
|
87
|
+
aggregation_name: str = Field(default='')
|
|
88
|
+
"""Name of the aggregation methods"""
|
|
89
|
+
|
|
90
|
+
num: int = Field(default=0)
|
|
91
|
+
"""Number of samples used in the aggregation."""
|
|
92
|
+
|
|
93
|
+
ids: Optional[List[Union[str, int]]] = Field(default=None)
|
|
94
|
+
"""List of sample IDs used in the aggregation, if applicable."""
|
|
95
|
+
|
|
96
|
+
metadata: Optional[Dict[str, Any]] = Field(default=None)
|
|
97
|
+
"""Additional metadata related to the aggregation."""
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class Aggregator:
|
|
101
|
+
|
|
102
|
+
name = 'default'
|
|
103
|
+
|
|
104
|
+
def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
|
|
105
|
+
r"""Aggregate a metric on a list of scores.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
scores: List of scores.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
List[AggregatOutput]: List of aggregated outputs.
|
|
112
|
+
"""
|
|
113
|
+
...
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .llm_judge_mixin import LLMJudgeMixin
|