evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from tqdm import tqdm
|
|
3
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Union, cast
|
|
4
|
+
|
|
5
|
+
from .dataset import Dataset, FieldSpec, Sample
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def record_to_sample_fn(sample_fields: Union[FieldSpec, Callable, None] = None, ) -> Callable:
|
|
9
|
+
if sample_fields is None:
|
|
10
|
+
sample_fields = FieldSpec()
|
|
11
|
+
|
|
12
|
+
if isinstance(sample_fields, FieldSpec):
|
|
13
|
+
|
|
14
|
+
def record_to_sample(record: dict) -> Sample:
|
|
15
|
+
# collect metadata if specified
|
|
16
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
17
|
+
if sample_fields.metadata:
|
|
18
|
+
if isinstance(sample_fields.metadata, list):
|
|
19
|
+
metadata = {}
|
|
20
|
+
for name in sample_fields.metadata:
|
|
21
|
+
metadata[name] = record.get(name)
|
|
22
|
+
|
|
23
|
+
elif 'metadata' in record:
|
|
24
|
+
metadata_field = record.get('metadata')
|
|
25
|
+
if isinstance(metadata_field, str):
|
|
26
|
+
metadata = json.loads(metadata_field)
|
|
27
|
+
elif isinstance(metadata_field, dict):
|
|
28
|
+
metadata = metadata_field
|
|
29
|
+
else:
|
|
30
|
+
raise ValueError(f"Unexpected type for 'metadata' field: {type(metadata_field)}")
|
|
31
|
+
|
|
32
|
+
# return sample
|
|
33
|
+
return Sample(
|
|
34
|
+
input=read_input(record.get(sample_fields.input)),
|
|
35
|
+
target=read_target(record.get(sample_fields.target)),
|
|
36
|
+
choices=read_choices(record.get(sample_fields.choices)),
|
|
37
|
+
id=record.get(sample_fields.id, None),
|
|
38
|
+
metadata=metadata,
|
|
39
|
+
sandbox=read_sandbox(record.get(sample_fields.sandbox)),
|
|
40
|
+
files=read_files(record.get(sample_fields.files)),
|
|
41
|
+
setup=read_setup(record.get(sample_fields.setup)),
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
return record_to_sample
|
|
45
|
+
|
|
46
|
+
else:
|
|
47
|
+
return sample_fields
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def data_to_samples(data: Iterable[dict], data_to_sample: Callable) -> List[Sample]:
|
|
51
|
+
samples: List[Sample] = []
|
|
52
|
+
for record in tqdm(data, desc='Processing records'):
|
|
53
|
+
record_samples = as_sample_list(data_to_sample(record=record))
|
|
54
|
+
samples.extend(record_samples)
|
|
55
|
+
return samples
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def as_sample_list(samples: Union[Sample, List[Sample]]) -> List[Sample]:
|
|
59
|
+
if isinstance(samples, list):
|
|
60
|
+
return samples
|
|
61
|
+
else:
|
|
62
|
+
return [samples]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def read_input(input_val: Optional[Any]) -> str:
|
|
66
|
+
if not input_val:
|
|
67
|
+
raise ValueError('No input in dataset')
|
|
68
|
+
return str(input_val)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def read_target(obj: Optional[Any]) -> Union[str, List[str]]:
|
|
72
|
+
if obj is not None:
|
|
73
|
+
return [str(item) for item in obj] if isinstance(obj, list) else str(obj)
|
|
74
|
+
else:
|
|
75
|
+
return ''
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def read_choices(obj: Optional[Any]) -> Optional[List[str]]:
|
|
79
|
+
if obj is not None:
|
|
80
|
+
if isinstance(obj, list):
|
|
81
|
+
return [str(choice) for choice in obj]
|
|
82
|
+
elif isinstance(obj, str):
|
|
83
|
+
choices = obj.split(',')
|
|
84
|
+
if len(choices) == 1:
|
|
85
|
+
choices = obj.split()
|
|
86
|
+
return [choice.strip() for choice in choices]
|
|
87
|
+
else:
|
|
88
|
+
return [str(obj)]
|
|
89
|
+
else:
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def read_setup(setup: Optional[Any]) -> Optional[str]:
|
|
94
|
+
if setup is not None:
|
|
95
|
+
return str(setup)
|
|
96
|
+
else:
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def read_sandbox(sandbox: Optional[Any]) -> Optional[str]:
|
|
101
|
+
if sandbox is not None:
|
|
102
|
+
if isinstance(sandbox, str):
|
|
103
|
+
return sandbox
|
|
104
|
+
elif isinstance(sandbox, dict):
|
|
105
|
+
return json.dumps(sandbox)
|
|
106
|
+
else:
|
|
107
|
+
raise ValueError(f"Unexpected type for 'sandbox' field: {type(sandbox)}")
|
|
108
|
+
else:
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def read_files(files: Optional[Any]) -> Optional[Dict[str, str]]:
|
|
113
|
+
if files is not None:
|
|
114
|
+
if isinstance(files, str):
|
|
115
|
+
files = json.loads(files)
|
|
116
|
+
if isinstance(files, dict):
|
|
117
|
+
if all(isinstance(v, str) for v in files.values()):
|
|
118
|
+
return cast(Dict[str, str], files)
|
|
119
|
+
|
|
120
|
+
# didn't find the right type
|
|
121
|
+
raise ValueError(f"Unexpected type for 'files' field: {type(files)}")
|
|
122
|
+
else:
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def shuffle_choices_if_requested(dataset: Dataset, shuffle_choices: Optional[Union[bool, int]]) -> None:
|
|
127
|
+
"""
|
|
128
|
+
Shuffle the choices in the dataset if requested.
|
|
129
|
+
|
|
130
|
+
The `shuffle_choices` parameter passed to `json_dataset`, `csv_dataset`,
|
|
131
|
+
and `hf_dataset` can be a boolean, an integer, or `None` (default).
|
|
132
|
+
If it is a boolean, it will shuffle the choices if the value is `True`,
|
|
133
|
+
and do nothing if it is `False`.
|
|
134
|
+
If it is an integer, it will shuffle the choices using the integer as the seed.
|
|
135
|
+
"""
|
|
136
|
+
# Note that `isinstance(x, int)` returns True if x is True or False,
|
|
137
|
+
# so we need to check for both explicitly
|
|
138
|
+
if shuffle_choices is True:
|
|
139
|
+
dataset.shuffle_choices()
|
|
140
|
+
elif shuffle_choices is False:
|
|
141
|
+
pass
|
|
142
|
+
elif isinstance(shuffle_choices, int):
|
|
143
|
+
dataset.shuffle_choices(seed=shuffle_choices)
|
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import os
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.api.dataset import Dataset
|
|
7
|
+
from evalscope.api.messages import ChatMessage
|
|
8
|
+
from evalscope.api.metric import SampleScore
|
|
9
|
+
from evalscope.api.model import ModelOutput
|
|
10
|
+
from evalscope.constants import DumpMode
|
|
11
|
+
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
from .state import TaskState
|
|
14
|
+
|
|
15
|
+
logger = get_logger()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CacheManager:
|
|
19
|
+
"""
|
|
20
|
+
Manage model results and review results for evaluation caching.
|
|
21
|
+
|
|
22
|
+
This class handles the caching mechanism for evaluation results, allowing
|
|
23
|
+
the system to resume evaluations from previously computed results and
|
|
24
|
+
avoid redundant computations.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, outputs: OutputsStructure, model_name: str, benchmark_name: str):
|
|
28
|
+
"""
|
|
29
|
+
Initialize the cache manager.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
outputs: Output directory structure for storing cache files
|
|
33
|
+
model_name: Name of the model being evaluated
|
|
34
|
+
benchmark_name: Name of the benchmark being used
|
|
35
|
+
"""
|
|
36
|
+
self.outputs = outputs
|
|
37
|
+
self.model_name = model_name
|
|
38
|
+
self.benchmark_name = benchmark_name
|
|
39
|
+
|
|
40
|
+
def filter_prediction_cache(self, subset: str, dataset: Dataset) -> Tuple[List[TaskState], Dataset]:
|
|
41
|
+
"""
|
|
42
|
+
Load cached prediction results and filter them from the dataset.
|
|
43
|
+
|
|
44
|
+
This method checks for existing prediction cache files and loads any
|
|
45
|
+
previously computed results. It then filters these samples from the
|
|
46
|
+
input dataset to avoid recomputation.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
subset: Name of the dataset subset
|
|
50
|
+
dataset: The dataset to filter
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Tuple of (cached task states, filtered dataset with remaining samples)
|
|
54
|
+
"""
|
|
55
|
+
cache_file = self.get_prediction_cache_path(subset)
|
|
56
|
+
if not os.path.exists(cache_file):
|
|
57
|
+
# No cache file exists, return empty cache and full dataset
|
|
58
|
+
return [], dataset
|
|
59
|
+
|
|
60
|
+
cached_task_states = []
|
|
61
|
+
cached_sample_ids = set()
|
|
62
|
+
cache_items = jsonl_to_list(cache_file)
|
|
63
|
+
|
|
64
|
+
# Process each cached item
|
|
65
|
+
for cache_item in cache_items:
|
|
66
|
+
# Deserialize the cached model result
|
|
67
|
+
cached_model_result = ModelResult.model_validate(cache_item)
|
|
68
|
+
# Convert to task state for further processing
|
|
69
|
+
cached_state = cached_model_result.to_task_state(dataset=dataset)
|
|
70
|
+
|
|
71
|
+
cached_task_states.append(cached_state)
|
|
72
|
+
cached_sample_ids.add(cached_state.sample_id)
|
|
73
|
+
|
|
74
|
+
# Remove cached samples from the dataset to avoid reprocessing
|
|
75
|
+
filtered_dataset = dataset.filter(lambda sample: sample.id not in cached_sample_ids)
|
|
76
|
+
|
|
77
|
+
logger.info(
|
|
78
|
+
f'Reusing predictions from {cache_file}, got {len(cached_task_states)} predictions, '
|
|
79
|
+
f'remaining {len(filtered_dataset)} samples'
|
|
80
|
+
)
|
|
81
|
+
return cached_task_states, filtered_dataset
|
|
82
|
+
|
|
83
|
+
def get_prediction_cache_path(self, subset: str) -> str:
|
|
84
|
+
"""
|
|
85
|
+
Get the file path for prediction cache storage.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
subset: Name of the dataset subset
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Path to the prediction cache file
|
|
92
|
+
"""
|
|
93
|
+
file_path = os.path.join(self.outputs.predictions_dir, self.model_name, f'{self.benchmark_name}_{subset}.jsonl')
|
|
94
|
+
# Ensure the directory exists
|
|
95
|
+
if self.outputs.is_make:
|
|
96
|
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
97
|
+
return file_path
|
|
98
|
+
|
|
99
|
+
def save_prediction_cache(self, subset: str, task_state: TaskState, save_metadata: bool = True) -> 'ModelResult':
|
|
100
|
+
"""
|
|
101
|
+
Save a prediction result to the cache.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
subset: Name of the dataset subset
|
|
105
|
+
task_state: The task state containing prediction results
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
The saved model result object
|
|
109
|
+
"""
|
|
110
|
+
cache_file = self.get_prediction_cache_path(subset)
|
|
111
|
+
# Convert task state to serializable model result
|
|
112
|
+
model_result = ModelResult.from_task_state(task_state, save_metadata)
|
|
113
|
+
# Serialize to dictionary
|
|
114
|
+
model_result_dict = model_result.model_dump()
|
|
115
|
+
# Append to JSONL cache file
|
|
116
|
+
dump_jsonl_data(data_list=model_result_dict, jsonl_file=cache_file, dump_mode=DumpMode.APPEND)
|
|
117
|
+
return model_result
|
|
118
|
+
|
|
119
|
+
def filter_review_cache(self, subset: str,
|
|
120
|
+
task_states: List[TaskState]) -> Tuple[List[SampleScore], List[TaskState]]:
|
|
121
|
+
"""
|
|
122
|
+
Load cached review results and filter corresponding task states.
|
|
123
|
+
|
|
124
|
+
This method loads previously computed review scores and removes
|
|
125
|
+
the corresponding task states from further review processing.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
subset: Name of the dataset subset
|
|
129
|
+
task_states: List of task states to potentially review
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Tuple of (cached sample scores, filtered task states for remaining reviews)
|
|
133
|
+
"""
|
|
134
|
+
cache_file = self.get_review_cache_path(subset)
|
|
135
|
+
if not os.path.exists(cache_file):
|
|
136
|
+
# No review cache exists, return empty scores and all task states
|
|
137
|
+
return [], task_states
|
|
138
|
+
|
|
139
|
+
cached_sample_scores: List[SampleScore] = []
|
|
140
|
+
cache_items = jsonl_to_list(cache_file)
|
|
141
|
+
|
|
142
|
+
# Process each cached review result
|
|
143
|
+
for cache_item in cache_items:
|
|
144
|
+
# Deserialize the cached review result
|
|
145
|
+
cached_review_result = ReviewResult.model_validate(cache_item)
|
|
146
|
+
cached_sample_scores.append(cached_review_result.to_sample_score())
|
|
147
|
+
|
|
148
|
+
# Filter out task states that already have review scores
|
|
149
|
+
cached_sample_ids = {review.sample_id for review in cached_sample_scores}
|
|
150
|
+
filtered_task_states = [state for state in task_states if state.sample_id not in cached_sample_ids]
|
|
151
|
+
|
|
152
|
+
logger.info(f'Reusing reviews from {cache_file}, got {len(cached_sample_scores)} reviews')
|
|
153
|
+
return cached_sample_scores, filtered_task_states
|
|
154
|
+
|
|
155
|
+
def get_review_cache_path(self, subset: str) -> str:
|
|
156
|
+
"""
|
|
157
|
+
Get the file path for review cache storage.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
subset: Name of the dataset subset
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Path to the review cache file
|
|
164
|
+
"""
|
|
165
|
+
file_path = os.path.join(self.outputs.reviews_dir, self.model_name, f'{self.benchmark_name}_{subset}.jsonl')
|
|
166
|
+
# Ensure the directory exists
|
|
167
|
+
if self.outputs.is_make:
|
|
168
|
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
169
|
+
return file_path
|
|
170
|
+
|
|
171
|
+
def delete_review_cache(self, subset: str):
|
|
172
|
+
"""Delete the review cache for a specific subset. If the cache exists, it will be removed."""
|
|
173
|
+
file_path = self.get_review_cache_path(subset)
|
|
174
|
+
if os.path.exists(file_path):
|
|
175
|
+
logger.info(f'Deleting review cache file: {file_path}')
|
|
176
|
+
os.remove(file_path)
|
|
177
|
+
|
|
178
|
+
def save_review_cache(
|
|
179
|
+
self,
|
|
180
|
+
subset: str,
|
|
181
|
+
task_state: TaskState,
|
|
182
|
+
sample_score: SampleScore,
|
|
183
|
+
save_metadata: bool = True
|
|
184
|
+
) -> 'ReviewResult':
|
|
185
|
+
"""
|
|
186
|
+
Save a review result to the cache.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
subset: Name of the dataset subset
|
|
190
|
+
task_state: The task state that was reviewed
|
|
191
|
+
sample_score: The computed score for the sample
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
The saved review result object
|
|
195
|
+
"""
|
|
196
|
+
cache_file = self.get_review_cache_path(subset)
|
|
197
|
+
# Convert score and state to serializable review result
|
|
198
|
+
review_result = ReviewResult.from_score_state(sample_score, task_state, save_metadata)
|
|
199
|
+
# Serialize to dictionary
|
|
200
|
+
review_result_dict = review_result.model_dump()
|
|
201
|
+
# Append to JSONL cache file
|
|
202
|
+
dump_jsonl_data(data_list=review_result_dict, jsonl_file=cache_file, dump_mode=DumpMode.APPEND)
|
|
203
|
+
return review_result
|
|
204
|
+
|
|
205
|
+
def get_report_path(self) -> str:
|
|
206
|
+
"""
|
|
207
|
+
Get the directory path for report storage.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
Path to the reports directory for this model
|
|
211
|
+
"""
|
|
212
|
+
report_path = os.path.join(self.outputs.reports_dir, self.model_name)
|
|
213
|
+
# Ensure the directory exists
|
|
214
|
+
if self.outputs.is_make:
|
|
215
|
+
os.makedirs(report_path, exist_ok=True)
|
|
216
|
+
return report_path
|
|
217
|
+
|
|
218
|
+
def get_report_file(self) -> str:
|
|
219
|
+
"""
|
|
220
|
+
Get the report file path for the benchmark.
|
|
221
|
+
|
|
222
|
+
The report file is named as '{benchmark_name}.json' and contains
|
|
223
|
+
the final evaluation results for the benchmark.
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
Full path to the benchmark report file
|
|
227
|
+
"""
|
|
228
|
+
return os.path.join(self.get_report_path(), f'{self.benchmark_name}.json')
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
class ModelResult(BaseModel):
|
|
232
|
+
"""
|
|
233
|
+
Serializable container for model prediction results.
|
|
234
|
+
|
|
235
|
+
This class represents a single model prediction that can be cached
|
|
236
|
+
and restored later to avoid recomputation.
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
index: int
|
|
240
|
+
"""Index of the sample in the dataset that was processed."""
|
|
241
|
+
|
|
242
|
+
model: str = ''
|
|
243
|
+
"""Name of the model that generated this prediction."""
|
|
244
|
+
|
|
245
|
+
model_output: Optional[ModelOutput] = None
|
|
246
|
+
"""The actual prediction/output generated by the model."""
|
|
247
|
+
|
|
248
|
+
messages: List[ChatMessage] = []
|
|
249
|
+
"""Chat messages exchanged during evaluation (for conversational models)."""
|
|
250
|
+
|
|
251
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
252
|
+
"""Additional metadata associated with the model result."""
|
|
253
|
+
|
|
254
|
+
@classmethod
|
|
255
|
+
def from_task_state(cls, task_state: TaskState, save_metadata: bool = True) -> 'ModelResult':
|
|
256
|
+
"""
|
|
257
|
+
Create a ModelResult from a TaskState for caching.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
task_state: The completed task state to serialize
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
ModelResult object ready for caching
|
|
264
|
+
"""
|
|
265
|
+
return cls(
|
|
266
|
+
model=task_state.model,
|
|
267
|
+
index=task_state.sample_id,
|
|
268
|
+
messages=task_state.messages,
|
|
269
|
+
model_output=task_state.output,
|
|
270
|
+
metadata=task_state.metadata if save_metadata else {},
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
def to_task_state(self, dataset: Dataset) -> TaskState:
|
|
274
|
+
"""
|
|
275
|
+
Restore a TaskState from cached ModelResult.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
dataset: The dataset to retrieve the original sample from
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
Reconstructed TaskState with cached results
|
|
282
|
+
|
|
283
|
+
Raises:
|
|
284
|
+
ValueError: If the sample index is not found in the dataset
|
|
285
|
+
"""
|
|
286
|
+
sample = dataset[self.index]
|
|
287
|
+
if not sample:
|
|
288
|
+
raise ValueError(f'Sample with index {self.index} not found in dataset')
|
|
289
|
+
|
|
290
|
+
# update metadata if exists
|
|
291
|
+
if self.metadata:
|
|
292
|
+
sample.metadata.update(self.metadata)
|
|
293
|
+
|
|
294
|
+
return TaskState(
|
|
295
|
+
model=self.model,
|
|
296
|
+
sample=sample,
|
|
297
|
+
messages=self.messages,
|
|
298
|
+
output=ModelOutput.model_validate(self.model_output),
|
|
299
|
+
completed=True, # Mark as completed since it was cached
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
class ReviewResult(BaseModel):
|
|
304
|
+
"""
|
|
305
|
+
Serializable container for review/scoring results.
|
|
306
|
+
|
|
307
|
+
This class represents the result of reviewing a model's prediction,
|
|
308
|
+
including the computed score and relevant context.
|
|
309
|
+
"""
|
|
310
|
+
|
|
311
|
+
index: int
|
|
312
|
+
"""Index of the sample that was reviewed."""
|
|
313
|
+
|
|
314
|
+
input: str = ''
|
|
315
|
+
"""Original input from the sample (immutable reference)."""
|
|
316
|
+
|
|
317
|
+
target: Optional[str] = None
|
|
318
|
+
"""Expected/target answer for the sample, if available."""
|
|
319
|
+
|
|
320
|
+
sample_score: SampleScore
|
|
321
|
+
"""The computed evaluation score for this sample."""
|
|
322
|
+
|
|
323
|
+
@classmethod
|
|
324
|
+
def from_score_state(
|
|
325
|
+
cls, sample_score: SampleScore, state: TaskState, save_metadata: bool = True
|
|
326
|
+
) -> 'ReviewResult':
|
|
327
|
+
"""
|
|
328
|
+
Create a ReviewResult from a score and task state for caching.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
sample_score: The computed score for the sample
|
|
332
|
+
state: The task state containing sample information
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
ReviewResult object ready for caching
|
|
336
|
+
"""
|
|
337
|
+
if not save_metadata:
|
|
338
|
+
sample_score = copy.deepcopy(sample_score)
|
|
339
|
+
sample_score.sample_metadata = None
|
|
340
|
+
|
|
341
|
+
return cls(
|
|
342
|
+
index=state.sample_id,
|
|
343
|
+
input=state.input_text,
|
|
344
|
+
target=state.target,
|
|
345
|
+
sample_score=sample_score,
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
def to_sample_score(self) -> SampleScore:
|
|
349
|
+
"""
|
|
350
|
+
Extract the sample score from the cached review result.
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
The sample score object
|
|
354
|
+
"""
|
|
355
|
+
return self.sample_score
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from typing import TYPE_CHECKING, List, Union
|
|
3
|
+
|
|
4
|
+
from evalscope.api.metric import SampleScore
|
|
5
|
+
from evalscope.report import Report
|
|
6
|
+
from .state import TaskState
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from evalscope.api.benchmark import DataAdapter
|
|
10
|
+
from evalscope.api.model import Model
|
|
11
|
+
from evalscope.config import TaskConfig
|
|
12
|
+
from evalscope.utils.io_utils import OutputsStructure
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Evaluator(abc.ABC):
|
|
16
|
+
"""
|
|
17
|
+
Abstract base class for evaluators.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
benchmark (DataAdapter): The data adapter for the benchmark.
|
|
21
|
+
model (Model): The model to evaluate.
|
|
22
|
+
outputs (OutputsStructure, optional): The output structure for results.
|
|
23
|
+
task_config (TaskConfig, optional): The task configuration.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
benchmark: 'DataAdapter',
|
|
29
|
+
model: 'Model',
|
|
30
|
+
outputs: 'OutputsStructure' = None,
|
|
31
|
+
task_config: 'TaskConfig' = None,
|
|
32
|
+
):
|
|
33
|
+
self.benchmark = benchmark
|
|
34
|
+
self.model = model
|
|
35
|
+
self.outputs = outputs
|
|
36
|
+
self.task_config = task_config
|
|
37
|
+
|
|
38
|
+
@abc.abstractmethod
|
|
39
|
+
def eval(self, *args, **kwargs) -> Report:
|
|
40
|
+
"""Run the evaluation process."""
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
@abc.abstractmethod
|
|
44
|
+
def get_answers(self, *args, **kwargs) -> List[TaskState]:
|
|
45
|
+
"""Get the evaluation answers."""
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
@abc.abstractmethod
|
|
49
|
+
def get_reviews(self, *args, **kwargs) -> List[SampleScore]:
|
|
50
|
+
"""Get the review results."""
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
@abc.abstractmethod
|
|
54
|
+
def get_report(self, *args, **kwargs) -> Report:
|
|
55
|
+
"""Get the evaluation report."""
|
|
56
|
+
pass
|