evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +5 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +356 -0
- evalscope/api/benchmark/meta.py +121 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +262 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +378 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +275 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +243 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +155 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/app.py +3 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +26 -14
- evalscope/app/utils/data_utils.py +43 -27
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -14
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +7 -10
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +10 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +136 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +157 -57
- evalscope/constants.py +37 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +275 -419
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +47 -33
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +67 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +126 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +701 -0
- evalscope/perf/benchmark.py +4 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +15 -10
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +11 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -3
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +51 -35
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +33 -47
- evalscope/summarizer.py +1 -1
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +3 -2
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +142 -6
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +11 -7
- evalscope/utils/multi_choices.py +288 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
- tests/benchmark/test_eval.py +385 -0
- tests/benchmark/test_image_edit.py +65 -0
- tests/{aigc → benchmark}/test_t2i.py +22 -4
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +85 -47
- tests/cli/test_collection.py +20 -8
- tests/cli/test_custom.py +22 -15
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -2
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/__init__.py +0 -0
|
@@ -0,0 +1,378 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import os
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
5
|
+
|
|
6
|
+
from evalscope.api.dataset import Dataset
|
|
7
|
+
from evalscope.api.messages import ChatMessage
|
|
8
|
+
from evalscope.api.metric import SampleScore
|
|
9
|
+
from evalscope.api.model import ModelOutput
|
|
10
|
+
from evalscope.constants import DumpMode
|
|
11
|
+
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
from .state import TaskState
|
|
14
|
+
|
|
15
|
+
logger = get_logger()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CacheManager:
|
|
19
|
+
"""
|
|
20
|
+
Manage model results and review results for evaluation caching.
|
|
21
|
+
|
|
22
|
+
This class handles the caching mechanism for evaluation results, allowing
|
|
23
|
+
the system to resume evaluations from previously computed results and
|
|
24
|
+
avoid redundant computations.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, outputs: OutputsStructure, model_name: str, benchmark_name: str):
|
|
28
|
+
"""
|
|
29
|
+
Initialize the cache manager.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
outputs: Output directory structure for storing cache files
|
|
33
|
+
model_name: Name of the model being evaluated
|
|
34
|
+
benchmark_name: Name of the benchmark being used
|
|
35
|
+
"""
|
|
36
|
+
self.outputs = outputs
|
|
37
|
+
self.model_name = model_name
|
|
38
|
+
self.benchmark_name = benchmark_name
|
|
39
|
+
|
|
40
|
+
def filter_prediction_cache(self, subset: str, dataset: Dataset) -> Tuple[List[TaskState], Dataset]:
|
|
41
|
+
"""
|
|
42
|
+
Load cached prediction results and filter them from the dataset.
|
|
43
|
+
|
|
44
|
+
This method checks for existing prediction cache files and loads any
|
|
45
|
+
previously computed results. It then filters these samples from the
|
|
46
|
+
input dataset to avoid recomputation.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
subset: Name of the dataset subset
|
|
50
|
+
dataset: The dataset to filter
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Tuple of (cached task states, filtered dataset with remaining samples)
|
|
54
|
+
"""
|
|
55
|
+
cache_file = self.get_prediction_cache_path(subset)
|
|
56
|
+
if not os.path.exists(cache_file):
|
|
57
|
+
# No cache file exists, return empty cache and full dataset
|
|
58
|
+
return [], dataset
|
|
59
|
+
|
|
60
|
+
cached_task_states = []
|
|
61
|
+
cached_sample_ids = set()
|
|
62
|
+
cache_items = jsonl_to_list(cache_file)
|
|
63
|
+
|
|
64
|
+
# Process each cached item
|
|
65
|
+
for cache_item in cache_items:
|
|
66
|
+
# Deserialize the cached model result
|
|
67
|
+
cached_model_result = ModelResult.model_validate(cache_item)
|
|
68
|
+
# Convert to task state for further processing
|
|
69
|
+
cached_state = cached_model_result.to_task_state(dataset=dataset)
|
|
70
|
+
|
|
71
|
+
cached_task_states.append(cached_state)
|
|
72
|
+
cached_sample_ids.add(cached_state.sample_id)
|
|
73
|
+
|
|
74
|
+
# Remove cached samples from the dataset to avoid reprocessing
|
|
75
|
+
filtered_dataset = dataset.filter(lambda sample: sample.id not in cached_sample_ids)
|
|
76
|
+
|
|
77
|
+
logger.info(
|
|
78
|
+
f'Reusing predictions from {cache_file}, got {len(cached_task_states)} predictions, '
|
|
79
|
+
f'remaining {len(filtered_dataset)} samples'
|
|
80
|
+
)
|
|
81
|
+
return cached_task_states, filtered_dataset
|
|
82
|
+
|
|
83
|
+
def get_prediction_cache_path(self, subset: str) -> str:
|
|
84
|
+
"""
|
|
85
|
+
Get the file path for prediction cache storage.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
subset: Name of the dataset subset
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Path to the prediction cache file
|
|
92
|
+
"""
|
|
93
|
+
file_path = os.path.join(self.outputs.predictions_dir, self.model_name, f'{self.benchmark_name}_{subset}.jsonl')
|
|
94
|
+
# Ensure the directory exists
|
|
95
|
+
if self.outputs.is_make:
|
|
96
|
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
97
|
+
return file_path
|
|
98
|
+
|
|
99
|
+
def save_prediction_cache(self, subset: str, task_state: TaskState, save_metadata: bool = True) -> 'ModelResult':
|
|
100
|
+
"""
|
|
101
|
+
Save a prediction result to the cache.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
subset: Name of the dataset subset
|
|
105
|
+
task_state: The task state containing prediction results
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
The saved model result object
|
|
109
|
+
"""
|
|
110
|
+
cache_file = self.get_prediction_cache_path(subset)
|
|
111
|
+
# Convert task state to serializable model result
|
|
112
|
+
model_result = ModelResult.from_task_state(task_state, save_metadata)
|
|
113
|
+
# Serialize to dictionary
|
|
114
|
+
model_result_dict = model_result.model_dump()
|
|
115
|
+
# Append to JSONL cache file
|
|
116
|
+
dump_jsonl_data(data_list=model_result_dict, jsonl_file=cache_file, dump_mode=DumpMode.APPEND)
|
|
117
|
+
return model_result
|
|
118
|
+
|
|
119
|
+
def filter_review_cache(self, subset: str,
|
|
120
|
+
task_states: List[TaskState]) -> Tuple[List[SampleScore], List[TaskState]]:
|
|
121
|
+
"""
|
|
122
|
+
Load cached review results and filter corresponding task states.
|
|
123
|
+
|
|
124
|
+
This method loads previously computed review scores and removes
|
|
125
|
+
the corresponding task states from further review processing.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
subset: Name of the dataset subset
|
|
129
|
+
task_states: List of task states to potentially review
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Tuple of (cached sample scores, filtered task states for remaining reviews)
|
|
133
|
+
"""
|
|
134
|
+
cache_file = self.get_review_cache_path(subset)
|
|
135
|
+
if not os.path.exists(cache_file):
|
|
136
|
+
# No review cache exists, return empty scores and all task states
|
|
137
|
+
return [], task_states
|
|
138
|
+
|
|
139
|
+
cached_sample_scores: List[SampleScore] = []
|
|
140
|
+
cache_items = jsonl_to_list(cache_file)
|
|
141
|
+
|
|
142
|
+
# Process each cached review result
|
|
143
|
+
for cache_item in cache_items:
|
|
144
|
+
# Deserialize the cached review result
|
|
145
|
+
cached_review_result = ReviewResult.model_validate(cache_item)
|
|
146
|
+
cached_sample_scores.append(cached_review_result.to_sample_score())
|
|
147
|
+
|
|
148
|
+
# Filter out task states that already have review scores
|
|
149
|
+
cached_sample_ids = {review.sample_id for review in cached_sample_scores}
|
|
150
|
+
filtered_task_states = [state for state in task_states if state.sample_id not in cached_sample_ids]
|
|
151
|
+
|
|
152
|
+
logger.info(f'Reusing reviews from {cache_file}, got {len(cached_sample_scores)} reviews')
|
|
153
|
+
return cached_sample_scores, filtered_task_states
|
|
154
|
+
|
|
155
|
+
def get_review_cache_path(self, subset: str) -> str:
|
|
156
|
+
"""
|
|
157
|
+
Get the file path for review cache storage.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
subset: Name of the dataset subset
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Path to the review cache file
|
|
164
|
+
"""
|
|
165
|
+
file_path = os.path.join(self.outputs.reviews_dir, self.model_name, f'{self.benchmark_name}_{subset}.jsonl')
|
|
166
|
+
# Ensure the directory exists
|
|
167
|
+
if self.outputs.is_make:
|
|
168
|
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
169
|
+
return file_path
|
|
170
|
+
|
|
171
|
+
def delete_review_cache(self, subset: str):
|
|
172
|
+
"""Delete the review cache for a specific subset. If the cache exists, it will be removed."""
|
|
173
|
+
file_path = self.get_review_cache_path(subset)
|
|
174
|
+
if os.path.exists(file_path):
|
|
175
|
+
logger.info(f'Deleting review cache file: {file_path}')
|
|
176
|
+
os.remove(file_path)
|
|
177
|
+
|
|
178
|
+
def save_review_cache(
|
|
179
|
+
self,
|
|
180
|
+
subset: str,
|
|
181
|
+
task_state: TaskState,
|
|
182
|
+
sample_score: SampleScore,
|
|
183
|
+
save_metadata: bool = True
|
|
184
|
+
) -> 'ReviewResult':
|
|
185
|
+
"""
|
|
186
|
+
Save a review result to the cache.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
subset: Name of the dataset subset
|
|
190
|
+
task_state: The task state that was reviewed
|
|
191
|
+
sample_score: The computed score for the sample
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
The saved review result object
|
|
195
|
+
"""
|
|
196
|
+
cache_file = self.get_review_cache_path(subset)
|
|
197
|
+
# Convert score and state to serializable review result
|
|
198
|
+
review_result = ReviewResult.from_score_state(sample_score, task_state, save_metadata)
|
|
199
|
+
# Serialize to dictionary
|
|
200
|
+
review_result_dict = review_result.model_dump()
|
|
201
|
+
# Append to JSONL cache file
|
|
202
|
+
dump_jsonl_data(data_list=review_result_dict, jsonl_file=cache_file, dump_mode=DumpMode.APPEND)
|
|
203
|
+
return review_result
|
|
204
|
+
|
|
205
|
+
def get_report_path(self) -> str:
|
|
206
|
+
"""
|
|
207
|
+
Get the directory path for report storage.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
Path to the reports directory for this model
|
|
211
|
+
"""
|
|
212
|
+
report_path = os.path.join(self.outputs.reports_dir, self.model_name)
|
|
213
|
+
# Ensure the directory exists
|
|
214
|
+
if self.outputs.is_make:
|
|
215
|
+
os.makedirs(report_path, exist_ok=True)
|
|
216
|
+
return report_path
|
|
217
|
+
|
|
218
|
+
def get_report_file(self) -> str:
|
|
219
|
+
"""
|
|
220
|
+
Get the report file path for the benchmark.
|
|
221
|
+
|
|
222
|
+
The report file is named as '{benchmark_name}.json' and contains
|
|
223
|
+
the final evaluation results for the benchmark.
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
Full path to the benchmark report file
|
|
227
|
+
"""
|
|
228
|
+
return os.path.join(self.get_report_path(), f'{self.benchmark_name}.json')
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
class ModelResult(BaseModel):
|
|
232
|
+
"""
|
|
233
|
+
Serializable container for model prediction results.
|
|
234
|
+
|
|
235
|
+
This class represents a single model prediction that can be cached
|
|
236
|
+
and restored later to avoid recomputation.
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
index: int
|
|
240
|
+
"""Index of the sample in the dataset that was processed."""
|
|
241
|
+
|
|
242
|
+
model: str = ''
|
|
243
|
+
"""Name of the model that generated this prediction."""
|
|
244
|
+
|
|
245
|
+
model_output: Optional[ModelOutput] = None
|
|
246
|
+
"""The actual prediction/output generated by the model."""
|
|
247
|
+
|
|
248
|
+
messages: List[ChatMessage] = []
|
|
249
|
+
"""Chat messages exchanged during evaluation (for conversational models)."""
|
|
250
|
+
|
|
251
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
252
|
+
"""Additional metadata associated with the model result."""
|
|
253
|
+
|
|
254
|
+
@classmethod
|
|
255
|
+
def from_task_state(cls, task_state: TaskState, save_metadata: bool = True) -> 'ModelResult':
|
|
256
|
+
"""
|
|
257
|
+
Create a ModelResult from a TaskState for caching.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
task_state: The completed task state to serialize
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
ModelResult object ready for caching
|
|
264
|
+
"""
|
|
265
|
+
return cls(
|
|
266
|
+
model=task_state.model,
|
|
267
|
+
index=task_state.sample_id,
|
|
268
|
+
messages=task_state.messages,
|
|
269
|
+
model_output=task_state.output,
|
|
270
|
+
metadata=task_state.metadata if save_metadata else {},
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
def to_task_state(self, dataset: Dataset) -> TaskState:
|
|
274
|
+
"""
|
|
275
|
+
Restore a TaskState from cached ModelResult.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
dataset: The dataset to retrieve the original sample from
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
Reconstructed TaskState with cached results
|
|
282
|
+
|
|
283
|
+
Raises:
|
|
284
|
+
ValueError: If the sample index is not found in the dataset
|
|
285
|
+
"""
|
|
286
|
+
sample = dataset[self.index]
|
|
287
|
+
if not sample:
|
|
288
|
+
raise ValueError(f'Sample with index {self.index} not found in dataset')
|
|
289
|
+
|
|
290
|
+
# update metadata if exists
|
|
291
|
+
if self.metadata:
|
|
292
|
+
sample.metadata.update(self.metadata)
|
|
293
|
+
|
|
294
|
+
return TaskState(
|
|
295
|
+
model=self.model,
|
|
296
|
+
sample=sample,
|
|
297
|
+
messages=self.messages,
|
|
298
|
+
output=ModelOutput.model_validate(self.model_output),
|
|
299
|
+
completed=True, # Mark as completed since it was cached
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
def pretty_print(self) -> str:
|
|
303
|
+
"""
|
|
304
|
+
Generate a pretty-printed string representation of the model result.
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
A string representation of the model result
|
|
308
|
+
"""
|
|
309
|
+
return self.model_dump_json(indent=2)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
class ReviewResult(BaseModel):
|
|
313
|
+
"""
|
|
314
|
+
Serializable container for review/scoring results.
|
|
315
|
+
|
|
316
|
+
This class represents the result of reviewing a model's prediction,
|
|
317
|
+
including the computed score and relevant context.
|
|
318
|
+
"""
|
|
319
|
+
|
|
320
|
+
index: int
|
|
321
|
+
"""Index of the sample that was reviewed."""
|
|
322
|
+
|
|
323
|
+
input: str = ''
|
|
324
|
+
"""Original input from the sample (immutable reference)."""
|
|
325
|
+
|
|
326
|
+
target: Optional[str] = None
|
|
327
|
+
"""Expected/target answer for the sample, if available."""
|
|
328
|
+
|
|
329
|
+
sample_score: SampleScore
|
|
330
|
+
"""The computed evaluation score for this sample."""
|
|
331
|
+
|
|
332
|
+
@classmethod
|
|
333
|
+
def from_score_state(
|
|
334
|
+
cls, sample_score: SampleScore, state: TaskState, save_metadata: bool = True
|
|
335
|
+
) -> 'ReviewResult':
|
|
336
|
+
"""
|
|
337
|
+
Create a ReviewResult from a score and task state for caching.
|
|
338
|
+
|
|
339
|
+
Args:
|
|
340
|
+
sample_score: The computed score for the sample
|
|
341
|
+
state: The task state containing sample information
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
ReviewResult object ready for caching
|
|
345
|
+
"""
|
|
346
|
+
if not save_metadata:
|
|
347
|
+
sample_score = copy.deepcopy(sample_score)
|
|
348
|
+
sample_score.sample_metadata = None
|
|
349
|
+
|
|
350
|
+
return cls(
|
|
351
|
+
index=state.sample_id,
|
|
352
|
+
input=state.input_markdown,
|
|
353
|
+
target=state.target,
|
|
354
|
+
sample_score=sample_score,
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
def to_sample_score(self) -> SampleScore:
|
|
358
|
+
"""
|
|
359
|
+
Extract the sample score from the cached review result.
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
The sample score object
|
|
363
|
+
"""
|
|
364
|
+
return self.sample_score
|
|
365
|
+
|
|
366
|
+
def pretty_print(self) -> str:
|
|
367
|
+
"""
|
|
368
|
+
Generate a pretty-printed string representation of the review result.
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
A string representation of the review result
|
|
372
|
+
"""
|
|
373
|
+
output = [
|
|
374
|
+
f'Review Result for Sample {self.index}:',
|
|
375
|
+
f'Target: {self.target}',
|
|
376
|
+
f'Score: {self.sample_score.model_dump_json(indent=2)}',
|
|
377
|
+
]
|
|
378
|
+
return '\n'.join(output)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from typing import TYPE_CHECKING, List, Union
|
|
3
|
+
|
|
4
|
+
from evalscope.api.metric import SampleScore
|
|
5
|
+
from evalscope.report import Report
|
|
6
|
+
from .state import TaskState
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from evalscope.api.benchmark import DataAdapter
|
|
10
|
+
from evalscope.api.model import Model
|
|
11
|
+
from evalscope.config import TaskConfig
|
|
12
|
+
from evalscope.utils.io_utils import OutputsStructure
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Evaluator(abc.ABC):
|
|
16
|
+
"""
|
|
17
|
+
Abstract base class for evaluators.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
benchmark (DataAdapter): The data adapter for the benchmark.
|
|
21
|
+
model (Model): The model to evaluate.
|
|
22
|
+
outputs (OutputsStructure, optional): The output structure for results.
|
|
23
|
+
task_config (TaskConfig, optional): The task configuration.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
benchmark: 'DataAdapter',
|
|
29
|
+
model: 'Model',
|
|
30
|
+
outputs: 'OutputsStructure' = None,
|
|
31
|
+
task_config: 'TaskConfig' = None,
|
|
32
|
+
):
|
|
33
|
+
self.benchmark = benchmark
|
|
34
|
+
self.model = model
|
|
35
|
+
self.outputs = outputs
|
|
36
|
+
self.task_config = task_config
|
|
37
|
+
|
|
38
|
+
@abc.abstractmethod
|
|
39
|
+
def eval(self, *args, **kwargs) -> Report:
|
|
40
|
+
"""Run the evaluation process."""
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
@abc.abstractmethod
|
|
44
|
+
def get_answers(self, *args, **kwargs) -> List[TaskState]:
|
|
45
|
+
"""Get the evaluation answers."""
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
@abc.abstractmethod
|
|
49
|
+
def get_reviews(self, *args, **kwargs) -> List[SampleScore]:
|
|
50
|
+
"""Get the review results."""
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
@abc.abstractmethod
|
|
54
|
+
def get_report(self, *args, **kwargs) -> Report:
|
|
55
|
+
"""Get the evaluation report."""
|
|
56
|
+
pass
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from random import Random
|
|
3
|
+
from typing import Any, Dict, List, Optional, Sequence, Union, overload
|
|
4
|
+
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.messages import ChatMessage, ChatMessageUser, messages_pretty_str, messages_to_markdown
|
|
7
|
+
from evalscope.api.model import ModelOutput
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Target(Sequence[str]):
|
|
11
|
+
"""Target for scoring against the current TaskState.
|
|
12
|
+
|
|
13
|
+
Target is a sequence of one or more strings. Use the
|
|
14
|
+
`text` property to access the value as a single string.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, target: Union[str, List[str]]) -> None:
|
|
18
|
+
self.target = target if isinstance(target, list) else [target]
|
|
19
|
+
|
|
20
|
+
@overload
|
|
21
|
+
def __getitem__(self, index: int) -> str:
|
|
22
|
+
...
|
|
23
|
+
|
|
24
|
+
@overload
|
|
25
|
+
def __getitem__(self, index: slice) -> Sequence[str]:
|
|
26
|
+
...
|
|
27
|
+
|
|
28
|
+
def __getitem__(self, index: Union[int, slice]) -> Union[str, Sequence[str]]:
|
|
29
|
+
return self.target[index]
|
|
30
|
+
|
|
31
|
+
def __len__(self) -> int:
|
|
32
|
+
return len(self.target)
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def text(self) -> str:
|
|
36
|
+
return ''.join(self.target)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class Choice:
|
|
41
|
+
"""
|
|
42
|
+
A `Choice` represents a single choice in a multiple choice question.
|
|
43
|
+
|
|
44
|
+
It is only relevant for the `multiple_choice` solver and corresponding
|
|
45
|
+
`choice` scorer.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
value: str
|
|
49
|
+
"""The original value of the choice from the `Sample`."""
|
|
50
|
+
|
|
51
|
+
correct: Optional[bool]
|
|
52
|
+
"""Did the model think this choice satisfies the question? `None`
|
|
53
|
+
indicates this has not been set yet"""
|
|
54
|
+
|
|
55
|
+
original_position: int
|
|
56
|
+
"""Choices may be re-ordered during processing, this represents the
|
|
57
|
+
original position in the sample's list of choices"""
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class Choices(Sequence[Choice]):
|
|
61
|
+
"""
|
|
62
|
+
Wrapper class for a list of `Choice` objects.
|
|
63
|
+
|
|
64
|
+
Primarily simply to abstract away implementations of choice-specific
|
|
65
|
+
functionality from the already-big `TaskState` class.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
def __init__(self, choices: Union[List[str], List[Choice]]) -> None:
|
|
69
|
+
"""
|
|
70
|
+
Setter for choices, intended to only be used with the `multiple_choice` scorer.
|
|
71
|
+
|
|
72
|
+
Choices come from a list of choices for the sample, specifically used by
|
|
73
|
+
the `multiple_choice` scorer.
|
|
74
|
+
|
|
75
|
+
For example, if the sample was a multiple choice question like "What is
|
|
76
|
+
the capital of France? A) Paris B) London C) Berlin", we would store the
|
|
77
|
+
possible answers here.
|
|
78
|
+
"""
|
|
79
|
+
self._choices: List[Choice] = []
|
|
80
|
+
|
|
81
|
+
for i, choice in enumerate(choices):
|
|
82
|
+
if isinstance(choice, str):
|
|
83
|
+
self._choices.append(Choice(value=choice, correct=None, original_position=i))
|
|
84
|
+
elif isinstance(choice, Choice):
|
|
85
|
+
self._choices.append(choice)
|
|
86
|
+
|
|
87
|
+
@overload
|
|
88
|
+
def __getitem__(self, index: int) -> Choice:
|
|
89
|
+
...
|
|
90
|
+
|
|
91
|
+
@overload
|
|
92
|
+
def __getitem__(self, index: slice) -> Sequence[Choice]:
|
|
93
|
+
...
|
|
94
|
+
|
|
95
|
+
def __getitem__(self, index: Union[int, slice]) -> Union[Choice, Sequence[Choice]]:
|
|
96
|
+
return self._choices[index]
|
|
97
|
+
|
|
98
|
+
def __len__(self) -> int:
|
|
99
|
+
return len(self._choices)
|
|
100
|
+
|
|
101
|
+
def mark_choice(self, index: int, correct: bool) -> None:
|
|
102
|
+
"""Set the value of a specific choice"""
|
|
103
|
+
self._choices[index].correct = correct
|
|
104
|
+
|
|
105
|
+
def shuffle(self, rand: Random = Random()) -> None:
|
|
106
|
+
"""
|
|
107
|
+
Shuffle the choice order, setting the `original_position` so they can be mapped back to their original order.
|
|
108
|
+
|
|
109
|
+
Some evals will shuffle the choices from the original sample to try to
|
|
110
|
+
avoid the model answering correctly due to fine-tuning (or similar) on
|
|
111
|
+
specific datasets.
|
|
112
|
+
"""
|
|
113
|
+
shuffled_positions = list(range(len(self._choices)))
|
|
114
|
+
rand.shuffle(shuffled_positions)
|
|
115
|
+
|
|
116
|
+
shuffled_choices = [Choice('notachoice', None, -1)] * len(self._choices)
|
|
117
|
+
|
|
118
|
+
for i, shuffled_position in enumerate(shuffled_positions):
|
|
119
|
+
shuffled_choices[i] = self._choices[shuffled_position]
|
|
120
|
+
shuffled_choices[i].original_position = shuffled_position
|
|
121
|
+
|
|
122
|
+
self._choices = shuffled_choices
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class TaskState:
|
|
126
|
+
"""
|
|
127
|
+
The `TaskState` represents the internal state of the `Task` being run for a single `Sample`.
|
|
128
|
+
|
|
129
|
+
The `TaskState` is passed to and returned from each solver during a sample's
|
|
130
|
+
evaluation. It allows us to maintain the manipulated message history, the tools
|
|
131
|
+
available to the model, the final output of the model, and whether the task
|
|
132
|
+
is completed or has hit a limit.
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
def __init__(
|
|
136
|
+
self,
|
|
137
|
+
model: str,
|
|
138
|
+
sample: Sample,
|
|
139
|
+
messages: List[ChatMessage] = [],
|
|
140
|
+
output: Optional[ModelOutput] = None,
|
|
141
|
+
completed: bool = False,
|
|
142
|
+
) -> None:
|
|
143
|
+
self._model = model
|
|
144
|
+
self._sample = sample
|
|
145
|
+
self._sample_id = sample.id
|
|
146
|
+
self._group_id = sample.group_id
|
|
147
|
+
self._input = sample.input
|
|
148
|
+
self._target = Target(sample.target)
|
|
149
|
+
self._metadata = sample.metadata
|
|
150
|
+
self._messages: List[ChatMessage] = messages
|
|
151
|
+
self._output = output if output else ModelOutput(model=str(model))
|
|
152
|
+
self._completed = completed
|
|
153
|
+
if sample.choices:
|
|
154
|
+
self._choices = Choices(sample.choices)
|
|
155
|
+
else:
|
|
156
|
+
self._choices = Choices([])
|
|
157
|
+
|
|
158
|
+
@property
|
|
159
|
+
def model(self) -> str:
|
|
160
|
+
"""Name of model being evaluated."""
|
|
161
|
+
return self._model
|
|
162
|
+
|
|
163
|
+
@property
|
|
164
|
+
def sample_id(self) -> int:
|
|
165
|
+
"""Unique id for sample."""
|
|
166
|
+
return self._sample_id
|
|
167
|
+
|
|
168
|
+
@property
|
|
169
|
+
def group_id(self) -> int:
|
|
170
|
+
"""Group id for sample."""
|
|
171
|
+
return self._group_id
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def input(self) -> Union[str, List[ChatMessage]]:
|
|
175
|
+
"""Input from the `Sample`, should be considered immutable."""
|
|
176
|
+
return self._input
|
|
177
|
+
|
|
178
|
+
@property
|
|
179
|
+
def input_text(self) -> str:
|
|
180
|
+
"""
|
|
181
|
+
Convenience function for accessing the initial input from the `Sample` as a string.
|
|
182
|
+
|
|
183
|
+
If the `input` is a `List[ChatMessage]`, this will return the text from
|
|
184
|
+
the last chat message
|
|
185
|
+
"""
|
|
186
|
+
if isinstance(self._input, str):
|
|
187
|
+
return self._input
|
|
188
|
+
else:
|
|
189
|
+
return messages_pretty_str(self._input)
|
|
190
|
+
|
|
191
|
+
@property
|
|
192
|
+
def input_markdown(self) -> str:
|
|
193
|
+
"""Get the input text as markdown.
|
|
194
|
+
|
|
195
|
+
For multi-modal content, images will be represented in markdown format.
|
|
196
|
+
"""
|
|
197
|
+
if isinstance(self._input, str):
|
|
198
|
+
return self._input
|
|
199
|
+
else:
|
|
200
|
+
return messages_to_markdown(self._input)
|
|
201
|
+
|
|
202
|
+
@property
|
|
203
|
+
def choices(self) -> Choices:
|
|
204
|
+
"""Choices for the sample, if applicable."""
|
|
205
|
+
return self._choices
|
|
206
|
+
|
|
207
|
+
@property
|
|
208
|
+
def user_prompt(self) -> ChatMessageUser:
|
|
209
|
+
"""User prompt for this state.
|
|
210
|
+
|
|
211
|
+
Tasks are very general and can have may types of inputs.
|
|
212
|
+
However, in many cases solvers assume they can interact with
|
|
213
|
+
the state as a "chat" in a predictable fashion (e.g. prompt
|
|
214
|
+
engineering solvers). This property enables easy read and
|
|
215
|
+
write access to the user chat prompt. Raises an
|
|
216
|
+
exception if there is no user prompt
|
|
217
|
+
"""
|
|
218
|
+
prompt = next((m for m in reversed(self.messages) if m.role == 'user'), None)
|
|
219
|
+
if prompt:
|
|
220
|
+
return prompt
|
|
221
|
+
else:
|
|
222
|
+
raise ValueError('user_prompt requested from TaskState but none available')
|
|
223
|
+
|
|
224
|
+
@property
|
|
225
|
+
def metadata(self) -> Dict[str, Any]:
|
|
226
|
+
"""Metadata from the `Sample` for this `TaskState`"""
|
|
227
|
+
return self._metadata
|
|
228
|
+
|
|
229
|
+
@metadata.setter
|
|
230
|
+
def metadata(self, metadata: Dict[str, Any]) -> None:
|
|
231
|
+
self._metadata = metadata
|
|
232
|
+
|
|
233
|
+
@property
|
|
234
|
+
def messages(self) -> List[ChatMessage]:
|
|
235
|
+
"""
|
|
236
|
+
Chat conversation history for sample.
|
|
237
|
+
|
|
238
|
+
This will generally get appended to every time a `generate` call is made
|
|
239
|
+
to the model. Useful for both debug and for solvers/scorers to assess
|
|
240
|
+
model performance or choose the next step.
|
|
241
|
+
"""
|
|
242
|
+
return self._messages
|
|
243
|
+
|
|
244
|
+
@messages.setter
|
|
245
|
+
def messages(self, messages: List[ChatMessage]) -> None:
|
|
246
|
+
self._messages = messages
|
|
247
|
+
|
|
248
|
+
@property
|
|
249
|
+
def output(self) -> ModelOutput:
|
|
250
|
+
"""
|
|
251
|
+
The 'final' model output once we've completed all solving.
|
|
252
|
+
|
|
253
|
+
For simple evals this may just be the last `message` from the
|
|
254
|
+
conversation history, but more complex solvers may set this directly.
|
|
255
|
+
"""
|
|
256
|
+
return self._output
|
|
257
|
+
|
|
258
|
+
@output.setter
|
|
259
|
+
def output(self, output: ModelOutput) -> None:
|
|
260
|
+
self._output = output
|
|
261
|
+
|
|
262
|
+
@property
|
|
263
|
+
def completed(self) -> bool:
|
|
264
|
+
"""Is the task completed."""
|
|
265
|
+
return self._completed
|
|
266
|
+
|
|
267
|
+
@completed.setter
|
|
268
|
+
def completed(self, completed: bool) -> None:
|
|
269
|
+
"""Set the completed status."""
|
|
270
|
+
self._completed = completed
|
|
271
|
+
|
|
272
|
+
@property
|
|
273
|
+
def target(self) -> str:
|
|
274
|
+
"""The scoring target for this `Sample`."""
|
|
275
|
+
return self._target.text
|