evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +5 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +356 -0
- evalscope/api/benchmark/meta.py +121 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +262 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +378 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +275 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +243 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +155 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/app.py +3 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +26 -14
- evalscope/app/utils/data_utils.py +43 -27
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -14
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +7 -10
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +10 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +136 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +157 -57
- evalscope/constants.py +37 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +275 -419
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +47 -33
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +67 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +126 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +701 -0
- evalscope/perf/benchmark.py +4 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +15 -10
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +11 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -3
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +51 -35
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +33 -47
- evalscope/summarizer.py +1 -1
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +3 -2
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +142 -6
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +11 -7
- evalscope/utils/multi_choices.py +288 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
- tests/benchmark/test_eval.py +385 -0
- tests/benchmark/test_image_edit.py +65 -0
- tests/{aigc → benchmark}/test_t2i.py +22 -4
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +85 -47
- tests/cli/test_collection.py +20 -8
- tests/cli/test_custom.py +22 -15
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -2
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/__init__.py +0 -0
|
@@ -1,12 +1,15 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from itertools import product
|
|
2
3
|
from tqdm import tqdm
|
|
3
|
-
from typing import TYPE_CHECKING, List, Union
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Union
|
|
4
5
|
|
|
5
|
-
from evalscope.
|
|
6
|
-
from evalscope.
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.
|
|
9
|
-
from evalscope.
|
|
6
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
7
|
+
from evalscope.api.dataset import DatasetDict, DictDataLoader, MemoryDataset, Sample
|
|
8
|
+
from evalscope.api.evaluator import TaskState
|
|
9
|
+
from evalscope.api.metric import Score
|
|
10
|
+
from evalscope.api.registry import register_benchmark
|
|
11
|
+
from evalscope.constants import Tags
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
10
13
|
|
|
11
14
|
if TYPE_CHECKING:
|
|
12
15
|
from evalscope.report import Report
|
|
@@ -26,55 +29,66 @@ PROMPT_TEMPLATE = """Please read the following text and answer the question belo
|
|
|
26
29
|
Don't give information outside the document or repeat your findings."""
|
|
27
30
|
|
|
28
31
|
|
|
29
|
-
@
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
32
|
+
@register_benchmark(
|
|
33
|
+
BenchmarkMeta(
|
|
34
|
+
name='needle_haystack',
|
|
35
|
+
pretty_name='Needle-in-a-Haystack',
|
|
36
|
+
tags=[Tags.RETRIEVAL, Tags.LONG_CONTEXT],
|
|
37
|
+
description='Needle in a Haystack is a benchmark focused on information retrieval tasks. '
|
|
38
|
+
'It requires the model to find specific information within a large corpus of text. '
|
|
39
|
+
'[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/needle_haystack.html)', # noqa: E501
|
|
40
|
+
dataset_id='AI-ModelScope/Needle-in-a-Haystack-Corpus',
|
|
41
|
+
metric_list=['acc'],
|
|
42
|
+
subset_list=['english', 'chinese'],
|
|
43
|
+
eval_split='test',
|
|
44
|
+
system_prompt='You are a helpful AI bot that answers questions for a user. Keep your response short and direct',
|
|
45
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
46
|
+
extra_params={
|
|
47
|
+
'retrieval_question':
|
|
48
|
+
'What is the best thing to do in San Francisco?',
|
|
49
|
+
'needles':
|
|
50
|
+
['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n'],
|
|
51
|
+
'context_lengths_min':
|
|
52
|
+
1000,
|
|
53
|
+
'context_lengths_max':
|
|
54
|
+
32000,
|
|
55
|
+
'context_lengths_num_intervals':
|
|
56
|
+
10,
|
|
57
|
+
'document_depth_percent_min':
|
|
58
|
+
0,
|
|
59
|
+
'document_depth_percent_max':
|
|
60
|
+
100,
|
|
61
|
+
'document_depth_percent_intervals':
|
|
62
|
+
10,
|
|
63
|
+
'tokenizer_path':
|
|
64
|
+
'Qwen/Qwen3-0.6B',
|
|
65
|
+
'show_score':
|
|
66
|
+
False,
|
|
67
|
+
}
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
class NeedleHaystackAdapter(DefaultDataAdapter):
|
|
58
71
|
|
|
59
72
|
def __init__(self, **kwargs):
|
|
60
73
|
super().__init__(**kwargs)
|
|
61
74
|
|
|
62
|
-
self.
|
|
75
|
+
self._use_llm_judge = True
|
|
63
76
|
# set extra params
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
self.needles = extra_params.get(
|
|
77
|
+
self.retrieval_question = self.extra_params.get(
|
|
78
|
+
'retrieval_question', 'What is the best thing to do in San Francisco?'
|
|
79
|
+
)
|
|
80
|
+
self.needles = self.extra_params.get(
|
|
68
81
|
'needles',
|
|
69
|
-
['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n']
|
|
70
|
-
|
|
71
|
-
self.
|
|
72
|
-
self.
|
|
73
|
-
self.
|
|
74
|
-
self.
|
|
75
|
-
self.
|
|
76
|
-
self.
|
|
77
|
-
self.
|
|
82
|
+
['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n']
|
|
83
|
+
)
|
|
84
|
+
self.context_lengths_min = self.extra_params.get('context_lengths_min', 1000)
|
|
85
|
+
self.context_lengths_max = self.extra_params.get('context_lengths_max', 32000)
|
|
86
|
+
self.context_lengths_num_intervals = self.extra_params.get('context_lengths_num_intervals', 10)
|
|
87
|
+
self.document_depth_percent_min = self.extra_params.get('document_depth_percent_min', 0)
|
|
88
|
+
self.document_depth_percent_max = self.extra_params.get('document_depth_percent_max', 100)
|
|
89
|
+
self.document_depth_percent_intervals = self.extra_params.get('document_depth_percent_intervals', 10)
|
|
90
|
+
self.tokenizer_path = self.extra_params.get('tokenizer_path', 'Qwen/Qwen3-0.6B')
|
|
91
|
+
self.show_score = self.extra_params.get('show_score', False)
|
|
78
92
|
|
|
79
93
|
self._init_tokenizer()
|
|
80
94
|
self._init_length()
|
|
@@ -88,65 +102,97 @@ class NeedleHaystackAdapter(DataAdapter):
|
|
|
88
102
|
self.context_lengths_min,
|
|
89
103
|
self.context_lengths_max,
|
|
90
104
|
num=self.context_lengths_num_intervals,
|
|
91
|
-
endpoint=True
|
|
105
|
+
endpoint=True
|
|
106
|
+
)
|
|
107
|
+
).astype(int)
|
|
92
108
|
|
|
93
109
|
self.document_depth_percents = np.round(
|
|
94
110
|
np.linspace(
|
|
95
111
|
self.document_depth_percent_min,
|
|
96
112
|
self.document_depth_percent_max,
|
|
97
113
|
num=self.document_depth_percent_intervals,
|
|
98
|
-
endpoint=True
|
|
114
|
+
endpoint=True
|
|
115
|
+
)
|
|
116
|
+
).astype(int)
|
|
99
117
|
|
|
100
118
|
def _init_tokenizer(self):
|
|
101
119
|
""" Initialize the tokenizer based on the provided tokenizer path."""
|
|
102
120
|
from modelscope import AutoTokenizer
|
|
103
121
|
self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path)
|
|
104
122
|
|
|
105
|
-
def load(self
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
# Generate
|
|
131
|
-
|
|
123
|
+
def load(self):
|
|
124
|
+
"""Load dataset from local disk or remote."""
|
|
125
|
+
dataset_name_or_path = self.dataset_id
|
|
126
|
+
if os.path.exists(dataset_name_or_path):
|
|
127
|
+
logger.info(f'Loading dataset from {dataset_name_or_path}')
|
|
128
|
+
dataset_path = dataset_name_or_path
|
|
129
|
+
else:
|
|
130
|
+
from modelscope import dataset_snapshot_download
|
|
131
|
+
logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
|
|
132
|
+
dataset_path = dataset_snapshot_download(
|
|
133
|
+
dataset_name_or_path, allow_file_pattern=['PaulGraham_Essays.txt', 'Journey_to_the_West.txt']
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# Load datasets for both subsets
|
|
137
|
+
datasets = {}
|
|
138
|
+
file_structure = {'english': ['PaulGraham_Essays.txt'], 'chinese': ['Journey_to_the_West.txt']}
|
|
139
|
+
|
|
140
|
+
for subset_name, files in file_structure.items():
|
|
141
|
+
if subset_name not in self.subset_list:
|
|
142
|
+
continue
|
|
143
|
+
file_path = os.path.join(dataset_path, files[0])
|
|
144
|
+
if os.path.exists(file_path):
|
|
145
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
146
|
+
text = f.read()
|
|
147
|
+
|
|
148
|
+
# Generate samples for all combinations of context length and depth
|
|
149
|
+
records = []
|
|
150
|
+
tokens_context = self._get_context_tokens(text)
|
|
132
151
|
for context_length, depth_percent in tqdm(
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
152
|
+
product(self.context_lengths, self.document_depth_percents),
|
|
153
|
+
desc=f'Generating {subset_name} samples'
|
|
154
|
+
):
|
|
136
155
|
context = self._insert_needles(tokens_context, depth_percent, context_length)
|
|
137
|
-
|
|
138
|
-
|
|
156
|
+
record = {
|
|
157
|
+
'text': text,
|
|
139
158
|
'context_length': int(context_length),
|
|
140
159
|
'depth_percent': int(depth_percent),
|
|
141
160
|
'question': self.retrieval_question,
|
|
142
161
|
'answer': '\n'.join(self.needles),
|
|
143
162
|
'context': context,
|
|
144
163
|
}
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
164
|
+
records.append(record)
|
|
165
|
+
|
|
166
|
+
dataset = DictDataLoader(
|
|
167
|
+
dict_list=records,
|
|
168
|
+
limit=self.limit,
|
|
169
|
+
repeats=self.repeats,
|
|
170
|
+
sample_fields=self.record_to_sample,
|
|
171
|
+
shuffle=self.shuffle,
|
|
172
|
+
).load()
|
|
173
|
+
|
|
174
|
+
datasets[subset_name] = dataset
|
|
175
|
+
|
|
176
|
+
test_dataset = DatasetDict(datasets)
|
|
177
|
+
return test_dataset, None
|
|
178
|
+
|
|
179
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
180
|
+
"""Convert a data record to a Sample object."""
|
|
181
|
+
return Sample(
|
|
182
|
+
input=record['question'],
|
|
183
|
+
target=record['answer'],
|
|
184
|
+
metadata={
|
|
185
|
+
'context': record['context'],
|
|
186
|
+
'context_length': record['context_length'],
|
|
187
|
+
'depth_percent': record['depth_percent'],
|
|
188
|
+
}
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
def format_prompt_template(self, sample):
|
|
192
|
+
"""Format the prompt template with context and question."""
|
|
193
|
+
context = sample.metadata['context']
|
|
194
|
+
question = sample.input
|
|
195
|
+
return self.prompt_template.format(context=context, question=question)
|
|
150
196
|
|
|
151
197
|
def _get_context_tokens(self, input_context: str) -> list:
|
|
152
198
|
"""
|
|
@@ -227,7 +273,8 @@ class NeedleHaystackAdapter(DataAdapter):
|
|
|
227
273
|
# We want to make sure that we place our needle at a sentence break
|
|
228
274
|
# so we first see what token a '.' is
|
|
229
275
|
period_tokens = self.tokenizer.encode('.') + self.tokenizer.encode(
|
|
230
|
-
'。'
|
|
276
|
+
'。'
|
|
277
|
+
) # Handle both English and Chinese periods
|
|
231
278
|
|
|
232
279
|
# Then we iteration backwards until we find the first period
|
|
233
280
|
while tokens_new_context and tokens_new_context[-1] not in period_tokens:
|
|
@@ -240,8 +287,10 @@ class NeedleHaystackAdapter(DataAdapter):
|
|
|
240
287
|
# Log
|
|
241
288
|
insertion_percentage = (insertion_point / len(tokens_context)) * 100
|
|
242
289
|
self.insertion_percentages.append(insertion_percentage)
|
|
243
|
-
logger.debug(
|
|
244
|
-
|
|
290
|
+
logger.debug(
|
|
291
|
+
f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, "
|
|
292
|
+
f'total length now: {len(tokens_context)} tokens'
|
|
293
|
+
)
|
|
245
294
|
|
|
246
295
|
# Adjust depth for next needle
|
|
247
296
|
depth_percent += depth_percent_interval
|
|
@@ -249,84 +298,78 @@ class NeedleHaystackAdapter(DataAdapter):
|
|
|
249
298
|
new_context = self.tokenizer.decode(tokens_context)
|
|
250
299
|
return new_context
|
|
251
300
|
|
|
252
|
-
def
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
Returns:
|
|
259
|
-
A dictionary containing the prompt data
|
|
260
|
-
"""
|
|
261
|
-
context = input_d.get('context')
|
|
262
|
-
question = input_d.get('question')
|
|
301
|
+
def match_score(
|
|
302
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
303
|
+
) -> Score:
|
|
304
|
+
"""Calculate evaluation scores by comparing prediction with reference."""
|
|
305
|
+
from evalscope.metrics import exact_match
|
|
306
|
+
from .utils import normalize_answer
|
|
263
307
|
|
|
264
|
-
|
|
308
|
+
score = Score(
|
|
309
|
+
extracted_prediction=filtered_prediction,
|
|
310
|
+
prediction=original_prediction,
|
|
311
|
+
)
|
|
265
312
|
|
|
266
|
-
|
|
313
|
+
# Get metadata from task state
|
|
314
|
+
context_length = task_state.metadata.get('context_length', 0)
|
|
315
|
+
depth_percent = task_state.metadata.get('depth_percent', 0)
|
|
267
316
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
"""
|
|
272
|
-
return input_d.get('answer', '').strip()
|
|
317
|
+
norm_gold = normalize_answer(reference)
|
|
318
|
+
norm_pred = normalize_answer(filtered_prediction)
|
|
319
|
+
accuracy = exact_match(gold=norm_gold, pred=norm_pred)
|
|
273
320
|
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
"""
|
|
278
|
-
return result
|
|
321
|
+
metric_name = f'Context#{context_length} Depth#{depth_percent}'
|
|
322
|
+
score.value = {metric_name: accuracy}
|
|
323
|
+
score.main_score_name = metric_name
|
|
279
324
|
|
|
280
|
-
|
|
281
|
-
"""
|
|
282
|
-
Match the gold answer and the predicted answer.
|
|
283
|
-
"""
|
|
284
|
-
from .utils import normalize_answer
|
|
285
|
-
norm_gold = normalize_answer(gold)
|
|
286
|
-
norm_pred = normalize_answer(pred)
|
|
287
|
-
# Use exact match for Needle in a Haystack
|
|
288
|
-
return exact_match(gold=norm_gold, pred=norm_pred)
|
|
325
|
+
return score
|
|
289
326
|
|
|
290
|
-
def
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
"""
|
|
327
|
+
def llm_match_score(
|
|
328
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
329
|
+
) -> Score:
|
|
330
|
+
"""Use LLM as a judge to evaluate the predicted answer against the gold answer."""
|
|
294
331
|
from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE, parse_score
|
|
295
332
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
333
|
+
score = Score(
|
|
334
|
+
extracted_prediction=filtered_prediction,
|
|
335
|
+
prediction=original_prediction,
|
|
336
|
+
)
|
|
300
337
|
|
|
301
|
-
#
|
|
302
|
-
|
|
303
|
-
|
|
338
|
+
# Get metadata from task state
|
|
339
|
+
context_length = task_state.metadata.get('context_length', 0)
|
|
340
|
+
depth_percent = task_state.metadata.get('depth_percent', 0)
|
|
341
|
+
question = task_state.input_text
|
|
304
342
|
|
|
305
|
-
#
|
|
306
|
-
|
|
307
|
-
|
|
343
|
+
# Get grading response
|
|
344
|
+
prompt = ORM_USER_TEMPLATE.format(question=question, gold=reference, pred=filtered_prediction)
|
|
345
|
+
orm_response = self.llm_judge.judge(prompt, system_prompt=GENERAL_ORM_PROMPT)
|
|
308
346
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
compute weighted mean of the bleu score of all samples
|
|
347
|
+
# Parse grading score with regex, [[score]]
|
|
348
|
+
accuracy = parse_score(orm_response) if orm_response else 0.0
|
|
312
349
|
|
|
313
|
-
|
|
314
|
-
|
|
350
|
+
metric_name = f'Context#{context_length} Depth#{depth_percent}'
|
|
351
|
+
score.value = {metric_name: accuracy}
|
|
352
|
+
score.explanation = f'LLM judge: {orm_response}'
|
|
353
|
+
score.metadata = {
|
|
354
|
+
'source': 'llm_judge',
|
|
355
|
+
'judge_strategy': getattr(self, 'judge_strategy', 'default'),
|
|
356
|
+
'model': self.llm_judge.model_id if hasattr(self.llm_judge, 'model_id') else 'unknown'
|
|
357
|
+
}
|
|
358
|
+
score.main_score_name = metric_name
|
|
315
359
|
|
|
316
|
-
|
|
317
|
-
avg_res: List[dict]
|
|
360
|
+
return score
|
|
318
361
|
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
return
|
|
362
|
+
def _on_generate_report(self, scores, model_name, add_aggregation_name=True):
|
|
363
|
+
# Don't add aggregation name for needle haystack adapter
|
|
364
|
+
return super()._on_generate_report(scores, model_name, False)
|
|
322
365
|
|
|
323
|
-
def
|
|
366
|
+
def _on_generate_report_end(self, report: 'Report', output_dir: str, **kwargs):
|
|
324
367
|
try:
|
|
325
368
|
import os
|
|
326
369
|
|
|
327
370
|
from .utils import draw_score_chat
|
|
328
371
|
|
|
329
|
-
report_path =
|
|
372
|
+
report_path = output_dir
|
|
330
373
|
data_frame = report.to_dataframe()
|
|
331
374
|
# split `Metric` to `Context` and `Depth`
|
|
332
375
|
data_frame[['Context', 'Depth']] = data_frame['Metric'].str.split(' ', n=1, expand=True)
|
|
@@ -336,13 +379,14 @@ class NeedleHaystackAdapter(DataAdapter):
|
|
|
336
379
|
for subset in data_frame['Subset'].unique():
|
|
337
380
|
sub_df = data_frame[data_frame['Subset'] == subset]
|
|
338
381
|
# draw charts for each subset
|
|
339
|
-
pivot_table = sub_df.pivot_table(
|
|
340
|
-
|
|
382
|
+
pivot_table = sub_df.pivot_table(values='Score', index=['Depth', 'Context'],
|
|
383
|
+
aggfunc='mean').reset_index()
|
|
341
384
|
pivot_table = pivot_table.pivot(index='Depth', columns='Context', values='Score')
|
|
342
385
|
draw_score_chat(
|
|
343
386
|
pivot_table,
|
|
344
387
|
outpath=os.path.join(report_path, f'needle_haystack_heatmap_{subset}.png'),
|
|
345
|
-
show_score=self.show_score
|
|
388
|
+
show_score=self.show_score
|
|
389
|
+
)
|
|
346
390
|
|
|
347
391
|
except Exception as e:
|
|
348
392
|
logger.error(f'Error generating charts: {e}')
|