evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,12 +1,15 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from itertools import product
|
|
2
3
|
from tqdm import tqdm
|
|
3
|
-
from typing import TYPE_CHECKING, List, Union
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Union
|
|
4
5
|
|
|
5
|
-
from evalscope.
|
|
6
|
-
from evalscope.
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.
|
|
9
|
-
from evalscope.
|
|
6
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
7
|
+
from evalscope.api.dataset import DatasetDict, DictDataLoader, MemoryDataset, Sample
|
|
8
|
+
from evalscope.api.evaluator import TaskState
|
|
9
|
+
from evalscope.api.metric import Score
|
|
10
|
+
from evalscope.api.registry import register_benchmark
|
|
11
|
+
from evalscope.constants import Tags
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
10
13
|
|
|
11
14
|
if TYPE_CHECKING:
|
|
12
15
|
from evalscope.report import Report
|
|
@@ -26,55 +29,66 @@ PROMPT_TEMPLATE = """Please read the following text and answer the question belo
|
|
|
26
29
|
Don't give information outside the document or repeat your findings."""
|
|
27
30
|
|
|
28
31
|
|
|
29
|
-
@
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
32
|
+
@register_benchmark(
|
|
33
|
+
BenchmarkMeta(
|
|
34
|
+
name='needle_haystack',
|
|
35
|
+
pretty_name='Needle-in-a-Haystack',
|
|
36
|
+
tags=[Tags.RETRIEVAL, Tags.LONG_CONTEXT],
|
|
37
|
+
description='Needle in a Haystack is a benchmark focused on information retrieval tasks. '
|
|
38
|
+
'It requires the model to find specific information within a large corpus of text. '
|
|
39
|
+
'[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/needle_haystack.html)', # noqa: E501
|
|
40
|
+
dataset_id='AI-ModelScope/Needle-in-a-Haystack-Corpus',
|
|
41
|
+
metric_list=['acc'],
|
|
42
|
+
subset_list=['english', 'chinese'],
|
|
43
|
+
eval_split='test',
|
|
44
|
+
system_prompt='You are a helpful AI bot that answers questions for a user. Keep your response short and direct',
|
|
45
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
46
|
+
extra_params={
|
|
47
|
+
'retrieval_question':
|
|
48
|
+
'What is the best thing to do in San Francisco?',
|
|
49
|
+
'needles':
|
|
50
|
+
['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n'],
|
|
51
|
+
'context_lengths_min':
|
|
52
|
+
1000,
|
|
53
|
+
'context_lengths_max':
|
|
54
|
+
32000,
|
|
55
|
+
'context_lengths_num_intervals':
|
|
56
|
+
10,
|
|
57
|
+
'document_depth_percent_min':
|
|
58
|
+
0,
|
|
59
|
+
'document_depth_percent_max':
|
|
60
|
+
100,
|
|
61
|
+
'document_depth_percent_intervals':
|
|
62
|
+
10,
|
|
63
|
+
'tokenizer_path':
|
|
64
|
+
'Qwen/Qwen3-0.6B',
|
|
65
|
+
'show_score':
|
|
66
|
+
False,
|
|
67
|
+
}
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
class NeedleHaystackAdapter(DefaultDataAdapter):
|
|
58
71
|
|
|
59
72
|
def __init__(self, **kwargs):
|
|
60
73
|
super().__init__(**kwargs)
|
|
61
74
|
|
|
62
|
-
self.
|
|
75
|
+
self._use_llm_judge = True
|
|
63
76
|
# set extra params
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
self.needles = extra_params.get(
|
|
77
|
+
self.retrieval_question = self.extra_params.get(
|
|
78
|
+
'retrieval_question', 'What is the best thing to do in San Francisco?'
|
|
79
|
+
)
|
|
80
|
+
self.needles = self.extra_params.get(
|
|
68
81
|
'needles',
|
|
69
|
-
['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n']
|
|
70
|
-
|
|
71
|
-
self.
|
|
72
|
-
self.
|
|
73
|
-
self.
|
|
74
|
-
self.
|
|
75
|
-
self.
|
|
76
|
-
self.
|
|
77
|
-
self.
|
|
82
|
+
['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n']
|
|
83
|
+
)
|
|
84
|
+
self.context_lengths_min = self.extra_params.get('context_lengths_min', 1000)
|
|
85
|
+
self.context_lengths_max = self.extra_params.get('context_lengths_max', 32000)
|
|
86
|
+
self.context_lengths_num_intervals = self.extra_params.get('context_lengths_num_intervals', 10)
|
|
87
|
+
self.document_depth_percent_min = self.extra_params.get('document_depth_percent_min', 0)
|
|
88
|
+
self.document_depth_percent_max = self.extra_params.get('document_depth_percent_max', 100)
|
|
89
|
+
self.document_depth_percent_intervals = self.extra_params.get('document_depth_percent_intervals', 10)
|
|
90
|
+
self.tokenizer_path = self.extra_params.get('tokenizer_path', 'Qwen/Qwen3-0.6B')
|
|
91
|
+
self.show_score = self.extra_params.get('show_score', False)
|
|
78
92
|
|
|
79
93
|
self._init_tokenizer()
|
|
80
94
|
self._init_length()
|
|
@@ -88,65 +102,93 @@ class NeedleHaystackAdapter(DataAdapter):
|
|
|
88
102
|
self.context_lengths_min,
|
|
89
103
|
self.context_lengths_max,
|
|
90
104
|
num=self.context_lengths_num_intervals,
|
|
91
|
-
endpoint=True
|
|
105
|
+
endpoint=True
|
|
106
|
+
)
|
|
107
|
+
).astype(int)
|
|
92
108
|
|
|
93
109
|
self.document_depth_percents = np.round(
|
|
94
110
|
np.linspace(
|
|
95
111
|
self.document_depth_percent_min,
|
|
96
112
|
self.document_depth_percent_max,
|
|
97
113
|
num=self.document_depth_percent_intervals,
|
|
98
|
-
endpoint=True
|
|
114
|
+
endpoint=True
|
|
115
|
+
)
|
|
116
|
+
).astype(int)
|
|
99
117
|
|
|
100
118
|
def _init_tokenizer(self):
|
|
101
119
|
""" Initialize the tokenizer based on the provided tokenizer path."""
|
|
102
120
|
from modelscope import AutoTokenizer
|
|
103
121
|
self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path)
|
|
104
122
|
|
|
105
|
-
def load(self
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
# Generate
|
|
131
|
-
|
|
123
|
+
def load(self):
|
|
124
|
+
"""Load dataset from local disk or remote."""
|
|
125
|
+
dataset_name_or_path = self.dataset_id
|
|
126
|
+
if os.path.exists(dataset_name_or_path):
|
|
127
|
+
logger.info(f'Loading dataset from {dataset_name_or_path}')
|
|
128
|
+
dataset_path = dataset_name_or_path
|
|
129
|
+
else:
|
|
130
|
+
from modelscope import dataset_snapshot_download
|
|
131
|
+
logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
|
|
132
|
+
dataset_path = dataset_snapshot_download(
|
|
133
|
+
dataset_name_or_path, allow_file_pattern=['PaulGraham_Essays.txt', 'Journey_to_the_West.txt']
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# Load datasets for both subsets
|
|
137
|
+
datasets = {}
|
|
138
|
+
file_structure = {'english': ['PaulGraham_Essays.txt'], 'chinese': ['Journey_to_the_West.txt']}
|
|
139
|
+
|
|
140
|
+
for subset_name, files in file_structure.items():
|
|
141
|
+
if subset_name not in self.subset_list:
|
|
142
|
+
continue
|
|
143
|
+
file_path = os.path.join(dataset_path, files[0])
|
|
144
|
+
if os.path.exists(file_path):
|
|
145
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
146
|
+
text = f.read()
|
|
147
|
+
|
|
148
|
+
# Generate samples for all combinations of context length and depth
|
|
149
|
+
records = []
|
|
150
|
+
tokens_context = self._get_context_tokens(text)
|
|
132
151
|
for context_length, depth_percent in tqdm(
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
152
|
+
product(self.context_lengths, self.document_depth_percents),
|
|
153
|
+
desc=f'Generating {subset_name} samples'
|
|
154
|
+
):
|
|
136
155
|
context = self._insert_needles(tokens_context, depth_percent, context_length)
|
|
137
|
-
|
|
138
|
-
|
|
156
|
+
record = {
|
|
157
|
+
'text': text,
|
|
139
158
|
'context_length': int(context_length),
|
|
140
159
|
'depth_percent': int(depth_percent),
|
|
141
160
|
'question': self.retrieval_question,
|
|
142
161
|
'answer': '\n'.join(self.needles),
|
|
143
162
|
'context': context,
|
|
144
163
|
}
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
164
|
+
records.append(record)
|
|
165
|
+
|
|
166
|
+
dataset = DictDataLoader(
|
|
167
|
+
dict_list=records, limit=self.limit, repeats=self.repeats, sample_fields=self.record_to_sample
|
|
168
|
+
).load()
|
|
169
|
+
|
|
170
|
+
datasets[subset_name] = dataset
|
|
171
|
+
|
|
172
|
+
test_dataset = DatasetDict(datasets)
|
|
173
|
+
return test_dataset, None
|
|
174
|
+
|
|
175
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
176
|
+
"""Convert a data record to a Sample object."""
|
|
177
|
+
return Sample(
|
|
178
|
+
input=record['question'],
|
|
179
|
+
target=record['answer'],
|
|
180
|
+
metadata={
|
|
181
|
+
'context': record['context'],
|
|
182
|
+
'context_length': record['context_length'],
|
|
183
|
+
'depth_percent': record['depth_percent'],
|
|
184
|
+
}
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
def format_prompt_template(self, sample):
|
|
188
|
+
"""Format the prompt template with context and question."""
|
|
189
|
+
context = sample.metadata['context']
|
|
190
|
+
question = sample.input
|
|
191
|
+
return self.prompt_template.format(context=context, question=question)
|
|
150
192
|
|
|
151
193
|
def _get_context_tokens(self, input_context: str) -> list:
|
|
152
194
|
"""
|
|
@@ -227,7 +269,8 @@ class NeedleHaystackAdapter(DataAdapter):
|
|
|
227
269
|
# We want to make sure that we place our needle at a sentence break
|
|
228
270
|
# so we first see what token a '.' is
|
|
229
271
|
period_tokens = self.tokenizer.encode('.') + self.tokenizer.encode(
|
|
230
|
-
'。'
|
|
272
|
+
'。'
|
|
273
|
+
) # Handle both English and Chinese periods
|
|
231
274
|
|
|
232
275
|
# Then we iteration backwards until we find the first period
|
|
233
276
|
while tokens_new_context and tokens_new_context[-1] not in period_tokens:
|
|
@@ -240,8 +283,10 @@ class NeedleHaystackAdapter(DataAdapter):
|
|
|
240
283
|
# Log
|
|
241
284
|
insertion_percentage = (insertion_point / len(tokens_context)) * 100
|
|
242
285
|
self.insertion_percentages.append(insertion_percentage)
|
|
243
|
-
logger.debug(
|
|
244
|
-
|
|
286
|
+
logger.debug(
|
|
287
|
+
f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, "
|
|
288
|
+
f'total length now: {len(tokens_context)} tokens'
|
|
289
|
+
)
|
|
245
290
|
|
|
246
291
|
# Adjust depth for next needle
|
|
247
292
|
depth_percent += depth_percent_interval
|
|
@@ -249,84 +294,78 @@ class NeedleHaystackAdapter(DataAdapter):
|
|
|
249
294
|
new_context = self.tokenizer.decode(tokens_context)
|
|
250
295
|
return new_context
|
|
251
296
|
|
|
252
|
-
def
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
Returns:
|
|
259
|
-
A dictionary containing the prompt data
|
|
260
|
-
"""
|
|
261
|
-
context = input_d.get('context')
|
|
262
|
-
question = input_d.get('question')
|
|
297
|
+
def match_score(
|
|
298
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
299
|
+
) -> Score:
|
|
300
|
+
"""Calculate evaluation scores by comparing prediction with reference."""
|
|
301
|
+
from evalscope.metrics import exact_match
|
|
302
|
+
from .utils import normalize_answer
|
|
263
303
|
|
|
264
|
-
|
|
304
|
+
score = Score(
|
|
305
|
+
extracted_prediction=filtered_prediction,
|
|
306
|
+
prediction=original_prediction,
|
|
307
|
+
)
|
|
265
308
|
|
|
266
|
-
|
|
309
|
+
# Get metadata from task state
|
|
310
|
+
context_length = task_state.metadata.get('context_length', 0)
|
|
311
|
+
depth_percent = task_state.metadata.get('depth_percent', 0)
|
|
267
312
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
"""
|
|
272
|
-
return input_d.get('answer', '').strip()
|
|
313
|
+
norm_gold = normalize_answer(reference)
|
|
314
|
+
norm_pred = normalize_answer(filtered_prediction)
|
|
315
|
+
accuracy = exact_match(gold=norm_gold, pred=norm_pred)
|
|
273
316
|
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
"""
|
|
278
|
-
return result
|
|
317
|
+
metric_name = f'Context#{context_length} Depth#{depth_percent}'
|
|
318
|
+
score.value = {metric_name: accuracy}
|
|
319
|
+
score.main_score_name = metric_name
|
|
279
320
|
|
|
280
|
-
|
|
281
|
-
"""
|
|
282
|
-
Match the gold answer and the predicted answer.
|
|
283
|
-
"""
|
|
284
|
-
from .utils import normalize_answer
|
|
285
|
-
norm_gold = normalize_answer(gold)
|
|
286
|
-
norm_pred = normalize_answer(pred)
|
|
287
|
-
# Use exact match for Needle in a Haystack
|
|
288
|
-
return exact_match(gold=norm_gold, pred=norm_pred)
|
|
321
|
+
return score
|
|
289
322
|
|
|
290
|
-
def
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
"""
|
|
323
|
+
def llm_match_score(
|
|
324
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
325
|
+
) -> Score:
|
|
326
|
+
"""Use LLM as a judge to evaluate the predicted answer against the gold answer."""
|
|
294
327
|
from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE, parse_score
|
|
295
328
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
329
|
+
score = Score(
|
|
330
|
+
extracted_prediction=filtered_prediction,
|
|
331
|
+
prediction=original_prediction,
|
|
332
|
+
)
|
|
300
333
|
|
|
301
|
-
#
|
|
302
|
-
|
|
303
|
-
|
|
334
|
+
# Get metadata from task state
|
|
335
|
+
context_length = task_state.metadata.get('context_length', 0)
|
|
336
|
+
depth_percent = task_state.metadata.get('depth_percent', 0)
|
|
337
|
+
question = task_state.input_text
|
|
304
338
|
|
|
305
|
-
#
|
|
306
|
-
|
|
307
|
-
|
|
339
|
+
# Get grading response
|
|
340
|
+
prompt = ORM_USER_TEMPLATE.format(question=question, gold=reference, pred=filtered_prediction)
|
|
341
|
+
orm_response = self.llm_judge.judge(prompt, system_prompt=GENERAL_ORM_PROMPT)
|
|
308
342
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
compute weighted mean of the bleu score of all samples
|
|
343
|
+
# Parse grading score with regex, [[score]]
|
|
344
|
+
accuracy = parse_score(orm_response) if orm_response else 0.0
|
|
312
345
|
|
|
313
|
-
|
|
314
|
-
|
|
346
|
+
metric_name = f'Context#{context_length} Depth#{depth_percent}'
|
|
347
|
+
score.value = {metric_name: accuracy}
|
|
348
|
+
score.explanation = f'LLM judge: {orm_response}'
|
|
349
|
+
score.metadata = {
|
|
350
|
+
'source': 'llm_judge',
|
|
351
|
+
'judge_strategy': getattr(self, 'judge_strategy', 'default'),
|
|
352
|
+
'model': self.llm_judge.model_id if hasattr(self.llm_judge, 'model_id') else 'unknown'
|
|
353
|
+
}
|
|
354
|
+
score.main_score_name = metric_name
|
|
315
355
|
|
|
316
|
-
|
|
317
|
-
avg_res: List[dict]
|
|
356
|
+
return score
|
|
318
357
|
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
return
|
|
358
|
+
def _on_generate_report(self, scores, model_name, add_aggregation_name=True):
|
|
359
|
+
# Don't add aggregation name for needle haystack adapter
|
|
360
|
+
return super()._on_generate_report(scores, model_name, False)
|
|
322
361
|
|
|
323
|
-
def
|
|
362
|
+
def _on_generate_report_end(self, report: 'Report', output_dir: str, **kwargs):
|
|
324
363
|
try:
|
|
325
364
|
import os
|
|
326
365
|
|
|
327
366
|
from .utils import draw_score_chat
|
|
328
367
|
|
|
329
|
-
report_path =
|
|
368
|
+
report_path = output_dir
|
|
330
369
|
data_frame = report.to_dataframe()
|
|
331
370
|
# split `Metric` to `Context` and `Depth`
|
|
332
371
|
data_frame[['Context', 'Depth']] = data_frame['Metric'].str.split(' ', n=1, expand=True)
|
|
@@ -336,13 +375,14 @@ class NeedleHaystackAdapter(DataAdapter):
|
|
|
336
375
|
for subset in data_frame['Subset'].unique():
|
|
337
376
|
sub_df = data_frame[data_frame['Subset'] == subset]
|
|
338
377
|
# draw charts for each subset
|
|
339
|
-
pivot_table = sub_df.pivot_table(
|
|
340
|
-
|
|
378
|
+
pivot_table = sub_df.pivot_table(values='Score', index=['Depth', 'Context'],
|
|
379
|
+
aggfunc='mean').reset_index()
|
|
341
380
|
pivot_table = pivot_table.pivot(index='Depth', columns='Context', values='Score')
|
|
342
381
|
draw_score_chat(
|
|
343
382
|
pivot_table,
|
|
344
383
|
outpath=os.path.join(report_path, f'needle_haystack_heatmap_{subset}.png'),
|
|
345
|
-
show_score=self.show_score
|
|
384
|
+
show_score=self.show_score
|
|
385
|
+
)
|
|
346
386
|
|
|
347
387
|
except Exception as e:
|
|
348
388
|
logger.error(f'Error generating charts: {e}')
|