PyPI - evalscope - Versions diffs - 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show

evalscope/__init__.py +4 -1
evalscope/api/benchmark/__init__.py +3 -0
evalscope/api/benchmark/adapters/__init__.py +5 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
evalscope/api/benchmark/benchmark.py +356 -0
evalscope/api/benchmark/meta.py +121 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +349 -0
evalscope/api/dataset/loader.py +262 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +378 -0
evalscope/api/evaluator/evaluator.py +56 -0
evalscope/api/evaluator/state.py +275 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +12 -0
evalscope/api/messages/chat_message.py +243 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +55 -0
evalscope/api/metric/scorer.py +113 -0
evalscope/api/mixin/__init__.py +1 -0
evalscope/api/mixin/llm_judge_mixin.py +168 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +155 -0
evalscope/api/model/model.py +386 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/app.py +3 -0
evalscope/app/ui/app_ui.py +2 -1
evalscope/app/ui/multi_model.py +50 -25
evalscope/app/ui/single_model.py +26 -14
evalscope/app/utils/data_utils.py +43 -27
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -14
evalscope/app/utils/visualization.py +9 -4
evalscope/arguments.py +7 -10
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +6 -5
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +10 -1
evalscope/backend/rag_eval/utils/llm.py +13 -12
evalscope/benchmarks/__init__.py +0 -2
evalscope/benchmarks/aime/aime24_adapter.py +38 -40
evalscope/benchmarks/aime/aime25_adapter.py +34 -40
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
evalscope/benchmarks/arc/arc_adapter.py +34 -147
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
evalscope/benchmarks/arena_hard/utils.py +37 -1
evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
evalscope/benchmarks/bfcl/generation.py +222 -0
evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
evalscope/benchmarks/docmath/utils.py +4 -5
evalscope/benchmarks/drop/drop_adapter.py +88 -40
evalscope/benchmarks/frames/frames_adapter.py +136 -52
evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
evalscope/benchmarks/general_arena/utils.py +23 -27
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
evalscope/benchmarks/hle/hle_adapter.py +127 -93
evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
evalscope/benchmarks/ifeval/instructions.py +109 -64
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
evalscope/benchmarks/musr/musr_adapter.py +33 -64
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
evalscope/benchmarks/race/race_adapter.py +33 -119
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
evalscope/benchmarks/super_gpqa/utils.py +2 -1
evalscope/benchmarks/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +2 -10
evalscope/collections/sampler.py +10 -10
evalscope/collections/schema.py +13 -11
evalscope/config.py +157 -57
evalscope/constants.py +37 -61
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +275 -419
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +13 -13
evalscope/metrics/llm_judge.py +47 -33
evalscope/metrics/math_parser.py +27 -22
evalscope/metrics/metric.py +307 -0
evalscope/metrics/metrics.py +22 -18
evalscope/metrics/t2v_metrics/__init__.py +0 -52
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
evalscope/models/__init__.py +6 -29
evalscope/models/image_edit_model.py +125 -0
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +67 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +126 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +701 -0
evalscope/perf/benchmark.py +4 -1
evalscope/perf/http_client.py +4 -2
evalscope/perf/plugin/api/custom_api.py +5 -4
evalscope/perf/plugin/api/openai_api.py +11 -9
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +4 -2
evalscope/perf/utils/benchmark_util.py +15 -10
evalscope/perf/utils/db_util.py +9 -6
evalscope/perf/utils/local_server.py +11 -3
evalscope/perf/utils/rich_display.py +16 -10
evalscope/report/__init__.py +2 -3
evalscope/report/combinator.py +18 -12
evalscope/report/generator.py +51 -35
evalscope/report/{utils.py → report.py} +8 -6
evalscope/run.py +33 -47
evalscope/summarizer.py +1 -1
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/__init__.py +21 -2
evalscope/utils/chat_service.py +3 -2
evalscope/utils/deprecation_utils.py +12 -1
evalscope/utils/function_utils.py +29 -0
evalscope/utils/import_utils.py +23 -1
evalscope/utils/io_utils.py +142 -6
evalscope/utils/json_schema.py +208 -0
evalscope/utils/logger.py +51 -12
evalscope/utils/model_utils.py +11 -7
evalscope/utils/multi_choices.py +288 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
tests/benchmark/test_eval.py +385 -0
tests/benchmark/test_image_edit.py +65 -0
tests/{aigc → benchmark}/test_t2i.py +22 -4
tests/benchmark/test_vlm.py +80 -0
tests/cli/test_all.py +85 -47
tests/cli/test_collection.py +20 -8
tests/cli/test_custom.py +22 -15
tests/cli/test_reasoning.py +81 -0
tests/common.py +73 -0
tests/perf/test_perf.py +4 -2
tests/rag/test_clip_benchmark.py +0 -2
evalscope/benchmarks/aigc/t2i/base.py +0 -56
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -81
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -528
evalscope/benchmarks/filters.py +0 -59
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/process_bench/critique_template.txt +0 -13
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/benchmarks/utils.py +0 -60
evalscope/collections/evaluator.py +0 -375
evalscope/metrics/completion_parsers.py +0 -227
evalscope/metrics/named_metrics.py +0 -55
evalscope/models/adapters/__init__.py +0 -14
evalscope/models/adapters/base_adapter.py +0 -84
evalscope/models/adapters/bfcl_adapter.py +0 -246
evalscope/models/adapters/chat_adapter.py +0 -207
evalscope/models/adapters/choice_adapter.py +0 -222
evalscope/models/adapters/custom_adapter.py +0 -71
evalscope/models/adapters/server_adapter.py +0 -236
evalscope/models/adapters/t2i_adapter.py +0 -79
evalscope/models/adapters/tau_bench_adapter.py +0 -189
evalscope/models/custom/__init__.py +0 -4
evalscope/models/custom/custom_model.py +0 -50
evalscope/models/custom/dummy_model.py +0 -99
evalscope/models/local_model.py +0 -128
evalscope/models/register.py +0 -41
tests/cli/test_run.py +0 -489
/evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
/tests/{aigc → benchmark}/__init__.py +0 -0

evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py CHANGED Viewed

@@ -1,12 +1,15 @@
+import os
 from itertools import product
 from tqdm import tqdm
-from typing import TYPE_CHECKING, List, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Union
-from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.constants import AnswerKeys, EvalType
-from evalscope.metrics import LLMJudge, exact_match
-from evalscope.metrics.metrics import mean
-from evalscope.utils import get_logger
+from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
+from evalscope.api.dataset import DatasetDict, DictDataLoader, MemoryDataset, Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.metric import Score
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.logger import get_logger
 if TYPE_CHECKING:
     from evalscope.report import Report
@@ -26,55 +29,66 @@ PROMPT_TEMPLATE = """Please read the following text and answer the question belo
 Don't give information outside the document or repeat your findings."""
-@Benchmark.register(
-    name='needle_haystack',
-    pretty_name='Needle-in-a-Haystack',
-    tags=['Retrieval', 'Long Context'],
-    description='Needle in a Haystack is a benchmark focused on information retrieval tasks. '
-    'It requires the model to find specific information within a large corpus of text. '
-    '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/needle_haystack.html)',  # noqa: E501
-    dataset_id='AI-ModelScope/Needle-in-a-Haystack-Corpus',
-    metric_list=['AverageAccuracy'],
-    subset_list=['english', 'chinese'],
-    few_shot_num=0,
-    train_split=None,
-    eval_split='test',
-    system_prompt='You are a helpful AI bot that answers questions for a user. Keep your response short and direct',
-    prompt_template=PROMPT_TEMPLATE,
-    extra_params={
-        'retrieval_question': 'What is the best thing to do in San Francisco?',
-        'needles':
-        ['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n'],
-        'context_lengths_min': 1000,
-        'context_lengths_max': 32000,
-        'context_lengths_num_intervals': 10,
-        'document_depth_percent_min': 0,
-        'document_depth_percent_max': 100,
-        'document_depth_percent_intervals': 10,
-        'tokenizer_path': 'Qwen/Qwen3-0.6B',
-        'show_score': False,
-    })
-class NeedleHaystackAdapter(DataAdapter):
+@register_benchmark(
+    BenchmarkMeta(
+        name='needle_haystack',
+        pretty_name='Needle-in-a-Haystack',
+        tags=[Tags.RETRIEVAL, Tags.LONG_CONTEXT],
+        description='Needle in a Haystack is a benchmark focused on information retrieval tasks. '
+        'It requires the model to find specific information within a large corpus of text. '
+        '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/needle_haystack.html)',  # noqa: E501
+        dataset_id='AI-ModelScope/Needle-in-a-Haystack-Corpus',
+        metric_list=['acc'],
+        subset_list=['english', 'chinese'],
+        eval_split='test',
+        system_prompt='You are a helpful AI bot that answers questions for a user. Keep your response short and direct',
+        prompt_template=PROMPT_TEMPLATE,
+        extra_params={
+            'retrieval_question':
+            'What is the best thing to do in San Francisco?',
+            'needles':
+            ['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n'],
+            'context_lengths_min':
+            1000,
+            'context_lengths_max':
+            32000,
+            'context_lengths_num_intervals':
+            10,
+            'document_depth_percent_min':
+            0,
+            'document_depth_percent_max':
+            100,
+            'document_depth_percent_intervals':
+            10,
+            'tokenizer_path':
+            'Qwen/Qwen3-0.6B',
+            'show_score':
+            False,
+        }
+    )
+)
+class NeedleHaystackAdapter(DefaultDataAdapter):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        self.llm_as_a_judge = True
+        self._use_llm_judge = True
         # set extra params
-        extra_params = kwargs.get('extra_params', {})
-        self.retrieval_question = extra_params.get('retrieval_question',
-                                                   'What is the best thing to do in San Francisco?')
-        self.needles = extra_params.get(
+        self.retrieval_question = self.extra_params.get(
+            'retrieval_question', 'What is the best thing to do in San Francisco?'
+        )
+        self.needles = self.extra_params.get(
             'needles',
-            ['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n'])
-        self.context_lengths_min = extra_params.get('context_lengths_min', 1000)
-        self.context_lengths_max = extra_params.get('context_lengths_max', 32000)
-        self.context_lengths_num_intervals = extra_params.get('context_lengths_num_intervals', 10)
-        self.document_depth_percent_min = extra_params.get('document_depth_percent_min', 0)
-        self.document_depth_percent_max = extra_params.get('document_depth_percent_max', 100)
-        self.document_depth_percent_intervals = extra_params.get('document_depth_percent_intervals', 10)
-        self.tokenizer_path = extra_params.get('tokenizer_path', 'Qwen/Qwen3-0.6B')
-        self.show_score = extra_params.get('show_score', False)
+            ['\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n']
+        )
+        self.context_lengths_min = self.extra_params.get('context_lengths_min', 1000)
+        self.context_lengths_max = self.extra_params.get('context_lengths_max', 32000)
+        self.context_lengths_num_intervals = self.extra_params.get('context_lengths_num_intervals', 10)
+        self.document_depth_percent_min = self.extra_params.get('document_depth_percent_min', 0)
+        self.document_depth_percent_max = self.extra_params.get('document_depth_percent_max', 100)
+        self.document_depth_percent_intervals = self.extra_params.get('document_depth_percent_intervals', 10)
+        self.tokenizer_path = self.extra_params.get('tokenizer_path', 'Qwen/Qwen3-0.6B')
+        self.show_score = self.extra_params.get('show_score', False)
         self._init_tokenizer()
         self._init_length()
@@ -88,65 +102,97 @@ class NeedleHaystackAdapter(DataAdapter):
                 self.context_lengths_min,
                 self.context_lengths_max,
                 num=self.context_lengths_num_intervals,
-                endpoint=True)).astype(int)
+                endpoint=True
+            )
+        ).astype(int)
         self.document_depth_percents = np.round(
             np.linspace(
                 self.document_depth_percent_min,
                 self.document_depth_percent_max,
                 num=self.document_depth_percent_intervals,
-                endpoint=True)).astype(int)
+                endpoint=True
+            )
+        ).astype(int)
     def _init_tokenizer(self):
         """ Initialize the tokenizer based on the provided tokenizer path."""
         from modelscope import AutoTokenizer
         self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path)
-    def load(self, **kwargs):
-        # default load with snapshot
-        kwargs['file_structure'] = {'english': ['PaulGraham_Essays.txt'], 'chinese': ['Journey_to_the_West.txt']}
-        data_dict = super().load_with_snapshot(**kwargs)
-        return data_dict
-    def gen_prompts(self, data_dict: dict) -> dict:
-        """
-        Generate dataset prompts from raw input, unify the prompt format for different datasets.
-        Args:
-            data_dict: {'english': {'test': [sample_d_1, sample_d_2, ...]},
-                        'chinese': {'test': [sample_d_1, sample_d_2, ...]}}
-        Returns:
-            {'subset_name': [prompt_d_1, prompt_d_2, ...]}
-            prompt_d_i (dict): refer to the output of gen_prompt method.
-        e.g. train -- few-shot data, test -- target dataset to evaluate.
-        """
-        res_dict: dict = {}
-        for sub_name, sub_data_dict in data_dict.items():
-            res_dict[sub_name] = []
-            for sample_d in sub_data_dict[self.eval_split]:
-                # Generate prompts for each sample in the dataset
-                tokens_context = self._get_context_tokens(sample_d['text'])
+    def load(self):
+        """Load dataset from local disk or remote."""
+        dataset_name_or_path = self.dataset_id
+        if os.path.exists(dataset_name_or_path):
+            logger.info(f'Loading dataset from {dataset_name_or_path}')
+            dataset_path = dataset_name_or_path
+        else:
+            from modelscope import dataset_snapshot_download
+            logger.info(f'Loading dataset from modelscope: > dataset_name: {dataset_name_or_path}')
+            dataset_path = dataset_snapshot_download(
+                dataset_name_or_path, allow_file_pattern=['PaulGraham_Essays.txt', 'Journey_to_the_West.txt']
+            )
+        # Load datasets for both subsets
+        datasets = {}
+        file_structure = {'english': ['PaulGraham_Essays.txt'], 'chinese': ['Journey_to_the_West.txt']}
+        for subset_name, files in file_structure.items():
+            if subset_name not in self.subset_list:
+                continue
+            file_path = os.path.join(dataset_path, files[0])
+            if os.path.exists(file_path):
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    text = f.read()
+                # Generate samples for all combinations of context length and depth
+                records = []
+                tokens_context = self._get_context_tokens(text)
                 for context_length, depth_percent in tqdm(
-                        product(self.context_lengths, self.document_depth_percents),
-                        desc=f'Generating {sub_name} prompts'):
-                    # Insert needles into the context at the specified depth percentage
+                    product(self.context_lengths, self.document_depth_percents),
+                    desc=f'Generating {subset_name} samples'
+                ):
                     context = self._insert_needles(tokens_context, depth_percent, context_length)
-                    # Build the input dictionary for the prompt
-                    input_d = {
+                    record = {
+                        'text': text,
                         'context_length': int(context_length),
                         'depth_percent': int(depth_percent),
                         'question': self.retrieval_question,
                         'answer': '\n'.join(self.needles),
                         'context': context,
                     }
-                    prompt_d = self.gen_prompt(input_d=input_d)
-                    prompt_d[AnswerKeys.RAW_INPUT] = input_d
-                    res_dict[sub_name].append(prompt_d)
-        return res_dict
+                    records.append(record)
+                dataset = DictDataLoader(
+                    dict_list=records,
+                    limit=self.limit,
+                    repeats=self.repeats,
+                    sample_fields=self.record_to_sample,
+                    shuffle=self.shuffle,
+                ).load()
+                datasets[subset_name] = dataset
+        test_dataset = DatasetDict(datasets)
+        return test_dataset, None
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        """Convert a data record to a Sample object."""
+        return Sample(
+            input=record['question'],
+            target=record['answer'],
+            metadata={
+                'context': record['context'],
+                'context_length': record['context_length'],
+                'depth_percent': record['depth_percent'],
+            }
+        )
+    def format_prompt_template(self, sample):
+        """Format the prompt template with context and question."""
+        context = sample.metadata['context']
+        question = sample.input
+        return self.prompt_template.format(context=context, question=question)
     def _get_context_tokens(self, input_context: str) -> list:
         """
@@ -227,7 +273,8 @@ class NeedleHaystackAdapter(DataAdapter):
                 # We want to make sure that we place our needle at a sentence break
                 # so we first see what token a '.' is
                 period_tokens = self.tokenizer.encode('.') + self.tokenizer.encode(
-                    '。')  # Handle both English and Chinese periods
+                    '。'
+                )  # Handle both English and Chinese periods
                 # Then we iteration backwards until we find the first period
                 while tokens_new_context and tokens_new_context[-1] not in period_tokens:
@@ -240,8 +287,10 @@ class NeedleHaystackAdapter(DataAdapter):
                 # Log
                 insertion_percentage = (insertion_point / len(tokens_context)) * 100
                 self.insertion_percentages.append(insertion_percentage)
-                logger.debug(f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, "
-                             f'total length now: {len(tokens_context)} tokens')
+                logger.debug(
+                    f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, "
+                    f'total length now: {len(tokens_context)} tokens'
+                )
                 # Adjust depth for next needle
                 depth_percent += depth_percent_interval
@@ -249,84 +298,78 @@ class NeedleHaystackAdapter(DataAdapter):
         new_context = self.tokenizer.decode(tokens_context)
         return new_context
-    def gen_prompt(self, input_d: dict, **kwargs) -> dict:
-        """
-        Generate the prompt for each sample in the dataset.
-        Args:
-            input_d: A dictionary containing the input data for the prompt.
-                It should contain 'context' and optionally 'question'.
-        Returns:
-            A dictionary containing the prompt data
-        """
-        context = input_d.get('context')
-        question = input_d.get('question')
+    def match_score(
+        self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
+    ) -> Score:
+        """Calculate evaluation scores by comparing prediction with reference."""
+        from evalscope.metrics import exact_match
+        from .utils import normalize_answer
-        prompt = self.prompt_template.format(context=context, question=question)
+        score = Score(
+            extracted_prediction=filtered_prediction,
+            prediction=original_prediction,
+        )
-        return self.gen_prompt_data(prompt, system_prompt=self.system_prompt)
+        # Get metadata from task state
+        context_length = task_state.metadata.get('context_length', 0)
+        depth_percent = task_state.metadata.get('depth_percent', 0)
-    def get_gold_answer(self, input_d: dict) -> str:
-        """
-        Parse the raw input labels (gold).
-        """
-        return input_d.get('answer', '').strip()
+        norm_gold = normalize_answer(reference)
+        norm_pred = normalize_answer(filtered_prediction)
+        accuracy = exact_match(gold=norm_gold, pred=norm_pred)
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
-        """
-        Parse the predicted result and extract proper answer.
-        """
-        return result
+        metric_name = f'Context#{context_length} Depth#{depth_percent}'
+        score.value = {metric_name: accuracy}
+        score.main_score_name = metric_name
-    def match(self, gold: str, pred: str) -> float:
-        """
-        Match the gold answer and the predicted answer.
-        """
-        from .utils import normalize_answer
-        norm_gold = normalize_answer(gold)
-        norm_pred = normalize_answer(pred)
-        # Use exact match for Needle in a Haystack
-        return exact_match(gold=norm_gold, pred=norm_pred)
+        return score
-    def llm_match(self, gold: str, pred: str, judge: LLMJudge, **kwargs) -> dict:
-        """
-        Use LLM as a judge to evaluate the predicted answer against the gold answer.
-        """
+    def llm_match_score(
+        self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
+    ) -> Score:
+        """Use LLM as a judge to evaluate the predicted answer against the gold answer."""
         from .utils import GENERAL_ORM_PROMPT, ORM_USER_TEMPLATE, parse_score
-        raw_input = kwargs.get('raw_input', None)
-        question = raw_input.get('question')
-        context_length = raw_input.get('context_length')
-        depth_percent = raw_input.get('depth_percent')
+        score = Score(
+            extracted_prediction=filtered_prediction,
+            prediction=original_prediction,
+        )
-        # get grading response
-        prompt = ORM_USER_TEMPLATE.format(question=question, gold=gold, pred=pred)
-        orm_response = judge(prompt=prompt, system_prompt=GENERAL_ORM_PROMPT)
+        # Get metadata from task state
+        context_length = task_state.metadata.get('context_length', 0)
+        depth_percent = task_state.metadata.get('depth_percent', 0)
+        question = task_state.input_text
-        # parse grading score with regex, [[score]]
-        score = parse_score(orm_response) if orm_response else 0.0
-        return {f'Context#{context_length} Depth#{depth_percent}': score}
+        # Get grading response
+        prompt = ORM_USER_TEMPLATE.format(question=question, gold=reference, pred=filtered_prediction)
+        orm_response = self.llm_judge.judge(prompt, system_prompt=GENERAL_ORM_PROMPT)
-    def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
-        """
-        compute weighted mean of the bleu score of all samples
+        # Parse grading score with regex, [[score]]
+        accuracy = parse_score(orm_response) if orm_response else 0.0
-        Args:
-            review_res_list: [score1, score2, ...]
+        metric_name = f'Context#{context_length} Depth#{depth_percent}'
+        score.value = {metric_name: accuracy}
+        score.explanation = f'LLM judge: {orm_response}'
+        score.metadata = {
+            'source': 'llm_judge',
+            'judge_strategy': getattr(self, 'judge_strategy', 'default'),
+            'model': self.llm_judge.model_id if hasattr(self.llm_judge, 'model_id') else 'unknown'
+        }
+        score.main_score_name = metric_name
-        Returns:
-            avg_res: List[dict]
+        return score
-        """
-        items = super().compute_dict_metric(review_res_list, **kwargs)
-        return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
+    def _on_generate_report(self, scores, model_name, add_aggregation_name=True):
+        # Don't add aggregation name for needle haystack adapter
+        return super()._on_generate_report(scores, model_name, False)
-    def post_process_report(self, report: 'Report', **kwargs):
+    def _on_generate_report_end(self, report: 'Report', output_dir: str, **kwargs):
         try:
             import os
             from .utils import draw_score_chat
-            report_path = kwargs.get('report_path')
+            report_path = output_dir
             data_frame = report.to_dataframe()
             # split `Metric` to `Context` and `Depth`
             data_frame[['Context', 'Depth']] = data_frame['Metric'].str.split(' ', n=1, expand=True)
@@ -336,13 +379,14 @@ class NeedleHaystackAdapter(DataAdapter):
             for subset in data_frame['Subset'].unique():
                 sub_df = data_frame[data_frame['Subset'] == subset]
                 # draw charts for each subset
-                pivot_table = sub_df.pivot_table(
-                    values='Score', index=['Depth', 'Context'], aggfunc='mean').reset_index()
+                pivot_table = sub_df.pivot_table(values='Score', index=['Depth', 'Context'],
+                                                 aggfunc='mean').reset_index()
                 pivot_table = pivot_table.pivot(index='Depth', columns='Context', values='Score')
                 draw_score_chat(
                     pivot_table,
                     outpath=os.path.join(report_path, f'needle_haystack_heatmap_{subset}.png'),
-                    show_score=self.show_score)
+                    show_score=self.show_score
+                )
         except Exception as e:
             logger.error(f'Error generating charts: {e}')

evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl