evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +5 -1
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +46 -50
- evalscope/backend/rag_eval/utils/embedding.py +12 -11
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +32 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +119 -95
- evalscope/constants.py +61 -29
- evalscope/evaluator/__init__.py +1 -0
- evalscope/evaluator/evaluator.py +96 -377
- evalscope/evaluator/humaneval_evaluator.py +158 -0
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +15 -3
- evalscope/perf/benchmark.py +7 -9
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +10 -0
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +2 -3
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/perf/utils/db_util.py +11 -8
- evalscope/perf/utils/local_server.py +19 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +184 -375
- evalscope/run_arena.py +20 -25
- evalscope/summarizer.py +16 -17
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -28
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -5
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
- evalscope/tools/combine_reports.py +25 -30
- evalscope/tools/rewrite_eval_results.py +14 -46
- evalscope/utils/__init__.py +0 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +3 -4
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/logger.py +9 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +12 -138
- evalscope/version.py +2 -2
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
- evalscope-0.8.0.dist-info/RECORD +285 -0
- tests/cli/test_run.py +54 -15
- tests/perf/test_perf.py +4 -0
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.2.dist-info/RECORD +0 -286
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
evalscope/run_ms.py
DELETED
|
@@ -1,140 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
# flake8: noqa
|
|
3
|
-
|
|
4
|
-
import argparse
|
|
5
|
-
import torch
|
|
6
|
-
|
|
7
|
-
from evalscope.benchmarks.ceval import DATASET_ID as CEVAL_EXAM
|
|
8
|
-
from evalscope.benchmarks.mmlu import DATASET_ID as MMLU
|
|
9
|
-
from evalscope.benchmarks.hellaswag import DATASET_ID as HELLA_SWAG
|
|
10
|
-
from evalscope.benchmarks.arc import DATASET_ID as ARC
|
|
11
|
-
from evalscope.benchmarks.truthful_qa import DATASET_ID as TRUTHFUL_QA
|
|
12
|
-
from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
|
|
13
|
-
from evalscope.evaluator import Evaluator
|
|
14
|
-
from evalscope.models.model_adapter import MultiChoiceModelAdapter, ContinuationLogitsModelAdapter
|
|
15
|
-
from evalscope.utils.logger import get_logger
|
|
16
|
-
|
|
17
|
-
logger = get_logger()
|
|
18
|
-
|
|
19
|
-
# TODO: add more precision
|
|
20
|
-
MODEL_PRECISION_MAP = {'fp16': torch.float16, 'fp32': torch.float32, 'bf16': torch.bfloat16}
|
|
21
|
-
|
|
22
|
-
"""
|
|
23
|
-
Run evaluation process for ModelScope Leaderboard.
|
|
24
|
-
"""
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def parse_args():
|
|
28
|
-
parser = argparse.ArgumentParser(description='Run evaluation on a model')
|
|
29
|
-
|
|
30
|
-
parser.add_argument('--model', help='Model id from modelscope or huggingface.', required=True)
|
|
31
|
-
parser.add_argument('--revision', help='Model revision.', required=False, default=None)
|
|
32
|
-
parser.add_argument('--precision', help='Model precision.', default='bf16')
|
|
33
|
-
parser.add_argument('--work-dir', help='root work cache dir.', default=None)
|
|
34
|
-
parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
|
|
35
|
-
parser.add_argument('--datasets-dir', help='Datasets dir.', default=DEFAULT_ROOT_CACHE_DIR)
|
|
36
|
-
parser.add_argument('--device-map', help='device map.', default='auto')
|
|
37
|
-
parser.add_argument('--max-eval-size', type=int, help='Max evaluation samples num for each subset', default=None)
|
|
38
|
-
parser.add_argument('--dataset-id', help='Dataset id on modelscope', required=False, default=None)
|
|
39
|
-
|
|
40
|
-
parser.add_argument('--debug',
|
|
41
|
-
help='Debug mode, will print information for debugging.',
|
|
42
|
-
action='store_true',
|
|
43
|
-
default=False)
|
|
44
|
-
parser.add_argument('--dry-run',
|
|
45
|
-
help='Dry run in single processing mode.',
|
|
46
|
-
action='store_true',
|
|
47
|
-
default=False)
|
|
48
|
-
parser.add_argument('--mem-cache',
|
|
49
|
-
help='To use memory cache or not.',
|
|
50
|
-
action='store_true',
|
|
51
|
-
default=False)
|
|
52
|
-
|
|
53
|
-
args = parser.parse_args()
|
|
54
|
-
|
|
55
|
-
return args
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def main():
|
|
59
|
-
args = parse_args()
|
|
60
|
-
logger.info(args)
|
|
61
|
-
|
|
62
|
-
# Customize your target datasets here
|
|
63
|
-
all_benchmarks = [CEVAL_EXAM, MMLU, ARC, HELLA_SWAG, TRUTHFUL_QA]
|
|
64
|
-
|
|
65
|
-
dataset_id = args.dataset_id
|
|
66
|
-
if dataset_id is None:
|
|
67
|
-
datasets = all_benchmarks
|
|
68
|
-
elif dataset_id in all_benchmarks:
|
|
69
|
-
datasets = [dataset_id]
|
|
70
|
-
else:
|
|
71
|
-
raise ValueError(f'Unknown dataset: {dataset_id}, Supported datasets: {all_benchmarks}')
|
|
72
|
-
|
|
73
|
-
# Get model instance
|
|
74
|
-
if args.dry_run:
|
|
75
|
-
from evalscope.models.dummy_chat_model import DummyChatModel
|
|
76
|
-
model_adapter = DummyChatModel(model_cfg=dict()) # TODO
|
|
77
|
-
model_id: str = 'dummy'
|
|
78
|
-
model_revision: str = 'v1.0.0'
|
|
79
|
-
model_precision = MODEL_PRECISION_MAP.get(args.precision, torch.bfloat16)
|
|
80
|
-
else:
|
|
81
|
-
model_id: str = args.model
|
|
82
|
-
model_revision: str = args.revision
|
|
83
|
-
model_precision = MODEL_PRECISION_MAP.get(args.precision, torch.bfloat16)
|
|
84
|
-
|
|
85
|
-
model_adapter = MultiChoiceModelAdapter(model_id=model_id,
|
|
86
|
-
device_map=args.device_map,
|
|
87
|
-
torch_dtype=model_precision,
|
|
88
|
-
model_revision=model_revision,)
|
|
89
|
-
|
|
90
|
-
# Evaluate on each dataset
|
|
91
|
-
for dataset_name in datasets:
|
|
92
|
-
if dataset_name == CEVAL_EXAM:
|
|
93
|
-
from evalscope.benchmarks.ceval import CEVALAdapter
|
|
94
|
-
data_adapter = CEVALAdapter()
|
|
95
|
-
elif dataset_name == MMLU:
|
|
96
|
-
from evalscope.benchmarks.mmlu import MMLUAdapter
|
|
97
|
-
data_adapter = MMLUAdapter()
|
|
98
|
-
elif dataset_name == ARC:
|
|
99
|
-
from evalscope.benchmarks.arc import ARCAdapter
|
|
100
|
-
data_adapter = ARCAdapter()
|
|
101
|
-
elif dataset_name == HELLA_SWAG:
|
|
102
|
-
# Note: HellaSwag should run few-shot eval
|
|
103
|
-
from evalscope.benchmarks.hellaswag import HellaSwagAdapter
|
|
104
|
-
data_adapter = HellaSwagAdapter()
|
|
105
|
-
elif dataset_name == TRUTHFUL_QA:
|
|
106
|
-
from evalscope.benchmarks.truthful_qa import TruthfulQaAdapter
|
|
107
|
-
data_adapter = TruthfulQaAdapter()
|
|
108
|
-
|
|
109
|
-
# TODO: add more datasets here
|
|
110
|
-
else:
|
|
111
|
-
raise ValueError(f'Unknown dataset: {dataset_name}')
|
|
112
|
-
|
|
113
|
-
# TODO: add mapping
|
|
114
|
-
if dataset_name in {TRUTHFUL_QA, HELLA_SWAG} and not args.dry_run:
|
|
115
|
-
model_adapter = ContinuationLogitsModelAdapter(model_id=model_id,
|
|
116
|
-
device_map=args.device_map,
|
|
117
|
-
torch_dtype=model_precision,
|
|
118
|
-
model_revision=model_revision, )
|
|
119
|
-
|
|
120
|
-
root_work_dir = args.work_dir if args.work_dir is not None else DEFAULT_ROOT_CACHE_DIR
|
|
121
|
-
evaluator = Evaluator(dataset_name_or_path=dataset_name,
|
|
122
|
-
subset_list=None,
|
|
123
|
-
data_adapter=data_adapter,
|
|
124
|
-
model_adapter=model_adapter,
|
|
125
|
-
use_cache=args.mem_cache,
|
|
126
|
-
root_cache_dir=root_work_dir,
|
|
127
|
-
outputs_dir=args.outputs_dir,
|
|
128
|
-
is_custom_outputs_dir=True,
|
|
129
|
-
datasets_dir=args.datasets_dir, )
|
|
130
|
-
|
|
131
|
-
infer_cfg = dict(max_length=2048, limit=args.max_eval_size)
|
|
132
|
-
evaluator.eval(infer_cfg=infer_cfg, debug=args.debug)
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
if __name__ == '__main__':
|
|
136
|
-
main()
|
|
137
|
-
|
|
138
|
-
# Usage:
|
|
139
|
-
# python evalscope/run_ms.py --model ZhipuAI/chatglm2-6b --precision fp16 --dry-run --dataset-id modelscope/mmlu --limit 10
|
|
140
|
-
|
evalscope/utils/task_utils.py
DELETED
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
from enum import Enum
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class EvalBackend(Enum):
|
|
6
|
-
# Use native evaluation pipeline of EvalScope
|
|
7
|
-
NATIVE = 'Native'
|
|
8
|
-
|
|
9
|
-
# Use OpenCompass framework as the evaluation backend
|
|
10
|
-
OPEN_COMPASS = 'OpenCompass'
|
|
11
|
-
|
|
12
|
-
# Use VLM Eval Kit as the multi-modal model evaluation backend
|
|
13
|
-
VLM_EVAL_KIT = 'VLMEvalKit'
|
|
14
|
-
|
|
15
|
-
# Use RAGEval as the RAG evaluation backend
|
|
16
|
-
RAG_EVAL = 'RAGEval'
|
|
17
|
-
|
|
18
|
-
# Use third-party evaluation backend/modules
|
|
19
|
-
THIRD_PARTY = 'ThirdParty'
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
evalscope-0.7.2.dist-info/RECORD
DELETED
|
@@ -1,286 +0,0 @@
|
|
|
1
|
-
evalscope/__init__.py,sha256=3eLMMrjkAIAs3vGluXNZn5-xTSbO_vfba9yNPbkVtg8,105
|
|
2
|
-
evalscope/cache.py,sha256=zpGjL9JMosqjk_dkODVwvIGiUC0WAMmMTHDNJOvBQU8,3288
|
|
3
|
-
evalscope/config.py,sha256=G_rpSn5Kd1aPlFJO6asnZu5FUggZmwcYdAxxpuq0yDs,6972
|
|
4
|
-
evalscope/constants.py,sha256=g8lGYlpA4Wk88HwtqId1-jJX_z8Lr2k02gWLsyofyj0,2670
|
|
5
|
-
evalscope/run.py,sha256=uAXtaxIBcR94jyfHGFAecuzn0y71oLgu-d9VOohCJAw,18738
|
|
6
|
-
evalscope/run_arena.py,sha256=BCWCAiX0BQ9pLMIq08svEcd-IoFr75gFShpV88robIY,8963
|
|
7
|
-
evalscope/run_ms.py,sha256=UtJoGnah64SXigTawJQWTi_TEGjr7Td0rjCTaO-htL8,6028
|
|
8
|
-
evalscope/summarizer.py,sha256=rIyML8HpjQxIpXg8KvQ0CzOS6xMS-JHZh6kUZzkaRsk,6640
|
|
9
|
-
evalscope/version.py,sha256=H1a8g__rL44KWkB8_7MGtmAlZo0QAG-Yz8ElN-SGMdA,118
|
|
10
|
-
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
-
evalscope/backend/base.py,sha256=5BLrDNNwxsGp35zorD-kphmN15tlBbkuuqwkz8jWZq0,876
|
|
12
|
-
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
13
|
-
evalscope/backend/opencompass/api_meta_template.py,sha256=sBW0XbVDOKeJ7mVUDLhmcG4e0yClw3eluazdp_8wtgQ,1753
|
|
14
|
-
evalscope/backend/opencompass/backend_manager.py,sha256=_eg82FLAVxQ6t5e1OqlyuxZcngqD8rxvI5EijLUh_zI,10294
|
|
15
|
-
evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
16
|
-
evalscope/backend/opencompass/tasks/eval_api.py,sha256=NRIbDqhM_5JD0zBGinhptxrSmTjTelO_RaiaAht7ee0,1179
|
|
17
|
-
evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=nWrPfItIYgPLJUXyu9vZmOmUUjku0BEFCV94Ss12pVU,5408
|
|
18
|
-
evalscope/backend/rag_eval/__init__.py,sha256=8om6TVnTMmyTEQt1jBuUQA4UfIzyps-_-ih90H_Qjio,284
|
|
19
|
-
evalscope/backend/rag_eval/backend_manager.py,sha256=jmO-UMu6_iOXMnl4--PrMWCsnIYEhsbiX017rtURqm0,2997
|
|
20
|
-
evalscope/backend/rag_eval/clip_benchmark/__init__.py,sha256=gDXCiRUTSeGQHxd5SjQsnphMqHJ2si2jywRiHvujEOg,150
|
|
21
|
-
evalscope/backend/rag_eval/clip_benchmark/arguments.py,sha256=VbB7JY4NunV83ewkZrUiM74jTzSETMPcOLlllRs7djA,1537
|
|
22
|
-
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py,sha256=vaguNflVBC5-0lk1kaU7CLTbkJuBf0hHGIdmoq4Bn8s,8474
|
|
23
|
-
evalscope/backend/rag_eval/clip_benchmark/task_template.py,sha256=asEF_Nt2Xt3DtIS49J9nQKEjTdrcAkYhY4zumCDzSws,3990
|
|
24
|
-
evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
-
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py,sha256=Bj2ysvM0JT-6T40v0rffeZgJIRht5KVX0GzMOiUphf0,2578
|
|
26
|
-
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py,sha256=ZrUYDbQ75eo0vmIwXh5Bb9c4nyEwd4AO2oURaIqjIII,7502
|
|
27
|
-
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py,sha256=Bcs64xece4BMNhxuaFimOwMJnlpjNxfGrdSCWOYItko,5977
|
|
28
|
-
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py,sha256=3wW-AigMx5rygsI47rr8Kym_t0GWO4eio7zSAavSr6A,8765
|
|
29
|
-
evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt,sha256=eiiAaxhS48b5rVLy5O9VvFfV2AfxY86ITu_iqT7ZLkQ,649
|
|
30
|
-
evalscope/backend/rag_eval/cmteb/__init__.py,sha256=ajVz6XP5hqPq-jm66hp2poA2qKj1V19ZGoqjrGUlO7U,279
|
|
31
|
-
evalscope/backend/rag_eval/cmteb/arguments.py,sha256=wZvnVir2tSxYCV_DPR3TSDj4VxtUn3wLhBPqyMJYKno,2330
|
|
32
|
-
evalscope/backend/rag_eval/cmteb/base.py,sha256=fYrIjKwOLwBAHb2rlNkEjYScjZ5Qpyv2LdMmWZYWREA,2830
|
|
33
|
-
evalscope/backend/rag_eval/cmteb/task_template.py,sha256=Clyc8TZCtZrL6MjAw49rh55Xb3hf2y1C3SzLvZsorLE,2646
|
|
34
|
-
evalscope/backend/rag_eval/cmteb/tasks/Classification.py,sha256=7adR40W6Uu58-QR9jCUP4k7TdAnG0oT225v4xHXah2g,10635
|
|
35
|
-
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py,sha256=-oJ9rXy7pgOB7Gyf68TcSlmmAUoBx5hKofcKNuIsCd8,8977
|
|
36
|
-
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py,sha256=rF6dtrwOfvJoq2Y4myZg9_638M1g06qq0hWCmvxsIo0,2039
|
|
37
|
-
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py,sha256=2WkaTE-jF8jqsu1UcNDqN8A4567UzW5boD_0B83j-9A,4008
|
|
38
|
-
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py,sha256=C34nDuya8OT3aeMxYCYjUpUtWp7w00jSfIYQSInlNAg,5329
|
|
39
|
-
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py,sha256=wUxiQH5aOmWNS4YswACyHqBn5xqP5eyvsq6U9WSp5R0,11457
|
|
40
|
-
evalscope/backend/rag_eval/cmteb/tasks/STS.py,sha256=6GMaoCANM-IKYLk4srHOYr_eurav3DGihHMQeJPXR6k,12054
|
|
41
|
-
evalscope/backend/rag_eval/cmteb/tasks/__init__.py,sha256=eBHm_TWeh7WiwpdVBtUlegeXMAxJyVQdUHRhJERobIs,1506
|
|
42
|
-
evalscope/backend/rag_eval/ragas/__init__.py,sha256=-VnStCVy7uHih2uipG_7AD4i2FQ5sVM7_NI-sEZBpRQ,170
|
|
43
|
-
evalscope/backend/rag_eval/ragas/arguments.py,sha256=BriXjcXVk2FqjDNuFYpfBZsUVzrkrYH7egbO9x-jcZ4,1873
|
|
44
|
-
evalscope/backend/rag_eval/ragas/task_template.py,sha256=nv2i9-NE2SXpLrVKo5zhadYYKbDFVXVVA4sfgb4ti4g,1693
|
|
45
|
-
evalscope/backend/rag_eval/ragas/metrics/__init__.py,sha256=HgY5nrcNtWpQ7gBi5lCEJXJVINd_R57dsmI8ldS2rd0,160
|
|
46
|
-
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py,sha256=Uqz5qWZ76Gos95_QlhwncbATXyk0YX4wkI0LiAdPElU,3838
|
|
47
|
-
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py,sha256=CdLnWHq1eTna6j3F5-pncW5YusxD_v3ScjzeCsZ7mng,3967
|
|
48
|
-
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py,sha256=1m8FBVga_uetCkahL_mwhGS8nAXG8V4jmnT4iP_6QYo,794
|
|
49
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json,sha256=YaqCbIynnRtPQHng6AzlD4l7KA-TPAi4ayjnhZj6gw0,3940
|
|
50
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json,sha256=-BjIwLy3QOiQbFGqjhYTNfhLTLeaBeOtpKBKfpjlf7E,1736
|
|
51
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json,sha256=eyUasvFvtwXAcpeUaOOBVuvxhGl-u_dndV-qsjnqsF4,981
|
|
52
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json,sha256=KXr3hmd49n1KsgYWrjTuYY9xBFIcTSksueVTUEwfEm0,3188
|
|
53
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json,sha256=1A9KlwbQr8WqNxdLEa4nU1HlPzF-q2KflQ591pJA0To,2475
|
|
54
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json,sha256=YQFk8o0esRyOF9m2aJBR_Nwn40D6LAr7YrfhQdHae_s,1739
|
|
55
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json,sha256=xH4kduv1OUJIl_xcGGh-StK_zOlZa4G-pCrIt1M8Hbc,1025
|
|
56
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json,sha256=Q4sf2Xud4NpVrbEIYZJEE_VVjMy-fgwX_AK0OnMQpDg,992
|
|
57
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=FGGqRlNgvEXnH-YcNPk5pzoRZXwtaS5cMtbIBQyEPyU,669
|
|
58
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=4JTUCczH-7UjH5nlz13w-srcTC3usqiXjJwLwxu-MIg,919
|
|
59
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=PJ2IHm3zXHe_XnT_DPxL5TNqJGJ-jjX2owVShw9V9kA,672
|
|
60
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=4JTUCczH-7UjH5nlz13w-srcTC3usqiXjJwLwxu-MIg,919
|
|
61
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json,sha256=nZ7VIz6R1XyyKtP0Vq5jPFNfHaN6M1Z9rFPOCVRChBE,1374
|
|
62
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json,sha256=5IKDA_hPmyuDXMhzK7aACrZGrYNT3wuqhzsHYC7Vkt4,1496
|
|
63
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json,sha256=uY_4P9OloNHP2IdvIuoTFCuUEHMyEqx9TzCoC6tj8G8,774
|
|
64
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json,sha256=-0BwNQgPeH3dIIHsgNSL9OCMsg03oqtWtqm6HJG6gOk,663
|
|
65
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json,sha256=9oSmEYvqor920jXByeNynyOSXagAukFK_e4jnMuDZQU,916
|
|
66
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json,sha256=ukF4AaOn8Su0uZ5E_uszzZFC1_MY2M9OymOSZ15w0BQ,688
|
|
67
|
-
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json,sha256=dH-etTJrQ0gQIS97QCZ5IhQR223gLS0_QZjUEW91fOA,657
|
|
68
|
-
evalscope/backend/rag_eval/ragas/tasks/__init__.py,sha256=WO2xja0g0JSiYGdu2uAEDQgDceuFcgPWwPoqFnwDU0s,172
|
|
69
|
-
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=nX-dG0Fm1629pSASujuEmMODFZf1955WncNNykRrNtI,9305
|
|
70
|
-
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=bXOqik6qKWzbrEz21ykdkqeqqPrmoUIhTwW6eRQXy0M,2222
|
|
71
|
-
evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
72
|
-
evalscope/backend/rag_eval/utils/clip.py,sha256=frafvJ1soUtjFUmi-053_Fhg6ERRwyvczQBlLWAX9vE,5104
|
|
73
|
-
evalscope/backend/rag_eval/utils/embedding.py,sha256=RZf0JlovZY_cCBsq8MMUqC_Sy78WtKLY_rBAlRA_udo,6239
|
|
74
|
-
evalscope/backend/rag_eval/utils/llm.py,sha256=9tFwMNoTf3jNomgDu5qqVLO92HtEtelH3DXpny9_B2g,2552
|
|
75
|
-
evalscope/backend/rag_eval/utils/tools.py,sha256=LpcYoeIBj1btzQ1_P84u1dYCdRWhMtiltxihmZCvWKk,1528
|
|
76
|
-
evalscope/backend/vlm_eval_kit/__init__.py,sha256=xTgHM95lWzh4s0W7zxLwYkgUbPAZfAb0UoGGmyyBXrs,83
|
|
77
|
-
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
|
|
78
|
-
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=Yz2A5kB1E8DYBnjuVCA6TTPtLjhg8vYKeJTh6FU_Ecw,1645
|
|
79
|
-
evalscope/benchmarks/__init__.py,sha256=6TKP35wfKf7R_h870fsEtcIlIAgomKOcukNL9M-5I1Y,162
|
|
80
|
-
evalscope/benchmarks/benchmark.py,sha256=EmwYyFdrAHBGMkSbsMZQOR_62Q0CSKl8zeLlr7xvJdQ,2159
|
|
81
|
-
evalscope/benchmarks/data_adapter.py,sha256=eVQvOQYQOQbIl8UlvOEUqRThL3FP3aUD6DSlqF1bqO0,10395
|
|
82
|
-
evalscope/benchmarks/arc/__init__.py,sha256=7k2jFDUCHpEKDdQZ3Bmq59YmImFg9RyIfZQIsGebhE8,314
|
|
83
|
-
evalscope/benchmarks/arc/ai2_arc.py,sha256=Wim8JsH094og7d0eLCEI0kUwDP_0x7AT117oTRPdiAI,5608
|
|
84
|
-
evalscope/benchmarks/arc/arc_adapter.py,sha256=RpXgp69N-3UinKDAnMVxeuGrOBFX2HgXAwwjm_kH-vg,9214
|
|
85
|
-
evalscope/benchmarks/bbh/__init__.py,sha256=x_FWzYE1gKf0mUswYXOKqKaAkmSm6IfzWvPnCtjbs8I,306
|
|
86
|
-
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=pUUjwtxX_9_z0DUo_oCddc7ktA5enhN5EaNrSRWT4V4,10804
|
|
87
|
-
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=hNie8uvyVSF-W0sZW721vEhH7_9lypZ0qtDRVraBgxg,1780
|
|
88
|
-
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=t2ozSTodp4p2ZibgkhoAomhBFtf2keRioum9QF9E5Sk,3652
|
|
89
|
-
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=7GDstZmMXjlucd6RsN5WzQiLij_VASLHHEx3mMP4wJ8,1166
|
|
90
|
-
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt,sha256=qhCLqXjtlI325tGCBXgLnM8V_bUKpUW-Dohh7U-BPSY,3567
|
|
91
|
-
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt,sha256=Ut7JSNl4zQqeuDt1eq-Mrgdxf2kMar1i85DpVqEi5vU,2404
|
|
92
|
-
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt,sha256=WNruIcuKwCaNwHPj-xs6VK1RzyVq2JDED02MadpDPl4,4476
|
|
93
|
-
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt,sha256=4hn9zfKMo7HquDPsWuy5fEnkuJtsI9GrfabostMLQLo,4830
|
|
94
|
-
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt,sha256=Jmv9AxvfpgbLsi0Nc_3_xcSLuwpiT-Po4VgTukmA0w8,3113
|
|
95
|
-
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt,sha256=LIqVgRhbD1t5ohv5FGW-Ql98kst4mIl-IpX-IY5c6mg,2504
|
|
96
|
-
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt,sha256=LIqVgRhbD1t5ohv5FGW-Ql98kst4mIl-IpX-IY5c6mg,2504
|
|
97
|
-
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt,sha256=LIqVgRhbD1t5ohv5FGW-Ql98kst4mIl-IpX-IY5c6mg,2504
|
|
98
|
-
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt,sha256=gkMO9u025Uc4RClBeQtF11FDcNb9EsxUlrbwGDdllZc,2120
|
|
99
|
-
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt,sha256=g7Y4iyJ0fQGmWyLYWwzaqlpmaypQ3x2sK49AX_YL3NU,2385
|
|
100
|
-
evalscope/benchmarks/bbh/cot_prompts/navigate.txt,sha256=DATvl8pqWOblx260R3muCt1sYErASv0TAviag1UZrVE,2146
|
|
101
|
-
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt,sha256=pTYwcwnt-yypHJ9hRLyDVW0hMgBPgxUwX6f6TZnFriw,1417
|
|
102
|
-
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt,sha256=juEuZ97hBp8vgQl_mkKiAsMdbb5MqxatkMRKkKDvopc,2385
|
|
103
|
-
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt,sha256=5KJQPIDdCPkyRmk9riKDHlSFfTtlhyG8aIeTDl7h6JA,2294
|
|
104
|
-
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt,sha256=OqEgTV80zfH8Mu2_IZkpPMKUREqVWOFSJo6t7D2sUx4,3480
|
|
105
|
-
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt,sha256=nBP1tctGuL8pCBYvH3BNW3nQRrRzY7lFNd5bWG6Hs64,6140
|
|
106
|
-
evalscope/benchmarks/bbh/cot_prompts/snarks.txt,sha256=QSAKws7Of09NdrmFPjJJCVc0zvAIMak6xROhpdgxSt4,3113
|
|
107
|
-
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt,sha256=vgTwHu4mowIeCtaaD24fRmfsaU-V9lG1z4U6izcoFBg,820
|
|
108
|
-
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt,sha256=xZeBUTWvnAT3jL8SgQJqiC_a82FfYYcgEra6frIuvlA,3022
|
|
109
|
-
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt,sha256=RmuGDIzsjWNt1ZlkqmerLHiVAWPzZOTVENcgoiM7AZg,2603
|
|
110
|
-
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt,sha256=RmuGDIzsjWNt1ZlkqmerLHiVAWPzZOTVENcgoiM7AZg,2603
|
|
111
|
-
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt,sha256=RmuGDIzsjWNt1ZlkqmerLHiVAWPzZOTVENcgoiM7AZg,2603
|
|
112
|
-
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=aPWMXg6mdgoqMao8Oc3jcjeOBh0RUPqN3aBvxaWv9pc,2944
|
|
113
|
-
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=uhRRz8y0hfHI96olJS9IU32XafGcdiqsPPCOexB1hL4,2163
|
|
114
|
-
evalscope/benchmarks/ceval/__init__.py,sha256=SatTco8Ks6wD0jh9LUN5chf21VaJnwW1SG4cGG8OYAo,343
|
|
115
|
-
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=FBUTdmW4a5TY7atBjE_H1h_ST2_WoPWMMTvfHNvusNU,15852
|
|
116
|
-
evalscope/benchmarks/ceval/ceval_exam.py,sha256=S32eMfGUBMrUDP39HzO6XfvSir0tthHCPItNtriE-hc,5063
|
|
117
|
-
evalscope/benchmarks/ceval/samples.jsonl,sha256=dyWhGAdt4eq6Amgu2Ykx8RevUJVFtbhGFSTbDAeUgHc,448
|
|
118
|
-
evalscope/benchmarks/cmmlu/__init__.py,sha256=mIMlXA_BHb_bF71Oi5XJwhV_sZKN2b_lBTOXhU5h6Bg,342
|
|
119
|
-
evalscope/benchmarks/cmmlu/cmmlu.py,sha256=q_6ONrjdcHNqpXTUmSVbNOfl1yMd0zEQZWnh0PMQmYY,5153
|
|
120
|
-
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=jqVghYwex2Awx7THgka0wQ7dFY0EdzfnI7n0aMXGPro,15216
|
|
121
|
-
evalscope/benchmarks/cmmlu/samples.jsonl,sha256=l842nKaAfeRE69jcX_E5N1gstWrHYpoNZjP-5D6Aq_k,1721
|
|
122
|
-
evalscope/benchmarks/competition_math/__init__.py,sha256=hXO0DTtrA_0YDYUcyrL4XOyPGvPEa0sy2miHTF1Cxrg,393
|
|
123
|
-
evalscope/benchmarks/competition_math/competition_math.py,sha256=0p5iKUfU6WpXgplb44YgVWZUYkeWLLmOdj66_dapdDc,2678
|
|
124
|
-
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=FijGL1FlEWJAy34tp3bIapiglT7KBJ8AvU8bjP4CGAw,19087
|
|
125
|
-
evalscope/benchmarks/general_qa/__init__.py,sha256=lsGH8AlyH6MNCs7xZhWPKW8Ac3pwZg2hLibWMbyiKAc,346
|
|
126
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=N4Kopo7i3JfEG6Fann-kjPpYXcR2BHfYmtG8aZXfwR0,6097
|
|
127
|
-
evalscope/benchmarks/gsm8k/__init__.py,sha256=4rdHRuEZdDO_WPY3RcLSZCAzgLV7UAOXgOa4cSUhmWs,315
|
|
128
|
-
evalscope/benchmarks/gsm8k/gsm8k.py,sha256=WZ8k4EEMjNWWCxY-Dhs2BSR4EHHqYBViyl_UZIGNu7U,4282
|
|
129
|
-
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=qpRii4zyWK6nadEYxBPDTdOSwyyotWTf0JIxQSoEy2k,13976
|
|
130
|
-
evalscope/benchmarks/hellaswag/__init__.py,sha256=nSLrBNF18Yqcp8B6IiBGsCYkDS3Xnm0nq_QDyXXSqM0,357
|
|
131
|
-
evalscope/benchmarks/hellaswag/hellaswag.py,sha256=bNOUDpGHtAOAyWrQlMiqEGyp0ePTcpIeYbZH3XaFczI,4690
|
|
132
|
-
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=4Gf6zvnRAi4YljS7UB9PGxJeT0VSNuvhq8yhYiVmZ34,8557
|
|
133
|
-
evalscope/benchmarks/humaneval/__init__.py,sha256=tBPFexx1c6U1nWMpglJqkQiY2GwKrmpSD_snv_NyRec,335
|
|
134
|
-
evalscope/benchmarks/humaneval/humaneval.py,sha256=oNxRcVkYxlzS18N2JmwSaQb1aTZOVHlZMKwIETjfvNs,3482
|
|
135
|
-
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=1YiAvNbWRUcaTu9oGwpDMmvS4_zoHt_bNWmNpzJmo-g,1661
|
|
136
|
-
evalscope/benchmarks/mmlu/__init__.py,sha256=fZicGcLq67XOc5cofGCi6WrV4FdubLupKb7nMdCUQSA,337
|
|
137
|
-
evalscope/benchmarks/mmlu/mmlu.py,sha256=GhjZFOgX5qG041eVrSWggOcRcMyl0oAI_yGXmufwEzc,5256
|
|
138
|
-
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=9lg_3s3QjGKC794O-RogU9cdvcCP7_Vp4ve9U9dRhz8,16401
|
|
139
|
-
evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
|
|
140
|
-
evalscope/benchmarks/race/__init__.py,sha256=htMZhgk40CsvNF7HXaHeAejUnGbUtU6Nu2yATOiMfaU,337
|
|
141
|
-
evalscope/benchmarks/race/race.py,sha256=giY44Vr6CePdVQxpi0x4CLsaknye47Gdlc_PVqN9VCA,3835
|
|
142
|
-
evalscope/benchmarks/race/race_adapter.py,sha256=3zHfz3tFzCVKoYLtzpGek338ZnIGT7ejq_xSaMxiIjU,9900
|
|
143
|
-
evalscope/benchmarks/race/samples.jsonl,sha256=GMwF5IPRWrsq6cfYNGS5yt_woXz687HObA0IkB6k3V4,1242
|
|
144
|
-
evalscope/benchmarks/trivia_qa/__init__.py,sha256=oslov-n_oV3bhEhrPXLJoQwmHE8_vYR2JTerxoHq29A,351
|
|
145
|
-
evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=9OsKC9uuBbw9MHghOOMMALeGkFOY_QTNWZYAr0ASPQ0,3444
|
|
146
|
-
evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=xrebA71r_Ek9NvwkDfsmWTuRCsae2HZEGmTBtZMGwfM,3296
|
|
147
|
-
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=0g0xSWorXiHrZ3PKTqOO6g18kK2tUop1HWaAjmCKRwg,7659
|
|
148
|
-
evalscope/benchmarks/truthful_qa/__init__.py,sha256=4bRdnHOceaEvn20jZj0yLCg5wpOHpzP3LRjkYm5u-Fs,367
|
|
149
|
-
evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=eOcYBjR7XZ4VFSAY4r1_UUoKXakhR-yzd2I3FiOmlUw,7017
|
|
150
|
-
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=Cavimjnc6NPMC1TDOV4_uI37c3--sILz_VqGiJM_z50,14952
|
|
151
|
-
evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
152
|
-
evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
|
|
153
|
-
evalscope/cli/cli.py,sha256=uZ-qC8WBsLd5-Hn94d43sSGg0UC_12RebSD4ToKjypg,844
|
|
154
|
-
evalscope/cli/start_perf.py,sha256=yIE3sP13_yoTXQD3DBNzRVY6L_5p-Ix0J1VBvZFYdVU,914
|
|
155
|
-
evalscope/cli/start_server.py,sha256=ATGLP2TE0aImJNicpehdzBuFlNb50F7KhyL4A_ZSoGU,3885
|
|
156
|
-
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
157
|
-
evalscope/evaluator/evaluator.py,sha256=eSCgPPDGfIJfKu0cthhbDLFm1xMhj_869iT3ngcQkPc,30817
|
|
158
|
-
evalscope/evaluator/rating_eval.py,sha256=cJbkyXIuwFUZoe7ZJZM6eUskNd9zlORgndckuon2OQ8,5768
|
|
159
|
-
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
160
|
-
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=JycPYti9h1j_8DRcu_rc5U0wkEASHYg-XBqrUUoiO-Q,17054
|
|
161
|
-
evalscope/metrics/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
162
|
-
evalscope/metrics/code_metric.py,sha256=zK1tpNDZbvmSHt3a_JJ5Y2Hdu2cqeFriy__wUOl2tSw,3462
|
|
163
|
-
evalscope/metrics/math_accuracy.py,sha256=1PCy1VUNYg48JcGy-6SUmUDZNwPeAkMW1QQ_lXomdWw,1988
|
|
164
|
-
evalscope/metrics/metrics.py,sha256=sDZljGiZwgHsFZ5eNi65-3z3BLCdIwWUzPcq2QpKf1k,12545
|
|
165
|
-
evalscope/metrics/rouge_metric.py,sha256=VNdy86ZGZL6thVDFg0nKedp6dPApV7_yoIupMe0f6hk,4518
|
|
166
|
-
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
167
|
-
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=MXcHwmsXnh9mQZR1Bt5St6DNwXY-mfz4dNM8y6a23dc,12236
|
|
168
|
-
evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48EsmFuY5_iVvY6xjc,524464
|
|
169
|
-
evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
|
|
170
|
-
evalscope/models/__init__.py,sha256=zG27J2HSeKPGiAIUE7QLPHEPLyXLsfaDwYI_TDXjpCg,145
|
|
171
|
-
evalscope/models/dummy_chat_model.py,sha256=xE8wcFVSCkvizEJ-B8ojX0Ir01Q5KrN5mapjMQaQtbg,1325
|
|
172
|
-
evalscope/models/model.py,sha256=ZzzVzZHVzuzdt5F1r-rEBT44ZfW9B7R1spsrV-T8nSw,3020
|
|
173
|
-
evalscope/models/model_adapter.py,sha256=Cgs68ajRwTETEo1eU-OhFiFGuSx4eS1p7-JT3jOpcOk,22740
|
|
174
|
-
evalscope/models/openai_model.py,sha256=PoQS1FIiWIxp1xBJPV7Bq81LFD9FIT3vAHUvNa22DCc,3452
|
|
175
|
-
evalscope/models/template.py,sha256=Yk7-QnvjiLD0zchSZcaDSLmpW8onIeFpngSwtUOYVPk,56035
|
|
176
|
-
evalscope/models/api/__init__.py,sha256=0c75K78O1KaV02BqqtEp-hhtSSClXLawb8E0c2iqN_A,105
|
|
177
|
-
evalscope/models/api/openai_api.py,sha256=uBicJPaFLOhIrB5PKI8FE-SItb7v-fuDwBgkgns3CY0,7883
|
|
178
|
-
evalscope/models/custom/__init__.py,sha256=K4Ewo7Qrs73-jBuPq4ffxd8hMnttKhic-Zj0amH3wiU,103
|
|
179
|
-
evalscope/models/custom/custom_model.py,sha256=2ivxfGQs5V5HDnQEhTBi5v8KNBxJDbzPVJdNOGo3iSg,1566
|
|
180
|
-
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
181
|
-
evalscope/perf/arguments.py,sha256=ixiWx16qAL1gU7JTwoYOnvvc3IrwVWGz2uVno38gywA,8671
|
|
182
|
-
evalscope/perf/benchmark.py,sha256=Yiqcg5N03KmBa-5aWYNyklbYJ9Hqiuu1oaD8kBkFPSQ,9659
|
|
183
|
-
evalscope/perf/http_client.py,sha256=OpTgYl4obSpmyi5bOkTRSIQxp0aVdO08EcIVFAv-znU,7192
|
|
184
|
-
evalscope/perf/main.py,sha256=ljJDJVsD9hGWgF5bJCW-mfUGohc4LofaxiyAUfMa2WQ,997
|
|
185
|
-
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
186
|
-
evalscope/perf/plugin/registry.py,sha256=PyK3E1AqQFuU4Bs9COvFFCJOaCtmHbfeQOVGtjVYh-I,1304
|
|
187
|
-
evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
|
|
188
|
-
evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
|
|
189
|
-
evalscope/perf/plugin/api/custom_api.py,sha256=NQ2LDKsFQfExVRx2prcmfORCBzxxibfhpVHhB-lxAO4,3776
|
|
190
|
-
evalscope/perf/plugin/api/dashscope_api.py,sha256=0p9f6ujppS_H6w7wsIbRVNnCkHXtRemIai5Bhdogla4,3826
|
|
191
|
-
evalscope/perf/plugin/api/openai_api.py,sha256=dYUzRwQ9xLoJUTtZHfoI1_3pV9VevdK7EV_yfngDidA,7008
|
|
192
|
-
evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
|
|
193
|
-
evalscope/perf/plugin/datasets/base.py,sha256=1U_efZuU2ZdWV9UVAqFu1fx9_0PST_sJnaSIqbNvTF4,1787
|
|
194
|
-
evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
|
|
195
|
-
evalscope/perf/plugin/datasets/flickr8k.py,sha256=39jbcZde4cOY6PpJHeb20v5PIg58ezFMoXjYO7U6Z2A,1582
|
|
196
|
-
evalscope/perf/plugin/datasets/line_by_line.py,sha256=IKVZMpKei6XW9DTm9VEssWHE96i1lTqMf0621dA_img,836
|
|
197
|
-
evalscope/perf/plugin/datasets/longalpaca.py,sha256=ohmq3Mp0JKeG8h8ef9GYqN7pBLTHzpF8g9KrrriRbwM,1165
|
|
198
|
-
evalscope/perf/plugin/datasets/openqa.py,sha256=l9vCnEKBYU1a8uo49kArwSXu-ZaOXDHa2Pl3gp4yXE4,1395
|
|
199
|
-
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
|
|
200
|
-
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
201
|
-
evalscope/perf/utils/analysis_result.py,sha256=o0wMcr9U0Gwd5lh5tAFCFpp3FmfwsaMppyJOLI2_sJ8,1213
|
|
202
|
-
evalscope/perf/utils/benchmark_util.py,sha256=-wZyZnWrXsQOzPrWdxQVbQUVUAljzsfWV4-2Hw_xzpQ,5565
|
|
203
|
-
evalscope/perf/utils/chat_service.py,sha256=ncMmeUDpOo7Kjkhe_TPDZY8ffoHTCl-B5szHJ4gipEo,8642
|
|
204
|
-
evalscope/perf/utils/db_util.py,sha256=TeZzcGoWDde81EjpDOyV6c2B1ZM7NzRv-0cEmeorGjE,7356
|
|
205
|
-
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
206
|
-
evalscope/perf/utils/local_server.py,sha256=AezbEdPGuE1esCBxXtXJWjFYTZfFb6SYC6bAfcaX5Gk,4316
|
|
207
|
-
evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
208
|
-
evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
|
|
209
|
-
evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=IQmfcwkzCCV-bMbIC9M2fd-X99bHJ_r_qfIJjClClx0,2760
|
|
210
|
-
evalscope/registry/config/cfg_pairwise_baseline.yaml,sha256=d05pBiqOk1ejcdd9XE-opZ_ersyttAesF3Iwa2df8O8,3580
|
|
211
|
-
evalscope/registry/config/cfg_single.yaml,sha256=zjsUC3zhU8z7JURaJiz7npkUbFpP82q1ycqUmObC-hc,3056
|
|
212
|
-
evalscope/registry/data/question.jsonl,sha256=WQw5FXvFYerdfwPK1L4YwrWX-TApeAr2X4Zxjznq-oc,12885
|
|
213
|
-
evalscope/registry/data/prompt_template/lmsys_v2.jsonl,sha256=F3PcsoO_UOCztLNmGDYd90K4z4eVufBWz5prKrcqHG0,10554
|
|
214
|
-
evalscope/registry/data/prompt_template/prompt_templates.jsonl,sha256=F3PcsoO_UOCztLNmGDYd90K4z4eVufBWz5prKrcqHG0,10554
|
|
215
|
-
evalscope/registry/data/qa_browser/battle.jsonl,sha256=2MXcYoMItBmttQxSMh2Oa0x51xxqJaWEgSuERUx1O_0,1185590
|
|
216
|
-
evalscope/registry/data/qa_browser/category_mapping.yaml,sha256=3r9nUIciW9205qbtOQF7aI_etM191cM3vlWU8ueG2Co,484
|
|
217
|
-
evalscope/registry/tasks/arc.yaml,sha256=phXsBLsAgvHWmU31J89QMnJJnUioRphraQrF9SrJ53c,863
|
|
218
|
-
evalscope/registry/tasks/bbh.yaml,sha256=Ircb_-_eVri2B1MHeSrFs9vIol7RY8ZaWwdz1j57NHA,701
|
|
219
|
-
evalscope/registry/tasks/bbh_mini.yaml,sha256=eZYash__XJcfJau0VqujehuYE2WnFzrWr9s9jCkNT8Q,775
|
|
220
|
-
evalscope/registry/tasks/ceval.yaml,sha256=OoSPrz6c3jPy_T7NH162N1lemwwU2OcnT1zo3S-nPRA,703
|
|
221
|
-
evalscope/registry/tasks/ceval_mini.yaml,sha256=Aw9zzw_6STRVA21mVuAvmGiWCdXzL6ktmdFOCiQWRw4,769
|
|
222
|
-
evalscope/registry/tasks/cmmlu.yaml,sha256=mkTqqXCdc8bqMcCDI_J3d375RaxX_8v4jw5fyAKAW0A,703
|
|
223
|
-
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml,sha256=IzPU-edTVDVAr_LGyGoYTlaFvhH1iFp4LpAWKPIy2Lg,737
|
|
224
|
-
evalscope/registry/tasks/general_qa.yaml,sha256=7uiytV5kgs23eP5sBCpg5KXs6b9yFtPFWU1pnCCQIpg,703
|
|
225
|
-
evalscope/registry/tasks/gsm8k.yaml,sha256=KYLK-xtv_3qtgCZiwwP4-rP_ftc_qUmtsl1Tf-jNlCg,730
|
|
226
|
-
evalscope/registry/tasks/mmlu.yaml,sha256=504yhHVfi9pvUBk_SGPs-Yx7R2hx_2_-nAFiGIiFGx4,726
|
|
227
|
-
evalscope/registry/tasks/mmlu_mini.yaml,sha256=wVbosZ5Tm9pwLG5nCphalezXilIjcq5j33nz3MR7_BE,778
|
|
228
|
-
evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
229
|
-
evalscope/third_party/longbench_write/README.md,sha256=p7C5StphFFzEeMA2lcfKyeBlJgJiIjTSXvzwhw9md2k,3248
|
|
230
|
-
evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
|
|
231
|
-
evalscope/third_party/longbench_write/default_task.json,sha256=HPSnI7Ar7cqe86wzQnH2XsDtqmAuCDLy3sZm3MeNyKc,711
|
|
232
|
-
evalscope/third_party/longbench_write/default_task.yaml,sha256=aQB-Cn-gEkdoI_26yOaeJWGpoI3-FxHBclZGAmxeBcc,579
|
|
233
|
-
evalscope/third_party/longbench_write/eval.py,sha256=_fwV3f-Yq0qrkuZ6LBXvBiXnM6lpz6sOqd7BfYxEU80,11163
|
|
234
|
-
evalscope/third_party/longbench_write/infer.py,sha256=MB0MdSM1qDx15FyrPSU6BXPbSGnBjxuTWqrcHAgbj9o,8318
|
|
235
|
-
evalscope/third_party/longbench_write/longbench_write.py,sha256=MQzlIzv3sGlNgxgX0FPHtDIuAmgwThfBkMeKNcsR3U8,3926
|
|
236
|
-
evalscope/third_party/longbench_write/utils.py,sha256=l6q9cNZLFVRvG9qYbxFxobuQkcMyteU9Y6NxyMU4tmQ,816
|
|
237
|
-
evalscope/third_party/longbench_write/resources/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
238
|
-
evalscope/third_party/longbench_write/resources/judge.txt,sha256=LEI86IoOtqYUgvQnmXo8A8S8Ef6GEQKJXcrEWSauHVc,1884
|
|
239
|
-
evalscope/third_party/longbench_write/resources/longbench_write.jsonl,sha256=H26ZSXzCTWWJTWXgFAYvOYupRuvdJUt_izOeSNOrV3k,54155
|
|
240
|
-
evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl,sha256=h4AJJ3YfNA5IiZ5N9dR_tyEa1JNqY0INv6l5ZgQUJZ8,24235
|
|
241
|
-
evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl,sha256=odTr8N8PoWAFZ2kdEcmlLeMDfEo3KXDtLo9S8oieCmI,5718
|
|
242
|
-
evalscope/third_party/longbench_write/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
243
|
-
evalscope/third_party/longbench_write/tools/data_etl.py,sha256=fSc4iT7_bdTvW20TbjlWme-k1pLqj_e2wXV8z831_Yw,5963
|
|
244
|
-
evalscope/third_party/toolbench_static/README.md,sha256=Osdnt0_K-setbmYwDPCPRp2LXxamGp2mE8KsOByPPOY,3944
|
|
245
|
-
evalscope/third_party/toolbench_static/__init__.py,sha256=BO936RxwodHr4OEpV6W3S_keC91OfOd41_msIJ2d0fs,128
|
|
246
|
-
evalscope/third_party/toolbench_static/config_default.json,sha256=KrUzeHL2DNiM5FwY7cH3KZlxTwELCQZ6e39nilfUi0M,368
|
|
247
|
-
evalscope/third_party/toolbench_static/config_default.yaml,sha256=-6n6Zyg9eHN2eexlehSi9LI4F3EPk-3JacrAb6ZoyxI,451
|
|
248
|
-
evalscope/third_party/toolbench_static/eval.py,sha256=TqjMuuYePnD3bGRhQe1_9bIOlAW41kiFSztaEuppRLM,8237
|
|
249
|
-
evalscope/third_party/toolbench_static/infer.py,sha256=WogwVXqDabdcsJ4uftZxAwR2wncp6HYpkS-fACEvjT4,9331
|
|
250
|
-
evalscope/third_party/toolbench_static/requirements.txt,sha256=JMIbWAfKRYcQh771IT-EjroMagXchYDSgfgY7gcqx08,21
|
|
251
|
-
evalscope/third_party/toolbench_static/toolbench_static.py,sha256=uXvyeyNWTZHFVASnOeMf1sqHUjy9NQ3r8wbkhUQJL1g,1930
|
|
252
|
-
evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
253
|
-
evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=UywM8SU2ByFTzD4YkbB17SXJyxmzY1QDwARDuGzbCvs,1452
|
|
254
|
-
evalscope/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
255
|
-
evalscope/tools/combine_reports.py,sha256=AJYB7ZAHiBu64mcs81bf40ClxukpU2NIUV53UYPiqUs,5388
|
|
256
|
-
evalscope/tools/gen_mmlu_subject_mapping.py,sha256=CUmRdReEU7QfMyprh9I56KmHoRww_zUda_JuyxmCL1A,3277
|
|
257
|
-
evalscope/tools/rewrite_eval_results.py,sha256=ZVi2hVjiTOmR_O5IaLv6qnQNpMz6FnDb95c83Fi3h4I,3193
|
|
258
|
-
evalscope/utils/__init__.py,sha256=6RjACRYUSpGj6fkZ7NzYpl0lFppQCp9KVn5ktZe626s,128
|
|
259
|
-
evalscope/utils/arena_utils.py,sha256=RMkymUv9Cxs37arUntzgDY5P0Dand2jGpsb7uy6wZmg,7670
|
|
260
|
-
evalscope/utils/completion_parsers.py,sha256=61l8CTh1VxHgRoMDhtznpAhuJp47MssGgS-LdEe_h80,2997
|
|
261
|
-
evalscope/utils/logger.py,sha256=Nhm8u_Wpd5BlVPdv9IBW_M3XMEcp5UbkOf1oN2HvGG0,3060
|
|
262
|
-
evalscope/utils/task_cfg_parser.py,sha256=LiNQ2X8lbZU0cODpaY_PbKyUhNoxZIC495UsLJigX64,138
|
|
263
|
-
evalscope/utils/task_utils.py,sha256=IMtBSBUp3H95Ko0vn8Q55Wmz2SFZXSfjVy49tyomL_g,537
|
|
264
|
-
evalscope/utils/utils.py,sha256=bv_5zDNNzsODSwXz6M7TFkdfVJT6rw_orn_BG-qkijM,20567
|
|
265
|
-
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
266
|
-
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
267
|
-
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
268
|
-
tests/cli/test_run.py,sha256=9GTF21NaUgERcF1Rkm9almO5-5pxsDF86Nw8fs8X7Hg,2926
|
|
269
|
-
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
270
|
-
tests/perf/test_perf.py,sha256=Mn3nw2UJoR4qDLZ3Jhna3m52gD4mouc63uY_DLyXkG0,2889
|
|
271
|
-
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
272
|
-
tests/rag/test_clip_benchmark.py,sha256=7NsOzgrpU9ou22M7fXtSFEnYt0iy2Q-ShIDL26Kp2gw,2597
|
|
273
|
-
tests/rag/test_mteb.py,sha256=MOksxYseIQ6SD_iFFxMC9BinvDtB0vlNSFEGJt0SGl8,4608
|
|
274
|
-
tests/rag/test_ragas.py,sha256=g3rAHymUzTyM6usIce6kItwyh1IocummK0BBPZiJPmY,4024
|
|
275
|
-
tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
276
|
-
tests/swift/test_run_swift_eval.py,sha256=Qop40c8jsHUbDTJe-Y8b_Aa8qn4Xstmu-FNGG14Gqik,5749
|
|
277
|
-
tests/swift/test_run_swift_vlm_eval.py,sha256=p2i2ZRj-vG1YsQGsemvQLHcyhjy1EmUChyAjFEmVbCE,4899
|
|
278
|
-
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=c31jwQle_97ru5Dep91qsAqYjR1HDm1O9YZihRr0u0s,6018
|
|
279
|
-
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
280
|
-
tests/vlm/test_vlmeval.py,sha256=21xi0nu4ghDB6_X-Pol7pTfK7aYkAYOp82TQ-MSQv-I,1757
|
|
281
|
-
evalscope-0.7.2.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
282
|
-
evalscope-0.7.2.dist-info/METADATA,sha256=k4bfDc1QrLXGBfztQbobpQDp1ML1sUjqZrKylotW-Ck,23780
|
|
283
|
-
evalscope-0.7.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
284
|
-
evalscope-0.7.2.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
285
|
-
evalscope-0.7.2.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
286
|
-
evalscope-0.7.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|