evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +5 -1
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +46 -50
- evalscope/backend/rag_eval/utils/embedding.py +12 -11
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +32 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +119 -95
- evalscope/constants.py +61 -29
- evalscope/evaluator/__init__.py +1 -0
- evalscope/evaluator/evaluator.py +96 -377
- evalscope/evaluator/humaneval_evaluator.py +158 -0
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +15 -3
- evalscope/perf/benchmark.py +7 -9
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +10 -0
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +3 -4
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/perf/utils/db_util.py +11 -8
- evalscope/perf/utils/local_server.py +19 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +184 -375
- evalscope/run_arena.py +20 -25
- evalscope/summarizer.py +16 -17
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -28
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -5
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
- evalscope/tools/combine_reports.py +25 -30
- evalscope/tools/rewrite_eval_results.py +14 -46
- evalscope/utils/__init__.py +0 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +3 -4
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/logger.py +9 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +12 -138
- evalscope/version.py +2 -2
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
- evalscope-0.8.0.dist-info/RECORD +285 -0
- tests/cli/test_run.py +54 -15
- tests/perf/test_perf.py +4 -0
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.1.dist-info/RECORD +0 -286
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
evalscope/__init__.py
CHANGED
evalscope/arguments.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ParseStrArgsAction(argparse.Action):
|
|
6
|
+
|
|
7
|
+
def __call__(self, parser, namespace, values, option_string=None):
|
|
8
|
+
assert isinstance(values, str), 'args should be a string.'
|
|
9
|
+
|
|
10
|
+
arg_dict = {}
|
|
11
|
+
for arg in values.strip().split(','):
|
|
12
|
+
key, value = map(str.strip, arg.split('=', 1)) # Use maxsplit=1 to handle multiple '='
|
|
13
|
+
try:
|
|
14
|
+
# Safely evaluate the value using eval
|
|
15
|
+
arg_dict[key] = eval(value)
|
|
16
|
+
except Exception:
|
|
17
|
+
# If eval fails, check if it's a boolean value
|
|
18
|
+
value_lower = value.lower()
|
|
19
|
+
if value_lower == 'true':
|
|
20
|
+
arg_dict[key] = True
|
|
21
|
+
elif value_lower == 'false':
|
|
22
|
+
arg_dict[key] = False
|
|
23
|
+
else:
|
|
24
|
+
# If not a boolean, keep the original string
|
|
25
|
+
arg_dict[key] = value
|
|
26
|
+
|
|
27
|
+
setattr(namespace, self.dest, arg_dict)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def add_argument(parser: argparse.ArgumentParser):
|
|
31
|
+
# yapf: disable
|
|
32
|
+
# Model-related arguments
|
|
33
|
+
parser.add_argument('--model', type=str, required=False, help='The model id on modelscope, or local model dir.')
|
|
34
|
+
parser.add_argument('--model-args', type=str, action=ParseStrArgsAction, help='The model args, should be a string.')
|
|
35
|
+
|
|
36
|
+
# Template-related arguments
|
|
37
|
+
parser.add_argument('--template-type', type=str, required=False, help='Deprecated, will be removed in v1.0.0.')
|
|
38
|
+
parser.add_argument('--chat-template', type=str, required=False, help='The custom jinja template for chat generation.') # noqa: E501
|
|
39
|
+
|
|
40
|
+
# Dataset-related arguments
|
|
41
|
+
parser.add_argument('--datasets', type=str, nargs='+', required=False, help='Dataset id list, align to the module name in evalscope.benchmarks') # noqa: E501
|
|
42
|
+
parser.add_argument('--dataset-args', type=json.loads, default='{}', help='The dataset args, should be a json string.') # noqa: E501
|
|
43
|
+
parser.add_argument('--dataset-dir', help='The datasets dir.')
|
|
44
|
+
parser.add_argument('--dataset-hub', help='The datasets hub.')
|
|
45
|
+
|
|
46
|
+
# Generation configuration arguments
|
|
47
|
+
parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.') # noqa: E501
|
|
48
|
+
|
|
49
|
+
# Evaluation-related arguments
|
|
50
|
+
parser.add_argument('--eval-type', type=str, help='The type for evaluating.')
|
|
51
|
+
parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.')
|
|
52
|
+
parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
|
|
53
|
+
parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.')
|
|
54
|
+
parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
|
|
55
|
+
|
|
56
|
+
# Cache and working directory arguments
|
|
57
|
+
parser.add_argument('--mem-cache', action='store_true', default=False, help='Deprecated, will be removed in v1.0.0.') # noqa: E501
|
|
58
|
+
parser.add_argument('--use-cache', type=str, help='Path to reuse the cached results.')
|
|
59
|
+
parser.add_argument('--work-dir', type=str, help='The root cache dir.')
|
|
60
|
+
|
|
61
|
+
# Debug and runtime mode arguments
|
|
62
|
+
parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
|
|
63
|
+
parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
|
|
64
|
+
parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
|
|
65
|
+
# yapf: enable
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def parse_args():
|
|
69
|
+
parser = argparse.ArgumentParser(description='Run evaluation on benchmarks for LLMs.')
|
|
70
|
+
add_argument(parser)
|
|
71
|
+
|
|
72
|
+
args = parser.parse_args()
|
|
73
|
+
return args
|
evalscope/backend/base.py
CHANGED
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
from typing import Union
|
|
3
3
|
|
|
4
|
+
from evalscope.config import TaskConfig
|
|
4
5
|
from evalscope.utils import yaml_to_dict
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
class BackendManager:
|
|
8
|
-
|
|
9
|
+
|
|
10
|
+
def __init__(self, config: Union[str, dict, TaskConfig], **kwargs):
|
|
9
11
|
"""
|
|
10
12
|
BackendManager is the base class for the evaluation backend manager.
|
|
11
13
|
It provides the basic configuration parsing, command generation, task submission, and result fetching.
|
|
@@ -15,6 +17,8 @@ class BackendManager:
|
|
|
15
17
|
"""
|
|
16
18
|
if isinstance(config, str):
|
|
17
19
|
self.config_d = yaml_to_dict(config)
|
|
20
|
+
elif isinstance(config, TaskConfig):
|
|
21
|
+
self.config_d = config.eval_config
|
|
18
22
|
else:
|
|
19
23
|
self.config_d = config
|
|
20
24
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
+
# isort: skip_file
|
|
1
2
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
from typing import
|
|
3
|
-
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
4
|
"""
|
|
5
5
|
The API meta template for OpenCompass.
|
|
6
6
|
|
|
@@ -26,18 +26,16 @@ class MetaTemplateType:
|
|
|
26
26
|
TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {}
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
def register_template(name: str,
|
|
30
|
-
template: Dict[str, Any],
|
|
31
|
-
exists_ok: bool = False):
|
|
29
|
+
def register_template(name: str, template: Dict[str, Any], exists_ok: bool = False):
|
|
32
30
|
if not exists_ok and name in TEMPLATE_MAPPING:
|
|
33
|
-
raise ValueError(f
|
|
31
|
+
raise ValueError(f'The `{name}` has already been registered in the TEMPLATE_MAPPING.')
|
|
34
32
|
|
|
35
33
|
TEMPLATE_MAPPING[name] = template
|
|
36
34
|
|
|
37
35
|
|
|
38
36
|
def get_template(name: str) -> Dict[str, Any]:
|
|
39
37
|
if name not in TEMPLATE_MAPPING:
|
|
40
|
-
raise ValueError(f
|
|
38
|
+
raise ValueError(f'The `{name}` has not been registered in the TEMPLATE_MAPPING.')
|
|
41
39
|
|
|
42
40
|
return TEMPLATE_MAPPING[name]
|
|
43
41
|
|
|
@@ -46,16 +44,12 @@ def get_template(name: str) -> Dict[str, Any]:
|
|
|
46
44
|
register_template(
|
|
47
45
|
name=MetaTemplateType.default_api_meta_template_oc,
|
|
48
46
|
template=dict(
|
|
49
|
-
round=[
|
|
50
|
-
|
|
51
|
-
dict(role='BOT', api_role='BOT', generate=True)
|
|
52
|
-
],
|
|
47
|
+
round=[dict(role='HUMAN', api_role='HUMAN'),
|
|
48
|
+
dict(role='BOT', api_role='BOT', generate=True)],
|
|
53
49
|
reserved_roles=[
|
|
54
50
|
dict(role='SYSTEM', api_role='SYSTEM'),
|
|
55
51
|
],
|
|
56
|
-
)
|
|
57
|
-
)
|
|
58
|
-
|
|
52
|
+
))
|
|
59
53
|
|
|
60
54
|
if __name__ == '__main__':
|
|
61
55
|
res = MetaTemplateType.get_template_name_list()
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
from enum import Enum
|
|
3
|
-
from typing import Optional, Union
|
|
4
2
|
import subprocess
|
|
5
|
-
from dataclasses import asdict
|
|
6
3
|
import tempfile
|
|
4
|
+
from dataclasses import asdict
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Optional, Union
|
|
7
7
|
|
|
8
|
-
from evalscope.utils import is_module_installed, get_module_path, get_valid_list
|
|
9
8
|
from evalscope.backend.base import BackendManager
|
|
10
9
|
from evalscope.backend.opencompass.api_meta_template import get_template
|
|
10
|
+
from evalscope.utils import get_module_path, get_valid_list, is_module_installed
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
12
12
|
|
|
13
13
|
logger = get_logger()
|
|
@@ -107,8 +107,8 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
107
107
|
|
|
108
108
|
@staticmethod
|
|
109
109
|
def list_datasets(return_details: bool = False):
|
|
110
|
-
from opencompass.utils.run import get_config_from_arg
|
|
111
110
|
from dataclasses import dataclass
|
|
111
|
+
from opencompass.utils.run import get_config_from_arg
|
|
112
112
|
|
|
113
113
|
@dataclass
|
|
114
114
|
class TempArgs:
|
|
@@ -160,18 +160,18 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
160
160
|
None
|
|
161
161
|
"""
|
|
162
162
|
if run_mode == RunMode.FUNCTION:
|
|
163
|
-
from opencompass.cli.main import run_task
|
|
164
163
|
from opencompass.cli.arguments import ApiModelConfig
|
|
164
|
+
from opencompass.cli.main import run_task
|
|
165
165
|
|
|
166
166
|
assert isinstance(self.args.models, list) and len(self.args.models) > 0, 'The models are required.'
|
|
167
167
|
|
|
168
168
|
tmp_model_d: dict = self.args.models[0]
|
|
169
169
|
assert 'path' in tmp_model_d and 'openai_api_base' in tmp_model_d, \
|
|
170
|
-
f
|
|
170
|
+
f'Got invalid model config: {tmp_model_d}. \nTo get valid format: ' \
|
|
171
171
|
"{'path': 'qwen-7b-chat', 'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'}"
|
|
172
172
|
|
|
173
173
|
# Get valid datasets
|
|
174
|
-
dataset_names = self.args.datasets
|
|
174
|
+
dataset_names = self.args.datasets # e.g. ['mmlu', 'ceval']
|
|
175
175
|
dataset_names_all, real_dataset_all = self.list_datasets(return_details=True)
|
|
176
176
|
|
|
177
177
|
if not dataset_names:
|
|
@@ -185,7 +185,9 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
185
185
|
assert len(valid_dataset_names) > 0, f'No valid datasets. ' \
|
|
186
186
|
f'To get the valid datasets, please refer to {dataset_names_all}'
|
|
187
187
|
|
|
188
|
-
valid_datasets = [
|
|
188
|
+
valid_datasets = [
|
|
189
|
+
_dataset for _dataset in real_dataset_all if _dataset['dataset_name'] in valid_dataset_names
|
|
190
|
+
]
|
|
189
191
|
for _dataset in valid_datasets:
|
|
190
192
|
_dataset.pop('dataset_name')
|
|
191
193
|
_dataset['reader_cfg']['test_range'] = self.args.limit
|
|
@@ -232,16 +234,23 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
232
234
|
if __name__ == '__main__':
|
|
233
235
|
|
|
234
236
|
# OpenCompassBackendManager.list_datasets()
|
|
235
|
-
# ['mmlu', 'WSC', 'DRCD', 'chid', 'gsm8k', 'AX_g', 'BoolQ', 'cmnli', 'ARC_e', 'ocnli_fc', 'summedits', 'MultiRC',
|
|
237
|
+
# ['mmlu', 'WSC', 'DRCD', 'chid', 'gsm8k', 'AX_g', 'BoolQ', 'cmnli', 'ARC_e', 'ocnli_fc', 'summedits', 'MultiRC',
|
|
238
|
+
# 'GaokaoBench', 'obqa', 'math', 'agieval', 'hellaswag', 'RTE', 'race', 'flores', 'ocnli', 'strategyqa',
|
|
239
|
+
# 'triviaqa', 'WiC', 'COPA', 'commonsenseqa', 'piqa', 'nq', 'mbpp', 'csl', 'Xsum', 'CB', 'tnews', 'ARC_c',
|
|
240
|
+
# 'afqmc', 'eprstmt', 'ReCoRD', 'bbh', 'TheoremQA', 'CMRC', 'AX_b', 'siqa', 'storycloze', 'humaneval',
|
|
241
|
+
# 'cluewsc', 'winogrande', 'lambada', 'ceval', 'bustm', 'C3', 'lcsts']
|
|
236
242
|
|
|
237
243
|
# 'meta_template': 'default-api-meta-template-oc',
|
|
238
244
|
# models: llama3-8b-instruct, qwen-7b-chat
|
|
239
245
|
oc_backend_manager = OpenCompassBackendManager(
|
|
240
|
-
config={
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
246
|
+
config={
|
|
247
|
+
'datasets': ['mmlu', 'ceval', 'ARC_c', 'gsm8k'],
|
|
248
|
+
'models': [{
|
|
249
|
+
'path': 'llama3-8b-instruct',
|
|
250
|
+
'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'
|
|
251
|
+
}],
|
|
252
|
+
'limit': 5
|
|
253
|
+
})
|
|
245
254
|
all_datasets = OpenCompassBackendManager.list_datasets()
|
|
246
255
|
print(f'all_datasets: {all_datasets}')
|
|
247
256
|
oc_backend_manager.run()
|
|
@@ -4,7 +4,6 @@ from opencompass.partitioners import NaivePartitioner
|
|
|
4
4
|
from opencompass.runners import LocalRunner
|
|
5
5
|
from opencompass.tasks import OpenICLInferTask
|
|
6
6
|
|
|
7
|
-
|
|
8
7
|
with read_base():
|
|
9
8
|
# from opencompass.configs.summarizers.medium import summarizer
|
|
10
9
|
# from opencompass.configs.summarizers.PMMEval import summarizer
|
|
@@ -17,7 +16,6 @@ for _dataset in datasets:
|
|
|
17
16
|
from opencompass.datasets.humaneval import humaneval_gpt_postprocess
|
|
18
17
|
_dataset['eval_cfg']['pred_postprocessor']['type'] = humaneval_gpt_postprocess
|
|
19
18
|
|
|
20
|
-
|
|
21
19
|
# 2. Get models, only for placeholder, you should fill in the real model information from command line
|
|
22
20
|
# See more templates in `opencompass.cli.arguments.ApiModelConfig`
|
|
23
21
|
models = []
|
|
@@ -25,8 +23,5 @@ models = []
|
|
|
25
23
|
# 3. Get infer config
|
|
26
24
|
infer = dict(
|
|
27
25
|
partitioner=dict(type=NaivePartitioner),
|
|
28
|
-
runner=dict(
|
|
29
|
-
type=LocalRunner,
|
|
30
|
-
max_num_workers=4,
|
|
31
|
-
task=dict(type=OpenICLInferTask)),
|
|
26
|
+
runner=dict(type=LocalRunner, max_num_workers=4, task=dict(type=OpenICLInferTask)),
|
|
32
27
|
)
|
|
@@ -2,18 +2,18 @@
|
|
|
2
2
|
from mmengine.config import read_base
|
|
3
3
|
|
|
4
4
|
with read_base():
|
|
5
|
-
from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets
|
|
6
|
-
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
|
|
7
5
|
from opencompass.configs.datasets.agieval.agieval_gen_64afd3 import agieval_datasets
|
|
8
|
-
from opencompass.configs.datasets.
|
|
9
|
-
from opencompass.configs.datasets.
|
|
10
|
-
from opencompass.configs.datasets.
|
|
6
|
+
from opencompass.configs.datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
|
|
7
|
+
from opencompass.configs.datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
|
|
8
|
+
from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
|
|
9
|
+
from opencompass.configs.datasets.CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
|
|
11
10
|
from opencompass.configs.datasets.CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
|
|
11
|
+
from opencompass.configs.datasets.CLUE_cmnli.CLUE_cmnli_gen_1abf97 import cmnli_datasets
|
|
12
12
|
from opencompass.configs.datasets.CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
|
|
13
13
|
from opencompass.configs.datasets.CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
|
|
14
|
-
from opencompass.configs.datasets.CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
|
|
15
|
-
from opencompass.configs.datasets.CLUE_cmnli.CLUE_cmnli_gen_1abf97 import cmnli_datasets
|
|
16
14
|
from opencompass.configs.datasets.CLUE_ocnli.CLUE_ocnli_gen_c4cb6c import ocnli_datasets
|
|
15
|
+
from opencompass.configs.datasets.cmb.cmb_gen_dfb5c4 import cmb_datasets
|
|
16
|
+
from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
|
|
17
17
|
from opencompass.configs.datasets.FewCLUE_bustm.FewCLUE_bustm_gen_634f41 import bustm_datasets
|
|
18
18
|
from opencompass.configs.datasets.FewCLUE_chid.FewCLUE_chid_gen_0a29a2 import chid_datasets
|
|
19
19
|
from opencompass.configs.datasets.FewCLUE_cluewsc.FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets
|
|
@@ -21,37 +21,37 @@ with read_base():
|
|
|
21
21
|
from opencompass.configs.datasets.FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
|
|
22
22
|
from opencompass.configs.datasets.FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_gen_f97a97 import ocnli_fc_datasets
|
|
23
23
|
from opencompass.configs.datasets.FewCLUE_tnews.FewCLUE_tnews_gen_b90e4a import tnews_datasets
|
|
24
|
-
from opencompass.configs.datasets.
|
|
24
|
+
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
|
|
25
|
+
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
|
26
|
+
from opencompass.configs.datasets.hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
|
|
27
|
+
from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
|
25
28
|
from opencompass.configs.datasets.lambada.lambada_gen_217e11 import lambada_datasets
|
|
29
|
+
from opencompass.configs.datasets.lcsts.lcsts_gen_8ee1fe import lcsts_datasets
|
|
30
|
+
from opencompass.configs.datasets.math.math_gen_265cce import math_datasets
|
|
31
|
+
from opencompass.configs.datasets.mbpp.mbpp_gen_830460 import mbpp_datasets
|
|
32
|
+
from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets
|
|
33
|
+
from opencompass.configs.datasets.nq.nq_gen_c788f6 import nq_datasets
|
|
34
|
+
from opencompass.configs.datasets.obqa.obqa_gen_9069e4 import obqa_datasets
|
|
35
|
+
from opencompass.configs.datasets.piqa.piqa_gen_1194eb import piqa_datasets
|
|
36
|
+
from opencompass.configs.datasets.PMMEval.pmmeval_gen import PMMEval_datasets
|
|
37
|
+
from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
|
|
38
|
+
from opencompass.configs.datasets.siqa.siqa_gen_e78df3 import siqa_datasets
|
|
26
39
|
from opencompass.configs.datasets.storycloze.storycloze_gen_7f656a import storycloze_datasets
|
|
40
|
+
from opencompass.configs.datasets.strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets
|
|
41
|
+
from opencompass.configs.datasets.summedits.summedits_gen_315438 import summedits_datasets
|
|
27
42
|
from opencompass.configs.datasets.SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets
|
|
28
43
|
from opencompass.configs.datasets.SuperGLUE_AX_g.SuperGLUE_AX_g_gen_68aac7 import AX_g_datasets
|
|
29
44
|
from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
|
|
30
45
|
from opencompass.configs.datasets.SuperGLUE_CB.SuperGLUE_CB_gen_854c6c import CB_datasets
|
|
31
46
|
from opencompass.configs.datasets.SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
|
|
32
47
|
from opencompass.configs.datasets.SuperGLUE_MultiRC.SuperGLUE_MultiRC_gen_27071f import MultiRC_datasets
|
|
33
|
-
from opencompass.configs.datasets.SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
|
|
34
48
|
from opencompass.configs.datasets.SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
|
|
49
|
+
from opencompass.configs.datasets.SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
|
|
35
50
|
from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
|
|
36
51
|
from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
|
|
37
|
-
from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
|
|
38
|
-
from opencompass.configs.datasets.Xsum.Xsum_gen_31397e import Xsum_datasets
|
|
39
|
-
from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
|
40
|
-
from opencompass.configs.datasets.summedits.summedits_gen_315438 import summedits_datasets
|
|
41
|
-
from opencompass.configs.datasets.math.math_gen_265cce import math_datasets
|
|
42
|
-
from opencompass.configs.datasets.hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
|
|
43
|
-
from opencompass.configs.datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
|
|
44
|
-
from opencompass.configs.datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
|
|
45
|
-
from opencompass.configs.datasets.piqa.piqa_gen_1194eb import piqa_datasets
|
|
46
|
-
from opencompass.configs.datasets.siqa.siqa_gen_e78df3 import siqa_datasets
|
|
47
|
-
from opencompass.configs.datasets.strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets
|
|
48
|
-
from opencompass.configs.datasets.winogrande.winogrande_gen_458220 import winogrande_datasets
|
|
49
|
-
from opencompass.configs.datasets.obqa.obqa_gen_9069e4 import obqa_datasets
|
|
50
|
-
from opencompass.configs.datasets.nq.nq_gen_c788f6 import nq_datasets
|
|
51
52
|
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
|
52
|
-
from opencompass.configs.datasets.
|
|
53
|
-
from opencompass.configs.datasets.
|
|
54
|
-
from opencompass.configs.datasets.PMMEval.pmmeval_gen import PMMEval_datasets
|
|
53
|
+
from opencompass.configs.datasets.winogrande.winogrande_gen_458220 import winogrande_datasets
|
|
54
|
+
from opencompass.configs.datasets.Xsum.Xsum_gen_31397e import Xsum_datasets
|
|
55
55
|
|
|
56
56
|
# Note: to be supported
|
|
57
57
|
# from opencompass.configs.datasets.flores.flores_gen_806ede import flores_datasets
|
|
@@ -59,7 +59,6 @@ with read_base():
|
|
|
59
59
|
# from opencompass.configs.datasets.commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets
|
|
60
60
|
# from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
|
|
61
61
|
|
|
62
|
-
|
|
63
62
|
datasets = []
|
|
64
63
|
_locals = {k: v for k, v in locals().items() if k.endswith('_datasets')}
|
|
65
64
|
|
|
@@ -68,7 +67,6 @@ for k, v in _locals.items():
|
|
|
68
67
|
_dataset['dataset_name'] = k.replace('_datasets', '')
|
|
69
68
|
datasets.append(_dataset)
|
|
70
69
|
|
|
71
|
-
|
|
72
70
|
if __name__ == '__main__':
|
|
73
71
|
for _dataset in datasets:
|
|
74
72
|
print(_dataset)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from evalscope.backend.rag_eval.utils.embedding import EmbeddingModel
|
|
2
|
-
from evalscope.backend.rag_eval.utils.llm import LLM, LocalLLM, ChatOpenAI
|
|
3
|
-
from evalscope.backend.rag_eval.utils.clip import VisionModel
|
|
4
1
|
from evalscope.backend.rag_eval.backend_manager import RAGEvalBackendManager
|
|
2
|
+
from evalscope.backend.rag_eval.utils.clip import VisionModel
|
|
3
|
+
from evalscope.backend.rag_eval.utils.embedding import EmbeddingModel
|
|
4
|
+
from evalscope.backend.rag_eval.utils.llm import LLM, ChatOpenAI, LocalLLM
|
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import Optional, Union
|
|
3
|
-
|
|
3
|
+
|
|
4
4
|
from evalscope.backend.base import BackendManager
|
|
5
|
+
from evalscope.utils import get_valid_list, is_module_installed
|
|
5
6
|
from evalscope.utils.logger import get_logger
|
|
6
7
|
|
|
7
|
-
|
|
8
8
|
logger = get_logger()
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class RAGEvalBackendManager(BackendManager):
|
|
12
|
+
|
|
12
13
|
def __init__(self, config: Union[str, dict], **kwargs):
|
|
13
14
|
"""BackendManager for VLM Evaluation Kit
|
|
14
15
|
|
|
@@ -20,17 +21,16 @@ class RAGEvalBackendManager(BackendManager):
|
|
|
20
21
|
@staticmethod
|
|
21
22
|
def _check_env(module_name: str):
|
|
22
23
|
if is_module_installed(module_name):
|
|
23
|
-
logger.info(f
|
|
24
|
+
logger.info(f'Check `{module_name}` Installed')
|
|
24
25
|
else:
|
|
25
|
-
logger.error(f
|
|
26
|
+
logger.error(f'Please install `{module_name}` first')
|
|
26
27
|
|
|
27
28
|
@staticmethod
|
|
28
29
|
def run_mteb(model_args, eval_args):
|
|
29
|
-
from evalscope.backend.rag_eval.cmteb import ModelArguments,
|
|
30
|
-
from evalscope.backend.rag_eval.cmteb import one_stage_eval, two_stage_eval
|
|
30
|
+
from evalscope.backend.rag_eval.cmteb import EvalArguments, ModelArguments, one_stage_eval, two_stage_eval
|
|
31
31
|
|
|
32
32
|
if len(model_args) > 2:
|
|
33
|
-
raise ValueError(
|
|
33
|
+
raise ValueError('Not support multiple models yet')
|
|
34
34
|
|
|
35
35
|
# Convert arguments to dictionary
|
|
36
36
|
model_args_list = [ModelArguments(**args).to_dict() for args in model_args]
|
|
@@ -43,12 +43,8 @@ class RAGEvalBackendManager(BackendManager):
|
|
|
43
43
|
|
|
44
44
|
@staticmethod
|
|
45
45
|
def run_ragas(testset_args, eval_args):
|
|
46
|
-
from evalscope.backend.rag_eval.ragas import rag_eval
|
|
46
|
+
from evalscope.backend.rag_eval.ragas import EvaluationArguments, TestsetGenerationArguments, rag_eval
|
|
47
47
|
from evalscope.backend.rag_eval.ragas.tasks import generate_testset
|
|
48
|
-
from evalscope.backend.rag_eval.ragas import (
|
|
49
|
-
TestsetGenerationArguments,
|
|
50
|
-
EvaluationArguments,
|
|
51
|
-
)
|
|
52
48
|
|
|
53
49
|
if testset_args is not None:
|
|
54
50
|
generate_testset(TestsetGenerationArguments(**testset_args))
|
|
@@ -62,19 +58,19 @@ class RAGEvalBackendManager(BackendManager):
|
|
|
62
58
|
evaluate(Arguments(**args))
|
|
63
59
|
|
|
64
60
|
def run(self, *args, **kwargs):
|
|
65
|
-
tool = self.config_d.pop(
|
|
66
|
-
if tool.lower() ==
|
|
67
|
-
self._check_env(
|
|
68
|
-
model_args = self.config_d[
|
|
69
|
-
eval_args = self.config_d[
|
|
61
|
+
tool = self.config_d.pop('tool')
|
|
62
|
+
if tool.lower() == 'mteb':
|
|
63
|
+
self._check_env('mteb')
|
|
64
|
+
model_args = self.config_d['model']
|
|
65
|
+
eval_args = self.config_d['eval']
|
|
70
66
|
self.run_mteb(model_args, eval_args)
|
|
71
|
-
elif tool.lower() ==
|
|
72
|
-
self._check_env(
|
|
73
|
-
testset_args = self.config_d.get(
|
|
74
|
-
eval_args = self.config_d.get(
|
|
67
|
+
elif tool.lower() == 'ragas':
|
|
68
|
+
self._check_env('ragas')
|
|
69
|
+
testset_args = self.config_d.get('testset_generation', None)
|
|
70
|
+
eval_args = self.config_d.get('eval', None)
|
|
75
71
|
self.run_ragas(testset_args, eval_args)
|
|
76
|
-
elif tool.lower() ==
|
|
77
|
-
self._check_env(
|
|
78
|
-
self.run_clip_benchmark(self.config_d[
|
|
72
|
+
elif tool.lower() == 'clip_benchmark':
|
|
73
|
+
self._check_env('webdataset')
|
|
74
|
+
self.run_clip_benchmark(self.config_d['eval'])
|
|
79
75
|
else:
|
|
80
|
-
raise ValueError(f
|
|
76
|
+
raise ValueError(f'Unknown tool: {tool}')
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Dict, List
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
@dataclass
|
|
@@ -13,7 +13,7 @@ class Arguments:
|
|
|
13
13
|
model_name: str
|
|
14
14
|
revision: str = "master"
|
|
15
15
|
hub: str = "modelscope"
|
|
16
|
-
|
|
16
|
+
|
|
17
17
|
For API VLM model support, you can use the following fields, (image caption only):
|
|
18
18
|
model_name="gpt-4o-mini"
|
|
19
19
|
api_base: str = "",
|
|
@@ -23,12 +23,12 @@ class Arguments:
|
|
|
23
23
|
models: List[Dict] = field(default_factory=dict) # List of paths to the pre-trained models or model identifiers
|
|
24
24
|
dataset_name: List[str] = field(default_factory=list) # List of dataset names to be used
|
|
25
25
|
data_dir: str = None # Root directory where the datasets are stored
|
|
26
|
-
split: str =
|
|
26
|
+
split: str = 'test' # Split of the dataset to be used (e.g., 'train', 'validation', 'test')
|
|
27
27
|
task: str = None
|
|
28
28
|
batch_size: int = 128 # Batch size for data loading
|
|
29
29
|
num_workers: int = 1 # Number of workers for data loading
|
|
30
30
|
verbose: bool = True # Flag to enable verbose logging
|
|
31
|
-
output_dir: str =
|
|
32
|
-
cache_dir: str =
|
|
31
|
+
output_dir: str = 'outputs' # Directory where the outputs (e.g., predictions, logs) will be saved
|
|
32
|
+
cache_dir: str = 'cache' # Directory where the dataset cache will be stored
|
|
33
33
|
skip_existing: bool = False # Flag to skip processing if outputs already exist
|
|
34
|
-
limit: int = None
|
|
34
|
+
limit: int = None # Limit the number of samples to be processed
|