evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +5 -1
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +46 -50
- evalscope/backend/rag_eval/utils/embedding.py +12 -11
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +32 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +119 -95
- evalscope/constants.py +61 -29
- evalscope/evaluator/__init__.py +1 -0
- evalscope/evaluator/evaluator.py +96 -377
- evalscope/evaluator/humaneval_evaluator.py +158 -0
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +15 -3
- evalscope/perf/benchmark.py +7 -9
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +10 -0
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +2 -3
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/perf/utils/db_util.py +11 -8
- evalscope/perf/utils/local_server.py +19 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +184 -375
- evalscope/run_arena.py +20 -25
- evalscope/summarizer.py +16 -17
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -28
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -5
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
- evalscope/tools/combine_reports.py +25 -30
- evalscope/tools/rewrite_eval_results.py +14 -46
- evalscope/utils/__init__.py +0 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +3 -4
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/logger.py +9 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +12 -138
- evalscope/version.py +2 -2
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
- evalscope-0.8.0.dist-info/RECORD +285 -0
- tests/cli/test_run.py +54 -15
- tests/perf/test_perf.py +4 -0
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.2.dist-info/RECORD +0 -286
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
evalscope/run_arena.py
CHANGED
|
@@ -3,16 +3,16 @@
|
|
|
3
3
|
|
|
4
4
|
import argparse
|
|
5
5
|
import os
|
|
6
|
-
from pathlib import Path
|
|
7
6
|
import torch
|
|
7
|
+
from modelscope.utils.hf_util import GenerationConfig
|
|
8
|
+
from pathlib import Path
|
|
8
9
|
from tqdm import tqdm
|
|
9
10
|
|
|
10
11
|
from evalscope.constants import EvalConfigKeys
|
|
11
12
|
from evalscope.evaluator.rating_eval import RatingEvaluate
|
|
12
13
|
from evalscope.models.model_adapter import ChatGenerationModelAdapter
|
|
13
|
-
from evalscope.utils import
|
|
14
|
+
from evalscope.utils import dump_jsonl_data, get_obj_from_cfg, jsonl_to_list, yaml_to_dict
|
|
14
15
|
from evalscope.utils.logger import get_logger
|
|
15
|
-
from modelscope.utils.hf_util import GenerationConfig
|
|
16
16
|
|
|
17
17
|
logger = get_logger()
|
|
18
18
|
|
|
@@ -41,8 +41,7 @@ class ArenaWorkflow:
|
|
|
41
41
|
def _get_obj_from_cfg(obj_cfg: dict):
|
|
42
42
|
cls_ref = obj_cfg.get(EvalConfigKeys.CLASS_REF, None)
|
|
43
43
|
if not cls_ref:
|
|
44
|
-
logger.warning(
|
|
45
|
-
f'Class reference is not specified in config: {obj_cfg}')
|
|
44
|
+
logger.warning(f'Class reference is not specified in config: {obj_cfg}')
|
|
46
45
|
return obj_cfg
|
|
47
46
|
|
|
48
47
|
cls = get_obj_from_cfg(cls_ref)
|
|
@@ -50,19 +49,16 @@ class ArenaWorkflow:
|
|
|
50
49
|
|
|
51
50
|
return obj_cfg
|
|
52
51
|
|
|
53
|
-
def _predict_answers(self,
|
|
54
|
-
|
|
55
|
-
model_revision: str,
|
|
56
|
-
precision: torch.dtype,
|
|
57
|
-
generation_config: GenerationConfig,
|
|
58
|
-
template_type: str) -> list:
|
|
52
|
+
def _predict_answers(self, model_id_or_path: str, model_revision: str, precision: torch.dtype,
|
|
53
|
+
generation_config: GenerationConfig, template_type: str) -> list:
|
|
59
54
|
|
|
60
55
|
# TODO: multi-task to be supported
|
|
61
|
-
model_adapter = ChatGenerationModelAdapter(
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
56
|
+
model_adapter = ChatGenerationModelAdapter(
|
|
57
|
+
model_id=model_id_or_path,
|
|
58
|
+
model_revision=model_revision,
|
|
59
|
+
torch_dtype=precision,
|
|
60
|
+
generation_config=generation_config,
|
|
61
|
+
template_type=template_type)
|
|
66
62
|
res_list = []
|
|
67
63
|
questions_list = jsonl_to_list(self.question_file)
|
|
68
64
|
for data_d in tqdm(questions_list, total=len(questions_list), desc=f'Predicting(answers):'):
|
|
@@ -92,8 +88,7 @@ class ArenaWorkflow:
|
|
|
92
88
|
for model_name, cfg_d in self.answers_gen.items():
|
|
93
89
|
enable = cfg_d.get(EvalConfigKeys.ENABLE, True)
|
|
94
90
|
if not enable:
|
|
95
|
-
logger.warning(
|
|
96
|
-
f'Skip model {model_name} because it is not enabled.')
|
|
91
|
+
logger.warning(f'Skip model {model_name} because it is not enabled.')
|
|
97
92
|
continue
|
|
98
93
|
|
|
99
94
|
model_id_or_path = cfg_d.get(EvalConfigKeys.MODEL_ID_OR_PATH)
|
|
@@ -105,11 +100,12 @@ class ArenaWorkflow:
|
|
|
105
100
|
ans_output_file = os.path.join(WORK_DIR, cfg_d.get(EvalConfigKeys.OUTPUT_FILE))
|
|
106
101
|
template_type = cfg_d.get(EvalConfigKeys.TEMPLATE_TYPE)
|
|
107
102
|
|
|
108
|
-
answers_list = self._predict_answers(
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
103
|
+
answers_list = self._predict_answers(
|
|
104
|
+
model_id_or_path=model_id_or_path,
|
|
105
|
+
model_revision=model_revision,
|
|
106
|
+
precision=precision,
|
|
107
|
+
generation_config=custom_generation_config,
|
|
108
|
+
template_type=template_type)
|
|
113
109
|
|
|
114
110
|
os.makedirs(os.path.dirname(ans_output_file), exist_ok=True)
|
|
115
111
|
dump_jsonl_data(answers_list, ans_output_file)
|
|
@@ -163,8 +159,7 @@ class ArenaWorkflow:
|
|
|
163
159
|
if enable:
|
|
164
160
|
report_file = os.path.join(WORK_DIR, self.rating_gen.get('report_file'))
|
|
165
161
|
metrics = self.rating_gen.get('metrics', ['elo'])
|
|
166
|
-
baseline_model = self.rating_gen.get(
|
|
167
|
-
'baseline_model') if metrics[0] == 'pairwise' else None
|
|
162
|
+
baseline_model = self.rating_gen.get('baseline_model') if metrics[0] == 'pairwise' else None
|
|
168
163
|
ae = RatingEvaluate(metrics=metrics, baseline_model=baseline_model)
|
|
169
164
|
res_list = ae.run(self.review_file)
|
|
170
165
|
rating_df = res_list[0]
|
evalscope/summarizer.py
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import glob
|
|
2
3
|
import json
|
|
3
4
|
import os
|
|
4
|
-
import glob
|
|
5
5
|
from typing import List, Union
|
|
6
6
|
|
|
7
7
|
from evalscope.config import TaskConfig
|
|
8
|
-
from evalscope.constants import OutputsStructure
|
|
8
|
+
from evalscope.constants import EvalBackend, OutputsStructure
|
|
9
9
|
from evalscope.tools.combine_reports import gen_table
|
|
10
|
-
from evalscope.utils import
|
|
11
|
-
csv_to_list
|
|
10
|
+
from evalscope.utils import csv_to_list, get_latest_folder_path, json_to_dict, yaml_to_dict
|
|
12
11
|
from evalscope.utils.logger import get_logger
|
|
13
12
|
|
|
14
13
|
logger = get_logger()
|
|
@@ -20,8 +19,8 @@ class Summarizer:
|
|
|
20
19
|
def get_report(outputs_dir: str) -> List[dict]:
|
|
21
20
|
res_list: list = []
|
|
22
21
|
|
|
23
|
-
outputs_structure
|
|
24
|
-
reports_dir: str = outputs_structure.
|
|
22
|
+
outputs_structure = OutputsStructure(outputs_dir, is_make=False)
|
|
23
|
+
reports_dir: str = outputs_structure.reports_dir
|
|
25
24
|
if reports_dir is None:
|
|
26
25
|
raise ValueError(f'No reports directory in {outputs_dir}')
|
|
27
26
|
|
|
@@ -70,9 +69,9 @@ class Summarizer:
|
|
|
70
69
|
|
|
71
70
|
for candidate_task in candidate_task_cfgs:
|
|
72
71
|
logger.info(f'**Loading task cfg for summarizer: {candidate_task}')
|
|
73
|
-
eval_backend = candidate_task.get('eval_backend') or EvalBackend.NATIVE
|
|
72
|
+
eval_backend = candidate_task.get('eval_backend') or EvalBackend.NATIVE
|
|
74
73
|
|
|
75
|
-
if eval_backend == EvalBackend.NATIVE
|
|
74
|
+
if eval_backend == EvalBackend.NATIVE:
|
|
76
75
|
outputs_dir: str = candidate_task.get('outputs')
|
|
77
76
|
outputs_dir: str = os.path.expanduser(outputs_dir)
|
|
78
77
|
if outputs_dir is None:
|
|
@@ -80,7 +79,7 @@ class Summarizer:
|
|
|
80
79
|
res_list: list = Summarizer.get_report(outputs_dir=outputs_dir)
|
|
81
80
|
final_res_list.extend(res_list)
|
|
82
81
|
|
|
83
|
-
elif eval_backend == EvalBackend.OPEN_COMPASS
|
|
82
|
+
elif eval_backend == EvalBackend.OPEN_COMPASS:
|
|
84
83
|
eval_config = Summarizer.parse_eval_config(candidate_task)
|
|
85
84
|
|
|
86
85
|
work_dir = eval_config.get('work_dir') or 'outputs/default'
|
|
@@ -93,25 +92,25 @@ class Summarizer:
|
|
|
93
92
|
raise ValueError(f'No summary files in {res_folder_path}')
|
|
94
93
|
|
|
95
94
|
summary_file_path = summary_files[0]
|
|
96
|
-
# Example: [{'dataset': 'gsm8k', 'version': '1d7fe4', 'metric': 'accuracy', 'mode': 'gen', 'qwen-7b-chat': '53.98'}
|
|
95
|
+
# Example: [{'dataset': 'gsm8k', 'version': '1d7fe4', 'metric': 'accuracy', 'mode': 'gen', 'qwen-7b-chat': '53.98'} # noqa: E501
|
|
97
96
|
summary_res: List[dict] = csv_to_list(file_path=summary_file_path)
|
|
98
97
|
final_res_list.extend(summary_res)
|
|
99
|
-
elif eval_backend == EvalBackend.VLM_EVAL_KIT
|
|
98
|
+
elif eval_backend == EvalBackend.VLM_EVAL_KIT:
|
|
100
99
|
eval_config = Summarizer.parse_eval_config(candidate_task)
|
|
101
100
|
|
|
102
101
|
work_dir = eval_config.get('work_dir') or 'outputs'
|
|
103
102
|
if not os.path.exists(work_dir):
|
|
104
103
|
raise ValueError(f'work_dir {work_dir} does not exist.')
|
|
105
|
-
|
|
104
|
+
|
|
106
105
|
for model in eval_config['model']:
|
|
107
106
|
if model['name'] == 'CustomAPIModel':
|
|
108
107
|
model_name = model['type']
|
|
109
108
|
else:
|
|
110
109
|
model_name = model['name']
|
|
111
|
-
|
|
110
|
+
|
|
112
111
|
csv_files = glob.glob(os.path.join(work_dir, model_name, '*.csv'))
|
|
113
112
|
json_files = glob.glob(os.path.join(work_dir, model_name, '*.json'))
|
|
114
|
-
|
|
113
|
+
|
|
115
114
|
summary_files = csv_files + json_files
|
|
116
115
|
for summary_file_path in summary_files:
|
|
117
116
|
if summary_file_path.endswith('csv'):
|
|
@@ -120,9 +119,9 @@ class Summarizer:
|
|
|
120
119
|
summary_res: dict = json_to_dict(summary_file_path)
|
|
121
120
|
file_name = os.path.basename(summary_file_path).split('.')[0]
|
|
122
121
|
final_res_list.append({file_name: summary_res})
|
|
123
|
-
|
|
124
|
-
elif eval_backend == EvalBackend.THIRD_PARTY
|
|
125
|
-
raise ValueError(
|
|
122
|
+
|
|
123
|
+
elif eval_backend == EvalBackend.THIRD_PARTY:
|
|
124
|
+
raise ValueError('*** The summarizer for Third party evaluation backend is not supported yet ***')
|
|
126
125
|
else:
|
|
127
126
|
raise ValueError(f'Invalid eval_backend: {eval_backend}')
|
|
128
127
|
|
|
@@ -3,14 +3,18 @@
|
|
|
3
3
|
The LongWriter supports 10,000+ Word Generation From Long Context LLMs.
|
|
4
4
|
We can use the benchmark LongBench-Write focuses more on measuring the long output quality as well as the output length.
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
GitHub: [LongWriter](https://github.com/THUDM/LongWriter)
|
|
7
|
+
|
|
8
|
+
Technical Report: [Minimum Tuning to Unlock Long Output from LLMs with High Quality Data as the Key](https://arxiv.org/abs/2410.10210)
|
|
9
|
+
|
|
7
10
|
|
|
8
11
|
## Usage
|
|
9
12
|
|
|
10
13
|
### Installation
|
|
11
14
|
|
|
12
15
|
```bash
|
|
13
|
-
pip install evalscope[framework]
|
|
16
|
+
pip install evalscope[framework] -U
|
|
17
|
+
pip install vllm -U
|
|
14
18
|
```
|
|
15
19
|
|
|
16
20
|
### Task configuration
|
|
@@ -24,53 +28,79 @@ task_cfg = dict(stage=['infer', 'eval_l', 'eval_q'],
|
|
|
24
28
|
model='ZhipuAI/LongWriter-glm4-9b',
|
|
25
29
|
input_data_path=None,
|
|
26
30
|
output_dir='./outputs',
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
'
|
|
31
|
-
'
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
'
|
|
31
|
+
infer_config={
|
|
32
|
+
'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions',
|
|
33
|
+
'is_chat': True,
|
|
34
|
+
'verbose': False,
|
|
35
|
+
'generation_kwargs': {
|
|
36
|
+
'max_new_tokens': 32768,
|
|
37
|
+
'temperature': 0.5,
|
|
38
|
+
'repetition_penalty': 1.0
|
|
39
|
+
},
|
|
40
|
+
'proc_num': 16,
|
|
37
41
|
},
|
|
38
|
-
|
|
42
|
+
eval_config={
|
|
43
|
+
# No need to set OpenAI info if skipping the stage `eval_q`
|
|
44
|
+
'openai_api_key': None,
|
|
45
|
+
'openai_api_base': 'https://api.openai.com/v1/chat/completions',
|
|
46
|
+
'openai_gpt_model': 'gpt-4o-2024-05-13',
|
|
47
|
+
'generation_kwargs': {
|
|
48
|
+
'max_new_tokens': 1024,
|
|
49
|
+
'temperature': 0.5,
|
|
50
|
+
'stop': None
|
|
51
|
+
},
|
|
52
|
+
'proc_num': 8
|
|
53
|
+
}
|
|
54
|
+
)
|
|
39
55
|
|
|
40
56
|
```
|
|
41
57
|
- Arguments:
|
|
42
|
-
- `stage`: To run multiple stages, `infer`--run the inference process. `eval_l`--run eval length process. `eval_q`--run eval quality process.
|
|
43
|
-
- `model`: model id on the ModelScope hub, or local model dir.
|
|
58
|
+
- `stage`: To run multiple stages, `infer`--run the inference process. `eval_l`--run eval length process. `eval_q`--run eval quality process with the model-as-judge.
|
|
59
|
+
- `model`: model id on the ModelScope hub, or local model dir. Refer to [LongWriter-glm4-9b](https://modelscope.cn/models/ZhipuAI/LongWriter-glm4-9b/summary) for more details.
|
|
44
60
|
- `input_data_path`: input data path, default to `None`, it means to use [longbench_write](resources/longbench_write.jsonl)
|
|
45
61
|
- `output_dir`: output root directory.
|
|
46
62
|
- `openai_api_key`: openai_api_key when enabling the stage `eval_q` to use `Model-as-Judge`. Default to None if not needed.
|
|
47
63
|
- `openai_gpt_model`: Judge model name from OpenAI. Default to `gpt-4o-2024-05-13`
|
|
48
|
-
- `
|
|
49
|
-
- `
|
|
50
|
-
- `proc_num`: proc num.
|
|
64
|
+
- `generation_kwargs`: The generation configs.
|
|
65
|
+
- `proc_num`: process number for inference and evaluation.
|
|
51
66
|
|
|
52
67
|
|
|
53
68
|
2. Configuration with json (Optional):
|
|
54
69
|
|
|
55
70
|
```json
|
|
56
71
|
{
|
|
57
|
-
"stage": [
|
|
72
|
+
"stage": [
|
|
73
|
+
"infer",
|
|
74
|
+
"eval_l",
|
|
75
|
+
"eval_q"
|
|
76
|
+
],
|
|
58
77
|
"model": "ZhipuAI/LongWriter-glm4-9b",
|
|
59
78
|
"input_data_path": null,
|
|
60
79
|
"output_dir": "./outputs",
|
|
61
|
-
"
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
"
|
|
65
|
-
"
|
|
80
|
+
"infer_config": {
|
|
81
|
+
"openai_api_base": "http://127.0.0.1:8000/v1/chat/completions",
|
|
82
|
+
"is_chat": true,
|
|
83
|
+
"verbose": false,
|
|
84
|
+
"generation_kwargs": {
|
|
85
|
+
"max_new_tokens": 32768,
|
|
86
|
+
"temperature": 0.5,
|
|
87
|
+
"repetition_penalty": 1.0
|
|
88
|
+
},
|
|
89
|
+
"proc_num": 16
|
|
66
90
|
},
|
|
67
|
-
"
|
|
68
|
-
"
|
|
69
|
-
"
|
|
70
|
-
"
|
|
71
|
-
|
|
72
|
-
|
|
91
|
+
"eval_config": {
|
|
92
|
+
"openai_api_key": null,
|
|
93
|
+
"openai_api_base": "https://api.openai.com/v1/chat/completions",
|
|
94
|
+
"openai_gpt_model": "gpt-4o-2024-05-13",
|
|
95
|
+
"generation_kwargs": {
|
|
96
|
+
"max_new_tokens": 1024,
|
|
97
|
+
"temperature": 0.5,
|
|
98
|
+
"stop": null
|
|
99
|
+
},
|
|
100
|
+
"proc_num": 8
|
|
101
|
+
}
|
|
73
102
|
}
|
|
103
|
+
|
|
74
104
|
```
|
|
75
105
|
Refer to [default_task.json](default_task.json) for more details.
|
|
76
106
|
|
|
@@ -82,24 +112,51 @@ stage:
|
|
|
82
112
|
- infer
|
|
83
113
|
- eval_l
|
|
84
114
|
- eval_q
|
|
85
|
-
model: ZhipuAI/LongWriter-glm4-9b
|
|
115
|
+
model: "ZhipuAI/LongWriter-glm4-9b"
|
|
86
116
|
input_data_path: null
|
|
87
|
-
output_dir: ./outputs
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
117
|
+
output_dir: "./outputs"
|
|
118
|
+
infer_config:
|
|
119
|
+
openai_api_base: "http://127.0.0.1:8000/v1/chat/completions"
|
|
120
|
+
is_chat: true
|
|
121
|
+
verbose: false
|
|
122
|
+
generation_kwargs:
|
|
123
|
+
max_new_tokens: 32768
|
|
124
|
+
temperature: 0.5
|
|
125
|
+
repetition_penalty: 1.0
|
|
126
|
+
proc_num: 16
|
|
127
|
+
eval_config:
|
|
128
|
+
openai_api_key: null
|
|
129
|
+
openai_api_base: "https://api.openai.com/v1/chat/completions"
|
|
130
|
+
openai_gpt_model: "gpt-4o-2024-05-13"
|
|
131
|
+
generation_kwargs:
|
|
132
|
+
max_new_tokens: 1024
|
|
133
|
+
temperature: 0.5
|
|
134
|
+
stop: null
|
|
135
|
+
proc_num: 8
|
|
98
136
|
|
|
99
137
|
```
|
|
100
138
|
Refer to [default_task.yaml](default_task.yaml) for more details.
|
|
101
139
|
|
|
102
140
|
|
|
141
|
+
### Run Model Inference
|
|
142
|
+
We recommend to use the [vLLM](https://github.com/vllm-project/vllm) to deploy the model.
|
|
143
|
+
|
|
144
|
+
Environment:
|
|
145
|
+
* A100(80G) x 1
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
To start vLLM server, run the following command:
|
|
149
|
+
```shell
|
|
150
|
+
CUDA_VISIBLE_DEVICES=0 VLLM_USE_MODELSCOPE=True vllm serve --max-model-len=65536 --gpu_memory_utilization=0.95 --trust-remote-code ZhipuAI/LongWriter-glm4-9b
|
|
151
|
+
```
|
|
152
|
+
- Arguments:
|
|
153
|
+
- `max-model-len`: The maximum length of the model input.
|
|
154
|
+
- `gpu_memory_utilization`: The GPU memory utilization.
|
|
155
|
+
- `trust-remote-code`: Whether to trust the remote code.
|
|
156
|
+
- `model`: Could be a model id on the ModelScope/HuggingFace hub, or a local model dir.
|
|
157
|
+
|
|
158
|
+
* Note: You can use multiple GPUs by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` alternatively.
|
|
159
|
+
|
|
103
160
|
|
|
104
161
|
### Run the task
|
|
105
162
|
|
|
@@ -2,23 +2,24 @@ stage:
|
|
|
2
2
|
- infer
|
|
3
3
|
- eval_l
|
|
4
4
|
- eval_q
|
|
5
|
-
model: ZhipuAI/LongWriter-glm4-9b
|
|
5
|
+
model: "ZhipuAI/LongWriter-glm4-9b"
|
|
6
6
|
input_data_path: null
|
|
7
|
-
output_dir:
|
|
7
|
+
output_dir: "./outputs"
|
|
8
8
|
infer_config:
|
|
9
|
-
openai_api_base:
|
|
9
|
+
openai_api_base: "http://127.0.0.1:8000/v1/chat/completions"
|
|
10
10
|
is_chat: true
|
|
11
11
|
verbose: false
|
|
12
12
|
generation_kwargs:
|
|
13
13
|
max_new_tokens: 32768
|
|
14
14
|
temperature: 0.5
|
|
15
15
|
repetition_penalty: 1.0
|
|
16
|
+
proc_num: 16
|
|
16
17
|
eval_config:
|
|
17
|
-
openai_api_key:
|
|
18
|
-
openai_api_base:
|
|
19
|
-
openai_gpt_model:
|
|
18
|
+
openai_api_key: null
|
|
19
|
+
openai_api_base: "https://api.openai.com/v1/chat/completions"
|
|
20
|
+
openai_gpt_model: "gpt-4o-2024-05-13"
|
|
20
21
|
generation_kwargs:
|
|
21
22
|
max_new_tokens: 1024
|
|
22
23
|
temperature: 0.5
|
|
23
24
|
stop: null
|
|
24
|
-
proc_num:
|
|
25
|
+
proc_num: 8
|
|
@@ -1,19 +1,16 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
# Copyright (c) ZhipuAI, Inc. and its affiliates.
|
|
3
|
-
import multiprocessing
|
|
4
|
-
import os
|
|
5
3
|
import json
|
|
6
|
-
import random
|
|
7
|
-
import re
|
|
8
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
9
|
-
|
|
10
4
|
import matplotlib.pyplot as plt
|
|
11
5
|
import numpy as np
|
|
6
|
+
import os
|
|
7
|
+
import random
|
|
8
|
+
import re
|
|
12
9
|
import requests
|
|
10
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
13
11
|
from tqdm import tqdm
|
|
14
12
|
|
|
15
|
-
from evalscope.utils import jsonl_to_list
|
|
16
|
-
from evalscope.utils import get_logger
|
|
13
|
+
from evalscope.utils import get_logger, jsonl_to_list
|
|
17
14
|
|
|
18
15
|
logger = get_logger()
|
|
19
16
|
|
|
@@ -52,14 +49,16 @@ class EvalLength:
|
|
|
52
49
|
return 100 * max(0, 1. - (x / y - 1) / 2)
|
|
53
50
|
|
|
54
51
|
def eval(self, dump_res: bool = True):
|
|
55
|
-
# example = {"prompt": "Write an outline for a short 100-word blog post about xxx",
|
|
52
|
+
# example = {"prompt": "Write an outline for a short 100-word blog post about xxx",
|
|
53
|
+
# "type": "Community Forum", "length": 100, "response_length": 103,
|
|
54
|
+
# "response": "I. Introduction A. xxx"}
|
|
56
55
|
predictions = [json.loads(line) for line in open(self.pred_path, encoding='utf-8')]
|
|
57
56
|
x, y, scores = [], [], []
|
|
58
57
|
|
|
59
|
-
for pred in tqdm(predictions, total=len(predictions), desc=
|
|
60
|
-
x.append(pred[
|
|
61
|
-
y.append(pred[
|
|
62
|
-
scores.append(self.score(pred[
|
|
58
|
+
for pred in tqdm(predictions, total=len(predictions), desc='[Processing eval_l]'):
|
|
59
|
+
x.append(pred['length'])
|
|
60
|
+
y.append(pred['response_length'])
|
|
61
|
+
scores.append(self.score(pred['length'], pred['response_length']))
|
|
63
62
|
|
|
64
63
|
avg_score_l = np.mean(scores)
|
|
65
64
|
logger.info(f'Average score of length evaluation: {avg_score_l:.2f}')
|
|
@@ -105,7 +104,7 @@ class EvalQuality:
|
|
|
105
104
|
|
|
106
105
|
EVAL_Q = 'eval_quality'
|
|
107
106
|
OPENAI_BASE_URL = 'https://api.openai.com/v1/chat/completions'
|
|
108
|
-
DIMS = [
|
|
107
|
+
DIMS = ['Relevance', 'Accuracy', 'Coherence', 'Clarity', 'Breadth and Depth', 'Reading Experience']
|
|
109
108
|
|
|
110
109
|
def __init__(self,
|
|
111
110
|
model: str,
|
|
@@ -144,7 +143,8 @@ class EvalQuality:
|
|
|
144
143
|
|
|
145
144
|
self.openai_api_key: str = openai_api_key
|
|
146
145
|
self.openai_gpt_model = openai_gpt_model
|
|
147
|
-
|
|
146
|
+
if not self.openai_api_key:
|
|
147
|
+
logger.error('Please set `OPENAI_API_KEY` in the envs when stage `eval_q` is activated!')
|
|
148
148
|
|
|
149
149
|
def get_response_gpt4(self, prompt, temperature=0.5, max_new_tokens=1024, stop=None):
|
|
150
150
|
tries = 0
|
|
@@ -152,17 +152,17 @@ class EvalQuality:
|
|
|
152
152
|
tries += 1
|
|
153
153
|
try:
|
|
154
154
|
headers = {
|
|
155
|
-
'Authorization':
|
|
155
|
+
'Authorization': 'Bearer {}'.format(self.openai_api_key),
|
|
156
156
|
}
|
|
157
157
|
messages = [
|
|
158
158
|
{'role': 'user', 'content': prompt},
|
|
159
159
|
]
|
|
160
160
|
resp = requests.post(self.openai_api_base, json={
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
161
|
+
'model': self.openai_gpt_model,
|
|
162
|
+
'messages': messages,
|
|
163
|
+
'temperature': temperature,
|
|
164
|
+
'max_tokens': max_new_tokens,
|
|
165
|
+
'stop': stop,
|
|
166
166
|
}, headers=headers, timeout=600)
|
|
167
167
|
if resp.status_code != 200:
|
|
168
168
|
raise Exception(resp.text)
|
|
@@ -172,16 +172,16 @@ class EvalQuality:
|
|
|
172
172
|
except KeyboardInterrupt as e:
|
|
173
173
|
raise e
|
|
174
174
|
except Exception as e:
|
|
175
|
-
if
|
|
175
|
+
if 'maximum context length' in str(e):
|
|
176
176
|
raise e
|
|
177
|
-
elif
|
|
177
|
+
elif 'triggering' in str(e):
|
|
178
178
|
return 'Trigger OpenAI\'s content management policy'
|
|
179
179
|
logger.error("Error Occurs: \"%s\" Retry ..." % (str(e)))
|
|
180
180
|
else:
|
|
181
|
-
logger.error(
|
|
182
|
-
return
|
|
181
|
+
logger.error('Max tries. Failed.')
|
|
182
|
+
return 'Max tries. Failed.'
|
|
183
183
|
try:
|
|
184
|
-
return resp[
|
|
184
|
+
return resp['choices'][0]['message']['content']
|
|
185
185
|
except:
|
|
186
186
|
return ''
|
|
187
187
|
|
|
@@ -195,7 +195,7 @@ class EvalQuality:
|
|
|
195
195
|
|
|
196
196
|
def process_data(self, item):
|
|
197
197
|
# for item in tqdm(items, total=len(items), desc=f'Process of eval_q: '):
|
|
198
|
-
prompt = self.prompt_template.replace('$INST$', item['prompt']).replace('$RESPONSE$', item[
|
|
198
|
+
prompt = self.prompt_template.replace('$INST$', item['prompt']).replace('$RESPONSE$', item['response'])
|
|
199
199
|
scores = None
|
|
200
200
|
output = self.get_response_gpt4(prompt, **self.generation_kwargs)
|
|
201
201
|
try:
|
|
@@ -235,7 +235,8 @@ class EvalQuality:
|
|
|
235
235
|
total_score = dict()
|
|
236
236
|
for dim in self.DIMS:
|
|
237
237
|
# scores = [float(score[dim]) if dim in score else 3 for score in self.eval_scores]
|
|
238
|
-
scores = [float(item['scores'][dim]) if 'scores' in item and dim in item['scores']
|
|
238
|
+
scores = [float(item['scores'][dim]) if 'scores' in item and dim in item['scores']
|
|
239
|
+
else 3 for item in self.eval_scores]
|
|
239
240
|
total_score[dim] = ((sum(scores) / len(scores)) - 1) * 25
|
|
240
241
|
total_score['total'] = sum(total_score.values()) / len(total_score)
|
|
241
242
|
logger.info(f'Total score of quality evaluation: {total_score["total"]:.2f}')
|