evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +6 -2
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +47 -51
- evalscope/backend/rag_eval/utils/embedding.py +13 -12
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +33 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +154 -96
- evalscope/constants.py +50 -32
- evalscope/evaluator/evaluator.py +97 -377
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +16 -3
- evalscope/perf/benchmark.py +9 -11
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +8 -1
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +3 -4
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +5 -6
- evalscope/perf/utils/db_util.py +77 -30
- evalscope/perf/utils/local_server.py +21 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +153 -381
- evalscope/run_arena.py +21 -25
- evalscope/summarizer.py +27 -40
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -27
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -4
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
- evalscope/tools/combine_reports.py +27 -34
- evalscope/tools/rewrite_eval_results.py +15 -47
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +4 -5
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/io_utils.py +162 -0
- evalscope/utils/logger.py +17 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +5 -306
- evalscope/version.py +2 -2
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
- evalscope-0.8.1.dist-info/RECORD +285 -0
- tests/cli/test_run.py +53 -15
- tests/perf/test_perf.py +6 -1
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.2.dist-info/RECORD +0 -286
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
evalscope/run_arena.py
CHANGED
|
@@ -3,16 +3,17 @@
|
|
|
3
3
|
|
|
4
4
|
import argparse
|
|
5
5
|
import os
|
|
6
|
-
from pathlib import Path
|
|
7
6
|
import torch
|
|
7
|
+
from modelscope.utils.hf_util import GenerationConfig
|
|
8
|
+
from pathlib import Path
|
|
8
9
|
from tqdm import tqdm
|
|
9
10
|
|
|
10
11
|
from evalscope.constants import EvalConfigKeys
|
|
11
12
|
from evalscope.evaluator.rating_eval import RatingEvaluate
|
|
12
13
|
from evalscope.models.model_adapter import ChatGenerationModelAdapter
|
|
13
|
-
from evalscope.utils import get_obj_from_cfg
|
|
14
|
+
from evalscope.utils import get_obj_from_cfg
|
|
15
|
+
from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list, yaml_to_dict
|
|
14
16
|
from evalscope.utils.logger import get_logger
|
|
15
|
-
from modelscope.utils.hf_util import GenerationConfig
|
|
16
17
|
|
|
17
18
|
logger = get_logger()
|
|
18
19
|
|
|
@@ -41,8 +42,7 @@ class ArenaWorkflow:
|
|
|
41
42
|
def _get_obj_from_cfg(obj_cfg: dict):
|
|
42
43
|
cls_ref = obj_cfg.get(EvalConfigKeys.CLASS_REF, None)
|
|
43
44
|
if not cls_ref:
|
|
44
|
-
logger.warning(
|
|
45
|
-
f'Class reference is not specified in config: {obj_cfg}')
|
|
45
|
+
logger.warning(f'Class reference is not specified in config: {obj_cfg}')
|
|
46
46
|
return obj_cfg
|
|
47
47
|
|
|
48
48
|
cls = get_obj_from_cfg(cls_ref)
|
|
@@ -50,19 +50,16 @@ class ArenaWorkflow:
|
|
|
50
50
|
|
|
51
51
|
return obj_cfg
|
|
52
52
|
|
|
53
|
-
def _predict_answers(self,
|
|
54
|
-
|
|
55
|
-
model_revision: str,
|
|
56
|
-
precision: torch.dtype,
|
|
57
|
-
generation_config: GenerationConfig,
|
|
58
|
-
template_type: str) -> list:
|
|
53
|
+
def _predict_answers(self, model_id_or_path: str, model_revision: str, precision: torch.dtype,
|
|
54
|
+
generation_config: GenerationConfig, template_type: str) -> list:
|
|
59
55
|
|
|
60
56
|
# TODO: multi-task to be supported
|
|
61
|
-
model_adapter = ChatGenerationModelAdapter(
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
57
|
+
model_adapter = ChatGenerationModelAdapter(
|
|
58
|
+
model_id=model_id_or_path,
|
|
59
|
+
model_revision=model_revision,
|
|
60
|
+
torch_dtype=precision,
|
|
61
|
+
generation_config=generation_config,
|
|
62
|
+
template_type=template_type)
|
|
66
63
|
res_list = []
|
|
67
64
|
questions_list = jsonl_to_list(self.question_file)
|
|
68
65
|
for data_d in tqdm(questions_list, total=len(questions_list), desc=f'Predicting(answers):'):
|
|
@@ -92,8 +89,7 @@ class ArenaWorkflow:
|
|
|
92
89
|
for model_name, cfg_d in self.answers_gen.items():
|
|
93
90
|
enable = cfg_d.get(EvalConfigKeys.ENABLE, True)
|
|
94
91
|
if not enable:
|
|
95
|
-
logger.warning(
|
|
96
|
-
f'Skip model {model_name} because it is not enabled.')
|
|
92
|
+
logger.warning(f'Skip model {model_name} because it is not enabled.')
|
|
97
93
|
continue
|
|
98
94
|
|
|
99
95
|
model_id_or_path = cfg_d.get(EvalConfigKeys.MODEL_ID_OR_PATH)
|
|
@@ -105,11 +101,12 @@ class ArenaWorkflow:
|
|
|
105
101
|
ans_output_file = os.path.join(WORK_DIR, cfg_d.get(EvalConfigKeys.OUTPUT_FILE))
|
|
106
102
|
template_type = cfg_d.get(EvalConfigKeys.TEMPLATE_TYPE)
|
|
107
103
|
|
|
108
|
-
answers_list = self._predict_answers(
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
104
|
+
answers_list = self._predict_answers(
|
|
105
|
+
model_id_or_path=model_id_or_path,
|
|
106
|
+
model_revision=model_revision,
|
|
107
|
+
precision=precision,
|
|
108
|
+
generation_config=custom_generation_config,
|
|
109
|
+
template_type=template_type)
|
|
113
110
|
|
|
114
111
|
os.makedirs(os.path.dirname(ans_output_file), exist_ok=True)
|
|
115
112
|
dump_jsonl_data(answers_list, ans_output_file)
|
|
@@ -163,8 +160,7 @@ class ArenaWorkflow:
|
|
|
163
160
|
if enable:
|
|
164
161
|
report_file = os.path.join(WORK_DIR, self.rating_gen.get('report_file'))
|
|
165
162
|
metrics = self.rating_gen.get('metrics', ['elo'])
|
|
166
|
-
baseline_model = self.rating_gen.get(
|
|
167
|
-
'baseline_model') if metrics[0] == 'pairwise' else None
|
|
163
|
+
baseline_model = self.rating_gen.get('baseline_model') if metrics[0] == 'pairwise' else None
|
|
168
164
|
ae = RatingEvaluate(metrics=metrics, baseline_model=baseline_model)
|
|
169
165
|
res_list = ae.run(self.review_file)
|
|
170
166
|
rating_df = res_list[0]
|
evalscope/summarizer.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import glob
|
|
2
3
|
import json
|
|
3
4
|
import os
|
|
4
|
-
import glob
|
|
5
5
|
from typing import List, Union
|
|
6
6
|
|
|
7
|
-
from evalscope.config import TaskConfig
|
|
8
|
-
from evalscope.constants import
|
|
7
|
+
from evalscope.config import TaskConfig, parse_task_config
|
|
8
|
+
from evalscope.constants import EvalBackend
|
|
9
9
|
from evalscope.tools.combine_reports import gen_table
|
|
10
|
-
from evalscope.utils import
|
|
11
|
-
|
|
10
|
+
from evalscope.utils import csv_to_list, get_latest_folder_path
|
|
11
|
+
from evalscope.utils.io_utils import OutputsStructure, json_to_dict, yaml_to_dict
|
|
12
12
|
from evalscope.utils.logger import get_logger
|
|
13
13
|
|
|
14
14
|
logger = get_logger()
|
|
@@ -20,12 +20,12 @@ class Summarizer:
|
|
|
20
20
|
def get_report(outputs_dir: str) -> List[dict]:
|
|
21
21
|
res_list: list = []
|
|
22
22
|
|
|
23
|
-
outputs_structure
|
|
24
|
-
reports_dir: str = outputs_structure.
|
|
23
|
+
outputs_structure = OutputsStructure(outputs_dir, is_make=False)
|
|
24
|
+
reports_dir: str = outputs_structure.reports_dir
|
|
25
25
|
if reports_dir is None:
|
|
26
26
|
raise ValueError(f'No reports directory in {outputs_dir}')
|
|
27
27
|
|
|
28
|
-
report_files: list = glob.glob(os.path.join(reports_dir, '
|
|
28
|
+
report_files: list = glob.glob(os.path.join(reports_dir, '**/*.json'))
|
|
29
29
|
for report_file in report_files:
|
|
30
30
|
with open(report_file, 'r') as f:
|
|
31
31
|
res_list.append(json.load(f))
|
|
@@ -48,39 +48,26 @@ class Summarizer:
|
|
|
48
48
|
A report dict is overall report on a benchmark for specific model.
|
|
49
49
|
"""
|
|
50
50
|
final_res_list: List[dict] = []
|
|
51
|
-
candidate_task_cfgs: List[
|
|
52
|
-
|
|
53
|
-
if isinstance(task_cfg,
|
|
54
|
-
candidate_task_cfgs = [task_cfg]
|
|
55
|
-
elif isinstance(task_cfg, str):
|
|
56
|
-
task_cfg: dict = yaml_to_dict(task_cfg)
|
|
57
|
-
candidate_task_cfgs = [task_cfg]
|
|
58
|
-
elif isinstance(task_cfg, TaskConfig):
|
|
59
|
-
task_cfg: dict = task_cfg.to_dict()
|
|
60
|
-
candidate_task_cfgs = [task_cfg]
|
|
61
|
-
elif isinstance(task_cfg, list):
|
|
51
|
+
candidate_task_cfgs: List[TaskConfig] = []
|
|
52
|
+
|
|
53
|
+
if isinstance(task_cfg, list):
|
|
62
54
|
for task_cfg_item in task_cfg:
|
|
63
|
-
|
|
64
|
-
task_cfg_item: dict = yaml_to_dict(task_cfg_item)
|
|
65
|
-
elif isinstance(task_cfg_item, TaskConfig):
|
|
66
|
-
task_cfg_item: dict = task_cfg_item.to_dict()
|
|
67
|
-
candidate_task_cfgs.append(task_cfg_item)
|
|
55
|
+
candidate_task_cfgs.append(parse_task_config(task_cfg_item))
|
|
68
56
|
else:
|
|
69
|
-
|
|
57
|
+
candidate_task_cfgs.append(parse_task_config(task_cfg))
|
|
70
58
|
|
|
71
59
|
for candidate_task in candidate_task_cfgs:
|
|
72
60
|
logger.info(f'**Loading task cfg for summarizer: {candidate_task}')
|
|
73
|
-
eval_backend = candidate_task.
|
|
61
|
+
eval_backend = candidate_task.eval_backend
|
|
74
62
|
|
|
75
|
-
if eval_backend == EvalBackend.NATIVE
|
|
76
|
-
outputs_dir: str = candidate_task.
|
|
77
|
-
outputs_dir: str = os.path.expanduser(outputs_dir)
|
|
63
|
+
if eval_backend == EvalBackend.NATIVE:
|
|
64
|
+
outputs_dir: str = os.path.expanduser(candidate_task.work_dir)
|
|
78
65
|
if outputs_dir is None:
|
|
79
66
|
raise ValueError(f'No outputs_dir in {task_cfg}')
|
|
80
67
|
res_list: list = Summarizer.get_report(outputs_dir=outputs_dir)
|
|
81
68
|
final_res_list.extend(res_list)
|
|
82
69
|
|
|
83
|
-
elif eval_backend == EvalBackend.OPEN_COMPASS
|
|
70
|
+
elif eval_backend == EvalBackend.OPEN_COMPASS:
|
|
84
71
|
eval_config = Summarizer.parse_eval_config(candidate_task)
|
|
85
72
|
|
|
86
73
|
work_dir = eval_config.get('work_dir') or 'outputs/default'
|
|
@@ -93,25 +80,25 @@ class Summarizer:
|
|
|
93
80
|
raise ValueError(f'No summary files in {res_folder_path}')
|
|
94
81
|
|
|
95
82
|
summary_file_path = summary_files[0]
|
|
96
|
-
# Example: [{'dataset': 'gsm8k', 'version': '1d7fe4', 'metric': 'accuracy', 'mode': 'gen', 'qwen-7b-chat': '53.98'}
|
|
83
|
+
# Example: [{'dataset': 'gsm8k', 'version': '1d7fe4', 'metric': 'accuracy', 'mode': 'gen', 'qwen-7b-chat': '53.98'} # noqa: E501
|
|
97
84
|
summary_res: List[dict] = csv_to_list(file_path=summary_file_path)
|
|
98
85
|
final_res_list.extend(summary_res)
|
|
99
|
-
elif eval_backend == EvalBackend.VLM_EVAL_KIT
|
|
86
|
+
elif eval_backend == EvalBackend.VLM_EVAL_KIT:
|
|
100
87
|
eval_config = Summarizer.parse_eval_config(candidate_task)
|
|
101
88
|
|
|
102
89
|
work_dir = eval_config.get('work_dir') or 'outputs'
|
|
103
90
|
if not os.path.exists(work_dir):
|
|
104
91
|
raise ValueError(f'work_dir {work_dir} does not exist.')
|
|
105
|
-
|
|
92
|
+
|
|
106
93
|
for model in eval_config['model']:
|
|
107
94
|
if model['name'] == 'CustomAPIModel':
|
|
108
95
|
model_name = model['type']
|
|
109
96
|
else:
|
|
110
97
|
model_name = model['name']
|
|
111
|
-
|
|
98
|
+
|
|
112
99
|
csv_files = glob.glob(os.path.join(work_dir, model_name, '*.csv'))
|
|
113
100
|
json_files = glob.glob(os.path.join(work_dir, model_name, '*.json'))
|
|
114
|
-
|
|
101
|
+
|
|
115
102
|
summary_files = csv_files + json_files
|
|
116
103
|
for summary_file_path in summary_files:
|
|
117
104
|
if summary_file_path.endswith('csv'):
|
|
@@ -120,17 +107,17 @@ class Summarizer:
|
|
|
120
107
|
summary_res: dict = json_to_dict(summary_file_path)
|
|
121
108
|
file_name = os.path.basename(summary_file_path).split('.')[0]
|
|
122
109
|
final_res_list.append({file_name: summary_res})
|
|
123
|
-
|
|
124
|
-
elif eval_backend == EvalBackend.THIRD_PARTY
|
|
125
|
-
raise ValueError(
|
|
110
|
+
|
|
111
|
+
elif eval_backend == EvalBackend.THIRD_PARTY:
|
|
112
|
+
raise ValueError('*** The summarizer for Third party evaluation backend is not supported yet ***')
|
|
126
113
|
else:
|
|
127
114
|
raise ValueError(f'Invalid eval_backend: {eval_backend}')
|
|
128
115
|
|
|
129
116
|
return final_res_list
|
|
130
117
|
|
|
131
118
|
@staticmethod
|
|
132
|
-
def parse_eval_config(candidate_task):
|
|
133
|
-
eval_config: Union[str, dict] = candidate_task.
|
|
119
|
+
def parse_eval_config(candidate_task: TaskConfig):
|
|
120
|
+
eval_config: Union[str, dict] = candidate_task.eval_config
|
|
134
121
|
assert eval_config is not None, 'Please provide eval_config for specific evaluation backend.'
|
|
135
122
|
|
|
136
123
|
if isinstance(eval_config, str):
|
|
@@ -3,14 +3,18 @@
|
|
|
3
3
|
The LongWriter supports 10,000+ Word Generation From Long Context LLMs.
|
|
4
4
|
We can use the benchmark LongBench-Write focuses more on measuring the long output quality as well as the output length.
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
GitHub: [LongWriter](https://github.com/THUDM/LongWriter)
|
|
7
|
+
|
|
8
|
+
Technical Report: [Minimum Tuning to Unlock Long Output from LLMs with High Quality Data as the Key](https://arxiv.org/abs/2410.10210)
|
|
9
|
+
|
|
7
10
|
|
|
8
11
|
## Usage
|
|
9
12
|
|
|
10
13
|
### Installation
|
|
11
14
|
|
|
12
15
|
```bash
|
|
13
|
-
pip install evalscope[framework]
|
|
16
|
+
pip install evalscope[framework] -U
|
|
17
|
+
pip install vllm -U
|
|
14
18
|
```
|
|
15
19
|
|
|
16
20
|
### Task configuration
|
|
@@ -24,53 +28,79 @@ task_cfg = dict(stage=['infer', 'eval_l', 'eval_q'],
|
|
|
24
28
|
model='ZhipuAI/LongWriter-glm4-9b',
|
|
25
29
|
input_data_path=None,
|
|
26
30
|
output_dir='./outputs',
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
'
|
|
31
|
-
'
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
'
|
|
31
|
+
infer_config={
|
|
32
|
+
'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions',
|
|
33
|
+
'is_chat': True,
|
|
34
|
+
'verbose': False,
|
|
35
|
+
'generation_kwargs': {
|
|
36
|
+
'max_new_tokens': 32768,
|
|
37
|
+
'temperature': 0.5,
|
|
38
|
+
'repetition_penalty': 1.0
|
|
39
|
+
},
|
|
40
|
+
'proc_num': 16,
|
|
37
41
|
},
|
|
38
|
-
|
|
42
|
+
eval_config={
|
|
43
|
+
# No need to set OpenAI info if skipping the stage `eval_q`
|
|
44
|
+
'openai_api_key': None,
|
|
45
|
+
'openai_api_base': 'https://api.openai.com/v1/chat/completions',
|
|
46
|
+
'openai_gpt_model': 'gpt-4o-2024-05-13',
|
|
47
|
+
'generation_kwargs': {
|
|
48
|
+
'max_new_tokens': 1024,
|
|
49
|
+
'temperature': 0.5,
|
|
50
|
+
'stop': None
|
|
51
|
+
},
|
|
52
|
+
'proc_num': 8
|
|
53
|
+
}
|
|
54
|
+
)
|
|
39
55
|
|
|
40
56
|
```
|
|
41
57
|
- Arguments:
|
|
42
|
-
- `stage`: To run multiple stages, `infer`--run the inference process. `eval_l`--run eval length process. `eval_q`--run eval quality process.
|
|
43
|
-
- `model`: model id on the ModelScope hub, or local model dir.
|
|
58
|
+
- `stage`: To run multiple stages, `infer`--run the inference process. `eval_l`--run eval length process. `eval_q`--run eval quality process with the model-as-judge.
|
|
59
|
+
- `model`: model id on the ModelScope hub, or local model dir. Refer to [LongWriter-glm4-9b](https://modelscope.cn/models/ZhipuAI/LongWriter-glm4-9b/summary) for more details.
|
|
44
60
|
- `input_data_path`: input data path, default to `None`, it means to use [longbench_write](resources/longbench_write.jsonl)
|
|
45
61
|
- `output_dir`: output root directory.
|
|
46
62
|
- `openai_api_key`: openai_api_key when enabling the stage `eval_q` to use `Model-as-Judge`. Default to None if not needed.
|
|
47
63
|
- `openai_gpt_model`: Judge model name from OpenAI. Default to `gpt-4o-2024-05-13`
|
|
48
|
-
- `
|
|
49
|
-
- `
|
|
50
|
-
- `proc_num`: proc num.
|
|
64
|
+
- `generation_kwargs`: The generation configs.
|
|
65
|
+
- `proc_num`: process number for inference and evaluation.
|
|
51
66
|
|
|
52
67
|
|
|
53
68
|
2. Configuration with json (Optional):
|
|
54
69
|
|
|
55
70
|
```json
|
|
56
71
|
{
|
|
57
|
-
"stage": [
|
|
72
|
+
"stage": [
|
|
73
|
+
"infer",
|
|
74
|
+
"eval_l",
|
|
75
|
+
"eval_q"
|
|
76
|
+
],
|
|
58
77
|
"model": "ZhipuAI/LongWriter-glm4-9b",
|
|
59
78
|
"input_data_path": null,
|
|
60
79
|
"output_dir": "./outputs",
|
|
61
|
-
"
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
"
|
|
65
|
-
"
|
|
80
|
+
"infer_config": {
|
|
81
|
+
"openai_api_base": "http://127.0.0.1:8000/v1/chat/completions",
|
|
82
|
+
"is_chat": true,
|
|
83
|
+
"verbose": false,
|
|
84
|
+
"generation_kwargs": {
|
|
85
|
+
"max_new_tokens": 32768,
|
|
86
|
+
"temperature": 0.5,
|
|
87
|
+
"repetition_penalty": 1.0
|
|
88
|
+
},
|
|
89
|
+
"proc_num": 16
|
|
66
90
|
},
|
|
67
|
-
"
|
|
68
|
-
"
|
|
69
|
-
"
|
|
70
|
-
"
|
|
71
|
-
|
|
72
|
-
|
|
91
|
+
"eval_config": {
|
|
92
|
+
"openai_api_key": null,
|
|
93
|
+
"openai_api_base": "https://api.openai.com/v1/chat/completions",
|
|
94
|
+
"openai_gpt_model": "gpt-4o-2024-05-13",
|
|
95
|
+
"generation_kwargs": {
|
|
96
|
+
"max_new_tokens": 1024,
|
|
97
|
+
"temperature": 0.5,
|
|
98
|
+
"stop": null
|
|
99
|
+
},
|
|
100
|
+
"proc_num": 8
|
|
101
|
+
}
|
|
73
102
|
}
|
|
103
|
+
|
|
74
104
|
```
|
|
75
105
|
Refer to [default_task.json](default_task.json) for more details.
|
|
76
106
|
|
|
@@ -82,24 +112,51 @@ stage:
|
|
|
82
112
|
- infer
|
|
83
113
|
- eval_l
|
|
84
114
|
- eval_q
|
|
85
|
-
model: ZhipuAI/LongWriter-glm4-9b
|
|
115
|
+
model: "ZhipuAI/LongWriter-glm4-9b"
|
|
86
116
|
input_data_path: null
|
|
87
|
-
output_dir: ./outputs
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
117
|
+
output_dir: "./outputs"
|
|
118
|
+
infer_config:
|
|
119
|
+
openai_api_base: "http://127.0.0.1:8000/v1/chat/completions"
|
|
120
|
+
is_chat: true
|
|
121
|
+
verbose: false
|
|
122
|
+
generation_kwargs:
|
|
123
|
+
max_new_tokens: 32768
|
|
124
|
+
temperature: 0.5
|
|
125
|
+
repetition_penalty: 1.0
|
|
126
|
+
proc_num: 16
|
|
127
|
+
eval_config:
|
|
128
|
+
openai_api_key: null
|
|
129
|
+
openai_api_base: "https://api.openai.com/v1/chat/completions"
|
|
130
|
+
openai_gpt_model: "gpt-4o-2024-05-13"
|
|
131
|
+
generation_kwargs:
|
|
132
|
+
max_new_tokens: 1024
|
|
133
|
+
temperature: 0.5
|
|
134
|
+
stop: null
|
|
135
|
+
proc_num: 8
|
|
98
136
|
|
|
99
137
|
```
|
|
100
138
|
Refer to [default_task.yaml](default_task.yaml) for more details.
|
|
101
139
|
|
|
102
140
|
|
|
141
|
+
### Run Model Inference
|
|
142
|
+
We recommend to use the [vLLM](https://github.com/vllm-project/vllm) to deploy the model.
|
|
143
|
+
|
|
144
|
+
Environment:
|
|
145
|
+
* A100(80G) x 1
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
To start vLLM server, run the following command:
|
|
149
|
+
```shell
|
|
150
|
+
CUDA_VISIBLE_DEVICES=0 VLLM_USE_MODELSCOPE=True vllm serve --max-model-len=65536 --gpu_memory_utilization=0.95 --trust-remote-code ZhipuAI/LongWriter-glm4-9b
|
|
151
|
+
```
|
|
152
|
+
- Arguments:
|
|
153
|
+
- `max-model-len`: The maximum length of the model input.
|
|
154
|
+
- `gpu_memory_utilization`: The GPU memory utilization.
|
|
155
|
+
- `trust-remote-code`: Whether to trust the remote code.
|
|
156
|
+
- `model`: Could be a model id on the ModelScope/HuggingFace hub, or a local model dir.
|
|
157
|
+
|
|
158
|
+
* Note: You can use multiple GPUs by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` alternatively.
|
|
159
|
+
|
|
103
160
|
|
|
104
161
|
### Run the task
|
|
105
162
|
|
|
@@ -2,23 +2,24 @@ stage:
|
|
|
2
2
|
- infer
|
|
3
3
|
- eval_l
|
|
4
4
|
- eval_q
|
|
5
|
-
model: ZhipuAI/LongWriter-glm4-9b
|
|
5
|
+
model: "ZhipuAI/LongWriter-glm4-9b"
|
|
6
6
|
input_data_path: null
|
|
7
|
-
output_dir:
|
|
7
|
+
output_dir: "./outputs"
|
|
8
8
|
infer_config:
|
|
9
|
-
openai_api_base:
|
|
9
|
+
openai_api_base: "http://127.0.0.1:8000/v1/chat/completions"
|
|
10
10
|
is_chat: true
|
|
11
11
|
verbose: false
|
|
12
12
|
generation_kwargs:
|
|
13
13
|
max_new_tokens: 32768
|
|
14
14
|
temperature: 0.5
|
|
15
15
|
repetition_penalty: 1.0
|
|
16
|
+
proc_num: 16
|
|
16
17
|
eval_config:
|
|
17
|
-
openai_api_key:
|
|
18
|
-
openai_api_base:
|
|
19
|
-
openai_gpt_model:
|
|
18
|
+
openai_api_key: null
|
|
19
|
+
openai_api_base: "https://api.openai.com/v1/chat/completions"
|
|
20
|
+
openai_gpt_model: "gpt-4o-2024-05-13"
|
|
20
21
|
generation_kwargs:
|
|
21
22
|
max_new_tokens: 1024
|
|
22
23
|
temperature: 0.5
|
|
23
24
|
stop: null
|
|
24
|
-
proc_num:
|
|
25
|
+
proc_num: 8
|
|
@@ -1,19 +1,17 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
# Copyright (c) ZhipuAI, Inc. and its affiliates.
|
|
3
|
-
import multiprocessing
|
|
4
|
-
import os
|
|
5
3
|
import json
|
|
6
|
-
import random
|
|
7
|
-
import re
|
|
8
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
9
|
-
|
|
10
4
|
import matplotlib.pyplot as plt
|
|
11
5
|
import numpy as np
|
|
6
|
+
import os
|
|
7
|
+
import random
|
|
8
|
+
import re
|
|
12
9
|
import requests
|
|
10
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
13
11
|
from tqdm import tqdm
|
|
14
12
|
|
|
15
|
-
from evalscope.utils import jsonl_to_list
|
|
16
13
|
from evalscope.utils import get_logger
|
|
14
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
17
15
|
|
|
18
16
|
logger = get_logger()
|
|
19
17
|
|
|
@@ -52,14 +50,16 @@ class EvalLength:
|
|
|
52
50
|
return 100 * max(0, 1. - (x / y - 1) / 2)
|
|
53
51
|
|
|
54
52
|
def eval(self, dump_res: bool = True):
|
|
55
|
-
# example = {"prompt": "Write an outline for a short 100-word blog post about xxx",
|
|
53
|
+
# example = {"prompt": "Write an outline for a short 100-word blog post about xxx",
|
|
54
|
+
# "type": "Community Forum", "length": 100, "response_length": 103,
|
|
55
|
+
# "response": "I. Introduction A. xxx"}
|
|
56
56
|
predictions = [json.loads(line) for line in open(self.pred_path, encoding='utf-8')]
|
|
57
57
|
x, y, scores = [], [], []
|
|
58
58
|
|
|
59
|
-
for pred in tqdm(predictions, total=len(predictions), desc=
|
|
60
|
-
x.append(pred[
|
|
61
|
-
y.append(pred[
|
|
62
|
-
scores.append(self.score(pred[
|
|
59
|
+
for pred in tqdm(predictions, total=len(predictions), desc='[Processing eval_l]'):
|
|
60
|
+
x.append(pred['length'])
|
|
61
|
+
y.append(pred['response_length'])
|
|
62
|
+
scores.append(self.score(pred['length'], pred['response_length']))
|
|
63
63
|
|
|
64
64
|
avg_score_l = np.mean(scores)
|
|
65
65
|
logger.info(f'Average score of length evaluation: {avg_score_l:.2f}')
|
|
@@ -105,7 +105,7 @@ class EvalQuality:
|
|
|
105
105
|
|
|
106
106
|
EVAL_Q = 'eval_quality'
|
|
107
107
|
OPENAI_BASE_URL = 'https://api.openai.com/v1/chat/completions'
|
|
108
|
-
DIMS = [
|
|
108
|
+
DIMS = ['Relevance', 'Accuracy', 'Coherence', 'Clarity', 'Breadth and Depth', 'Reading Experience']
|
|
109
109
|
|
|
110
110
|
def __init__(self,
|
|
111
111
|
model: str,
|
|
@@ -144,7 +144,8 @@ class EvalQuality:
|
|
|
144
144
|
|
|
145
145
|
self.openai_api_key: str = openai_api_key
|
|
146
146
|
self.openai_gpt_model = openai_gpt_model
|
|
147
|
-
|
|
147
|
+
if not self.openai_api_key:
|
|
148
|
+
logger.error('Please set `OPENAI_API_KEY` in the envs when stage `eval_q` is activated!')
|
|
148
149
|
|
|
149
150
|
def get_response_gpt4(self, prompt, temperature=0.5, max_new_tokens=1024, stop=None):
|
|
150
151
|
tries = 0
|
|
@@ -152,17 +153,17 @@ class EvalQuality:
|
|
|
152
153
|
tries += 1
|
|
153
154
|
try:
|
|
154
155
|
headers = {
|
|
155
|
-
'Authorization':
|
|
156
|
+
'Authorization': 'Bearer {}'.format(self.openai_api_key),
|
|
156
157
|
}
|
|
157
158
|
messages = [
|
|
158
159
|
{'role': 'user', 'content': prompt},
|
|
159
160
|
]
|
|
160
161
|
resp = requests.post(self.openai_api_base, json={
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
162
|
+
'model': self.openai_gpt_model,
|
|
163
|
+
'messages': messages,
|
|
164
|
+
'temperature': temperature,
|
|
165
|
+
'max_tokens': max_new_tokens,
|
|
166
|
+
'stop': stop,
|
|
166
167
|
}, headers=headers, timeout=600)
|
|
167
168
|
if resp.status_code != 200:
|
|
168
169
|
raise Exception(resp.text)
|
|
@@ -172,16 +173,16 @@ class EvalQuality:
|
|
|
172
173
|
except KeyboardInterrupt as e:
|
|
173
174
|
raise e
|
|
174
175
|
except Exception as e:
|
|
175
|
-
if
|
|
176
|
+
if 'maximum context length' in str(e):
|
|
176
177
|
raise e
|
|
177
|
-
elif
|
|
178
|
+
elif 'triggering' in str(e):
|
|
178
179
|
return 'Trigger OpenAI\'s content management policy'
|
|
179
180
|
logger.error("Error Occurs: \"%s\" Retry ..." % (str(e)))
|
|
180
181
|
else:
|
|
181
|
-
logger.error(
|
|
182
|
-
return
|
|
182
|
+
logger.error('Max tries. Failed.')
|
|
183
|
+
return 'Max tries. Failed.'
|
|
183
184
|
try:
|
|
184
|
-
return resp[
|
|
185
|
+
return resp['choices'][0]['message']['content']
|
|
185
186
|
except:
|
|
186
187
|
return ''
|
|
187
188
|
|
|
@@ -195,7 +196,7 @@ class EvalQuality:
|
|
|
195
196
|
|
|
196
197
|
def process_data(self, item):
|
|
197
198
|
# for item in tqdm(items, total=len(items), desc=f'Process of eval_q: '):
|
|
198
|
-
prompt = self.prompt_template.replace('$INST$', item['prompt']).replace('$RESPONSE$', item[
|
|
199
|
+
prompt = self.prompt_template.replace('$INST$', item['prompt']).replace('$RESPONSE$', item['response'])
|
|
199
200
|
scores = None
|
|
200
201
|
output = self.get_response_gpt4(prompt, **self.generation_kwargs)
|
|
201
202
|
try:
|
|
@@ -235,7 +236,8 @@ class EvalQuality:
|
|
|
235
236
|
total_score = dict()
|
|
236
237
|
for dim in self.DIMS:
|
|
237
238
|
# scores = [float(score[dim]) if dim in score else 3 for score in self.eval_scores]
|
|
238
|
-
scores = [float(item['scores'][dim]) if 'scores' in item and dim in item['scores']
|
|
239
|
+
scores = [float(item['scores'][dim]) if 'scores' in item and dim in item['scores']
|
|
240
|
+
else 3 for item in self.eval_scores]
|
|
239
241
|
total_score[dim] = ((sum(scores) / len(scores)) - 1) * 25
|
|
240
242
|
total_score['total'] = sum(total_score.values()) / len(total_score)
|
|
241
243
|
logger.info(f'Total score of quality evaluation: {total_score["total"]:.2f}')
|