evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +6 -2
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +47 -51
- evalscope/backend/rag_eval/utils/embedding.py +13 -12
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +33 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +154 -96
- evalscope/constants.py +50 -32
- evalscope/evaluator/evaluator.py +97 -377
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +16 -3
- evalscope/perf/benchmark.py +9 -11
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +8 -1
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +3 -4
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +5 -6
- evalscope/perf/utils/db_util.py +77 -30
- evalscope/perf/utils/local_server.py +21 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +153 -381
- evalscope/run_arena.py +21 -25
- evalscope/summarizer.py +27 -40
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -27
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -4
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
- evalscope/tools/combine_reports.py +27 -34
- evalscope/tools/rewrite_eval_results.py +15 -47
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +4 -5
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/io_utils.py +162 -0
- evalscope/utils/logger.py +17 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +5 -306
- evalscope/version.py +2 -2
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
- evalscope-0.8.1.dist-info/RECORD +285 -0
- tests/cli/test_run.py +53 -15
- tests/perf/test_perf.py +6 -1
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.2.dist-info/RECORD +0 -286
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
|
@@ -1,24 +1,17 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
from typing import List, Union
|
|
4
|
-
|
|
5
3
|
import pandas as pd
|
|
6
4
|
import pyarrow as pa
|
|
5
|
+
from typing import List, Union
|
|
7
6
|
|
|
8
7
|
from evalscope.constants import MetricMembers
|
|
9
8
|
from evalscope.utils.arena_utils import compute_elo
|
|
9
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
|
-
from evalscope.utils import jsonl_to_list
|
|
12
11
|
|
|
13
12
|
logger = get_logger()
|
|
14
13
|
|
|
15
|
-
DEFAULT_COLUMNS_MAPPING = {
|
|
16
|
-
'model_a': 'model_a',
|
|
17
|
-
'model_b': 'model_b',
|
|
18
|
-
'win': 'win',
|
|
19
|
-
'tstamp': 'ts',
|
|
20
|
-
'language': 'lang'
|
|
21
|
-
}
|
|
14
|
+
DEFAULT_COLUMNS_MAPPING = {'model_a': 'model_a', 'model_b': 'model_b', 'win': 'win', 'tstamp': 'ts', 'language': 'lang'}
|
|
22
15
|
|
|
23
16
|
|
|
24
17
|
class RatingEvaluate(object):
|
|
@@ -41,10 +34,9 @@ class RatingEvaluate(object):
|
|
|
41
34
|
elo_ratings = compute_elo(battles)
|
|
42
35
|
col_model = 'Model'
|
|
43
36
|
col_elo_rating = 'Elo_Rating'
|
|
44
|
-
elo_ratings_res = pd.DataFrame(
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
col_elo_rating, ascending=False).reset_index(drop=True)
|
|
37
|
+
elo_ratings_res = pd.DataFrame([[n, elo_ratings[n]] for n in elo_ratings.keys()],
|
|
38
|
+
columns=[col_model, col_elo_rating]).sort_values(
|
|
39
|
+
col_elo_rating, ascending=False).reset_index(drop=True)
|
|
48
40
|
elo_ratings_res = elo_ratings_res.round({col_elo_rating: 1})
|
|
49
41
|
return elo_ratings_res
|
|
50
42
|
|
|
@@ -89,23 +81,11 @@ class RatingEvaluate(object):
|
|
|
89
81
|
'tie': 1
|
|
90
82
|
}]
|
|
91
83
|
else:
|
|
92
|
-
return [{
|
|
93
|
-
'model': winner,
|
|
94
|
-
'win': 1,
|
|
95
|
-
'loss': 0,
|
|
96
|
-
'tie': 0
|
|
97
|
-
}, {
|
|
98
|
-
'model': loser,
|
|
99
|
-
'win': 0,
|
|
100
|
-
'loss': 1,
|
|
101
|
-
'tie': 0
|
|
102
|
-
}]
|
|
84
|
+
return [{'model': winner, 'win': 1, 'loss': 0, 'tie': 0}, {'model': loser, 'win': 0, 'loss': 1, 'tie': 0}]
|
|
103
85
|
|
|
104
86
|
def compute_pairwise_rating(self, raw_data):
|
|
105
87
|
df_all = self.preprocess(raw_data_df=raw_data)
|
|
106
|
-
model_list = (
|
|
107
|
-
df_all['model_a'].unique().tolist()
|
|
108
|
-
+ df_all['model_b'].unique().tolist())
|
|
88
|
+
model_list = (df_all['model_a'].unique().tolist() + df_all['model_b'].unique().tolist())
|
|
109
89
|
model_list = list(set(model_list))
|
|
110
90
|
|
|
111
91
|
list_res = []
|
|
@@ -114,8 +94,7 @@ class RatingEvaluate(object):
|
|
|
114
94
|
if self.baseline_model is not None:
|
|
115
95
|
if self.baseline_model not in [row['model_a'], row['model_b']]:
|
|
116
96
|
logger.warning(
|
|
117
|
-
f'One of the models in the battle should be the baseline model: {self.baseline_model}'
|
|
118
|
-
)
|
|
97
|
+
f'One of the models in the battle should be the baseline model: {self.baseline_model}')
|
|
119
98
|
continue
|
|
120
99
|
rating = self.get_single_pairwise_rating(row)
|
|
121
100
|
list_res = list_res + rating
|
|
@@ -149,15 +128,15 @@ class RatingEvaluate(object):
|
|
|
149
128
|
|
|
150
129
|
for metric in self.metrics:
|
|
151
130
|
|
|
152
|
-
if metric == MetricMembers.ELO
|
|
131
|
+
if metric == MetricMembers.ELO:
|
|
153
132
|
res = self.compute_elo_rating(raw_data)
|
|
154
133
|
res_all.append(res)
|
|
155
134
|
|
|
156
|
-
elif metric == MetricMembers.PAIRWISE
|
|
135
|
+
elif metric == MetricMembers.PAIRWISE:
|
|
157
136
|
res = self.compute_pairwise_rating(raw_data)
|
|
158
137
|
res_all.append(res)
|
|
159
138
|
|
|
160
|
-
elif metric == MetricMembers.SCORE
|
|
139
|
+
elif metric == MetricMembers.SCORE:
|
|
161
140
|
res = self.compute_score_rating(raw_data)
|
|
162
141
|
res_all.append(res)
|
|
163
142
|
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# flake8: noqa
|
|
3
3
|
|
|
4
4
|
import os
|
|
5
|
+
import pandas as pd
|
|
5
6
|
import random
|
|
6
7
|
import sys
|
|
7
8
|
import time
|
|
@@ -9,15 +10,11 @@ from abc import ABC, abstractmethod
|
|
|
9
10
|
from functools import partial
|
|
10
11
|
from typing import Any, List
|
|
11
12
|
|
|
12
|
-
import pandas as pd
|
|
13
|
-
|
|
14
13
|
from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
|
|
15
14
|
from evalscope.models.openai_model import OpenAIModel
|
|
16
|
-
from evalscope.utils import completion_parsers
|
|
17
|
-
from evalscope.utils.arena_utils import
|
|
18
|
-
|
|
19
|
-
shuffle_pairwise_preferences)
|
|
20
|
-
from evalscope.utils import dump_jsonl_data, jsonl_to_list, random_seeded_choice
|
|
15
|
+
from evalscope.utils import completion_parsers, random_seeded_choice
|
|
16
|
+
from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
|
|
17
|
+
from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list
|
|
21
18
|
from evalscope.utils.logger import get_logger
|
|
22
19
|
|
|
23
20
|
logger = get_logger()
|
|
@@ -33,8 +30,7 @@ class BaseReviewer(ABC):
|
|
|
33
30
|
"""
|
|
34
31
|
Run pairwise battles with given models.
|
|
35
32
|
"""
|
|
36
|
-
raise NotImplementedError(
|
|
37
|
-
'run() method must be implemented in your subclass.')
|
|
33
|
+
raise NotImplementedError('run() method must be implemented in your subclass.')
|
|
38
34
|
|
|
39
35
|
|
|
40
36
|
class AutoReviewerGpt4(BaseReviewer):
|
|
@@ -71,13 +67,9 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
71
67
|
|
|
72
68
|
self.review_result_file = review_result_file
|
|
73
69
|
self.prompt_list = jsonl_to_list(prompt_file)
|
|
74
|
-
self.answer_list = [
|
|
75
|
-
|
|
76
|
-
]
|
|
77
|
-
self.reference_list = jsonl_to_list(
|
|
78
|
-
reference_file) if reference_file else []
|
|
79
|
-
self.cache_list = jsonl_to_list(
|
|
80
|
-
cache_file) if cache_file and os.path.isfile(cache_file) else []
|
|
70
|
+
self.answer_list = [jsonl_to_list(answer_file) for answer_file in answer_file_list]
|
|
71
|
+
self.reference_list = jsonl_to_list(reference_file) if reference_file else []
|
|
72
|
+
self.cache_list = jsonl_to_list(cache_file) if cache_file and os.path.isfile(cache_file) else []
|
|
81
73
|
|
|
82
74
|
self.reviewer_args = reviewer_args if reviewer_args \
|
|
83
75
|
else self._get_default_args()
|
|
@@ -88,24 +80,18 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
88
80
|
self.answer_list.append(jsonl_to_list(baseline_file))
|
|
89
81
|
self.baseline_idx = len(self.answer_list) - 1
|
|
90
82
|
|
|
91
|
-
self.position_bias_mitigation = self.reviewer_args.pop(
|
|
92
|
-
|
|
93
|
-
PositionBiasMitigation.NONE)
|
|
83
|
+
self.position_bias_mitigation = self.reviewer_args.pop(EvalConfigKeys.POSITION_BIAS_MITIGATION,
|
|
84
|
+
PositionBiasMitigation.NONE)
|
|
94
85
|
if self.position_bias_mitigation == PositionBiasMitigation.RANDOMIZE_ORDER:
|
|
95
|
-
self.random_seed = self.reviewer_args.pop(
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
FnCompletionParser.LMSYS_PARSER)
|
|
101
|
-
completion_parser_kwargs = self.reviewer_args.pop(
|
|
102
|
-
EvalConfigKeys.COMPLETION_PARSER_KWARGS, {})
|
|
86
|
+
self.random_seed = self.reviewer_args.pop(EvalConfigKeys.RANDOM_SEED, 123)
|
|
87
|
+
|
|
88
|
+
fn_completion_parser = self.reviewer_args.pop(EvalConfigKeys.FN_COMPLETION_PARSER,
|
|
89
|
+
FnCompletionParser.LMSYS_PARSER)
|
|
90
|
+
completion_parser_kwargs = self.reviewer_args.pop(EvalConfigKeys.COMPLETION_PARSER_KWARGS, {})
|
|
103
91
|
if isinstance(fn_completion_parser, str):
|
|
104
|
-
fn_completion_parser = getattr(completion_parsers,
|
|
105
|
-
fn_completion_parser)
|
|
92
|
+
fn_completion_parser = getattr(completion_parsers, fn_completion_parser)
|
|
106
93
|
|
|
107
|
-
self.fn_completion_parser = partial(fn_completion_parser,
|
|
108
|
-
**completion_parser_kwargs)
|
|
94
|
+
self.fn_completion_parser = partial(fn_completion_parser, **completion_parser_kwargs)
|
|
109
95
|
self.gpt_predictor = OpenAIModel(model_cfg=self.reviewer_args)
|
|
110
96
|
|
|
111
97
|
@staticmethod
|
|
@@ -133,45 +119,35 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
133
119
|
# Default to general category (idx 0)
|
|
134
120
|
target_prompt_dict = prompts_list[0]
|
|
135
121
|
for item in prompts_list:
|
|
136
|
-
is_category_match = category in item['category'] if isinstance(
|
|
137
|
-
|
|
122
|
+
is_category_match = category in item['category'] if isinstance(item['category'],
|
|
123
|
+
list) else item['category'] == category
|
|
138
124
|
is_type_match = item.get('type', ArenaMode.PAIRWISE) == type
|
|
139
125
|
if is_category_match and is_type_match:
|
|
140
126
|
target_prompt_dict = item
|
|
141
127
|
break
|
|
142
|
-
elif is_type_match and target_prompt_dict.get('type',
|
|
143
|
-
ArenaMode.PAIRWISE) != type:
|
|
128
|
+
elif is_type_match and target_prompt_dict.get('type', ArenaMode.PAIRWISE) != type:
|
|
144
129
|
target_prompt_dict = item # fallback to type match
|
|
145
130
|
|
|
146
131
|
sys_prompt = target_prompt_dict['system_prompt']
|
|
147
132
|
prompt_template = target_prompt_dict['prompt_template']
|
|
148
133
|
defaults = target_prompt_dict.get('defaults', dict({}))
|
|
149
|
-
output_format = target_prompt_dict.get('output_format',
|
|
150
|
-
'[[rating_a,rating_b]]')
|
|
134
|
+
output_format = target_prompt_dict.get('output_format', '[[rating_a,rating_b]]')
|
|
151
135
|
|
|
152
136
|
if type == ArenaMode.SINGLE:
|
|
153
|
-
user_prompt = prompt_template.format(
|
|
154
|
-
question=ques, answer=ans1, ref_answer_1=ans_ref, **defaults)
|
|
137
|
+
user_prompt = prompt_template.format(question=ques, answer=ans1, ref_answer_1=ans_ref, **defaults)
|
|
155
138
|
else:
|
|
156
139
|
user_prompt = prompt_template.format(
|
|
157
|
-
question=ques,
|
|
158
|
-
answer_a=ans1,
|
|
159
|
-
answer_b=ans2,
|
|
160
|
-
ref_answer_1=ans_ref,
|
|
161
|
-
**defaults)
|
|
140
|
+
question=ques, answer_a=ans1, answer_b=ans2, ref_answer_1=ans_ref, **defaults)
|
|
162
141
|
|
|
163
142
|
return sys_prompt, user_prompt, output_format
|
|
164
143
|
|
|
165
144
|
def get_review_cache(self, model_a, model_b, question) -> list:
|
|
166
145
|
if model_b:
|
|
167
|
-
cache_hit = next(
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
None)
|
|
146
|
+
cache_hit = next((r for r in self.cache_list
|
|
147
|
+
if r['model_a'] == model_a and r['model_b'] == model_b and r['question'] == question),
|
|
148
|
+
None)
|
|
171
149
|
else:
|
|
172
|
-
cache_hit = next(
|
|
173
|
-
(r for r in self.cache_list
|
|
174
|
-
if r['model'] == model_a and r['question'] == question), None)
|
|
150
|
+
cache_hit = next((r for r in self.cache_list if r['model'] == model_a and r['question'] == question), None)
|
|
175
151
|
return cache_hit
|
|
176
152
|
|
|
177
153
|
def get_review_pair(self, item: List[dict], dry_run=False, **kwargs) -> dict:
|
|
@@ -265,12 +241,10 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
265
241
|
return review_result
|
|
266
242
|
|
|
267
243
|
def _get_review_pair(self, model_a, model_b, question, category, ans1, ans2, dry_run=False, **kwargs) -> (str, Any):
|
|
268
|
-
input_msg = dict(
|
|
269
|
-
ques=question, category=category, ans1=ans1, ans2=ans2)
|
|
244
|
+
input_msg = dict(ques=question, category=category, ans1=ans1, ans2=ans2)
|
|
270
245
|
|
|
271
246
|
if self.reference_list:
|
|
272
|
-
ans_ref = next((ref for ref in self.reference_list
|
|
273
|
-
if ref.get('text') == question), None)
|
|
247
|
+
ans_ref = next((ref for ref in self.reference_list if ref.get('text') == question), None)
|
|
274
248
|
assert ans_ref['answer']
|
|
275
249
|
input_msg['ans_ref'] = ans_ref['answer']
|
|
276
250
|
|
|
@@ -284,8 +258,7 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
284
258
|
else:
|
|
285
259
|
review_text = self._get_reviewer_prediction(sys_prompt, user_prompt, **kwargs)
|
|
286
260
|
|
|
287
|
-
result = self.fn_completion_parser(
|
|
288
|
-
review_text, output_format=output_format)
|
|
261
|
+
result = self.fn_completion_parser(review_text, output_format=output_format)
|
|
289
262
|
if not isinstance(result, tuple):
|
|
290
263
|
result = (result, None)
|
|
291
264
|
return review_text, *result
|
|
@@ -294,8 +267,7 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
294
267
|
input_msg = dict(ques=question, category=category, ans1=answer)
|
|
295
268
|
|
|
296
269
|
if self.reference_list:
|
|
297
|
-
ans_ref = next((ref for ref in self.reference_list
|
|
298
|
-
if ref.get('text') == question), None)
|
|
270
|
+
ans_ref = next((ref for ref in self.reference_list if ref.get('text') == question), None)
|
|
299
271
|
assert ans_ref['answer']
|
|
300
272
|
input_msg['ans_ref'] = ans_ref['answer']
|
|
301
273
|
|
|
@@ -312,8 +284,7 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
312
284
|
score = self.fn_completion_parser(review_text, output_format)
|
|
313
285
|
return review_text, score
|
|
314
286
|
|
|
315
|
-
def _get_reviewer_prediction_dummy(self, sys_prompt: str, user_prompt: str,
|
|
316
|
-
output_format) -> str:
|
|
287
|
+
def _get_reviewer_prediction_dummy(self, sys_prompt: str, user_prompt: str, output_format) -> str:
|
|
317
288
|
logger.info('Get dummy scores for input prompt ...')
|
|
318
289
|
if output_format == '[[rating]]':
|
|
319
290
|
return f'[[{round(random.random(), 2)}]]'
|
|
@@ -359,8 +330,7 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
359
330
|
if self.review_mode == ArenaMode.PAIRWISE:
|
|
360
331
|
battle_pairs = get_battle_pairs(merged_ans_df.columns)
|
|
361
332
|
elif self.review_mode == ArenaMode.PAIRWISE_BASELINE:
|
|
362
|
-
battle_pairs = get_battle_pairs(merged_ans_df.columns,
|
|
363
|
-
self.baseline_idx)
|
|
333
|
+
battle_pairs = get_battle_pairs(merged_ans_df.columns, self.baseline_idx)
|
|
364
334
|
elif self.review_mode == ArenaMode.SINGLE:
|
|
365
335
|
battle_pairs = [(col, ) for col in merged_ans_df.columns]
|
|
366
336
|
else:
|
|
@@ -373,14 +343,12 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
373
343
|
pair_df.columns = ['output_1', 'output_2']
|
|
374
344
|
pair_df['is_switched_outputs'] = pair_df.apply(
|
|
375
345
|
lambda x: random_seeded_choice(
|
|
376
|
-
seed='is_switched_outputs' + x[0]['text'] + str(
|
|
377
|
-
self.random_seed),
|
|
346
|
+
seed='is_switched_outputs' + x[0]['text'] + str(self.random_seed),
|
|
378
347
|
choices=[False, True],
|
|
379
348
|
),
|
|
380
349
|
axis=1,
|
|
381
350
|
)
|
|
382
|
-
pair_df = shuffle_pairwise_preferences(
|
|
383
|
-
pair_df, pair_df['is_switched_outputs'])
|
|
351
|
+
pair_df = shuffle_pairwise_preferences(pair_df, pair_df['is_switched_outputs'])
|
|
384
352
|
|
|
385
353
|
for index, row in pair_df.iterrows():
|
|
386
354
|
row_result = self.get_review_pair(row.to_list(), dry_run=dry_run, **kwargs) \
|
|
@@ -395,17 +363,21 @@ if __name__ == '__main__':
|
|
|
395
363
|
|
|
396
364
|
work_path = os.path.join(Path(__file__).absolute().parent, '../../../')
|
|
397
365
|
prompt_template_path = os.path.join(work_path, 'evalscope/registry/data/prompt_template/prompt_templates.jsonl')
|
|
398
|
-
answer_file_list = [
|
|
399
|
-
|
|
366
|
+
answer_file_list = [
|
|
367
|
+
os.path.join(work_path, 'outputs/arena/default/answers/answer_chatglm2-6b.jsonl'),
|
|
368
|
+
os.path.join(work_path, 'outputs/arena/default/answers/answer_llama2-7b.jsonl')
|
|
369
|
+
]
|
|
400
370
|
review_result_file_path = os.path.join(work_path, 'outputs/arena/default/reviews/review_gpt4.jsonl')
|
|
401
371
|
|
|
402
|
-
input_kwargs = dict(
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
372
|
+
input_kwargs = dict(
|
|
373
|
+
prompt_file=prompt_template_path,
|
|
374
|
+
answer_file_list=answer_file_list,
|
|
375
|
+
review_result_file=review_result_file_path,
|
|
376
|
+
reviewer_args={},
|
|
377
|
+
baseline_file='',
|
|
378
|
+
reference_file='',
|
|
379
|
+
cache_file='',
|
|
380
|
+
)
|
|
409
381
|
|
|
410
382
|
auto_reviewer = AutoReviewerGpt4(**input_kwargs)
|
|
411
383
|
auto_reviewer.run(dry_run=True)
|
|
@@ -29,16 +29,17 @@ In these examples settings.xml lists input files and formats.
|
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
31
|
from __future__ import absolute_import, division, print_function
|
|
32
|
-
import collections
|
|
33
|
-
import re
|
|
34
|
-
import os
|
|
35
32
|
|
|
33
|
+
import collections
|
|
36
34
|
import nltk
|
|
37
35
|
import numpy as np
|
|
36
|
+
import os
|
|
37
|
+
import re
|
|
38
38
|
import six
|
|
39
39
|
from absl import logging
|
|
40
40
|
from rouge_score import scoring, tokenizers
|
|
41
41
|
from six.moves import map, range
|
|
42
|
+
|
|
42
43
|
from evalscope.utils import get_logger
|
|
43
44
|
|
|
44
45
|
logger = get_logger()
|
|
@@ -81,11 +82,7 @@ class RougeScorer(scoring.BaseScorer):
|
|
|
81
82
|
... 'The quick brown dog jumps on the log.')
|
|
82
83
|
"""
|
|
83
84
|
|
|
84
|
-
def __init__(self,
|
|
85
|
-
rouge_types,
|
|
86
|
-
use_stemmer=False,
|
|
87
|
-
split_summaries=False,
|
|
88
|
-
tokenizer=None):
|
|
85
|
+
def __init__(self, rouge_types, use_stemmer=False, split_summaries=False, tokenizer=None):
|
|
89
86
|
|
|
90
87
|
self.rouge_types = rouge_types
|
|
91
88
|
if tokenizer:
|
|
@@ -160,21 +157,15 @@ class RougeScorer(scoring.BaseScorer):
|
|
|
160
157
|
sents = [x for x in sents if len(x)]
|
|
161
158
|
return sents
|
|
162
159
|
|
|
163
|
-
target_tokens_list = [
|
|
164
|
-
|
|
165
|
-
]
|
|
166
|
-
prediction_tokens_list = [
|
|
167
|
-
self._tokenizer.tokenize(s) for s in get_sents(prediction)
|
|
168
|
-
]
|
|
160
|
+
target_tokens_list = [self._tokenizer.tokenize(s) for s in get_sents(target)]
|
|
161
|
+
prediction_tokens_list = [self._tokenizer.tokenize(s) for s in get_sents(prediction)]
|
|
169
162
|
|
|
170
|
-
scores = _summary_level_lcs(target_tokens_list,
|
|
171
|
-
prediction_tokens_list)
|
|
163
|
+
scores = _summary_level_lcs(target_tokens_list, prediction_tokens_list)
|
|
172
164
|
elif re.match(r'rouge[0-9]$', six.ensure_str(rouge_type)):
|
|
173
165
|
# Rouge from n-grams.
|
|
174
166
|
n = int(rouge_type[5:])
|
|
175
167
|
if n <= 0:
|
|
176
|
-
raise ValueError('rougen requires positive n: %s'
|
|
177
|
-
% rouge_type)
|
|
168
|
+
raise ValueError('rougen requires positive n: %s' % rouge_type)
|
|
178
169
|
target_ngrams = _create_ngrams(target_tokens, n)
|
|
179
170
|
prediction_ngrams = _create_ngrams(prediction_tokens, n)
|
|
180
171
|
scores = _score_ngrams(target_ngrams, prediction_ngrams)
|
|
@@ -349,8 +340,7 @@ def _score_ngrams(target_ngrams, prediction_ngrams):
|
|
|
349
340
|
|
|
350
341
|
intersection_ngrams_count = 0
|
|
351
342
|
for ngram in six.iterkeys(target_ngrams):
|
|
352
|
-
intersection_ngrams_count += min(target_ngrams[ngram],
|
|
353
|
-
prediction_ngrams[ngram])
|
|
343
|
+
intersection_ngrams_count += min(target_ngrams[ngram], prediction_ngrams[ngram])
|
|
354
344
|
target_ngrams_count = sum(target_ngrams.values())
|
|
355
345
|
prediction_ngrams_count = sum(prediction_ngrams.values())
|
|
356
346
|
|
evalscope/metrics/code_metric.py
CHANGED
|
@@ -4,7 +4,6 @@ import inspect
|
|
|
4
4
|
import re
|
|
5
5
|
import signal
|
|
6
6
|
from collections import defaultdict
|
|
7
|
-
|
|
8
7
|
from tqdm import tqdm
|
|
9
8
|
|
|
10
9
|
|
|
@@ -20,8 +19,7 @@ def check_input(text, arg):
|
|
|
20
19
|
code_block = code_block_pattern.search(text)
|
|
21
20
|
code_string = code_block.group(1)
|
|
22
21
|
|
|
23
|
-
function_name_pattern = re.compile(r'def\s+([a-zA-Z_][a-zA-Z0-9_]*)\(',
|
|
24
|
-
re.DOTALL)
|
|
22
|
+
function_name_pattern = re.compile(r'def\s+([a-zA-Z_][a-zA-Z0-9_]*)\(', re.DOTALL)
|
|
25
23
|
function_name_block = function_name_pattern.search(code_string)
|
|
26
24
|
function_name = function_name_block.group(1)
|
|
27
25
|
|
|
@@ -52,9 +50,7 @@ def exec_func(func, arr):
|
|
|
52
50
|
|
|
53
51
|
|
|
54
52
|
def compute_pass_k_one_sample(predict, func_args, func_outputs, k=4):
|
|
55
|
-
assert len(
|
|
56
|
-
predict
|
|
57
|
-
) >= k, f'pass@k must have {k} generations, now have {len(predict)}'
|
|
53
|
+
assert len(predict) >= k, f'pass@k must have {k} generations, now have {len(predict)}'
|
|
58
54
|
for predict_i in predict[:k]:
|
|
59
55
|
try:
|
|
60
56
|
for arg, gold in zip(func_args, func_outputs):
|
|
@@ -87,9 +83,7 @@ def compute_pass_k(predict_l, reference_l, func_args_l, k=4, lang='py'):
|
|
|
87
83
|
def run_code_eval(data_l, k=4, md_level=2):
|
|
88
84
|
print(f"{'#' * md_level} Code Eval(pass@{k})")
|
|
89
85
|
for data in tqdm(data_l):
|
|
90
|
-
data[f'pass@{k}'] = compute_pass_k_one_sample(data['gen'],
|
|
91
|
-
data['func_args'],
|
|
92
|
-
data['func_outputs'], k)
|
|
86
|
+
data[f'pass@{k}'] = compute_pass_k_one_sample(data['gen'], data['func_args'], data['func_outputs'], k)
|
|
93
87
|
task_data_d = defaultdict(list)
|
|
94
88
|
for data in data_l:
|
|
95
89
|
for task in data['task_tags']:
|
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
import re
|
|
4
4
|
from collections import defaultdict
|
|
5
|
-
|
|
6
5
|
from tqdm import tqdm
|
|
7
6
|
|
|
8
7
|
from evalscope.constants import MetricsConstant
|
|
@@ -44,8 +43,7 @@ def compute_math_accuracy(predict_l, reference_l):
|
|
|
44
43
|
def run_math_eval(data_l, md_level=2):
|
|
45
44
|
print(f"{'#' * md_level} Math Eval(math accuracy)")
|
|
46
45
|
for data in tqdm(data_l):
|
|
47
|
-
data['math_accuracy'] = compute_math_accuracy_one_sample(
|
|
48
|
-
data['gen'], data['target'])
|
|
46
|
+
data['math_accuracy'] = compute_math_accuracy_one_sample(data['gen'], data['target'])
|
|
49
47
|
task_data_d = defaultdict(list)
|
|
50
48
|
for data in data_l:
|
|
51
49
|
for task in data['task_tags']:
|
|
@@ -54,7 +52,6 @@ def run_math_eval(data_l, md_level=2):
|
|
|
54
52
|
print(f'[total], count: {len(data_l)}, math accuracy: '
|
|
55
53
|
f'{correct_cnt / len(data_l) * 100:0.2f}%')
|
|
56
54
|
for task in task_data_d.keys():
|
|
57
|
-
correct_cnt = sum(
|
|
58
|
-
[data['math_accuracy'] for data in task_data_d[task]])
|
|
55
|
+
correct_cnt = sum([data['math_accuracy'] for data in task_data_d[task]])
|
|
59
56
|
print(f'[{task}], count: {len(task_data_d[task])}, math accuracy: '
|
|
60
|
-
f'{correct_cnt/len(task_data_d[task])*100:0.2f}%')
|
|
57
|
+
f'{correct_cnt / len(task_data_d[task]) * 100:0.2f}%')
|
evalscope/metrics/metrics.py
CHANGED
|
@@ -2,18 +2,17 @@
|
|
|
2
2
|
# Copyright (c) EleutherAI. and its affiliates.
|
|
3
3
|
# Copyright (c) OpenAI. and its affiliates.
|
|
4
4
|
import itertools
|
|
5
|
-
import math
|
|
6
|
-
from collections.abc import Iterable
|
|
7
|
-
from collections import defaultdict
|
|
8
|
-
from typing import Dict, List, Union
|
|
9
|
-
from nltk.translate.bleu_score import sentence_bleu
|
|
10
|
-
from nltk import word_tokenize
|
|
11
5
|
import jieba
|
|
12
|
-
|
|
6
|
+
import math
|
|
13
7
|
import numpy as np
|
|
8
|
+
import random
|
|
14
9
|
import sacrebleu
|
|
15
10
|
import sklearn.metrics
|
|
16
|
-
import
|
|
11
|
+
from collections import defaultdict
|
|
12
|
+
from collections.abc import Iterable
|
|
13
|
+
from nltk import word_tokenize
|
|
14
|
+
from nltk.translate.bleu_score import sentence_bleu
|
|
15
|
+
from typing import Dict, List, Union
|
|
17
16
|
|
|
18
17
|
|
|
19
18
|
def mean(arr):
|
|
@@ -22,12 +21,12 @@ def mean(arr):
|
|
|
22
21
|
|
|
23
22
|
def pop_stddev(arr):
|
|
24
23
|
mu = mean(arr)
|
|
25
|
-
return math.sqrt(sum([(x - mu)
|
|
24
|
+
return math.sqrt(sum([(x - mu)**2 for x in arr]) / len(arr))
|
|
26
25
|
|
|
27
26
|
|
|
28
27
|
def sample_stddev(arr):
|
|
29
28
|
mu = mean(arr)
|
|
30
|
-
return math.sqrt(sum([(x - mu)
|
|
29
|
+
return math.sqrt(sum([(x - mu)**2 for x in arr]) / (len(arr) - 1))
|
|
31
30
|
|
|
32
31
|
|
|
33
32
|
def mean_stderr(arr):
|
|
@@ -134,13 +133,14 @@ def bleu(items):
|
|
|
134
133
|
refs, preds = _sacreformat(refs, preds)
|
|
135
134
|
return sacrebleu.corpus_bleu(preds, refs).score
|
|
136
135
|
|
|
136
|
+
|
|
137
137
|
def bleu_ngram_one_sample(predict, reference):
|
|
138
138
|
"""
|
|
139
139
|
Calculate BLEU-1, BLEU-2, BLEU-3, and BLEU-4 scores
|
|
140
140
|
|
|
141
141
|
Args:
|
|
142
142
|
items: [(ref, pred)]
|
|
143
|
-
|
|
143
|
+
|
|
144
144
|
Returns:
|
|
145
145
|
{
|
|
146
146
|
'bleu-1': 0.8,
|
|
@@ -150,6 +150,7 @@ def bleu_ngram_one_sample(predict, reference):
|
|
|
150
150
|
}
|
|
151
151
|
|
|
152
152
|
"""
|
|
153
|
+
|
|
153
154
|
def is_contains_chinese(strs):
|
|
154
155
|
for _char in strs:
|
|
155
156
|
if '\u4e00' <= _char <= '\u9fa5':
|
|
@@ -230,6 +231,7 @@ def _sacreformat(refs, preds):
|
|
|
230
231
|
|
|
231
232
|
|
|
232
233
|
class _bootstrap_internal:
|
|
234
|
+
|
|
233
235
|
def __init__(self, f, n):
|
|
234
236
|
self.f = f
|
|
235
237
|
self.n = n
|
|
@@ -260,11 +262,11 @@ def bootstrap_stderr(f, xs, iters):
|
|
|
260
262
|
|
|
261
263
|
print('bootstrapping for stddev:', f.__name__)
|
|
262
264
|
for bootstrap in tqdm(
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
265
|
+
pool.imap(
|
|
266
|
+
_bootstrap_internal(f, chunk_size),
|
|
267
|
+
[(i, xs) for i in range(iters // chunk_size)],
|
|
268
|
+
),
|
|
269
|
+
total=iters // chunk_size,
|
|
268
270
|
):
|
|
269
271
|
# sample w replacement
|
|
270
272
|
res.extend(bootstrap)
|
|
@@ -372,11 +374,9 @@ def calculate_arc_accuracy(question_answers: Dict[str, str], predictions: Dict[s
|
|
|
372
374
|
return score / len(question_answers)
|
|
373
375
|
|
|
374
376
|
|
|
375
|
-
def calculate_pass_at_k(
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
k: int = 1
|
|
379
|
-
) -> np.ndarray:
|
|
377
|
+
def calculate_pass_at_k(num_samples: Union[int, List[int], np.ndarray],
|
|
378
|
+
num_correct: Union[List[int], np.ndarray],
|
|
379
|
+
k: int = 1) -> np.ndarray:
|
|
380
380
|
"""
|
|
381
381
|
Estimates pass@k of each problem and returns them in an array.
|
|
382
382
|
Examples:
|
|
@@ -1,18 +1,16 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
+
import jieba
|
|
3
4
|
import logging
|
|
4
5
|
from collections import defaultdict
|
|
5
6
|
from pathlib import Path
|
|
7
|
+
from rouge_chinese import Rouge
|
|
6
8
|
from statistics import mean
|
|
7
|
-
|
|
8
9
|
from tqdm import tqdm
|
|
9
10
|
|
|
10
11
|
from evalscope.constants import MetricsConstant
|
|
11
12
|
from evalscope.metrics.bundled_rouge_score import rouge_scorer
|
|
12
13
|
|
|
13
|
-
from rouge_chinese import Rouge
|
|
14
|
-
import jieba
|
|
15
|
-
|
|
16
14
|
|
|
17
15
|
class DummyTokenizer:
|
|
18
16
|
|
|
@@ -24,9 +22,7 @@ HERE = Path(__file__).absolute().parent
|
|
|
24
22
|
|
|
25
23
|
logger = logging.getLogger(__name__)
|
|
26
24
|
|
|
27
|
-
scorer = rouge_scorer.RougeScorer(
|
|
28
|
-
['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer()
|
|
29
|
-
)
|
|
25
|
+
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer())
|
|
30
26
|
zh_scorer = Rouge()
|
|
31
27
|
|
|
32
28
|
|
|
@@ -52,11 +48,7 @@ def compute_rouge_score(predict_l, reference_l):
|
|
|
52
48
|
result[rouge_key].append(one_sample[rouge_key])
|
|
53
49
|
rlt = {}
|
|
54
50
|
for rouge_key in MetricsConstant.ROUGE_KEYS:
|
|
55
|
-
rlt[rouge_key] = (
|
|
56
|
-
mean(result[rouge_key]) * 100
|
|
57
|
-
if rouge_key in result
|
|
58
|
-
else MetricsConstant.INVALID_VALUE
|
|
59
|
-
)
|
|
51
|
+
rlt[rouge_key] = (mean(result[rouge_key]) * 100 if rouge_key in result else MetricsConstant.INVALID_VALUE)
|
|
60
52
|
return rlt
|
|
61
53
|
|
|
62
54
|
|
|
@@ -111,9 +103,9 @@ def _to_table(final_result) -> str:
|
|
|
111
103
|
if not task:
|
|
112
104
|
continue
|
|
113
105
|
elif task == 'total':
|
|
114
|
-
row.append(f'{final_result["total"]["rouge"][rouge_key]
|
|
106
|
+
row.append(f'{final_result["total"]["rouge"][rouge_key]:0.2f}')
|
|
115
107
|
else:
|
|
116
|
-
row.append(f'{final_result["tasks"][task]["rouge"][rouge_key]
|
|
108
|
+
row.append(f'{final_result["tasks"][task]["rouge"][rouge_key]:0.2f}')
|
|
117
109
|
table.append('\t'.join(row))
|
|
118
110
|
|
|
119
111
|
return '\n'.join(table)
|
|
@@ -122,23 +114,17 @@ def _to_table(final_result) -> str:
|
|
|
122
114
|
def run_rouge_eval(data_l, md_level=2, report_metric_key='rouge-l-f'):
|
|
123
115
|
print(f"{'#' * md_level} Rouge Eval")
|
|
124
116
|
for data in tqdm(data_l):
|
|
125
|
-
data['rouge'] = compute_rouge_score_one_sample(
|
|
126
|
-
data['gen_tok_str'], data['reference_tok_str']
|
|
127
|
-
)
|
|
117
|
+
data['rouge'] = compute_rouge_score_one_sample(data['gen_tok_str'], data['reference_tok_str'])
|
|
128
118
|
task_data_d = defaultdict(list)
|
|
129
119
|
for data in data_l:
|
|
130
120
|
for task in data['task_tags']:
|
|
131
121
|
task_data_d[task].append(data)
|
|
132
122
|
|
|
133
123
|
total_rouge = mean([data['rouge'][report_metric_key] for data in data_l])
|
|
134
|
-
print(
|
|
135
|
-
|
|
136
|
-
f'{total_rouge * 100:0.2f}%'
|
|
137
|
-
)
|
|
124
|
+
print(f'[total], count: {len(data_l)}, {report_metric_key}: '
|
|
125
|
+
f'{total_rouge * 100:0.2f}%')
|
|
138
126
|
|
|
139
127
|
for task, task_data in task_data_d.items():
|
|
140
128
|
task_rouge = mean([data['rouge'][report_metric_key] for data in task_data])
|
|
141
|
-
print(
|
|
142
|
-
|
|
143
|
-
f'{task_rouge * 100:0.2f}%'
|
|
144
|
-
)
|
|
129
|
+
print(f'[{task}], count: {len(task_data_d[task])}, {report_metric_key}: '
|
|
130
|
+
f'{task_rouge * 100:0.2f}%')
|
evalscope/models/__init__.py
CHANGED