evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +5 -1
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +46 -50
- evalscope/backend/rag_eval/utils/embedding.py +12 -11
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +32 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +119 -95
- evalscope/constants.py +61 -29
- evalscope/evaluator/__init__.py +1 -0
- evalscope/evaluator/evaluator.py +96 -377
- evalscope/evaluator/humaneval_evaluator.py +158 -0
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +15 -3
- evalscope/perf/benchmark.py +7 -9
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +10 -0
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +3 -4
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/perf/utils/db_util.py +11 -8
- evalscope/perf/utils/local_server.py +19 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +184 -375
- evalscope/run_arena.py +20 -25
- evalscope/summarizer.py +16 -17
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -28
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -5
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
- evalscope/tools/combine_reports.py +25 -30
- evalscope/tools/rewrite_eval_results.py +14 -46
- evalscope/utils/__init__.py +0 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +3 -4
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/logger.py +9 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +12 -138
- evalscope/version.py +2 -2
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
- evalscope-0.8.0.dist-info/RECORD +285 -0
- tests/cli/test_run.py +54 -15
- tests/perf/test_perf.py +4 -0
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.1.dist-info/RECORD +0 -286
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
|
|
7
|
+
from evalscope.constants import OutputsStructure
|
|
8
|
+
from evalscope.evaluator.evaluator import logger
|
|
9
|
+
from evalscope.models.model_adapter import BaseModelAdapter
|
|
10
|
+
from evalscope.tools.combine_reports import gen_table
|
|
11
|
+
from evalscope.utils import normalize_score
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class HumanevalEvaluator(object):
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
problem_file: str,
|
|
19
|
+
model_id: str,
|
|
20
|
+
model_revision: str,
|
|
21
|
+
model_adapter: BaseModelAdapter,
|
|
22
|
+
outputs: Optional[OutputsStructure] = None,
|
|
23
|
+
k: List[int] = [1, 10, 100],
|
|
24
|
+
n_workers: int = 4,
|
|
25
|
+
timeout: float = 3.0,
|
|
26
|
+
):
|
|
27
|
+
try:
|
|
28
|
+
from human_eval.data import read_problems, write_jsonl
|
|
29
|
+
from human_eval.evaluation import evaluate_functional_correctness
|
|
30
|
+
except ImportError:
|
|
31
|
+
raise ImportError('Please install human_eval:'
|
|
32
|
+
'https://github.com/openai/human-eval/tree/master#installation , '
|
|
33
|
+
'Note that you need to enable the execution code in the human_eval/execution.py first.')
|
|
34
|
+
|
|
35
|
+
self.problem_file = problem_file
|
|
36
|
+
self.k = k
|
|
37
|
+
self.num_workers = n_workers
|
|
38
|
+
self.timeout = timeout
|
|
39
|
+
self.model_adapter = model_adapter
|
|
40
|
+
|
|
41
|
+
self.read_problems_func = read_problems
|
|
42
|
+
self.write_jsonl_func = write_jsonl
|
|
43
|
+
self.eval_func = evaluate_functional_correctness
|
|
44
|
+
|
|
45
|
+
# {'task_id': {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...}
|
|
46
|
+
self.problems = self.read_problems_func(self.problem_file)
|
|
47
|
+
|
|
48
|
+
# Deal with the output paths
|
|
49
|
+
self.outputs_structure = OutputsStructure(outputs)
|
|
50
|
+
|
|
51
|
+
def get_answers(self, infer_cfg: dict) -> List[dict]:
|
|
52
|
+
ans_list: list = []
|
|
53
|
+
system_prompt: str = 'Complete the following python code:\n'
|
|
54
|
+
for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
|
|
55
|
+
prompt: str = system_prompt + data_d['prompt']
|
|
56
|
+
inputs: dict = {'data': [prompt]}
|
|
57
|
+
# pred_res: dict = self.model_adapter.predict(inputs)
|
|
58
|
+
|
|
59
|
+
pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
|
|
60
|
+
|
|
61
|
+
pred_ans: str = pred_res['choices'][0]['message']['content']
|
|
62
|
+
pred_ans = self._postprocess(pred_ans)
|
|
63
|
+
|
|
64
|
+
ans_list.append({'task_id': task_id, 'completion': pred_ans})
|
|
65
|
+
|
|
66
|
+
return ans_list
|
|
67
|
+
|
|
68
|
+
def eval(self, infer_cfg: dict, **kwargs):
|
|
69
|
+
|
|
70
|
+
# predict
|
|
71
|
+
ans_list: list = self.get_answers(infer_cfg)
|
|
72
|
+
ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl')
|
|
73
|
+
|
|
74
|
+
self.write_jsonl_func(filename=ans_out_file, data=ans_list)
|
|
75
|
+
# logger.info(f'** Dump predictions to {ans_out_file} successfully.')
|
|
76
|
+
logger.info('** Dump predictions successfully.')
|
|
77
|
+
|
|
78
|
+
# evaluate results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
|
|
79
|
+
results = self.eval_func(
|
|
80
|
+
sample_file=ans_out_file,
|
|
81
|
+
k=self.k,
|
|
82
|
+
n_workers=self.num_workers,
|
|
83
|
+
timeout=self.timeout,
|
|
84
|
+
problem_file=self.problem_file)
|
|
85
|
+
|
|
86
|
+
# output: report
|
|
87
|
+
report_map: dict = self.gen_report(results=results)
|
|
88
|
+
report_dir: str = self.outputs_structure.reports_dir
|
|
89
|
+
report_file: str = os.path.join(report_dir, 'human_eval_report.json')
|
|
90
|
+
|
|
91
|
+
with open(report_file, 'w') as f:
|
|
92
|
+
f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
|
|
93
|
+
# logger.info(f'** Dump report to {report_file} \n')
|
|
94
|
+
logger.info('** Dump report \n')
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
# Make table
|
|
98
|
+
report_table: str = gen_table([report_dir])
|
|
99
|
+
logger.info(f'** Report table: \n {report_table} \n')
|
|
100
|
+
except Exception:
|
|
101
|
+
logger.error('Failed to generate report table.')
|
|
102
|
+
|
|
103
|
+
def gen_report(self, results: dict) -> dict:
|
|
104
|
+
"""
|
|
105
|
+
Generate report from evaluation results.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
{
|
|
109
|
+
"name":"ARC-Challenge",
|
|
110
|
+
"metric":"WeightedAverageAccuracy",
|
|
111
|
+
"score":0.3389,
|
|
112
|
+
"category":[
|
|
113
|
+
{
|
|
114
|
+
"name":"DEFAULT",
|
|
115
|
+
"score":0.3389,
|
|
116
|
+
"subset":[
|
|
117
|
+
{
|
|
118
|
+
"name":"ARC-Challenge",
|
|
119
|
+
"score":0.3389
|
|
120
|
+
},
|
|
121
|
+
]
|
|
122
|
+
}
|
|
123
|
+
],
|
|
124
|
+
"total_num":100
|
|
125
|
+
}
|
|
126
|
+
"""
|
|
127
|
+
results = {k: normalize_score(score=v) for k, v in results.items()}
|
|
128
|
+
|
|
129
|
+
category_d = dict(name='DEFAULT', score=results, subset=[])
|
|
130
|
+
|
|
131
|
+
res_map = dict(
|
|
132
|
+
name='HumanEval', metric='pass@k', score=results, category=[category_d], total_num=len(self.problems))
|
|
133
|
+
|
|
134
|
+
return res_map
|
|
135
|
+
|
|
136
|
+
@classmethod
|
|
137
|
+
def _postprocess(cls, text: str) -> str:
|
|
138
|
+
if '```' in text:
|
|
139
|
+
blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
|
|
140
|
+
if len(blocks) == 0:
|
|
141
|
+
text = text.split('```')[1] # fall back to default strategy
|
|
142
|
+
else:
|
|
143
|
+
text = blocks[0] # fetch the first code block
|
|
144
|
+
if not text.startswith('\n'): # in case starting with ```python
|
|
145
|
+
text = text[max(text.find('\n') + 1, 0):]
|
|
146
|
+
if text.strip().startswith('from') or text.strip().startswith('import'):
|
|
147
|
+
def_idx = text.find('def')
|
|
148
|
+
if def_idx != -1:
|
|
149
|
+
text = text[max(text.find('\n', def_idx) + 1, 0):]
|
|
150
|
+
text = text.split('\n\n')[0]
|
|
151
|
+
if text.strip().startswith('def'):
|
|
152
|
+
text = '\n'.join(text.split('\n')[1:])
|
|
153
|
+
if not text.startswith(' '):
|
|
154
|
+
if text.startswith(' '):
|
|
155
|
+
text = ' ' + text.lstrip()
|
|
156
|
+
else:
|
|
157
|
+
text = '\n'.join([' ' + line for line in text.split('\n')])
|
|
158
|
+
return text
|
|
@@ -1,24 +1,17 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
from typing import List, Union
|
|
4
|
-
|
|
5
3
|
import pandas as pd
|
|
6
4
|
import pyarrow as pa
|
|
5
|
+
from typing import List, Union
|
|
7
6
|
|
|
8
7
|
from evalscope.constants import MetricMembers
|
|
8
|
+
from evalscope.utils import jsonl_to_list
|
|
9
9
|
from evalscope.utils.arena_utils import compute_elo
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
|
-
from evalscope.utils import jsonl_to_list
|
|
12
11
|
|
|
13
12
|
logger = get_logger()
|
|
14
13
|
|
|
15
|
-
DEFAULT_COLUMNS_MAPPING = {
|
|
16
|
-
'model_a': 'model_a',
|
|
17
|
-
'model_b': 'model_b',
|
|
18
|
-
'win': 'win',
|
|
19
|
-
'tstamp': 'ts',
|
|
20
|
-
'language': 'lang'
|
|
21
|
-
}
|
|
14
|
+
DEFAULT_COLUMNS_MAPPING = {'model_a': 'model_a', 'model_b': 'model_b', 'win': 'win', 'tstamp': 'ts', 'language': 'lang'}
|
|
22
15
|
|
|
23
16
|
|
|
24
17
|
class RatingEvaluate(object):
|
|
@@ -41,10 +34,9 @@ class RatingEvaluate(object):
|
|
|
41
34
|
elo_ratings = compute_elo(battles)
|
|
42
35
|
col_model = 'Model'
|
|
43
36
|
col_elo_rating = 'Elo_Rating'
|
|
44
|
-
elo_ratings_res = pd.DataFrame(
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
col_elo_rating, ascending=False).reset_index(drop=True)
|
|
37
|
+
elo_ratings_res = pd.DataFrame([[n, elo_ratings[n]] for n in elo_ratings.keys()],
|
|
38
|
+
columns=[col_model, col_elo_rating]).sort_values(
|
|
39
|
+
col_elo_rating, ascending=False).reset_index(drop=True)
|
|
48
40
|
elo_ratings_res = elo_ratings_res.round({col_elo_rating: 1})
|
|
49
41
|
return elo_ratings_res
|
|
50
42
|
|
|
@@ -89,23 +81,11 @@ class RatingEvaluate(object):
|
|
|
89
81
|
'tie': 1
|
|
90
82
|
}]
|
|
91
83
|
else:
|
|
92
|
-
return [{
|
|
93
|
-
'model': winner,
|
|
94
|
-
'win': 1,
|
|
95
|
-
'loss': 0,
|
|
96
|
-
'tie': 0
|
|
97
|
-
}, {
|
|
98
|
-
'model': loser,
|
|
99
|
-
'win': 0,
|
|
100
|
-
'loss': 1,
|
|
101
|
-
'tie': 0
|
|
102
|
-
}]
|
|
84
|
+
return [{'model': winner, 'win': 1, 'loss': 0, 'tie': 0}, {'model': loser, 'win': 0, 'loss': 1, 'tie': 0}]
|
|
103
85
|
|
|
104
86
|
def compute_pairwise_rating(self, raw_data):
|
|
105
87
|
df_all = self.preprocess(raw_data_df=raw_data)
|
|
106
|
-
model_list = (
|
|
107
|
-
df_all['model_a'].unique().tolist()
|
|
108
|
-
+ df_all['model_b'].unique().tolist())
|
|
88
|
+
model_list = (df_all['model_a'].unique().tolist() + df_all['model_b'].unique().tolist())
|
|
109
89
|
model_list = list(set(model_list))
|
|
110
90
|
|
|
111
91
|
list_res = []
|
|
@@ -114,8 +94,7 @@ class RatingEvaluate(object):
|
|
|
114
94
|
if self.baseline_model is not None:
|
|
115
95
|
if self.baseline_model not in [row['model_a'], row['model_b']]:
|
|
116
96
|
logger.warning(
|
|
117
|
-
f'One of the models in the battle should be the baseline model: {self.baseline_model}'
|
|
118
|
-
)
|
|
97
|
+
f'One of the models in the battle should be the baseline model: {self.baseline_model}')
|
|
119
98
|
continue
|
|
120
99
|
rating = self.get_single_pairwise_rating(row)
|
|
121
100
|
list_res = list_res + rating
|
|
@@ -149,15 +128,15 @@ class RatingEvaluate(object):
|
|
|
149
128
|
|
|
150
129
|
for metric in self.metrics:
|
|
151
130
|
|
|
152
|
-
if metric == MetricMembers.ELO
|
|
131
|
+
if metric == MetricMembers.ELO:
|
|
153
132
|
res = self.compute_elo_rating(raw_data)
|
|
154
133
|
res_all.append(res)
|
|
155
134
|
|
|
156
|
-
elif metric == MetricMembers.PAIRWISE
|
|
135
|
+
elif metric == MetricMembers.PAIRWISE:
|
|
157
136
|
res = self.compute_pairwise_rating(raw_data)
|
|
158
137
|
res_all.append(res)
|
|
159
138
|
|
|
160
|
-
elif metric == MetricMembers.SCORE
|
|
139
|
+
elif metric == MetricMembers.SCORE:
|
|
161
140
|
res = self.compute_score_rating(raw_data)
|
|
162
141
|
res_all.append(res)
|
|
163
142
|
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# flake8: noqa
|
|
3
3
|
|
|
4
4
|
import os
|
|
5
|
+
import pandas as pd
|
|
5
6
|
import random
|
|
6
7
|
import sys
|
|
7
8
|
import time
|
|
@@ -9,15 +10,10 @@ from abc import ABC, abstractmethod
|
|
|
9
10
|
from functools import partial
|
|
10
11
|
from typing import Any, List
|
|
11
12
|
|
|
12
|
-
import pandas as pd
|
|
13
|
-
|
|
14
13
|
from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
|
|
15
14
|
from evalscope.models.openai_model import OpenAIModel
|
|
16
|
-
from evalscope.utils import completion_parsers
|
|
17
|
-
from evalscope.utils.arena_utils import
|
|
18
|
-
merge_ques_ans,
|
|
19
|
-
shuffle_pairwise_preferences)
|
|
20
|
-
from evalscope.utils import dump_jsonl_data, jsonl_to_list, random_seeded_choice
|
|
15
|
+
from evalscope.utils import completion_parsers, dump_jsonl_data, jsonl_to_list, random_seeded_choice
|
|
16
|
+
from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
|
|
21
17
|
from evalscope.utils.logger import get_logger
|
|
22
18
|
|
|
23
19
|
logger = get_logger()
|
|
@@ -33,8 +29,7 @@ class BaseReviewer(ABC):
|
|
|
33
29
|
"""
|
|
34
30
|
Run pairwise battles with given models.
|
|
35
31
|
"""
|
|
36
|
-
raise NotImplementedError(
|
|
37
|
-
'run() method must be implemented in your subclass.')
|
|
32
|
+
raise NotImplementedError('run() method must be implemented in your subclass.')
|
|
38
33
|
|
|
39
34
|
|
|
40
35
|
class AutoReviewerGpt4(BaseReviewer):
|
|
@@ -71,13 +66,9 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
71
66
|
|
|
72
67
|
self.review_result_file = review_result_file
|
|
73
68
|
self.prompt_list = jsonl_to_list(prompt_file)
|
|
74
|
-
self.answer_list = [
|
|
75
|
-
|
|
76
|
-
]
|
|
77
|
-
self.reference_list = jsonl_to_list(
|
|
78
|
-
reference_file) if reference_file else []
|
|
79
|
-
self.cache_list = jsonl_to_list(
|
|
80
|
-
cache_file) if cache_file and os.path.isfile(cache_file) else []
|
|
69
|
+
self.answer_list = [jsonl_to_list(answer_file) for answer_file in answer_file_list]
|
|
70
|
+
self.reference_list = jsonl_to_list(reference_file) if reference_file else []
|
|
71
|
+
self.cache_list = jsonl_to_list(cache_file) if cache_file and os.path.isfile(cache_file) else []
|
|
81
72
|
|
|
82
73
|
self.reviewer_args = reviewer_args if reviewer_args \
|
|
83
74
|
else self._get_default_args()
|
|
@@ -88,24 +79,18 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
88
79
|
self.answer_list.append(jsonl_to_list(baseline_file))
|
|
89
80
|
self.baseline_idx = len(self.answer_list) - 1
|
|
90
81
|
|
|
91
|
-
self.position_bias_mitigation = self.reviewer_args.pop(
|
|
92
|
-
|
|
93
|
-
PositionBiasMitigation.NONE)
|
|
82
|
+
self.position_bias_mitigation = self.reviewer_args.pop(EvalConfigKeys.POSITION_BIAS_MITIGATION,
|
|
83
|
+
PositionBiasMitigation.NONE)
|
|
94
84
|
if self.position_bias_mitigation == PositionBiasMitigation.RANDOMIZE_ORDER:
|
|
95
|
-
self.random_seed = self.reviewer_args.pop(
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
FnCompletionParser.LMSYS_PARSER)
|
|
101
|
-
completion_parser_kwargs = self.reviewer_args.pop(
|
|
102
|
-
EvalConfigKeys.COMPLETION_PARSER_KWARGS, {})
|
|
85
|
+
self.random_seed = self.reviewer_args.pop(EvalConfigKeys.RANDOM_SEED, 123)
|
|
86
|
+
|
|
87
|
+
fn_completion_parser = self.reviewer_args.pop(EvalConfigKeys.FN_COMPLETION_PARSER,
|
|
88
|
+
FnCompletionParser.LMSYS_PARSER)
|
|
89
|
+
completion_parser_kwargs = self.reviewer_args.pop(EvalConfigKeys.COMPLETION_PARSER_KWARGS, {})
|
|
103
90
|
if isinstance(fn_completion_parser, str):
|
|
104
|
-
fn_completion_parser = getattr(completion_parsers,
|
|
105
|
-
fn_completion_parser)
|
|
91
|
+
fn_completion_parser = getattr(completion_parsers, fn_completion_parser)
|
|
106
92
|
|
|
107
|
-
self.fn_completion_parser = partial(fn_completion_parser,
|
|
108
|
-
**completion_parser_kwargs)
|
|
93
|
+
self.fn_completion_parser = partial(fn_completion_parser, **completion_parser_kwargs)
|
|
109
94
|
self.gpt_predictor = OpenAIModel(model_cfg=self.reviewer_args)
|
|
110
95
|
|
|
111
96
|
@staticmethod
|
|
@@ -133,45 +118,35 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
133
118
|
# Default to general category (idx 0)
|
|
134
119
|
target_prompt_dict = prompts_list[0]
|
|
135
120
|
for item in prompts_list:
|
|
136
|
-
is_category_match = category in item['category'] if isinstance(
|
|
137
|
-
|
|
121
|
+
is_category_match = category in item['category'] if isinstance(item['category'],
|
|
122
|
+
list) else item['category'] == category
|
|
138
123
|
is_type_match = item.get('type', ArenaMode.PAIRWISE) == type
|
|
139
124
|
if is_category_match and is_type_match:
|
|
140
125
|
target_prompt_dict = item
|
|
141
126
|
break
|
|
142
|
-
elif is_type_match and target_prompt_dict.get('type',
|
|
143
|
-
ArenaMode.PAIRWISE) != type:
|
|
127
|
+
elif is_type_match and target_prompt_dict.get('type', ArenaMode.PAIRWISE) != type:
|
|
144
128
|
target_prompt_dict = item # fallback to type match
|
|
145
129
|
|
|
146
130
|
sys_prompt = target_prompt_dict['system_prompt']
|
|
147
131
|
prompt_template = target_prompt_dict['prompt_template']
|
|
148
132
|
defaults = target_prompt_dict.get('defaults', dict({}))
|
|
149
|
-
output_format = target_prompt_dict.get('output_format',
|
|
150
|
-
'[[rating_a,rating_b]]')
|
|
133
|
+
output_format = target_prompt_dict.get('output_format', '[[rating_a,rating_b]]')
|
|
151
134
|
|
|
152
135
|
if type == ArenaMode.SINGLE:
|
|
153
|
-
user_prompt = prompt_template.format(
|
|
154
|
-
question=ques, answer=ans1, ref_answer_1=ans_ref, **defaults)
|
|
136
|
+
user_prompt = prompt_template.format(question=ques, answer=ans1, ref_answer_1=ans_ref, **defaults)
|
|
155
137
|
else:
|
|
156
138
|
user_prompt = prompt_template.format(
|
|
157
|
-
question=ques,
|
|
158
|
-
answer_a=ans1,
|
|
159
|
-
answer_b=ans2,
|
|
160
|
-
ref_answer_1=ans_ref,
|
|
161
|
-
**defaults)
|
|
139
|
+
question=ques, answer_a=ans1, answer_b=ans2, ref_answer_1=ans_ref, **defaults)
|
|
162
140
|
|
|
163
141
|
return sys_prompt, user_prompt, output_format
|
|
164
142
|
|
|
165
143
|
def get_review_cache(self, model_a, model_b, question) -> list:
|
|
166
144
|
if model_b:
|
|
167
|
-
cache_hit = next(
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
None)
|
|
145
|
+
cache_hit = next((r for r in self.cache_list
|
|
146
|
+
if r['model_a'] == model_a and r['model_b'] == model_b and r['question'] == question),
|
|
147
|
+
None)
|
|
171
148
|
else:
|
|
172
|
-
cache_hit = next(
|
|
173
|
-
(r for r in self.cache_list
|
|
174
|
-
if r['model'] == model_a and r['question'] == question), None)
|
|
149
|
+
cache_hit = next((r for r in self.cache_list if r['model'] == model_a and r['question'] == question), None)
|
|
175
150
|
return cache_hit
|
|
176
151
|
|
|
177
152
|
def get_review_pair(self, item: List[dict], dry_run=False, **kwargs) -> dict:
|
|
@@ -265,12 +240,10 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
265
240
|
return review_result
|
|
266
241
|
|
|
267
242
|
def _get_review_pair(self, model_a, model_b, question, category, ans1, ans2, dry_run=False, **kwargs) -> (str, Any):
|
|
268
|
-
input_msg = dict(
|
|
269
|
-
ques=question, category=category, ans1=ans1, ans2=ans2)
|
|
243
|
+
input_msg = dict(ques=question, category=category, ans1=ans1, ans2=ans2)
|
|
270
244
|
|
|
271
245
|
if self.reference_list:
|
|
272
|
-
ans_ref = next((ref for ref in self.reference_list
|
|
273
|
-
if ref.get('text') == question), None)
|
|
246
|
+
ans_ref = next((ref for ref in self.reference_list if ref.get('text') == question), None)
|
|
274
247
|
assert ans_ref['answer']
|
|
275
248
|
input_msg['ans_ref'] = ans_ref['answer']
|
|
276
249
|
|
|
@@ -284,8 +257,7 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
284
257
|
else:
|
|
285
258
|
review_text = self._get_reviewer_prediction(sys_prompt, user_prompt, **kwargs)
|
|
286
259
|
|
|
287
|
-
result = self.fn_completion_parser(
|
|
288
|
-
review_text, output_format=output_format)
|
|
260
|
+
result = self.fn_completion_parser(review_text, output_format=output_format)
|
|
289
261
|
if not isinstance(result, tuple):
|
|
290
262
|
result = (result, None)
|
|
291
263
|
return review_text, *result
|
|
@@ -294,8 +266,7 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
294
266
|
input_msg = dict(ques=question, category=category, ans1=answer)
|
|
295
267
|
|
|
296
268
|
if self.reference_list:
|
|
297
|
-
ans_ref = next((ref for ref in self.reference_list
|
|
298
|
-
if ref.get('text') == question), None)
|
|
269
|
+
ans_ref = next((ref for ref in self.reference_list if ref.get('text') == question), None)
|
|
299
270
|
assert ans_ref['answer']
|
|
300
271
|
input_msg['ans_ref'] = ans_ref['answer']
|
|
301
272
|
|
|
@@ -312,8 +283,7 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
312
283
|
score = self.fn_completion_parser(review_text, output_format)
|
|
313
284
|
return review_text, score
|
|
314
285
|
|
|
315
|
-
def _get_reviewer_prediction_dummy(self, sys_prompt: str, user_prompt: str,
|
|
316
|
-
output_format) -> str:
|
|
286
|
+
def _get_reviewer_prediction_dummy(self, sys_prompt: str, user_prompt: str, output_format) -> str:
|
|
317
287
|
logger.info('Get dummy scores for input prompt ...')
|
|
318
288
|
if output_format == '[[rating]]':
|
|
319
289
|
return f'[[{round(random.random(), 2)}]]'
|
|
@@ -359,8 +329,7 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
359
329
|
if self.review_mode == ArenaMode.PAIRWISE:
|
|
360
330
|
battle_pairs = get_battle_pairs(merged_ans_df.columns)
|
|
361
331
|
elif self.review_mode == ArenaMode.PAIRWISE_BASELINE:
|
|
362
|
-
battle_pairs = get_battle_pairs(merged_ans_df.columns,
|
|
363
|
-
self.baseline_idx)
|
|
332
|
+
battle_pairs = get_battle_pairs(merged_ans_df.columns, self.baseline_idx)
|
|
364
333
|
elif self.review_mode == ArenaMode.SINGLE:
|
|
365
334
|
battle_pairs = [(col, ) for col in merged_ans_df.columns]
|
|
366
335
|
else:
|
|
@@ -373,14 +342,12 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
373
342
|
pair_df.columns = ['output_1', 'output_2']
|
|
374
343
|
pair_df['is_switched_outputs'] = pair_df.apply(
|
|
375
344
|
lambda x: random_seeded_choice(
|
|
376
|
-
seed='is_switched_outputs' + x[0]['text'] + str(
|
|
377
|
-
self.random_seed),
|
|
345
|
+
seed='is_switched_outputs' + x[0]['text'] + str(self.random_seed),
|
|
378
346
|
choices=[False, True],
|
|
379
347
|
),
|
|
380
348
|
axis=1,
|
|
381
349
|
)
|
|
382
|
-
pair_df = shuffle_pairwise_preferences(
|
|
383
|
-
pair_df, pair_df['is_switched_outputs'])
|
|
350
|
+
pair_df = shuffle_pairwise_preferences(pair_df, pair_df['is_switched_outputs'])
|
|
384
351
|
|
|
385
352
|
for index, row in pair_df.iterrows():
|
|
386
353
|
row_result = self.get_review_pair(row.to_list(), dry_run=dry_run, **kwargs) \
|
|
@@ -395,17 +362,21 @@ if __name__ == '__main__':
|
|
|
395
362
|
|
|
396
363
|
work_path = os.path.join(Path(__file__).absolute().parent, '../../../')
|
|
397
364
|
prompt_template_path = os.path.join(work_path, 'evalscope/registry/data/prompt_template/prompt_templates.jsonl')
|
|
398
|
-
answer_file_list = [
|
|
399
|
-
|
|
365
|
+
answer_file_list = [
|
|
366
|
+
os.path.join(work_path, 'outputs/arena/default/answers/answer_chatglm2-6b.jsonl'),
|
|
367
|
+
os.path.join(work_path, 'outputs/arena/default/answers/answer_llama2-7b.jsonl')
|
|
368
|
+
]
|
|
400
369
|
review_result_file_path = os.path.join(work_path, 'outputs/arena/default/reviews/review_gpt4.jsonl')
|
|
401
370
|
|
|
402
|
-
input_kwargs = dict(
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
371
|
+
input_kwargs = dict(
|
|
372
|
+
prompt_file=prompt_template_path,
|
|
373
|
+
answer_file_list=answer_file_list,
|
|
374
|
+
review_result_file=review_result_file_path,
|
|
375
|
+
reviewer_args={},
|
|
376
|
+
baseline_file='',
|
|
377
|
+
reference_file='',
|
|
378
|
+
cache_file='',
|
|
379
|
+
)
|
|
409
380
|
|
|
410
381
|
auto_reviewer = AutoReviewerGpt4(**input_kwargs)
|
|
411
382
|
auto_reviewer.run(dry_run=True)
|
|
@@ -29,16 +29,17 @@ In these examples settings.xml lists input files and formats.
|
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
31
|
from __future__ import absolute_import, division, print_function
|
|
32
|
-
import collections
|
|
33
|
-
import re
|
|
34
|
-
import os
|
|
35
32
|
|
|
33
|
+
import collections
|
|
36
34
|
import nltk
|
|
37
35
|
import numpy as np
|
|
36
|
+
import os
|
|
37
|
+
import re
|
|
38
38
|
import six
|
|
39
39
|
from absl import logging
|
|
40
40
|
from rouge_score import scoring, tokenizers
|
|
41
41
|
from six.moves import map, range
|
|
42
|
+
|
|
42
43
|
from evalscope.utils import get_logger
|
|
43
44
|
|
|
44
45
|
logger = get_logger()
|
|
@@ -81,11 +82,7 @@ class RougeScorer(scoring.BaseScorer):
|
|
|
81
82
|
... 'The quick brown dog jumps on the log.')
|
|
82
83
|
"""
|
|
83
84
|
|
|
84
|
-
def __init__(self,
|
|
85
|
-
rouge_types,
|
|
86
|
-
use_stemmer=False,
|
|
87
|
-
split_summaries=False,
|
|
88
|
-
tokenizer=None):
|
|
85
|
+
def __init__(self, rouge_types, use_stemmer=False, split_summaries=False, tokenizer=None):
|
|
89
86
|
|
|
90
87
|
self.rouge_types = rouge_types
|
|
91
88
|
if tokenizer:
|
|
@@ -160,21 +157,15 @@ class RougeScorer(scoring.BaseScorer):
|
|
|
160
157
|
sents = [x for x in sents if len(x)]
|
|
161
158
|
return sents
|
|
162
159
|
|
|
163
|
-
target_tokens_list = [
|
|
164
|
-
|
|
165
|
-
]
|
|
166
|
-
prediction_tokens_list = [
|
|
167
|
-
self._tokenizer.tokenize(s) for s in get_sents(prediction)
|
|
168
|
-
]
|
|
160
|
+
target_tokens_list = [self._tokenizer.tokenize(s) for s in get_sents(target)]
|
|
161
|
+
prediction_tokens_list = [self._tokenizer.tokenize(s) for s in get_sents(prediction)]
|
|
169
162
|
|
|
170
|
-
scores = _summary_level_lcs(target_tokens_list,
|
|
171
|
-
prediction_tokens_list)
|
|
163
|
+
scores = _summary_level_lcs(target_tokens_list, prediction_tokens_list)
|
|
172
164
|
elif re.match(r'rouge[0-9]$', six.ensure_str(rouge_type)):
|
|
173
165
|
# Rouge from n-grams.
|
|
174
166
|
n = int(rouge_type[5:])
|
|
175
167
|
if n <= 0:
|
|
176
|
-
raise ValueError('rougen requires positive n: %s'
|
|
177
|
-
% rouge_type)
|
|
168
|
+
raise ValueError('rougen requires positive n: %s' % rouge_type)
|
|
178
169
|
target_ngrams = _create_ngrams(target_tokens, n)
|
|
179
170
|
prediction_ngrams = _create_ngrams(prediction_tokens, n)
|
|
180
171
|
scores = _score_ngrams(target_ngrams, prediction_ngrams)
|
|
@@ -349,8 +340,7 @@ def _score_ngrams(target_ngrams, prediction_ngrams):
|
|
|
349
340
|
|
|
350
341
|
intersection_ngrams_count = 0
|
|
351
342
|
for ngram in six.iterkeys(target_ngrams):
|
|
352
|
-
intersection_ngrams_count += min(target_ngrams[ngram],
|
|
353
|
-
prediction_ngrams[ngram])
|
|
343
|
+
intersection_ngrams_count += min(target_ngrams[ngram], prediction_ngrams[ngram])
|
|
354
344
|
target_ngrams_count = sum(target_ngrams.values())
|
|
355
345
|
prediction_ngrams_count = sum(prediction_ngrams.values())
|
|
356
346
|
|
evalscope/metrics/code_metric.py
CHANGED
|
@@ -4,7 +4,6 @@ import inspect
|
|
|
4
4
|
import re
|
|
5
5
|
import signal
|
|
6
6
|
from collections import defaultdict
|
|
7
|
-
|
|
8
7
|
from tqdm import tqdm
|
|
9
8
|
|
|
10
9
|
|
|
@@ -20,8 +19,7 @@ def check_input(text, arg):
|
|
|
20
19
|
code_block = code_block_pattern.search(text)
|
|
21
20
|
code_string = code_block.group(1)
|
|
22
21
|
|
|
23
|
-
function_name_pattern = re.compile(r'def\s+([a-zA-Z_][a-zA-Z0-9_]*)\(',
|
|
24
|
-
re.DOTALL)
|
|
22
|
+
function_name_pattern = re.compile(r'def\s+([a-zA-Z_][a-zA-Z0-9_]*)\(', re.DOTALL)
|
|
25
23
|
function_name_block = function_name_pattern.search(code_string)
|
|
26
24
|
function_name = function_name_block.group(1)
|
|
27
25
|
|
|
@@ -52,9 +50,7 @@ def exec_func(func, arr):
|
|
|
52
50
|
|
|
53
51
|
|
|
54
52
|
def compute_pass_k_one_sample(predict, func_args, func_outputs, k=4):
|
|
55
|
-
assert len(
|
|
56
|
-
predict
|
|
57
|
-
) >= k, f'pass@k must have {k} generations, now have {len(predict)}'
|
|
53
|
+
assert len(predict) >= k, f'pass@k must have {k} generations, now have {len(predict)}'
|
|
58
54
|
for predict_i in predict[:k]:
|
|
59
55
|
try:
|
|
60
56
|
for arg, gold in zip(func_args, func_outputs):
|
|
@@ -87,9 +83,7 @@ def compute_pass_k(predict_l, reference_l, func_args_l, k=4, lang='py'):
|
|
|
87
83
|
def run_code_eval(data_l, k=4, md_level=2):
|
|
88
84
|
print(f"{'#' * md_level} Code Eval(pass@{k})")
|
|
89
85
|
for data in tqdm(data_l):
|
|
90
|
-
data[f'pass@{k}'] = compute_pass_k_one_sample(data['gen'],
|
|
91
|
-
data['func_args'],
|
|
92
|
-
data['func_outputs'], k)
|
|
86
|
+
data[f'pass@{k}'] = compute_pass_k_one_sample(data['gen'], data['func_args'], data['func_outputs'], k)
|
|
93
87
|
task_data_d = defaultdict(list)
|
|
94
88
|
for data in data_l:
|
|
95
89
|
for task in data['task_tags']:
|
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
import re
|
|
4
4
|
from collections import defaultdict
|
|
5
|
-
|
|
6
5
|
from tqdm import tqdm
|
|
7
6
|
|
|
8
7
|
from evalscope.constants import MetricsConstant
|
|
@@ -44,8 +43,7 @@ def compute_math_accuracy(predict_l, reference_l):
|
|
|
44
43
|
def run_math_eval(data_l, md_level=2):
|
|
45
44
|
print(f"{'#' * md_level} Math Eval(math accuracy)")
|
|
46
45
|
for data in tqdm(data_l):
|
|
47
|
-
data['math_accuracy'] = compute_math_accuracy_one_sample(
|
|
48
|
-
data['gen'], data['target'])
|
|
46
|
+
data['math_accuracy'] = compute_math_accuracy_one_sample(data['gen'], data['target'])
|
|
49
47
|
task_data_d = defaultdict(list)
|
|
50
48
|
for data in data_l:
|
|
51
49
|
for task in data['task_tags']:
|
|
@@ -54,7 +52,6 @@ def run_math_eval(data_l, md_level=2):
|
|
|
54
52
|
print(f'[total], count: {len(data_l)}, math accuracy: '
|
|
55
53
|
f'{correct_cnt / len(data_l) * 100:0.2f}%')
|
|
56
54
|
for task in task_data_d.keys():
|
|
57
|
-
correct_cnt = sum(
|
|
58
|
-
[data['math_accuracy'] for data in task_data_d[task]])
|
|
55
|
+
correct_cnt = sum([data['math_accuracy'] for data in task_data_d[task]])
|
|
59
56
|
print(f'[{task}], count: {len(task_data_d[task])}, math accuracy: '
|
|
60
|
-
f'{correct_cnt/len(task_data_d[task])*100:0.2f}%')
|
|
57
|
+
f'{correct_cnt / len(task_data_d[task]) * 100:0.2f}%')
|