PyPI - evalscope - Versions diffs - 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

evalscope 0.7.1py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show

evalscope/__init__.py +1 -1
evalscope/arguments.py +73 -0
evalscope/backend/base.py +5 -1
evalscope/backend/opencompass/api_meta_template.py +8 -14
evalscope/backend/opencompass/backend_manager.py +24 -15
evalscope/backend/opencompass/tasks/eval_api.py +1 -6
evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
evalscope/backend/rag_eval/__init__.py +3 -3
evalscope/backend/rag_eval/backend_manager.py +21 -25
evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
evalscope/backend/rag_eval/cmteb/base.py +22 -23
evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
evalscope/backend/rag_eval/ragas/__init__.py +2 -2
evalscope/backend/rag_eval/ragas/arguments.py +3 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +10 -15
evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
evalscope/backend/rag_eval/utils/clip.py +46 -50
evalscope/backend/rag_eval/utils/embedding.py +12 -11
evalscope/backend/rag_eval/utils/llm.py +8 -6
evalscope/backend/rag_eval/utils/tools.py +12 -11
evalscope/backend/vlm_eval_kit/__init__.py +1 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
evalscope/benchmarks/arc/__init__.py +3 -2
evalscope/benchmarks/arc/ai2_arc.py +19 -16
evalscope/benchmarks/arc/arc_adapter.py +32 -24
evalscope/benchmarks/bbh/__init__.py +1 -2
evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
evalscope/benchmarks/benchmark.py +16 -16
evalscope/benchmarks/ceval/__init__.py +3 -2
evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
evalscope/benchmarks/ceval/ceval_exam.py +18 -31
evalscope/benchmarks/cmmlu/__init__.py +3 -2
evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
evalscope/benchmarks/competition_math/__init__.py +3 -2
evalscope/benchmarks/competition_math/competition_math.py +7 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
evalscope/benchmarks/data_adapter.py +24 -24
evalscope/benchmarks/general_qa/__init__.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
evalscope/benchmarks/gsm8k/__init__.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
evalscope/benchmarks/hellaswag/__init__.py +3 -2
evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
evalscope/benchmarks/humaneval/__init__.py +1 -1
evalscope/benchmarks/humaneval/humaneval.py +15 -18
evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
evalscope/benchmarks/mmlu/__init__.py +3 -2
evalscope/benchmarks/mmlu/mmlu.py +15 -29
evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
evalscope/benchmarks/race/__init__.py +3 -2
evalscope/benchmarks/race/race.py +21 -35
evalscope/benchmarks/race/race_adapter.py +32 -29
evalscope/benchmarks/race/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/__init__.py +3 -2
evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
evalscope/benchmarks/truthful_qa/__init__.py +3 -2
evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
evalscope/cli/cli.py +6 -5
evalscope/cli/start_eval.py +31 -0
evalscope/cli/start_perf.py +0 -3
evalscope/cli/start_server.py +27 -41
evalscope/config.py +119 -95
evalscope/constants.py +61 -29
evalscope/evaluator/__init__.py +1 -0
evalscope/evaluator/evaluator.py +96 -377
evalscope/evaluator/humaneval_evaluator.py +158 -0
evalscope/evaluator/rating_eval.py +12 -33
evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
evalscope/metrics/code_metric.py +3 -9
evalscope/metrics/math_accuracy.py +3 -6
evalscope/metrics/metrics.py +21 -21
evalscope/metrics/rouge_metric.py +11 -25
evalscope/models/__init__.py +1 -2
evalscope/models/api/openai_api.py +40 -29
evalscope/models/custom/__init__.py +0 -1
evalscope/models/custom/custom_model.py +3 -3
evalscope/models/dummy_chat_model.py +7 -8
evalscope/models/model_adapter.py +89 -156
evalscope/models/openai_model.py +20 -20
evalscope/perf/arguments.py +15 -3
evalscope/perf/benchmark.py +7 -9
evalscope/perf/http_client.py +3 -8
evalscope/perf/main.py +10 -0
evalscope/perf/plugin/api/custom_api.py +1 -2
evalscope/perf/plugin/api/dashscope_api.py +1 -2
evalscope/perf/plugin/api/openai_api.py +3 -4
evalscope/perf/plugin/datasets/base.py +1 -2
evalscope/perf/plugin/datasets/flickr8k.py +1 -2
evalscope/perf/plugin/datasets/longalpaca.py +1 -2
evalscope/perf/plugin/datasets/openqa.py +1 -2
evalscope/perf/utils/analysis_result.py +1 -2
evalscope/perf/utils/benchmark_util.py +1 -2
evalscope/perf/utils/db_util.py +11 -8
evalscope/perf/utils/local_server.py +19 -13
evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
evalscope/registry/tasks/arc.yaml +2 -3
evalscope/registry/tasks/bbh.yaml +3 -4
evalscope/registry/tasks/bbh_mini.yaml +3 -4
evalscope/registry/tasks/ceval.yaml +3 -3
evalscope/registry/tasks/ceval_mini.yaml +3 -4
evalscope/registry/tasks/cmmlu.yaml +3 -3
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
evalscope/registry/tasks/general_qa.yaml +1 -1
evalscope/registry/tasks/gsm8k.yaml +2 -2
evalscope/registry/tasks/mmlu.yaml +3 -3
evalscope/registry/tasks/mmlu_mini.yaml +3 -3
evalscope/run.py +184 -375
evalscope/run_arena.py +20 -25
evalscope/summarizer.py +16 -17
evalscope/third_party/longbench_write/README.md +99 -42
evalscope/third_party/longbench_write/default_task.json +1 -1
evalscope/third_party/longbench_write/default_task.yaml +8 -7
evalscope/third_party/longbench_write/eval.py +29 -28
evalscope/third_party/longbench_write/infer.py +16 -104
evalscope/third_party/longbench_write/longbench_write.py +5 -5
evalscope/third_party/longbench_write/resources/judge.txt +1 -1
evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
evalscope/third_party/longbench_write/utils.py +0 -1
evalscope/third_party/toolbench_static/eval.py +14 -15
evalscope/third_party/toolbench_static/infer.py +48 -69
evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
evalscope/third_party/toolbench_static/requirements.txt +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
evalscope/tools/combine_reports.py +25 -30
evalscope/tools/rewrite_eval_results.py +14 -46
evalscope/utils/__init__.py +0 -1
evalscope/utils/arena_utils.py +18 -48
evalscope/{perf/utils → utils}/chat_service.py +3 -4
evalscope/utils/completion_parsers.py +3 -8
evalscope/utils/logger.py +9 -7
evalscope/utils/model_utils.py +11 -0
evalscope/utils/utils.py +12 -138
evalscope/version.py +2 -2
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
evalscope-0.8.0.dist-info/RECORD +285 -0
tests/cli/test_run.py +54 -15
tests/perf/test_perf.py +4 -0
tests/rag/test_clip_benchmark.py +38 -38
tests/rag/test_mteb.py +3 -2
tests/rag/test_ragas.py +5 -5
tests/swift/test_run_swift_eval.py +2 -3
tests/swift/test_run_swift_vlm_eval.py +2 -3
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
evalscope/cache.py +0 -98
evalscope/models/template.py +0 -1446
evalscope/run_ms.py +0 -140
evalscope/utils/task_cfg_parser.py +0 -10
evalscope/utils/task_utils.py +0 -22
evalscope-0.7.1.dist-info/RECORD +0 -286
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0

evalscope/evaluator/humaneval_evaluator.py ADDED Viewed

@@ -0,0 +1,158 @@
+import json
+import os
+import re
+from tqdm import tqdm
+from typing import List, Optional
+from evalscope.constants import OutputsStructure
+from evalscope.evaluator.evaluator import logger
+from evalscope.models.model_adapter import BaseModelAdapter
+from evalscope.tools.combine_reports import gen_table
+from evalscope.utils import normalize_score
+class HumanevalEvaluator(object):
+    def __init__(
+        self,
+        problem_file: str,
+        model_id: str,
+        model_revision: str,
+        model_adapter: BaseModelAdapter,
+        outputs: Optional[OutputsStructure] = None,
+        k: List[int] = [1, 10, 100],
+        n_workers: int = 4,
+        timeout: float = 3.0,
+    ):
+        try:
+            from human_eval.data import read_problems, write_jsonl
+            from human_eval.evaluation import evaluate_functional_correctness
+        except ImportError:
+            raise ImportError('Please install human_eval:'
+                              'https://github.com/openai/human-eval/tree/master#installation , '
+                              'Note that you need to enable the execution code in the human_eval/execution.py first.')
+        self.problem_file = problem_file
+        self.k = k
+        self.num_workers = n_workers
+        self.timeout = timeout
+        self.model_adapter = model_adapter
+        self.read_problems_func = read_problems
+        self.write_jsonl_func = write_jsonl
+        self.eval_func = evaluate_functional_correctness
+        # {'task_id': {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...}
+        self.problems = self.read_problems_func(self.problem_file)
+        # Deal with the output paths
+        self.outputs_structure = OutputsStructure(outputs)
+    def get_answers(self, infer_cfg: dict) -> List[dict]:
+        ans_list: list = []
+        system_prompt: str = 'Complete the following python code:\n'
+        for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
+            prompt: str = system_prompt + data_d['prompt']
+            inputs: dict = {'data': [prompt]}
+            # pred_res: dict = self.model_adapter.predict(inputs)
+            pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
+            pred_ans: str = pred_res['choices'][0]['message']['content']
+            pred_ans = self._postprocess(pred_ans)
+            ans_list.append({'task_id': task_id, 'completion': pred_ans})
+        return ans_list
+    def eval(self, infer_cfg: dict, **kwargs):
+        # predict
+        ans_list: list = self.get_answers(infer_cfg)
+        ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl')
+        self.write_jsonl_func(filename=ans_out_file, data=ans_list)
+        # logger.info(f'** Dump predictions to {ans_out_file} successfully.')
+        logger.info('** Dump predictions successfully.')
+        # evaluate  results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
+        results = self.eval_func(
+            sample_file=ans_out_file,
+            k=self.k,
+            n_workers=self.num_workers,
+            timeout=self.timeout,
+            problem_file=self.problem_file)
+        # output: report
+        report_map: dict = self.gen_report(results=results)
+        report_dir: str = self.outputs_structure.reports_dir
+        report_file: str = os.path.join(report_dir, 'human_eval_report.json')
+        with open(report_file, 'w') as f:
+            f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
+        # logger.info(f'** Dump report to {report_file} \n')
+        logger.info('** Dump report \n')
+        try:
+            # Make table
+            report_table: str = gen_table([report_dir])
+            logger.info(f'** Report table: \n {report_table} \n')
+        except Exception:
+            logger.error('Failed to generate report table.')
+    def gen_report(self, results: dict) -> dict:
+        """
+        Generate report from evaluation results.
+        Returns:
+            {
+            "name":"ARC-Challenge",
+            "metric":"WeightedAverageAccuracy",
+            "score":0.3389,
+            "category":[
+                {
+                    "name":"DEFAULT",
+                    "score":0.3389,
+                    "subset":[
+                        {
+                            "name":"ARC-Challenge",
+                            "score":0.3389
+                        },
+                    ]
+                }
+            ],
+            "total_num":100
+        }
+        """
+        results = {k: normalize_score(score=v) for k, v in results.items()}
+        category_d = dict(name='DEFAULT', score=results, subset=[])
+        res_map = dict(
+            name='HumanEval', metric='pass@k', score=results, category=[category_d], total_num=len(self.problems))
+        return res_map
+    @classmethod
+    def _postprocess(cls, text: str) -> str:
+        if '```' in text:
+            blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
+            if len(blocks) == 0:
+                text = text.split('```')[1]  # fall back to default strategy
+            else:
+                text = blocks[0]  # fetch the first code block
+                if not text.startswith('\n'):  # in case starting with ```python
+                    text = text[max(text.find('\n') + 1, 0):]
+        if text.strip().startswith('from') or text.strip().startswith('import'):
+            def_idx = text.find('def')
+            if def_idx != -1:
+                text = text[max(text.find('\n', def_idx) + 1, 0):]
+        text = text.split('\n\n')[0]
+        if text.strip().startswith('def'):
+            text = '\n'.join(text.split('\n')[1:])
+        if not text.startswith('    '):
+            if text.startswith(' '):
+                text = '    ' + text.lstrip()
+            else:
+                text = '\n'.join(['    ' + line for line in text.split('\n')])
+        return text

evalscope/evaluator/rating_eval.py CHANGED Viewed

@@ -1,24 +1,17 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import List, Union
 import pandas as pd
 import pyarrow as pa
+from typing import List, Union
 from evalscope.constants import MetricMembers
+from evalscope.utils import jsonl_to_list
 from evalscope.utils.arena_utils import compute_elo
 from evalscope.utils.logger import get_logger
-from evalscope.utils import jsonl_to_list
 logger = get_logger()
-DEFAULT_COLUMNS_MAPPING = {
-    'model_a': 'model_a',
-    'model_b': 'model_b',
-    'win': 'win',
-    'tstamp': 'ts',
-    'language': 'lang'
-}
+DEFAULT_COLUMNS_MAPPING = {'model_a': 'model_a', 'model_b': 'model_b', 'win': 'win', 'tstamp': 'ts', 'language': 'lang'}
 class RatingEvaluate(object):
@@ -41,10 +34,9 @@ class RatingEvaluate(object):
         elo_ratings = compute_elo(battles)
         col_model = 'Model'
         col_elo_rating = 'Elo_Rating'
-        elo_ratings_res = pd.DataFrame(
-            [[n, elo_ratings[n]] for n in elo_ratings.keys()],
-            columns=[col_model, col_elo_rating]).sort_values(
-                col_elo_rating, ascending=False).reset_index(drop=True)
+        elo_ratings_res = pd.DataFrame([[n, elo_ratings[n]] for n in elo_ratings.keys()],
+                                       columns=[col_model, col_elo_rating]).sort_values(
+                                           col_elo_rating, ascending=False).reset_index(drop=True)
         elo_ratings_res = elo_ratings_res.round({col_elo_rating: 1})
         return elo_ratings_res
@@ -89,23 +81,11 @@ class RatingEvaluate(object):
                 'tie': 1
             }]
         else:
-            return [{
-                'model': winner,
-                'win': 1,
-                'loss': 0,
-                'tie': 0
-            }, {
-                'model': loser,
-                'win': 0,
-                'loss': 1,
-                'tie': 0
-            }]
+            return [{'model': winner, 'win': 1, 'loss': 0, 'tie': 0}, {'model': loser, 'win': 0, 'loss': 1, 'tie': 0}]
     def compute_pairwise_rating(self, raw_data):
         df_all = self.preprocess(raw_data_df=raw_data)
-        model_list = (
-            df_all['model_a'].unique().tolist()
-            + df_all['model_b'].unique().tolist())
+        model_list = (df_all['model_a'].unique().tolist() + df_all['model_b'].unique().tolist())
         model_list = list(set(model_list))
         list_res = []
@@ -114,8 +94,7 @@ class RatingEvaluate(object):
             if self.baseline_model is not None:
                 if self.baseline_model not in [row['model_a'], row['model_b']]:
                     logger.warning(
-                        f'One of the models in the battle should be the baseline model: {self.baseline_model}'
-                    )
+                        f'One of the models in the battle should be the baseline model: {self.baseline_model}')
                     continue
             rating = self.get_single_pairwise_rating(row)
             list_res = list_res + rating
@@ -149,15 +128,15 @@ class RatingEvaluate(object):
         for metric in self.metrics:
-            if metric == MetricMembers.ELO.value:
+            if metric == MetricMembers.ELO:
                 res = self.compute_elo_rating(raw_data)
                 res_all.append(res)
-            elif metric == MetricMembers.PAIRWISE.value:
+            elif metric == MetricMembers.PAIRWISE:
                 res = self.compute_pairwise_rating(raw_data)
                 res_all.append(res)
-            elif metric == MetricMembers.SCORE.value:
+            elif metric == MetricMembers.SCORE:
                 res = self.compute_score_rating(raw_data)
                 res_all.append(res)

evalscope/evaluator/reviewer/auto_reviewer.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # flake8: noqa
 import os
+import pandas as pd
 import random
 import sys
 import time
@@ -9,15 +10,10 @@ from abc import ABC, abstractmethod
 from functools import partial
 from typing import Any, List
-import pandas as pd
 from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
 from evalscope.models.openai_model import OpenAIModel
-from evalscope.utils import completion_parsers
-from evalscope.utils.arena_utils import (get_battle_pairs,
-                                         merge_ques_ans,
-                                         shuffle_pairwise_preferences)
-from evalscope.utils import dump_jsonl_data, jsonl_to_list, random_seeded_choice
+from evalscope.utils import completion_parsers, dump_jsonl_data, jsonl_to_list, random_seeded_choice
+from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -33,8 +29,7 @@ class BaseReviewer(ABC):
         """
         Run pairwise battles with given models.
         """
-        raise NotImplementedError(
-            'run() method must be implemented in your subclass.')
+        raise NotImplementedError('run() method must be implemented in your subclass.')
 class AutoReviewerGpt4(BaseReviewer):
@@ -71,13 +66,9 @@ class AutoReviewerGpt4(BaseReviewer):
         self.review_result_file = review_result_file
         self.prompt_list = jsonl_to_list(prompt_file)
-        self.answer_list = [
-            jsonl_to_list(answer_file) for answer_file in answer_file_list
-        ]
-        self.reference_list = jsonl_to_list(
-            reference_file) if reference_file else []
-        self.cache_list = jsonl_to_list(
-            cache_file) if cache_file and os.path.isfile(cache_file) else []
+        self.answer_list = [jsonl_to_list(answer_file) for answer_file in answer_file_list]
+        self.reference_list = jsonl_to_list(reference_file) if reference_file else []
+        self.cache_list = jsonl_to_list(cache_file) if cache_file and os.path.isfile(cache_file) else []
         self.reviewer_args = reviewer_args if reviewer_args \
             else self._get_default_args()
@@ -88,24 +79,18 @@ class AutoReviewerGpt4(BaseReviewer):
             self.answer_list.append(jsonl_to_list(baseline_file))
             self.baseline_idx = len(self.answer_list) - 1
-        self.position_bias_mitigation = self.reviewer_args.pop(
-            EvalConfigKeys.POSITION_BIAS_MITIGATION,
-            PositionBiasMitigation.NONE)
+        self.position_bias_mitigation = self.reviewer_args.pop(EvalConfigKeys.POSITION_BIAS_MITIGATION,
+                                                               PositionBiasMitigation.NONE)
         if self.position_bias_mitigation == PositionBiasMitigation.RANDOMIZE_ORDER:
-            self.random_seed = self.reviewer_args.pop(
-                EvalConfigKeys.RANDOM_SEED, 123)
-        fn_completion_parser = self.reviewer_args.pop(
-            EvalConfigKeys.FN_COMPLETION_PARSER,
-            FnCompletionParser.LMSYS_PARSER)
-        completion_parser_kwargs = self.reviewer_args.pop(
-            EvalConfigKeys.COMPLETION_PARSER_KWARGS, {})
+            self.random_seed = self.reviewer_args.pop(EvalConfigKeys.RANDOM_SEED, 123)
+        fn_completion_parser = self.reviewer_args.pop(EvalConfigKeys.FN_COMPLETION_PARSER,
+                                                      FnCompletionParser.LMSYS_PARSER)
+        completion_parser_kwargs = self.reviewer_args.pop(EvalConfigKeys.COMPLETION_PARSER_KWARGS, {})
         if isinstance(fn_completion_parser, str):
-            fn_completion_parser = getattr(completion_parsers,
-                                           fn_completion_parser)
+            fn_completion_parser = getattr(completion_parsers, fn_completion_parser)
-        self.fn_completion_parser = partial(fn_completion_parser,
-                                            **completion_parser_kwargs)
+        self.fn_completion_parser = partial(fn_completion_parser, **completion_parser_kwargs)
         self.gpt_predictor = OpenAIModel(model_cfg=self.reviewer_args)
     @staticmethod
@@ -133,45 +118,35 @@ class AutoReviewerGpt4(BaseReviewer):
         # Default to general category (idx 0)
         target_prompt_dict = prompts_list[0]
         for item in prompts_list:
-            is_category_match = category in item['category'] if isinstance(
-                item['category'], list) else item['category'] == category
+            is_category_match = category in item['category'] if isinstance(item['category'],
+                                                                           list) else item['category'] == category
             is_type_match = item.get('type', ArenaMode.PAIRWISE) == type
             if is_category_match and is_type_match:
                 target_prompt_dict = item
                 break
-            elif is_type_match and target_prompt_dict.get('type',
-                                                          ArenaMode.PAIRWISE) != type:
+            elif is_type_match and target_prompt_dict.get('type', ArenaMode.PAIRWISE) != type:
                 target_prompt_dict = item  # fallback to type match
         sys_prompt = target_prompt_dict['system_prompt']
         prompt_template = target_prompt_dict['prompt_template']
         defaults = target_prompt_dict.get('defaults', dict({}))
-        output_format = target_prompt_dict.get('output_format',
-                                               '[[rating_a,rating_b]]')
+        output_format = target_prompt_dict.get('output_format', '[[rating_a,rating_b]]')
         if type == ArenaMode.SINGLE:
-            user_prompt = prompt_template.format(
-                question=ques, answer=ans1, ref_answer_1=ans_ref, **defaults)
+            user_prompt = prompt_template.format(question=ques, answer=ans1, ref_answer_1=ans_ref, **defaults)
         else:
             user_prompt = prompt_template.format(
-                question=ques,
-                answer_a=ans1,
-                answer_b=ans2,
-                ref_answer_1=ans_ref,
-                **defaults)
+                question=ques, answer_a=ans1, answer_b=ans2, ref_answer_1=ans_ref, **defaults)
         return sys_prompt, user_prompt, output_format
     def get_review_cache(self, model_a, model_b, question) -> list:
         if model_b:
-            cache_hit = next(
-                (r for r in self.cache_list if r['model_a'] == model_a
-                 and r['model_b'] == model_b and r['question'] == question),
-                None)
+            cache_hit = next((r for r in self.cache_list
+                              if r['model_a'] == model_a and r['model_b'] == model_b and r['question'] == question),
+                             None)
         else:
-            cache_hit = next(
-                (r for r in self.cache_list
-                 if r['model'] == model_a and r['question'] == question), None)
+            cache_hit = next((r for r in self.cache_list if r['model'] == model_a and r['question'] == question), None)
         return cache_hit
     def get_review_pair(self, item: List[dict], dry_run=False, **kwargs) -> dict:
@@ -265,12 +240,10 @@ class AutoReviewerGpt4(BaseReviewer):
         return review_result
     def _get_review_pair(self, model_a, model_b, question, category, ans1, ans2, dry_run=False, **kwargs) -> (str, Any):
-        input_msg = dict(
-            ques=question, category=category, ans1=ans1, ans2=ans2)
+        input_msg = dict(ques=question, category=category, ans1=ans1, ans2=ans2)
         if self.reference_list:
-            ans_ref = next((ref for ref in self.reference_list
-                            if ref.get('text') == question), None)
+            ans_ref = next((ref for ref in self.reference_list if ref.get('text') == question), None)
             assert ans_ref['answer']
             input_msg['ans_ref'] = ans_ref['answer']
@@ -284,8 +257,7 @@ class AutoReviewerGpt4(BaseReviewer):
         else:
             review_text = self._get_reviewer_prediction(sys_prompt, user_prompt, **kwargs)
-        result = self.fn_completion_parser(
-            review_text, output_format=output_format)
+        result = self.fn_completion_parser(review_text, output_format=output_format)
         if not isinstance(result, tuple):
             result = (result, None)
         return review_text, *result
@@ -294,8 +266,7 @@ class AutoReviewerGpt4(BaseReviewer):
         input_msg = dict(ques=question, category=category, ans1=answer)
         if self.reference_list:
-            ans_ref = next((ref for ref in self.reference_list
-                            if ref.get('text') == question), None)
+            ans_ref = next((ref for ref in self.reference_list if ref.get('text') == question), None)
             assert ans_ref['answer']
             input_msg['ans_ref'] = ans_ref['answer']
@@ -312,8 +283,7 @@ class AutoReviewerGpt4(BaseReviewer):
         score = self.fn_completion_parser(review_text, output_format)
         return review_text, score
-    def _get_reviewer_prediction_dummy(self, sys_prompt: str, user_prompt: str,
-                                       output_format) -> str:
+    def _get_reviewer_prediction_dummy(self, sys_prompt: str, user_prompt: str, output_format) -> str:
         logger.info('Get dummy scores for input prompt ...')
         if output_format == '[[rating]]':
             return f'[[{round(random.random(), 2)}]]'
@@ -359,8 +329,7 @@ class AutoReviewerGpt4(BaseReviewer):
         if self.review_mode == ArenaMode.PAIRWISE:
             battle_pairs = get_battle_pairs(merged_ans_df.columns)
         elif self.review_mode == ArenaMode.PAIRWISE_BASELINE:
-            battle_pairs = get_battle_pairs(merged_ans_df.columns,
-                                            self.baseline_idx)
+            battle_pairs = get_battle_pairs(merged_ans_df.columns, self.baseline_idx)
         elif self.review_mode == ArenaMode.SINGLE:
             battle_pairs = [(col, ) for col in merged_ans_df.columns]
         else:
@@ -373,14 +342,12 @@ class AutoReviewerGpt4(BaseReviewer):
                 pair_df.columns = ['output_1', 'output_2']
                 pair_df['is_switched_outputs'] = pair_df.apply(
                     lambda x: random_seeded_choice(
-                        seed='is_switched_outputs' + x[0]['text'] + str(
-                            self.random_seed),
+                        seed='is_switched_outputs' + x[0]['text'] + str(self.random_seed),
                         choices=[False, True],
                     ),
                     axis=1,
                 )
-                pair_df = shuffle_pairwise_preferences(
-                    pair_df, pair_df['is_switched_outputs'])
+                pair_df = shuffle_pairwise_preferences(pair_df, pair_df['is_switched_outputs'])
             for index, row in pair_df.iterrows():
                 row_result = self.get_review_pair(row.to_list(), dry_run=dry_run, **kwargs) \
@@ -395,17 +362,21 @@ if __name__ == '__main__':
     work_path = os.path.join(Path(__file__).absolute().parent, '../../../')
     prompt_template_path = os.path.join(work_path, 'evalscope/registry/data/prompt_template/prompt_templates.jsonl')
-    answer_file_list = [os.path.join(work_path, 'outputs/arena/default/answers/answer_chatglm2-6b.jsonl'),
-                        os.path.join(work_path, 'outputs/arena/default/answers/answer_llama2-7b.jsonl')]
+    answer_file_list = [
+        os.path.join(work_path, 'outputs/arena/default/answers/answer_chatglm2-6b.jsonl'),
+        os.path.join(work_path, 'outputs/arena/default/answers/answer_llama2-7b.jsonl')
+    ]
     review_result_file_path = os.path.join(work_path, 'outputs/arena/default/reviews/review_gpt4.jsonl')
-    input_kwargs = dict(prompt_file=prompt_template_path,
-                        answer_file_list=answer_file_list,
-                        review_result_file=review_result_file_path,
-                        reviewer_args={},
-                        baseline_file='',
-                        reference_file='',
-                        cache_file='', )
+    input_kwargs = dict(
+        prompt_file=prompt_template_path,
+        answer_file_list=answer_file_list,
+        review_result_file=review_result_file_path,
+        reviewer_args={},
+        baseline_file='',
+        reference_file='',
+        cache_file='',
+    )
     auto_reviewer = AutoReviewerGpt4(**input_kwargs)
     auto_reviewer.run(dry_run=True)

evalscope/metrics/bundled_rouge_score/rouge_scorer.py CHANGED Viewed

@@ -29,16 +29,17 @@ In these examples settings.xml lists input files and formats.
 """
 from __future__ import absolute_import, division, print_function
-import collections
-import re
-import os
+import collections
 import nltk
 import numpy as np
+import os
+import re
 import six
 from absl import logging
 from rouge_score import scoring, tokenizers
 from six.moves import map, range
 from evalscope.utils import get_logger
 logger = get_logger()
@@ -81,11 +82,7 @@ class RougeScorer(scoring.BaseScorer):
         ...                       'The quick brown dog jumps on the log.')
     """
-    def __init__(self,
-                 rouge_types,
-                 use_stemmer=False,
-                 split_summaries=False,
-                 tokenizer=None):
+    def __init__(self, rouge_types, use_stemmer=False, split_summaries=False, tokenizer=None):
         self.rouge_types = rouge_types
         if tokenizer:
@@ -160,21 +157,15 @@ class RougeScorer(scoring.BaseScorer):
                     sents = [x for x in sents if len(x)]
                     return sents
-                target_tokens_list = [
-                    self._tokenizer.tokenize(s) for s in get_sents(target)
-                ]
-                prediction_tokens_list = [
-                    self._tokenizer.tokenize(s) for s in get_sents(prediction)
-                ]
+                target_tokens_list = [self._tokenizer.tokenize(s) for s in get_sents(target)]
+                prediction_tokens_list = [self._tokenizer.tokenize(s) for s in get_sents(prediction)]
-                scores = _summary_level_lcs(target_tokens_list,
-                                            prediction_tokens_list)
+                scores = _summary_level_lcs(target_tokens_list, prediction_tokens_list)
             elif re.match(r'rouge[0-9]$', six.ensure_str(rouge_type)):
                 # Rouge from n-grams.
                 n = int(rouge_type[5:])
                 if n <= 0:
-                    raise ValueError('rougen requires positive n: %s'
-                                     % rouge_type)
+                    raise ValueError('rougen requires positive n: %s' % rouge_type)
                 target_ngrams = _create_ngrams(target_tokens, n)
                 prediction_ngrams = _create_ngrams(prediction_tokens, n)
                 scores = _score_ngrams(target_ngrams, prediction_ngrams)
@@ -349,8 +340,7 @@ def _score_ngrams(target_ngrams, prediction_ngrams):
     intersection_ngrams_count = 0
     for ngram in six.iterkeys(target_ngrams):
-        intersection_ngrams_count += min(target_ngrams[ngram],
-                                         prediction_ngrams[ngram])
+        intersection_ngrams_count += min(target_ngrams[ngram], prediction_ngrams[ngram])
     target_ngrams_count = sum(target_ngrams.values())
     prediction_ngrams_count = sum(prediction_ngrams.values())

evalscope/metrics/code_metric.py CHANGED Viewed

@@ -4,7 +4,6 @@ import inspect
 import re
 import signal
 from collections import defaultdict
 from tqdm import tqdm
@@ -20,8 +19,7 @@ def check_input(text, arg):
     code_block = code_block_pattern.search(text)
     code_string = code_block.group(1)
-    function_name_pattern = re.compile(r'def\s+([a-zA-Z_][a-zA-Z0-9_]*)\(',
-                                       re.DOTALL)
+    function_name_pattern = re.compile(r'def\s+([a-zA-Z_][a-zA-Z0-9_]*)\(', re.DOTALL)
     function_name_block = function_name_pattern.search(code_string)
     function_name = function_name_block.group(1)
@@ -52,9 +50,7 @@ def exec_func(func, arr):
 def compute_pass_k_one_sample(predict, func_args, func_outputs, k=4):
-    assert len(
-        predict
-    ) >= k, f'pass@k must have {k} generations, now have {len(predict)}'
+    assert len(predict) >= k, f'pass@k must have {k} generations, now have {len(predict)}'
     for predict_i in predict[:k]:
         try:
             for arg, gold in zip(func_args, func_outputs):
@@ -87,9 +83,7 @@ def compute_pass_k(predict_l, reference_l, func_args_l, k=4, lang='py'):
 def run_code_eval(data_l, k=4, md_level=2):
     print(f"{'#' * md_level} Code Eval(pass@{k})")
     for data in tqdm(data_l):
-        data[f'pass@{k}'] = compute_pass_k_one_sample(data['gen'],
-                                                      data['func_args'],
-                                                      data['func_outputs'], k)
+        data[f'pass@{k}'] = compute_pass_k_one_sample(data['gen'], data['func_args'], data['func_outputs'], k)
     task_data_d = defaultdict(list)
     for data in data_l:
         for task in data['task_tags']:

evalscope/metrics/math_accuracy.py CHANGED Viewed

@@ -2,7 +2,6 @@
 import re
 from collections import defaultdict
 from tqdm import tqdm
 from evalscope.constants import MetricsConstant
@@ -44,8 +43,7 @@ def compute_math_accuracy(predict_l, reference_l):
 def run_math_eval(data_l, md_level=2):
     print(f"{'#' * md_level} Math Eval(math accuracy)")
     for data in tqdm(data_l):
-        data['math_accuracy'] = compute_math_accuracy_one_sample(
-            data['gen'], data['target'])
+        data['math_accuracy'] = compute_math_accuracy_one_sample(data['gen'], data['target'])
     task_data_d = defaultdict(list)
     for data in data_l:
         for task in data['task_tags']:
@@ -54,7 +52,6 @@ def run_math_eval(data_l, md_level=2):
     print(f'[total], count: {len(data_l)}, math accuracy: '
           f'{correct_cnt / len(data_l) * 100:0.2f}%')
     for task in task_data_d.keys():
-        correct_cnt = sum(
-            [data['math_accuracy'] for data in task_data_d[task]])
+        correct_cnt = sum([data['math_accuracy'] for data in task_data_d[task]])
         print(f'[{task}], count: {len(task_data_d[task])}, math accuracy: '
-              f'{correct_cnt/len(task_data_d[task])*100:0.2f}%')
+              f'{correct_cnt / len(task_data_d[task]) * 100:0.2f}%')

evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.7.1py3-none-any.whl → 0.8.0py3-none-any.whl