PyPI - evalscope - Versions diffs - 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

evalscope 0.7.2py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (234) hide show

evalscope/__init__.py +1 -1
evalscope/arguments.py +73 -0
evalscope/backend/base.py +6 -2
evalscope/backend/opencompass/api_meta_template.py +8 -14
evalscope/backend/opencompass/backend_manager.py +24 -15
evalscope/backend/opencompass/tasks/eval_api.py +1 -6
evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
evalscope/backend/rag_eval/__init__.py +3 -3
evalscope/backend/rag_eval/backend_manager.py +21 -25
evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
evalscope/backend/rag_eval/cmteb/base.py +22 -23
evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
evalscope/backend/rag_eval/ragas/__init__.py +2 -2
evalscope/backend/rag_eval/ragas/arguments.py +3 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +10 -15
evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
evalscope/backend/rag_eval/utils/clip.py +47 -51
evalscope/backend/rag_eval/utils/embedding.py +13 -12
evalscope/backend/rag_eval/utils/llm.py +8 -6
evalscope/backend/rag_eval/utils/tools.py +12 -11
evalscope/backend/vlm_eval_kit/__init__.py +1 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
evalscope/benchmarks/arc/__init__.py +3 -2
evalscope/benchmarks/arc/ai2_arc.py +19 -16
evalscope/benchmarks/arc/arc_adapter.py +32 -24
evalscope/benchmarks/bbh/__init__.py +1 -2
evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
evalscope/benchmarks/benchmark.py +16 -16
evalscope/benchmarks/ceval/__init__.py +3 -2
evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
evalscope/benchmarks/ceval/ceval_exam.py +18 -31
evalscope/benchmarks/cmmlu/__init__.py +3 -2
evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
evalscope/benchmarks/competition_math/__init__.py +3 -2
evalscope/benchmarks/competition_math/competition_math.py +7 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
evalscope/benchmarks/data_adapter.py +24 -24
evalscope/benchmarks/general_qa/__init__.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
evalscope/benchmarks/gsm8k/__init__.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
evalscope/benchmarks/hellaswag/__init__.py +3 -2
evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
evalscope/benchmarks/humaneval/__init__.py +1 -1
evalscope/benchmarks/humaneval/humaneval.py +15 -18
evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
evalscope/benchmarks/mmlu/__init__.py +3 -2
evalscope/benchmarks/mmlu/mmlu.py +15 -29
evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
evalscope/benchmarks/race/__init__.py +3 -2
evalscope/benchmarks/race/race.py +21 -35
evalscope/benchmarks/race/race_adapter.py +33 -29
evalscope/benchmarks/race/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/__init__.py +3 -2
evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
evalscope/benchmarks/truthful_qa/__init__.py +3 -2
evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
evalscope/cli/cli.py +6 -5
evalscope/cli/start_eval.py +31 -0
evalscope/cli/start_perf.py +0 -3
evalscope/cli/start_server.py +27 -41
evalscope/config.py +154 -96
evalscope/constants.py +50 -32
evalscope/evaluator/evaluator.py +97 -377
evalscope/evaluator/rating_eval.py +12 -33
evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
evalscope/metrics/code_metric.py +3 -9
evalscope/metrics/math_accuracy.py +3 -6
evalscope/metrics/metrics.py +21 -21
evalscope/metrics/rouge_metric.py +11 -25
evalscope/models/__init__.py +1 -2
evalscope/models/api/openai_api.py +40 -29
evalscope/models/custom/__init__.py +0 -1
evalscope/models/custom/custom_model.py +3 -3
evalscope/models/dummy_chat_model.py +7 -8
evalscope/models/model_adapter.py +89 -156
evalscope/models/openai_model.py +20 -20
evalscope/perf/arguments.py +16 -3
evalscope/perf/benchmark.py +9 -11
evalscope/perf/http_client.py +3 -8
evalscope/perf/main.py +8 -1
evalscope/perf/plugin/api/custom_api.py +1 -2
evalscope/perf/plugin/api/dashscope_api.py +1 -2
evalscope/perf/plugin/api/openai_api.py +3 -4
evalscope/perf/plugin/datasets/base.py +1 -2
evalscope/perf/plugin/datasets/flickr8k.py +1 -2
evalscope/perf/plugin/datasets/longalpaca.py +1 -2
evalscope/perf/plugin/datasets/openqa.py +1 -2
evalscope/perf/plugin/registry.py +3 -3
evalscope/perf/utils/analysis_result.py +1 -2
evalscope/perf/utils/benchmark_util.py +5 -6
evalscope/perf/utils/db_util.py +77 -30
evalscope/perf/utils/local_server.py +21 -13
evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
evalscope/registry/tasks/arc.yaml +2 -3
evalscope/registry/tasks/bbh.yaml +3 -4
evalscope/registry/tasks/bbh_mini.yaml +3 -4
evalscope/registry/tasks/ceval.yaml +3 -3
evalscope/registry/tasks/ceval_mini.yaml +3 -4
evalscope/registry/tasks/cmmlu.yaml +3 -3
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
evalscope/registry/tasks/general_qa.yaml +1 -1
evalscope/registry/tasks/gsm8k.yaml +2 -2
evalscope/registry/tasks/mmlu.yaml +3 -3
evalscope/registry/tasks/mmlu_mini.yaml +3 -3
evalscope/run.py +153 -381
evalscope/run_arena.py +21 -25
evalscope/summarizer.py +27 -40
evalscope/third_party/longbench_write/README.md +99 -42
evalscope/third_party/longbench_write/default_task.json +1 -1
evalscope/third_party/longbench_write/default_task.yaml +8 -7
evalscope/third_party/longbench_write/eval.py +29 -27
evalscope/third_party/longbench_write/infer.py +16 -104
evalscope/third_party/longbench_write/longbench_write.py +5 -4
evalscope/third_party/longbench_write/resources/judge.txt +1 -1
evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
evalscope/third_party/longbench_write/utils.py +0 -1
evalscope/third_party/toolbench_static/eval.py +14 -15
evalscope/third_party/toolbench_static/infer.py +48 -69
evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
evalscope/third_party/toolbench_static/requirements.txt +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
evalscope/tools/combine_reports.py +27 -34
evalscope/tools/rewrite_eval_results.py +15 -47
evalscope/utils/__init__.py +1 -1
evalscope/utils/arena_utils.py +18 -48
evalscope/{perf/utils → utils}/chat_service.py +4 -5
evalscope/utils/completion_parsers.py +3 -8
evalscope/utils/io_utils.py +162 -0
evalscope/utils/logger.py +17 -7
evalscope/utils/model_utils.py +11 -0
evalscope/utils/utils.py +5 -306
evalscope/version.py +2 -2
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
evalscope-0.8.1.dist-info/RECORD +285 -0
tests/cli/test_run.py +53 -15
tests/perf/test_perf.py +6 -1
tests/rag/test_clip_benchmark.py +38 -38
tests/rag/test_mteb.py +3 -2
tests/rag/test_ragas.py +5 -5
tests/swift/test_run_swift_eval.py +2 -3
tests/swift/test_run_swift_vlm_eval.py +2 -3
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
tests/vlm/test_vlmeval.py +3 -2
evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
evalscope/cache.py +0 -98
evalscope/models/template.py +0 -1446
evalscope/run_ms.py +0 -140
evalscope/utils/task_cfg_parser.py +0 -10
evalscope/utils/task_utils.py +0 -22
evalscope-0.7.2.dist-info/RECORD +0 -286
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/humaneval/humaneval_adapter.py CHANGED Viewed

@@ -1,21 +1,206 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import json
+import os
+import re
+from tqdm import tqdm
+from typing import List
-# flake8: noqa
+from evalscope.benchmarks.data_adapter import DataAdapter
+from evalscope.metrics.metrics import weighted_mean
+from evalscope.tools.combine_reports import gen_table
+from evalscope.utils import normalize_score
+from evalscope.utils.logger import get_logger
+logger = get_logger()
 DATASET_ID = 'modelscope/humaneval'
 SUBSET_LIST = ['openai_humaneval']
-# Note: ONLY FOR CLASS IMPORT, No implementation here.
 # Example:
-# {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"}
+# {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"}  # noqa
-class HumanevalAdapter:
+class HumanevalAdapter(DataAdapter):
     """
     A placeholder for humaneval adapter, see HumanevalEvaluator for implementation.
     """
-    def __init__(self):
-        ...
+    def __init__(self,
+                 subset_list: list = None,
+                 metric_list: list = None,
+                 few_shot_num: int = None,
+                 train_split: str = None,
+                 eval_split: str = 'test',
+                 prompt_template: str = 'Complete the following python code:\n',
+                 **kwargs):
+        try:
+            from human_eval.data import stream_jsonl, write_jsonl
+            from human_eval.evaluation import check_correctness
+        except ImportError:
+            raise ImportError('Please install human_eval:'
+                              'https://github.com/openai/human-eval/tree/master#installation , '
+                              'Note that you need to enable the execution code in the human_eval/execution.py first.')
+        if subset_list is None:
+            subset_list = SUBSET_LIST
+        if metric_list is None:
+            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
+        self.k = [1]
+        self.num_workers = 4
+        self.timeout = 4.0
+        self.outputs = kwargs.get('outputs', None)
+        self.read_problems_func = stream_jsonl
+        self.write_jsonl_func = write_jsonl
+        self.eval_func = check_correctness
+        super().__init__(
+            subset_list=subset_list,
+            metric_list=metric_list,
+            few_shot_num=few_shot_num,
+            train_split=train_split,
+            eval_split=eval_split,
+            prompt_template=prompt_template,
+            **kwargs)
+    def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
+        data_dict = {}
+        for subset_name in subset_list:
+            data_dict[subset_name] = {}
+            # [{'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...]
+            data_dict[subset_name][self.eval_split] = [task for task in self.read_problems_func(dataset_name_or_path)]
+        return data_dict
+    def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
+        """
+        Generate prompt for the model.
+        Args:
+            input_d (dict): The raw input. A single data format of the Humaneval:
+            {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
+        """
+        full_prompt = input_d['prompt']
+        full_prompt = f'{self.prompt_template}\n{full_prompt}' if self.prompt_template else full_prompt
+        return {'data': [full_prompt]}
+    def get_answers(self, infer_cfg: dict) -> List[dict]:
+        ans_list: list = []
+        system_prompt: str = ''
+        for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
+            prompt: str = system_prompt + data_d['prompt']
+            inputs: dict = {'data': [prompt]}
+            pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
+            pred_ans: str = pred_res['choices'][0]['message']['content']
+            pred_ans = self._postprocess(pred_ans)
+            ans_list.append({'task_id': task_id, 'completion': pred_ans})
+        return ans_list
+    def eval(self, infer_cfg: dict, **kwargs):
+        # predict
+        ans_list: list = self.get_answers(infer_cfg)
+        ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl')
+        self.write_jsonl_func(filename=ans_out_file, data=ans_list)
+        # logger.info(f'** Dump predictions to {ans_out_file} successfully.')
+        logger.info('** Dump predictions successfully.')
+        # evaluate  results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
+        results = self.eval_func(
+            sample_file=ans_out_file,
+            k=self.k,
+            n_workers=self.num_workers,
+            timeout=self.timeout,
+            problem_file=self.problem_file)
+        # output: report
+        report_map: dict = self.gen_report(results=results)
+        report_dir: str = self.outputs_structure.reports_dir
+        report_file: str = os.path.join(report_dir, 'human_eval_report.json')
+        with open(report_file, 'w') as f:
+            f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
+        # logger.info(f'** Dump report to {report_file} \n')
+        logger.info('** Dump report \n')
+        try:
+            # Make table
+            report_table: str = gen_table([report_dir])
+            logger.info(f'** Report table: \n {report_table} \n')
+        except Exception:
+            logger.error('Failed to generate report table.')
+    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
+        total_num: int = sum([num for _, num in subset_score_map.values()])
+        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
+        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
+        cate_avg_list = [{
+            'name': subset_name,
+            'score': normalize_score(score=score)
+        } for subset_name, (score, _) in subset_score_map.items()]
+        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
+        res_map = dict(
+            name=report_name or 'HumanEval',
+            metric='pass@1',
+            score=weighted_avg_acc,
+            category=[category_d],
+            total_num=total_num)
+        return res_map
+    @classmethod
+    def _postprocess(cls, text: str) -> str:
+        if '```' in text:
+            blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
+            if len(blocks) == 0:
+                text = text.split('```')[1]  # fall back to default strategy
+            else:
+                text = blocks[0]  # fetch the first code block
+                if not text.startswith('\n'):  # in case starting with ```python
+                    text = text[max(text.find('\n') + 1, 0):]
+        if text.strip().startswith('from') or text.strip().startswith('import'):
+            def_idx = text.find('def')
+            if def_idx != -1:
+                text = text[max(text.find('\n', def_idx) + 1, 0):]
+        text = text.split('\n\n')[0]
+        if text.strip().startswith('def'):
+            text = '\n'.join(text.split('\n')[1:])
+        if not text.startswith('    '):
+            if text.startswith(' '):
+                text = '    ' + text.lstrip()
+            else:
+                text = '\n'.join(['    ' + line for line in text.split('\n')])
+        return text
+    def compute_metric(self, review_res_list: list) -> float:
+        """
+        Compute evaluation result by specific metric.
+        Args:
+            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
+        Returns:
+            The metric score.
+        """
+        items = [(score, 1.0) for score in review_res_list]
+        return weighted_mean(items)
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+        return self._postprocess(result)
+    def get_gold_answer(self, input_d: dict) -> str:
+        return input_d
+    def match(self, gold: str, pred: str) -> float:
+        res = self.eval_func(gold, pred, self.timeout)
+        return float(res['passed'])

evalscope/benchmarks/mmlu/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.benchmarks.mmlu.mmlu_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST, MMLUAdapter
+from evalscope.benchmarks.mmlu.mmlu_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
+from evalscope.benchmarks.mmlu.mmlu_adapter import MMLUAdapter
 from evalscope.benchmarks.mmlu.mmlu_adapter import MMLUAdapter as DataAdapterClass
-from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass    # noqa
+from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass  # noqa

evalscope/benchmarks/mmlu/mmlu.py CHANGED Viewed

@@ -1,3 +1,4 @@
+# isort: skip_file
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
 #
@@ -14,14 +15,11 @@
 # limitations under the License.
 # flake8: noqa
-import os
 import datasets
+import os
 import pandas as pd
 """The MMLU dataset on ModelScope hub. READ ONLY, DO NOT MODIFY."""
 _CITATION = """\
 @article{hendryckstest2021,
   title={Measuring Massive Multitask Language Understanding},
@@ -105,29 +103,23 @@ task_list = [
 class MMLUConfig(datasets.BuilderConfig):
     def __init__(self, **kwargs):
         super().__init__(version=datasets.Version('1.0.0'), **kwargs)
 class MMLU(datasets.GeneratorBasedBuilder):
-    BUILDER_CONFIGS = [
-        MMLUConfig(
-            name=task_name,
-        )
-        for task_name in task_list
-    ]
+    BUILDER_CONFIGS = [MMLUConfig(name=task_name, ) for task_name in task_list]
     def _info(self):
-        features = datasets.Features(
-            {
-                'input': datasets.Value('string'),
-                'A': datasets.Value('string'),
-                'B': datasets.Value('string'),
-                'C': datasets.Value('string'),
-                'D': datasets.Value('string'),
-                'target': datasets.Value('string'),
-            }
-        )
+        features = datasets.Features({
+            'input': datasets.Value('string'),
+            'A': datasets.Value('string'),
+            'B': datasets.Value('string'),
+            'C': datasets.Value('string'),
+            'D': datasets.Value('string'),
+            'target': datasets.Value('string'),
+        })
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
             features=features,
@@ -143,25 +135,19 @@ class MMLU(datasets.GeneratorBasedBuilder):
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
                 gen_kwargs={
-                    'filepath': os.path.join(
-                        data_dir, 'data', 'test', f'{task_name}_test.csv'
-                    ),
+                    'filepath': os.path.join(data_dir, 'data', 'test', f'{task_name}_test.csv'),
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.VALIDATION,
                 gen_kwargs={
-                    'filepath': os.path.join(
-                        data_dir, 'data', 'val', f'{task_name}_val.csv'
-                    ),
+                    'filepath': os.path.join(data_dir, 'data', 'val', f'{task_name}_val.csv'),
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
                 gen_kwargs={
-                    'filepath': os.path.join(
-                        data_dir, 'data', 'dev', f'{task_name}_dev.csv'
-                    ),
+                    'filepath': os.path.join(data_dir, 'data', 'dev', f'{task_name}_dev.csv'),
                 },
             ),
         ]

evalscope/benchmarks/mmlu/mmlu_adapter.py CHANGED Viewed

@@ -4,8 +4,9 @@ import os
 from evalscope.benchmarks.data_adapter import DataAdapter
 from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import normalize_score, ResponseParser
+from evalscope.utils import ResponseParser, normalize_score
 from evalscope.utils.logger import get_logger
 # flake8: noqa
 logger = get_logger()
@@ -72,65 +73,65 @@ SUBSET_LIST = [
     'college_biology',
 ]
-SUBJECT_MAPPING = {'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
-                   'anatomy': ['Anatomy', 'health', 'Other'],
-                   'astronomy': ['Astronomy', 'physics', 'STEM'],
-                   'business_ethics': ['Business Ethics', 'business', 'Other'],
-                   'clinical_knowledge': ['Clinical Knowledge', 'health', 'Other'],
-                   'college_biology': ['College Biology', 'biology', 'STEM'],
-                   'college_chemistry': ['College Chemistry', 'chemistry', 'STEM'],
-                   'college_computer_science': ['College Computer Science', 'computer science', 'STEM'],
-                   'college_mathematics': ['College Mathematics', 'math', 'STEM'],
-                   'college_medicine': ['College Medicine', 'health', 'Other'],
-                   'college_physics': ['College Physics', 'physics', 'STEM'],
-                   'computer_security': ['Computer Security', 'computer science', 'STEM'],
-                   'conceptual_physics': ['Conceptual Physics', 'physics', 'STEM'],
-                   'econometrics': ['Econometrics', 'economics', 'Social Science'],
-                   'electrical_engineering': ['Electrical Engineering', 'engineering', 'STEM'],
-                   'elementary_mathematics': ['Elementary Mathematics', 'math', 'STEM'],
-                   'formal_logic': ['Formal Logic', 'philosophy', 'Humanities'],
-                   'global_facts': ['Global Facts', 'other', 'Other'],
-                   'high_school_biology': ['High School Biology', 'biology', 'STEM'],
-                   'high_school_chemistry': ['High School Chemistry', 'chemistry', 'STEM'],
-                   'high_school_computer_science': ['High School Computer Science', 'computer science', 'STEM'],
-                   'high_school_european_history': ['High School European History', 'history', 'Humanities'],
-                   'high_school_geography': ['High School Geography', 'geography', 'Social Science'],
-                   'high_school_government_and_politics': ['High School Government And Politics', 'politics', 'Social Science'],
-                   'high_school_macroeconomics': ['High School Macroeconomics', 'economics', 'Social Science'],
-                   'high_school_mathematics': ['High School Mathematics', 'math', 'STEM'],
-                   'high_school_microeconomics': ['High School Microeconomics', 'economics', 'Social Science'],
-                   'high_school_physics': ['High School Physics', 'physics', 'STEM'],
-                   'high_school_psychology': ['High School Psychology', 'psychology', 'Social Science'],
-                   'high_school_statistics': ['High School Statistics', 'math', 'STEM'],
-                   'high_school_us_history': ['High School Us History', 'history', 'Humanities'],
-                   'high_school_world_history': ['High School World History', 'history', 'Humanities'],
-                   'human_aging': ['Human Aging', 'health', 'Other'],
-                   'human_sexuality': ['Human Sexuality', 'culture', 'Social Science'],
-                   'international_law': ['International Law', 'law', 'Humanities'],
-                   'jurisprudence': ['Jurisprudence', 'law', 'Humanities'],
-                   'logical_fallacies': ['Logical Fallacies', 'philosophy', 'Humanities'],
-                   'machine_learning': ['Machine Learning', 'computer science', 'STEM'],
-                   'management': ['Management', 'business', 'Other'],
-                   'marketing': ['Marketing', 'business', 'Other'],
-                   'medical_genetics': ['Medical Genetics', 'health', 'Other'],
-                   'miscellaneous': ['Miscellaneous', 'other', 'Other'],
-                   'moral_disputes': ['Moral Disputes', 'philosophy', 'Humanities'],
-                   'moral_scenarios': ['Moral Scenarios', 'philosophy', 'Humanities'],
-                   'nutrition': ['Nutrition', 'health', 'Other'],
-                   'philosophy': ['Philosophy', 'philosophy', 'Humanities'],
-                   'prehistory': ['Prehistory', 'history', 'Humanities'],
-                   'professional_accounting': ['Professional Accounting', 'other', 'Other'],
-                   'professional_law': ['Professional Law', 'law', 'Humanities'],
-                   'professional_medicine': ['Professional Medicine', 'health', 'Other'],
-                   'professional_psychology': ['Professional Psychology', 'psychology', 'Social Science'],
-                   'public_relations': ['Public Relations', 'politics', 'Social Science'],
-                   'security_studies': ['Security Studies', 'politics', 'Social Science'],
-                   'sociology': ['Sociology', 'culture', 'Social Science'],
-                   'us_foreign_policy': ['Us Foreign Policy', 'politics', 'Social Science'],
-                   'virology': ['Virology', 'health', 'Other'],
-                   'world_religions': ['World Religions', 'philosophy', 'Humanities'],
-                   }
+SUBJECT_MAPPING = {
+    'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
+    'anatomy': ['Anatomy', 'health', 'Other'],
+    'astronomy': ['Astronomy', 'physics', 'STEM'],
+    'business_ethics': ['Business Ethics', 'business', 'Other'],
+    'clinical_knowledge': ['Clinical Knowledge', 'health', 'Other'],
+    'college_biology': ['College Biology', 'biology', 'STEM'],
+    'college_chemistry': ['College Chemistry', 'chemistry', 'STEM'],
+    'college_computer_science': ['College Computer Science', 'computer science', 'STEM'],
+    'college_mathematics': ['College Mathematics', 'math', 'STEM'],
+    'college_medicine': ['College Medicine', 'health', 'Other'],
+    'college_physics': ['College Physics', 'physics', 'STEM'],
+    'computer_security': ['Computer Security', 'computer science', 'STEM'],
+    'conceptual_physics': ['Conceptual Physics', 'physics', 'STEM'],
+    'econometrics': ['Econometrics', 'economics', 'Social Science'],
+    'electrical_engineering': ['Electrical Engineering', 'engineering', 'STEM'],
+    'elementary_mathematics': ['Elementary Mathematics', 'math', 'STEM'],
+    'formal_logic': ['Formal Logic', 'philosophy', 'Humanities'],
+    'global_facts': ['Global Facts', 'other', 'Other'],
+    'high_school_biology': ['High School Biology', 'biology', 'STEM'],
+    'high_school_chemistry': ['High School Chemistry', 'chemistry', 'STEM'],
+    'high_school_computer_science': ['High School Computer Science', 'computer science', 'STEM'],
+    'high_school_european_history': ['High School European History', 'history', 'Humanities'],
+    'high_school_geography': ['High School Geography', 'geography', 'Social Science'],
+    'high_school_government_and_politics': ['High School Government And Politics', 'politics', 'Social Science'],
+    'high_school_macroeconomics': ['High School Macroeconomics', 'economics', 'Social Science'],
+    'high_school_mathematics': ['High School Mathematics', 'math', 'STEM'],
+    'high_school_microeconomics': ['High School Microeconomics', 'economics', 'Social Science'],
+    'high_school_physics': ['High School Physics', 'physics', 'STEM'],
+    'high_school_psychology': ['High School Psychology', 'psychology', 'Social Science'],
+    'high_school_statistics': ['High School Statistics', 'math', 'STEM'],
+    'high_school_us_history': ['High School Us History', 'history', 'Humanities'],
+    'high_school_world_history': ['High School World History', 'history', 'Humanities'],
+    'human_aging': ['Human Aging', 'health', 'Other'],
+    'human_sexuality': ['Human Sexuality', 'culture', 'Social Science'],
+    'international_law': ['International Law', 'law', 'Humanities'],
+    'jurisprudence': ['Jurisprudence', 'law', 'Humanities'],
+    'logical_fallacies': ['Logical Fallacies', 'philosophy', 'Humanities'],
+    'machine_learning': ['Machine Learning', 'computer science', 'STEM'],
+    'management': ['Management', 'business', 'Other'],
+    'marketing': ['Marketing', 'business', 'Other'],
+    'medical_genetics': ['Medical Genetics', 'health', 'Other'],
+    'miscellaneous': ['Miscellaneous', 'other', 'Other'],
+    'moral_disputes': ['Moral Disputes', 'philosophy', 'Humanities'],
+    'moral_scenarios': ['Moral Scenarios', 'philosophy', 'Humanities'],
+    'nutrition': ['Nutrition', 'health', 'Other'],
+    'philosophy': ['Philosophy', 'philosophy', 'Humanities'],
+    'prehistory': ['Prehistory', 'history', 'Humanities'],
+    'professional_accounting': ['Professional Accounting', 'other', 'Other'],
+    'professional_law': ['Professional Law', 'law', 'Humanities'],
+    'professional_medicine': ['Professional Medicine', 'health', 'Other'],
+    'professional_psychology': ['Professional Psychology', 'psychology', 'Social Science'],
+    'public_relations': ['Public Relations', 'politics', 'Social Science'],
+    'security_studies': ['Security Studies', 'politics', 'Social Science'],
+    'sociology': ['Sociology', 'culture', 'Social Science'],
+    'us_foreign_policy': ['Us Foreign Policy', 'politics', 'Social Science'],
+    'virology': ['Virology', 'health', 'Other'],
+    'world_religions': ['World Religions', 'philosophy', 'Humanities'],
+}
 class MMLUAdapter(DataAdapter):
@@ -160,12 +161,13 @@ class MMLUAdapter(DataAdapter):
             logger.warning(f'few_shot_num <= 5 for MMLU, but got {few_shot_num}. Use 5-shot by default.')
             few_shot_num = 5
-        super().__init__(subset_list=subset_list,
-                         metric_list=metric_list,
-                         few_shot_num=few_shot_num,
-                         train_split=train_split,
-                         eval_split=eval_split,
-                         **kwargs)
+        super().__init__(
+            subset_list=subset_list,
+            metric_list=metric_list,
+            few_shot_num=few_shot_num,
+            train_split=train_split,
+            eval_split=eval_split,
+            **kwargs)
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -227,8 +229,7 @@ class MMLUAdapter(DataAdapter):
         """
         prompt = 'The following are multiple choice questions (with answers) about {}.\n\n'.format(
-            self._format_subject(subset_name)
-        )
+            self._format_subject(subset_name))
         few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
         context: str = '\n'.join(few_shot_prompts) + '\n'
@@ -335,19 +336,26 @@ class MMLUAdapter(DataAdapter):
             domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
                                       sum([num for _, _, num in domain_res_list])
             domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
-            category_list.append({'name': domain_name,
-                                  'score': domain_weighted_avg_acc,
-                                  'subset': [{'name': subset_name, 'score': normalize_score(score=subset_score)}
-                                             for subset_name, subset_score, _ in domain_res_list]})
+            category_list.append({
+                'name':
+                domain_name,
+                'score':
+                domain_weighted_avg_acc,
+                'subset': [{
+                    'name': subset_name,
+                    'score': normalize_score(score=subset_score)
+                } for subset_name, subset_score, _ in domain_res_list]
+            })
         category_list = sorted(category_list, key=lambda x: x['name'])
         # Get final dict of report
-        res_map = dict(name=report_name or 'mmlu',
-                       metric=self.metric_list[0]['name'],
-                       score=weighted_avg_acc,
-                       category=category_list,
-                       total_num=total_num)
+        res_map = dict(
+            name=report_name or 'mmlu',
+            metric=self.metric_list[0]['name'],
+            score=weighted_avg_acc,
+            category=category_list,
+            total_num=total_num)
         return res_map

evalscope/benchmarks/race/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.benchmarks.race.race_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST, RACEAdapter
+from evalscope.benchmarks.race.race_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
+from evalscope.benchmarks.race.race_adapter import RACEAdapter
 from evalscope.benchmarks.race.race_adapter import RACEAdapter as DataAdapterClass
-from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass    # noqa
+from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass  # noqa

evalscope/benchmarks/race/race.py CHANGED Viewed

@@ -11,12 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
 import datasets
+import os
 import pandas as pd
 _CITATION = """\
 @inproceedings{lai-etal-2017-race,
     title = "{RACE}: Large-scale {R}e{A}ding Comprehension Dataset From Examinations",
@@ -40,39 +38,33 @@ _DESCRIPTION = """\
 RACE is a large-scale reading comprehension dataset with more than 28,000 passages and nearly 100,000 questions.
 """
-_HOMEPAGE = "https://modelscope.cn/datasets/modelscope/race/summary"
+_HOMEPAGE = 'https://modelscope.cn/datasets/modelscope/race/summary'
-_URL = "https://modelscope.cn/api/v1/datasets/modelscope/race/repo?Revision=master&FilePath=race.zip"
+_URL = 'https://modelscope.cn/api/v1/datasets/modelscope/race/repo?Revision=master&FilePath=race.zip'
 task_list = [
-    "high",
-    "middle",
+    'high',
+    'middle',
 ]
 class RACEConfig(datasets.BuilderConfig):
     def __init__(self, **kwargs):
-        super().__init__(version=datasets.Version("1.0.0"), **kwargs)
+        super().__init__(version=datasets.Version('1.0.0'), **kwargs)
 class RACE(datasets.GeneratorBasedBuilder):
-    BUILDER_CONFIGS = [
-        RACEConfig(
-            name=task_name,
-        )
-        for task_name in task_list
-    ]
+    BUILDER_CONFIGS = [RACEConfig(name=task_name, ) for task_name in task_list]
     def _info(self):
-        features = datasets.Features(
-            {
-                "example_id": datasets.Value("string"),
-                "article": datasets.Value("string"),
-                "answer": datasets.Value("string"),
-                "question": datasets.Value("string"),
-                "options": [datasets.Value("string")],
-            }
-        )
+        features = datasets.Features({
+            'example_id': datasets.Value('string'),
+            'article': datasets.Value('string'),
+            'answer': datasets.Value('string'),
+            'question': datasets.Value('string'),
+            'options': [datasets.Value('string')],
+        })
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
             features=features,
@@ -87,32 +79,26 @@ class RACE(datasets.GeneratorBasedBuilder):
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
                 gen_kwargs={
-                    "filepath": os.path.join(
-                        data_dir, f"race/test/{task_name}-00000-of-00001.parquet"
-                    ),
+                    'filepath': os.path.join(data_dir, f'race/test/{task_name}-00000-of-00001.parquet'),
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.VALIDATION,
                 gen_kwargs={
-                    "filepath": os.path.join(
-                        data_dir, f"race/val/{task_name}-00000-of-00001.parquet"
-                    ),
+                    'filepath': os.path.join(data_dir, f'race/val/{task_name}-00000-of-00001.parquet'),
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
                 gen_kwargs={
-                    "filepath": os.path.join(
-                        data_dir, f"race/train/{task_name}-00000-of-00001.parquet"
-                    ),
+                    'filepath': os.path.join(data_dir, f'race/train/{task_name}-00000-of-00001.parquet'),
                 },
             ),
         ]
     def _generate_examples(self, filepath):
         df = pd.read_parquet(filepath)
-        df.columns = ["example_id", "article", "answer", "question", "options"]
+        df.columns = ['example_id', 'article', 'answer', 'question', 'options']
-        for i, instance in enumerate(df.to_dict(orient="records")):
-            yield i, instance
+        for i, instance in enumerate(df.to_dict(orient='records')):
+            yield i, instance

evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.7.2py3-none-any.whl → 0.8.1py3-none-any.whl