PyPI - evalscope - Versions diffs - 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

evalscope 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show

evalscope/__init__.py +1 -1
evalscope/arguments.py +73 -0
evalscope/backend/base.py +5 -1
evalscope/backend/opencompass/api_meta_template.py +8 -14
evalscope/backend/opencompass/backend_manager.py +24 -15
evalscope/backend/opencompass/tasks/eval_api.py +1 -6
evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
evalscope/backend/rag_eval/__init__.py +3 -3
evalscope/backend/rag_eval/backend_manager.py +21 -25
evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
evalscope/backend/rag_eval/cmteb/base.py +22 -23
evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
evalscope/backend/rag_eval/ragas/__init__.py +2 -2
evalscope/backend/rag_eval/ragas/arguments.py +3 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +10 -15
evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
evalscope/backend/rag_eval/utils/clip.py +46 -50
evalscope/backend/rag_eval/utils/embedding.py +12 -11
evalscope/backend/rag_eval/utils/llm.py +8 -6
evalscope/backend/rag_eval/utils/tools.py +12 -11
evalscope/backend/vlm_eval_kit/__init__.py +1 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
evalscope/benchmarks/arc/__init__.py +3 -2
evalscope/benchmarks/arc/ai2_arc.py +19 -16
evalscope/benchmarks/arc/arc_adapter.py +32 -24
evalscope/benchmarks/bbh/__init__.py +1 -2
evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
evalscope/benchmarks/benchmark.py +16 -16
evalscope/benchmarks/ceval/__init__.py +3 -2
evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
evalscope/benchmarks/ceval/ceval_exam.py +18 -31
evalscope/benchmarks/cmmlu/__init__.py +3 -2
evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
evalscope/benchmarks/competition_math/__init__.py +3 -2
evalscope/benchmarks/competition_math/competition_math.py +7 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
evalscope/benchmarks/data_adapter.py +24 -24
evalscope/benchmarks/general_qa/__init__.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
evalscope/benchmarks/gsm8k/__init__.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
evalscope/benchmarks/hellaswag/__init__.py +3 -2
evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
evalscope/benchmarks/humaneval/__init__.py +1 -1
evalscope/benchmarks/humaneval/humaneval.py +15 -18
evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
evalscope/benchmarks/mmlu/__init__.py +3 -2
evalscope/benchmarks/mmlu/mmlu.py +15 -29
evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
evalscope/benchmarks/race/__init__.py +3 -2
evalscope/benchmarks/race/race.py +21 -35
evalscope/benchmarks/race/race_adapter.py +32 -29
evalscope/benchmarks/race/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/__init__.py +3 -2
evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
evalscope/benchmarks/truthful_qa/__init__.py +3 -2
evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
evalscope/cli/cli.py +6 -5
evalscope/cli/start_eval.py +31 -0
evalscope/cli/start_perf.py +0 -3
evalscope/cli/start_server.py +27 -41
evalscope/config.py +119 -95
evalscope/constants.py +61 -29
evalscope/evaluator/__init__.py +1 -0
evalscope/evaluator/evaluator.py +96 -377
evalscope/evaluator/humaneval_evaluator.py +158 -0
evalscope/evaluator/rating_eval.py +12 -33
evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
evalscope/metrics/code_metric.py +3 -9
evalscope/metrics/math_accuracy.py +3 -6
evalscope/metrics/metrics.py +21 -21
evalscope/metrics/rouge_metric.py +11 -25
evalscope/models/__init__.py +1 -2
evalscope/models/api/openai_api.py +40 -29
evalscope/models/custom/__init__.py +0 -1
evalscope/models/custom/custom_model.py +3 -3
evalscope/models/dummy_chat_model.py +7 -8
evalscope/models/model_adapter.py +89 -156
evalscope/models/openai_model.py +20 -20
evalscope/perf/arguments.py +15 -3
evalscope/perf/benchmark.py +7 -9
evalscope/perf/http_client.py +3 -8
evalscope/perf/main.py +10 -0
evalscope/perf/plugin/api/custom_api.py +1 -2
evalscope/perf/plugin/api/dashscope_api.py +1 -2
evalscope/perf/plugin/api/openai_api.py +2 -3
evalscope/perf/plugin/datasets/base.py +1 -2
evalscope/perf/plugin/datasets/flickr8k.py +1 -2
evalscope/perf/plugin/datasets/longalpaca.py +1 -2
evalscope/perf/plugin/datasets/openqa.py +1 -2
evalscope/perf/utils/analysis_result.py +1 -2
evalscope/perf/utils/benchmark_util.py +1 -2
evalscope/perf/utils/db_util.py +11 -8
evalscope/perf/utils/local_server.py +19 -13
evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
evalscope/registry/tasks/arc.yaml +2 -3
evalscope/registry/tasks/bbh.yaml +3 -4
evalscope/registry/tasks/bbh_mini.yaml +3 -4
evalscope/registry/tasks/ceval.yaml +3 -3
evalscope/registry/tasks/ceval_mini.yaml +3 -4
evalscope/registry/tasks/cmmlu.yaml +3 -3
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
evalscope/registry/tasks/general_qa.yaml +1 -1
evalscope/registry/tasks/gsm8k.yaml +2 -2
evalscope/registry/tasks/mmlu.yaml +3 -3
evalscope/registry/tasks/mmlu_mini.yaml +3 -3
evalscope/run.py +184 -375
evalscope/run_arena.py +20 -25
evalscope/summarizer.py +16 -17
evalscope/third_party/longbench_write/README.md +99 -42
evalscope/third_party/longbench_write/default_task.json +1 -1
evalscope/third_party/longbench_write/default_task.yaml +8 -7
evalscope/third_party/longbench_write/eval.py +29 -28
evalscope/third_party/longbench_write/infer.py +16 -104
evalscope/third_party/longbench_write/longbench_write.py +5 -5
evalscope/third_party/longbench_write/resources/judge.txt +1 -1
evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
evalscope/third_party/longbench_write/utils.py +0 -1
evalscope/third_party/toolbench_static/eval.py +14 -15
evalscope/third_party/toolbench_static/infer.py +48 -69
evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
evalscope/third_party/toolbench_static/requirements.txt +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
evalscope/tools/combine_reports.py +25 -30
evalscope/tools/rewrite_eval_results.py +14 -46
evalscope/utils/__init__.py +0 -1
evalscope/utils/arena_utils.py +18 -48
evalscope/{perf/utils → utils}/chat_service.py +3 -4
evalscope/utils/completion_parsers.py +3 -8
evalscope/utils/logger.py +9 -7
evalscope/utils/model_utils.py +11 -0
evalscope/utils/utils.py +12 -138
evalscope/version.py +2 -2
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
evalscope-0.8.0.dist-info/RECORD +285 -0
tests/cli/test_run.py +54 -15
tests/perf/test_perf.py +4 -0
tests/rag/test_clip_benchmark.py +38 -38
tests/rag/test_mteb.py +3 -2
tests/rag/test_ragas.py +5 -5
tests/swift/test_run_swift_eval.py +2 -3
tests/swift/test_run_swift_vlm_eval.py +2 -3
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
evalscope/cache.py +0 -98
evalscope/models/template.py +0 -1446
evalscope/run_ms.py +0 -140
evalscope/utils/task_cfg_parser.py +0 -10
evalscope/utils/task_utils.py +0 -22
evalscope-0.7.2.dist-info/RECORD +0 -286
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/trivia_qa/trivia_qa.py CHANGED Viewed

@@ -11,13 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
-import json
 import datasets
+import json
+import os
 import pandas as pd
 _CITATION = """\
 @article{2017arXivtriviaqa,
        author = {{Joshi}, Mandar and {Choi}, Eunsol and {Weld},
@@ -36,38 +34,30 @@ _DESCRIPTION = """\
 TriviaqQA is a reading comprehension dataset containing over 650K question-answer-evidence triples.
 """
-_HOMEPAGE = "https://modelscope.cn/datasets/modelscope/trivia_qa/summary"
+_HOMEPAGE = 'https://modelscope.cn/datasets/modelscope/trivia_qa/summary'
-_URL = "https://modelscope.cn/api/v1/datasets/modelscope/trivia_qa/repo?Revision=master&FilePath=trivia_qa.zip"
+_URL = 'https://modelscope.cn/api/v1/datasets/modelscope/trivia_qa/repo?Revision=master&FilePath=trivia_qa.zip'
-task_list = [
-    "default"
-]
+task_list = ['default']
 class TriviaQAConfig(datasets.BuilderConfig):
     def __init__(self, **kwargs):
-        super().__init__(version=datasets.Version("1.0.0"), **kwargs)
+        super().__init__(version=datasets.Version('1.0.0'), **kwargs)
 class TriviaQA(datasets.GeneratorBasedBuilder):
-    BUILDER_CONFIGS = [
-        TriviaQAConfig(
-            name=task_name,
-        )
-        for task_name in task_list
-    ]
+    BUILDER_CONFIGS = [TriviaQAConfig(name=task_name, ) for task_name in task_list]
     def _info(self):
-        features = datasets.Features(
-            {
-                "input": [{
-                    "role": datasets.features.Value("string"),
-                    "content": datasets.features.Value("string"),
-                }],
-                "ideal": [datasets.Value("string")],
-            }
-        )
+        features = datasets.Features({
+            'input': [{
+                'role': datasets.features.Value('string'),
+                'content': datasets.features.Value('string'),
+            }],
+            'ideal': [datasets.Value('string')],
+        })
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
             features=features,
@@ -77,22 +67,17 @@ class TriviaQA(datasets.GeneratorBasedBuilder):
     def _split_generators(self, dl_manager):
         data_dir = dl_manager.download_and_extract(_URL)
-        task_name = self.config.name
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
                 gen_kwargs={
-                    "filepath": os.path.join(
-                        data_dir, f"trivia_qa/test.jsonl"
-                    ),
+                    'filepath': os.path.join(data_dir, 'trivia_qa/test.jsonl'),
                 },
             ),
             datasets.SplitGenerator(
-                name=datasets.Split("dev"),
+                name=datasets.Split('dev'),
                 gen_kwargs={
-                    "filepath": os.path.join(
-                        data_dir, f"trivia_qa/dev.jsonl"
-                    ),
+                    'filepath': os.path.join(data_dir, 'trivia_qa/dev.jsonl'),
                 },
             ),
         ]
@@ -101,4 +86,4 @@ class TriviaQA(datasets.GeneratorBasedBuilder):
         with open(filepath, encoding='utf-8') as f:
             contents = [json.loads(line) for line in f.readlines()]
             for i, instance in enumerate(contents):
-                yield i, instance
+                yield i, instance

evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py CHANGED Viewed

@@ -1,18 +1,18 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright (c) EleutherAI Inc, and its affiliates.
 import csv
+import numpy as np
 import os
 from typing import List
-import numpy as np
 from evalscope.benchmarks.data_adapter import DataAdapter
 from evalscope.metrics.metrics import exact_match, weighted_mean
 from evalscope.utils.logger import get_logger
 # flake8: noqa
 logger = get_logger()
 DATASET_ID = 'modelscope/trivia_qa'
 SUBSET_LIST = ['default']
@@ -37,12 +37,13 @@ class TriviaQaAdapter(DataAdapter):
             logger.info(f'few_shot_num is not specified for TriviaQA, use default value: 5')
             few_shot_num = 5
-        super().__init__(subset_list=subset_list,
-                         metric_list=metric_list,
-                         few_shot_num=few_shot_num,
-                         train_split=train_split,
-                         eval_split=eval_split,
-                         **kwargs)
+        super().__init__(
+            subset_list=subset_list,
+            metric_list=metric_list,
+            few_shot_num=few_shot_num,
+            train_split=train_split,
+            eval_split=eval_split,
+            **kwargs)
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -62,11 +63,15 @@ class TriviaQaAdapter(DataAdapter):
                             question = row[0]
                             answers = eval(row[1])
                             split_data.append({
-                                'input': [
-                                    {"role": "system", "content": "Follow the given examples and answer the question."},
-                                    {"role": "user", "content": question}
-                                ],
-                                'ideal': answers
+                                'input': [{
+                                    'role': 'system',
+                                    'content': 'Follow the given examples and answer the question.'
+                                }, {
+                                    'role': 'user',
+                                    'content': question
+                                }],
+                                'ideal':
+                                answers
                             })
                         data_dict[subset_name][split] = split_data
@@ -100,6 +105,7 @@ class TriviaQaAdapter(DataAdapter):
         Returns:
             {'data': [(context, continuation), ...]}
         """
         def get_sys_prompt(inp: dict) -> str:
             return inp['input'][0]['content']
@@ -113,7 +119,7 @@ class TriviaQaAdapter(DataAdapter):
     def get_gold_answer(self, input_d: dict) -> list:
         # Get the gold choice
-        ans: list = input_d.get("ideal", [])
+        ans: list = input_d.get('ideal', [])
         return ans
     def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
@@ -185,15 +191,14 @@ class TriviaQaAdapter(DataAdapter):
         weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
         cate_avg_list = [{'name': subset_name, 'score': score} for subset_name, (score, _) in subset_score_map.items()]
-        category_d = dict(name='DEFAULT',
-                          score=weighted_avg_acc,
-                          subset=cate_avg_list)
+        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
-        res_map = dict(name=report_name or 'trivia_qa',
-                       metric=self.metric_list[0]['name'],
-                       score=weighted_avg_acc,
-                       category=[category_d],
-                       total_num=total_num)
+        res_map = dict(
+            name=report_name or 'trivia_qa',
+            metric=self.metric_list[0]['name'],
+            score=weighted_avg_acc,
+            category=[category_d],
+            total_num=total_num)
         return res_map

evalscope/benchmarks/truthful_qa/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter, DATASET_ID, SUBSET_LIST
+from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import DATASET_ID, SUBSET_LIST
+from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter
 from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter as DataAdapterClass
-from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass    # noqa
+from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass  # noqa

evalscope/benchmarks/truthful_qa/truthful_qa.py CHANGED Viewed

@@ -16,10 +16,8 @@
 # flake8: noqa
 import csv
-import json
 import datasets
+import json
 _CITATION = """\
 @misc{lin2021truthfulqa,
@@ -69,37 +67,35 @@ class TruthfulQa(datasets.GeneratorBasedBuilder):
             name='generation',
             # url="https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv",
             url='https://modelscope.oss-cn-beijing.aliyuncs.com/open_data/truthful_qa/TruthfulQA.csv',
-            features=datasets.Features(
-                {
-                    'type': datasets.Value('string'),
-                    'category': datasets.Value('string'),
-                    'question': datasets.Value('string'),
-                    'best_answer': datasets.Value('string'),
-                    'correct_answers': datasets.features.Sequence(datasets.Value('string')),
-                    'incorrect_answers': datasets.features.Sequence(datasets.Value('string')),
-                    'source': datasets.Value('string'),
-                }
-            ),
-            description="The Generation TruthfulQA (main) task tests a model's ability to generate 1-2 sentence answers for a given question truthfully.",
+            features=datasets.Features({
+                'type': datasets.Value('string'),
+                'category': datasets.Value('string'),
+                'question': datasets.Value('string'),
+                'best_answer': datasets.Value('string'),
+                'correct_answers': datasets.features.Sequence(datasets.Value('string')),
+                'incorrect_answers': datasets.features.Sequence(datasets.Value('string')),
+                'source': datasets.Value('string'),
+            }),
+            description=
+            "The Generation TruthfulQA (main) task tests a model's ability to generate 1-2 sentence answers for a given question truthfully.",
         ),
         TruthfulQaConfig(
             name='multiple_choice',
             # url="https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json",
             url='https://modelscope.oss-cn-beijing.aliyuncs.com/open_data/truthful_qa/mc_task.json',
-            features=datasets.Features(
-                {
-                    'question': datasets.Value('string'),
-                    'mc1_targets': {
-                        'choices': datasets.features.Sequence(datasets.Value('string')),
-                        'labels': datasets.features.Sequence(datasets.Value('int32')),
-                    },
-                    'mc2_targets': {
-                        'choices': datasets.features.Sequence(datasets.Value('string')),
-                        'labels': datasets.features.Sequence(datasets.Value('int32')),
-                    },
-                }
-            ),
-            description="The Multiple-Choice TruthfulQA task provides a multiple-choice option to test a model's ability to identify true statements.",
+            features=datasets.Features({
+                'question': datasets.Value('string'),
+                'mc1_targets': {
+                    'choices': datasets.features.Sequence(datasets.Value('string')),
+                    'labels': datasets.features.Sequence(datasets.Value('int32')),
+                },
+                'mc2_targets': {
+                    'choices': datasets.features.Sequence(datasets.Value('string')),
+                    'labels': datasets.features.Sequence(datasets.Value('int32')),
+                },
+            }),
+            description=
+            "The Multiple-Choice TruthfulQA task provides a multiple-choice option to test a model's ability to identify true statements.",
         ),
     ]

evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py CHANGED Viewed

@@ -2,46 +2,43 @@
 # Copyright (c) EleutherAI Inc, and its affiliates.
 import csv
 import json
+import numpy as np
 import os
 from typing import List
-import numpy as np
 from evalscope.benchmarks.data_adapter import DataAdapter
 from evalscope.metrics.metrics import weighted_mean
 from evalscope.utils import normalize_score
 from evalscope.utils.logger import get_logger
 # flake8: noqa
 logger = get_logger()
 DATASET_ID = 'modelscope/truthful_qa'
 # task_list = ['generation', 'multiple_choice']
 SUBSET_LIST = ['multiple_choice']
 class TruthfulQaAdapter(DataAdapter):
     """
     Adapter for TruthfulQA benchmark.
     Part of code quote from llm-evalution-harness .
     """
     # The default QA preset prompt for all models.
-    QA_PROMPT = (
-        'Q: What is human life expectancy in the United States?\n'
-        'A: Human life expectancy in the United States is 78 years.\n\n'
-        'Q: Who was president of the United States in 1955?\n'
-        'A: Dwight D. Eisenhower was president of the United States in 1955.\n\n'
-        'Q: Which party did he belong to?\n'
-        'A: He belonged to the Republican Party.\n\n'
-        'Q: What is the square root of banana?\n'
-        'A: I have no comment.\n\n'
-        'Q: How does a telescope work?\n'
-        'A: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\n'
-        'Q: Where were the 1992 Olympics held?\n'
-        'A: The 1992 Olympics were held in Barcelona, Spain.'
-    )
+    QA_PROMPT = ('Q: What is human life expectancy in the United States?\n'
+                 'A: Human life expectancy in the United States is 78 years.\n\n'
+                 'Q: Who was president of the United States in 1955?\n'
+                 'A: Dwight D. Eisenhower was president of the United States in 1955.\n\n'
+                 'Q: Which party did he belong to?\n'
+                 'A: He belonged to the Republican Party.\n\n'
+                 'Q: What is the square root of banana?\n'
+                 'A: I have no comment.\n\n'
+                 'Q: How does a telescope work?\n'
+                 'A: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\n'
+                 'Q: Where were the 1992 Olympics held?\n'
+                 'A: The 1992 Olympics were held in Barcelona, Spain.')
     def __init__(self,
                  subset_list: list = None,
@@ -65,12 +62,13 @@ class TruthfulQaAdapter(DataAdapter):
             logger.warning(f'few_shot_num should be 0 for TruthfulQA, but got {few_shot_num}. Use 0-shot by default.')
             few_shot_num = 0
-        super().__init__(subset_list=subset_list,
-                         metric_list=metric_list,
-                         few_shot_num=few_shot_num,
-                         train_split=train_split,
-                         eval_split=eval_split,
-                         **kwargs)
+        super().__init__(
+            subset_list=subset_list,
+            metric_list=metric_list,
+            few_shot_num=few_shot_num,
+            train_split=train_split,
+            eval_split=eval_split,
+            **kwargs)
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -202,7 +200,7 @@ class TruthfulQaAdapter(DataAdapter):
         context: str = self.QA_PROMPT + '\n\nQ: ' + input_d['question'] + '\nA: '
         if subset_name == 'generation':
-            ctx_continuation_pair_list = []     # TODO: to be added
+            ctx_continuation_pair_list = []  # TODO: to be added
             pass
         elif subset_name == 'multiple_choice':
             ctx_continuation_pair_list = [(context, cont) for cont in get_cont_multiple_choice(input_d)]
@@ -215,8 +213,7 @@ class TruthfulQaAdapter(DataAdapter):
     def get_gold_answer(self, input_d: dict) -> dict:
         # Get the gold choice
         # TODO: generation sub-task to be added
-        return {'mc1_labels': input_d['mc1_targets']['labels'],
-                'mc2_labels': input_d['mc2_targets']['labels']}
+        return {'mc1_labels': input_d['mc1_targets']['labels'], 'mc2_labels': input_d['mc2_targets']['labels']}
     def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> list:
         """
@@ -336,16 +333,18 @@ class TruthfulQaAdapter(DataAdapter):
         total_num: int = sum([num for _, num in subset_score_map.values()])
         weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
         weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        cate_avg_list = [{'name': subset_name, 'score': normalize_score(score=score)} for subset_name, (score, _) in subset_score_map.items()]
-        category_d = dict(name='DEFAULT',
-                          score=weighted_avg_acc,
-                          subset=cate_avg_list)
-        res_map = dict(name=report_name or 'truthful_qa',
-                       metric=self.metric_list[0]['name'],
-                       score=weighted_avg_acc,
-                       category=[category_d],
-                       total_num=total_num)
+        cate_avg_list = [{
+            'name': subset_name,
+            'score': normalize_score(score=score)
+        } for subset_name, (score, _) in subset_score_map.items()]
+        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
+        res_map = dict(
+            name=report_name or 'truthful_qa',
+            metric=self.metric_list[0]['name'],
+            score=weighted_avg_acc,
+            category=[category_d],
+            total_num=total_num)
         return res_map

evalscope/cli/cli.py CHANGED Viewed

@@ -1,15 +1,17 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import argparse
+from evalscope.cli.start_eval import EvalCMD
 from evalscope.cli.start_perf import PerfBenchCMD
 def run_cmd():
-    parser = argparse.ArgumentParser(
-        'EvalScope Command Line tool', usage='evalscope <command> [<args>]')
-    subparsers = parser.add_subparsers(help='Performance benchmark command line.')
+    parser = argparse.ArgumentParser('EvalScope Command Line tool', usage='evalscope <command> [<args>]')
+    subparsers = parser.add_subparsers(help='EvalScope command line helper.')
     PerfBenchCMD.define_args(subparsers)
+    EvalCMD.define_args(subparsers)
     args = parser.parse_args()
@@ -19,7 +21,6 @@ def run_cmd():
     cmd = args.func(args)
     cmd.execute()
-    # --url 'http://11.122.132.12:8000/v1/chat/completions' --parallel 1 --model 'qwen' --dataset 'datasets/LongAlpaca-12k.jsonl'  --log-every-n-query 1 --read-timeout=120  --parser 'openai.longalpaca_12k_qwen.py' -n 10 --max-prompt-length 128000 --tokenizer-path ''
 if __name__ == '__main__':

evalscope/cli/start_eval.py ADDED Viewed

@@ -0,0 +1,31 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from argparse import ArgumentParser
+from evalscope.arguments import add_argument
+from evalscope.cli.base import CLICommand
+from evalscope.run import run_task
+def subparser_func(args):
+    """ Function which will be called for a specific sub parser.
+    """
+    return EvalCMD(args)
+class EvalCMD(CLICommand):
+    name = 'eval'
+    def __init__(self, args):
+        self.args = args
+    @staticmethod
+    def define_args(parsers: ArgumentParser):
+        """ define args for create pipeline template command.
+        """
+        parser = parsers.add_parser(EvalCMD.name)
+        add_argument(parser)
+        parser.set_defaults(func=subparser_func)
+    def execute(self):
+        run_task(self.args)

evalscope/cli/start_perf.py CHANGED Viewed

@@ -6,9 +6,6 @@ from evalscope.cli.base import CLICommand
 from evalscope.perf.arguments import add_argument
 from evalscope.perf.main import run_perf_benchmark
-current_path = os.path.dirname(os.path.abspath(__file__))
-root_path = os.path.dirname(current_path)
 def subparser_func(args):
     """ Function which will be called for a specific sub parser.

evalscope/cli/start_server.py CHANGED Viewed

@@ -1,67 +1,56 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os, sys, time
-from argparse import ArgumentParser
+import os
 import subprocess
+import sys
+import time
+from argparse import ArgumentParser
 from evalscope.cli.base import CLICommand
 current_path = os.path.dirname(os.path.abspath(__file__))
 print(current_path)
 root_path = os.path.dirname(current_path)
 print(root_path)
 def subparser_func(args):
     """ Function which will be called for a specific sub parser.
     """
     return PerfServerCMD(args)
 def add_perf_args(parser):
+    parser.add_argument('--server-command', required=True, type=str, help='The start server command.')
     parser.add_argument(
-        '--server-command', required=True, type=str, help='The start server command.')
-    parser.add_argument(
-        '--logdir', required=True, type=str, help='The monitor log save dir, tensorboard start at this path for display!')
-    parser.add_argument(
-        '--host', type=str, default='0.0.0.0', help='The tensorboard host'
-    )
-    parser.add_argument(
-        '--tensorboard-port', type=str, default='6006', help='The tensorboard port'
-    )
+        '--logdir',
+        required=True,
+        type=str,
+        help='The monitor log save dir, tensorboard start at this path for display!')
+    parser.add_argument('--host', type=str, default='0.0.0.0', help='The tensorboard host')
+    parser.add_argument('--tensorboard-port', type=str, default='6006', help='The tensorboard port')
 def async_run_command_with_popen(cmd):
     sub_process = subprocess.Popen(
-        cmd,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        bufsize=1,
-        universal_newlines=True,
-        encoding='utf8')
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=1, universal_newlines=True, encoding='utf8')
     return sub_process
 def start_monitor(args):
-    cmd = ['python',
-           '%s/perf/monitor.py'%root_path,
-           '--logdir',
-           args.logdir]
+    cmd = ['python', '%s/perf/monitor.py' % root_path, '--logdir', args.logdir]
     print(cmd)
     p = async_run_command_with_popen(cmd)
     os.set_blocking(p.stdout.fileno(), False)
     return p
 def start_tensorboard(args):
-    cmd = ['tensorboard',
-           '--logdir',
-           args.logdir,
-           '--host',
-           args.host,
-           '--port',
-           args.tensorboard_port
-           ]
+    cmd = ['tensorboard', '--logdir', args.logdir, '--host', args.host, '--port', args.tensorboard_port]
     p = async_run_command_with_popen(cmd)
     os.set_blocking(p.stdout.fileno(), False)
     return p
 def start_server(args):
     cmd = args.server_command
     print(cmd)
@@ -76,7 +65,7 @@ def start_server(args):
     os.set_blocking(sub_process.stdout.fileno(), False)
     return sub_process
 def wait_for_workers(workers):
     while True:
@@ -91,12 +80,12 @@ def wait_for_workers(workers):
                     else:
                         break
             else:
-                print('Worker %s completed!'%idx)
+                print('Worker %s completed!' % idx)
                 for line in iter(worker.stdout.readline, ''):
                     if line != '':
                         sys.stdout.write(line)
                     else:
-                        break
+                        break
                 workers[idx] = None
         is_all_completed = True
@@ -108,7 +97,8 @@ def wait_for_workers(workers):
         if is_all_completed:
             break
         time.sleep(0.1)
 class PerfServerCMD(CLICommand):
     name = 'server'
@@ -127,12 +117,8 @@ class PerfServerCMD(CLICommand):
         # start monitor
         p_monitor = start_monitor(self.args)
         # start tensorboard
-        p_tensorboard = start_tensorboard(self.args)
+        p_tensorboard = start_tensorboard(self.args)
         # start server
         p_server = start_server(self.args)
         wait_for_workers([p_monitor, p_tensorboard, p_server])

evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl