PyPI - evalscope - Versions diffs - 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

evalscope 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show

evalscope/__init__.py +1 -1
evalscope/arguments.py +73 -0
evalscope/backend/base.py +5 -1
evalscope/backend/opencompass/api_meta_template.py +8 -14
evalscope/backend/opencompass/backend_manager.py +24 -15
evalscope/backend/opencompass/tasks/eval_api.py +1 -6
evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
evalscope/backend/rag_eval/__init__.py +3 -3
evalscope/backend/rag_eval/backend_manager.py +21 -25
evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
evalscope/backend/rag_eval/cmteb/base.py +22 -23
evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
evalscope/backend/rag_eval/ragas/__init__.py +2 -2
evalscope/backend/rag_eval/ragas/arguments.py +3 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +10 -15
evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
evalscope/backend/rag_eval/utils/clip.py +46 -50
evalscope/backend/rag_eval/utils/embedding.py +12 -11
evalscope/backend/rag_eval/utils/llm.py +8 -6
evalscope/backend/rag_eval/utils/tools.py +12 -11
evalscope/backend/vlm_eval_kit/__init__.py +1 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
evalscope/benchmarks/arc/__init__.py +3 -2
evalscope/benchmarks/arc/ai2_arc.py +19 -16
evalscope/benchmarks/arc/arc_adapter.py +32 -24
evalscope/benchmarks/bbh/__init__.py +1 -2
evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
evalscope/benchmarks/benchmark.py +16 -16
evalscope/benchmarks/ceval/__init__.py +3 -2
evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
evalscope/benchmarks/ceval/ceval_exam.py +18 -31
evalscope/benchmarks/cmmlu/__init__.py +3 -2
evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
evalscope/benchmarks/competition_math/__init__.py +3 -2
evalscope/benchmarks/competition_math/competition_math.py +7 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
evalscope/benchmarks/data_adapter.py +24 -24
evalscope/benchmarks/general_qa/__init__.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
evalscope/benchmarks/gsm8k/__init__.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
evalscope/benchmarks/hellaswag/__init__.py +3 -2
evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
evalscope/benchmarks/humaneval/__init__.py +1 -1
evalscope/benchmarks/humaneval/humaneval.py +15 -18
evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
evalscope/benchmarks/mmlu/__init__.py +3 -2
evalscope/benchmarks/mmlu/mmlu.py +15 -29
evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
evalscope/benchmarks/race/__init__.py +3 -2
evalscope/benchmarks/race/race.py +21 -35
evalscope/benchmarks/race/race_adapter.py +32 -29
evalscope/benchmarks/race/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/__init__.py +3 -2
evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
evalscope/benchmarks/truthful_qa/__init__.py +3 -2
evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
evalscope/cli/cli.py +6 -5
evalscope/cli/start_eval.py +31 -0
evalscope/cli/start_perf.py +0 -3
evalscope/cli/start_server.py +27 -41
evalscope/config.py +119 -95
evalscope/constants.py +61 -29
evalscope/evaluator/__init__.py +1 -0
evalscope/evaluator/evaluator.py +96 -377
evalscope/evaluator/humaneval_evaluator.py +158 -0
evalscope/evaluator/rating_eval.py +12 -33
evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
evalscope/metrics/code_metric.py +3 -9
evalscope/metrics/math_accuracy.py +3 -6
evalscope/metrics/metrics.py +21 -21
evalscope/metrics/rouge_metric.py +11 -25
evalscope/models/__init__.py +1 -2
evalscope/models/api/openai_api.py +40 -29
evalscope/models/custom/__init__.py +0 -1
evalscope/models/custom/custom_model.py +3 -3
evalscope/models/dummy_chat_model.py +7 -8
evalscope/models/model_adapter.py +89 -156
evalscope/models/openai_model.py +20 -20
evalscope/perf/arguments.py +15 -3
evalscope/perf/benchmark.py +7 -9
evalscope/perf/http_client.py +3 -8
evalscope/perf/main.py +10 -0
evalscope/perf/plugin/api/custom_api.py +1 -2
evalscope/perf/plugin/api/dashscope_api.py +1 -2
evalscope/perf/plugin/api/openai_api.py +2 -3
evalscope/perf/plugin/datasets/base.py +1 -2
evalscope/perf/plugin/datasets/flickr8k.py +1 -2
evalscope/perf/plugin/datasets/longalpaca.py +1 -2
evalscope/perf/plugin/datasets/openqa.py +1 -2
evalscope/perf/utils/analysis_result.py +1 -2
evalscope/perf/utils/benchmark_util.py +1 -2
evalscope/perf/utils/db_util.py +11 -8
evalscope/perf/utils/local_server.py +19 -13
evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
evalscope/registry/tasks/arc.yaml +2 -3
evalscope/registry/tasks/bbh.yaml +3 -4
evalscope/registry/tasks/bbh_mini.yaml +3 -4
evalscope/registry/tasks/ceval.yaml +3 -3
evalscope/registry/tasks/ceval_mini.yaml +3 -4
evalscope/registry/tasks/cmmlu.yaml +3 -3
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
evalscope/registry/tasks/general_qa.yaml +1 -1
evalscope/registry/tasks/gsm8k.yaml +2 -2
evalscope/registry/tasks/mmlu.yaml +3 -3
evalscope/registry/tasks/mmlu_mini.yaml +3 -3
evalscope/run.py +184 -375
evalscope/run_arena.py +20 -25
evalscope/summarizer.py +16 -17
evalscope/third_party/longbench_write/README.md +99 -42
evalscope/third_party/longbench_write/default_task.json +1 -1
evalscope/third_party/longbench_write/default_task.yaml +8 -7
evalscope/third_party/longbench_write/eval.py +29 -28
evalscope/third_party/longbench_write/infer.py +16 -104
evalscope/third_party/longbench_write/longbench_write.py +5 -5
evalscope/third_party/longbench_write/resources/judge.txt +1 -1
evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
evalscope/third_party/longbench_write/utils.py +0 -1
evalscope/third_party/toolbench_static/eval.py +14 -15
evalscope/third_party/toolbench_static/infer.py +48 -69
evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
evalscope/third_party/toolbench_static/requirements.txt +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
evalscope/tools/combine_reports.py +25 -30
evalscope/tools/rewrite_eval_results.py +14 -46
evalscope/utils/__init__.py +0 -1
evalscope/utils/arena_utils.py +18 -48
evalscope/{perf/utils → utils}/chat_service.py +3 -4
evalscope/utils/completion_parsers.py +3 -8
evalscope/utils/logger.py +9 -7
evalscope/utils/model_utils.py +11 -0
evalscope/utils/utils.py +12 -138
evalscope/version.py +2 -2
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
evalscope-0.8.0.dist-info/RECORD +285 -0
tests/cli/test_run.py +54 -15
tests/perf/test_perf.py +4 -0
tests/rag/test_clip_benchmark.py +38 -38
tests/rag/test_mteb.py +3 -2
tests/rag/test_ragas.py +5 -5
tests/swift/test_run_swift_eval.py +2 -3
tests/swift/test_run_swift_vlm_eval.py +2 -3
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
evalscope/cache.py +0 -98
evalscope/models/template.py +0 -1446
evalscope/run_ms.py +0 -140
evalscope/utils/task_cfg_parser.py +0 -10
evalscope/utils/task_utils.py +0 -22
evalscope-0.7.2.dist-info/RECORD +0 -286
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0

evalscope/run_arena.py CHANGED Viewed

@@ -3,16 +3,16 @@
 import argparse
 import os
-from pathlib import Path
 import torch
+from modelscope.utils.hf_util import GenerationConfig
+from pathlib import Path
 from tqdm import tqdm
 from evalscope.constants import EvalConfigKeys
 from evalscope.evaluator.rating_eval import RatingEvaluate
 from evalscope.models.model_adapter import ChatGenerationModelAdapter
-from evalscope.utils import get_obj_from_cfg, yaml_to_dict, jsonl_to_list, dump_jsonl_data
+from evalscope.utils import dump_jsonl_data, get_obj_from_cfg, jsonl_to_list, yaml_to_dict
 from evalscope.utils.logger import get_logger
-from modelscope.utils.hf_util import GenerationConfig
 logger = get_logger()
@@ -41,8 +41,7 @@ class ArenaWorkflow:
     def _get_obj_from_cfg(obj_cfg: dict):
         cls_ref = obj_cfg.get(EvalConfigKeys.CLASS_REF, None)
         if not cls_ref:
-            logger.warning(
-                f'Class reference is not specified in config: {obj_cfg}')
+            logger.warning(f'Class reference is not specified in config: {obj_cfg}')
             return obj_cfg
         cls = get_obj_from_cfg(cls_ref)
@@ -50,19 +49,16 @@ class ArenaWorkflow:
         return obj_cfg
-    def _predict_answers(self,
-                         model_id_or_path: str,
-                         model_revision: str,
-                         precision: torch.dtype,
-                         generation_config: GenerationConfig,
-                         template_type: str) -> list:
+    def _predict_answers(self, model_id_or_path: str, model_revision: str, precision: torch.dtype,
+                         generation_config: GenerationConfig, template_type: str) -> list:
         # TODO: multi-task to be supported
-        model_adapter = ChatGenerationModelAdapter(model_id=model_id_or_path,
-                                                   model_revision=model_revision,
-                                                   torch_dtype=precision,
-                                                   generation_config=generation_config,
-                                                   template_type=template_type)
+        model_adapter = ChatGenerationModelAdapter(
+            model_id=model_id_or_path,
+            model_revision=model_revision,
+            torch_dtype=precision,
+            generation_config=generation_config,
+            template_type=template_type)
         res_list = []
         questions_list = jsonl_to_list(self.question_file)
         for data_d in tqdm(questions_list, total=len(questions_list), desc=f'Predicting(answers):'):
@@ -92,8 +88,7 @@ class ArenaWorkflow:
         for model_name, cfg_d in self.answers_gen.items():
             enable = cfg_d.get(EvalConfigKeys.ENABLE, True)
             if not enable:
-                logger.warning(
-                    f'Skip model {model_name} because it is not enabled.')
+                logger.warning(f'Skip model {model_name} because it is not enabled.')
                 continue
             model_id_or_path = cfg_d.get(EvalConfigKeys.MODEL_ID_OR_PATH)
@@ -105,11 +100,12 @@ class ArenaWorkflow:
             ans_output_file = os.path.join(WORK_DIR, cfg_d.get(EvalConfigKeys.OUTPUT_FILE))
             template_type = cfg_d.get(EvalConfigKeys.TEMPLATE_TYPE)
-            answers_list = self._predict_answers(model_id_or_path=model_id_or_path,
-                                                 model_revision=model_revision,
-                                                 precision=precision,
-                                                 generation_config=custom_generation_config,
-                                                 template_type=template_type)
+            answers_list = self._predict_answers(
+                model_id_or_path=model_id_or_path,
+                model_revision=model_revision,
+                precision=precision,
+                generation_config=custom_generation_config,
+                template_type=template_type)
             os.makedirs(os.path.dirname(ans_output_file), exist_ok=True)
             dump_jsonl_data(answers_list, ans_output_file)
@@ -163,8 +159,7 @@ class ArenaWorkflow:
         if enable:
             report_file = os.path.join(WORK_DIR, self.rating_gen.get('report_file'))
             metrics = self.rating_gen.get('metrics', ['elo'])
-            baseline_model = self.rating_gen.get(
-                'baseline_model') if metrics[0] == 'pairwise' else None
+            baseline_model = self.rating_gen.get('baseline_model') if metrics[0] == 'pairwise' else None
             ae = RatingEvaluate(metrics=metrics, baseline_model=baseline_model)
             res_list = ae.run(self.review_file)
             rating_df = res_list[0]

evalscope/summarizer.py CHANGED Viewed

@@ -1,14 +1,13 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
 import json
 import os
-import glob
 from typing import List, Union
 from evalscope.config import TaskConfig
-from evalscope.constants import OutputsStructure
+from evalscope.constants import EvalBackend, OutputsStructure
 from evalscope.tools.combine_reports import gen_table
-from evalscope.utils import process_outputs_structure, yaml_to_dict, EvalBackend, json_to_dict, get_latest_folder_path, \
-    csv_to_list
+from evalscope.utils import csv_to_list, get_latest_folder_path, json_to_dict, yaml_to_dict
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -20,8 +19,8 @@ class Summarizer:
     def get_report(outputs_dir: str) -> List[dict]:
         res_list: list = []
-        outputs_structure: dict = process_outputs_structure(outputs_dir, is_make=False)
-        reports_dir: str = outputs_structure.get(OutputsStructure.REPORTS_DIR)
+        outputs_structure = OutputsStructure(outputs_dir, is_make=False)
+        reports_dir: str = outputs_structure.reports_dir
         if reports_dir is None:
             raise ValueError(f'No reports directory in {outputs_dir}')
@@ -70,9 +69,9 @@ class Summarizer:
         for candidate_task in candidate_task_cfgs:
             logger.info(f'**Loading task cfg for summarizer: {candidate_task}')
-            eval_backend = candidate_task.get('eval_backend') or EvalBackend.NATIVE.value
+            eval_backend = candidate_task.get('eval_backend') or EvalBackend.NATIVE
-            if eval_backend == EvalBackend.NATIVE.value:
+            if eval_backend == EvalBackend.NATIVE:
                 outputs_dir: str = candidate_task.get('outputs')
                 outputs_dir: str = os.path.expanduser(outputs_dir)
                 if outputs_dir is None:
@@ -80,7 +79,7 @@ class Summarizer:
                 res_list: list = Summarizer.get_report(outputs_dir=outputs_dir)
                 final_res_list.extend(res_list)
-            elif eval_backend == EvalBackend.OPEN_COMPASS.value:
+            elif eval_backend == EvalBackend.OPEN_COMPASS:
                 eval_config = Summarizer.parse_eval_config(candidate_task)
                 work_dir = eval_config.get('work_dir') or 'outputs/default'
@@ -93,25 +92,25 @@ class Summarizer:
                     raise ValueError(f'No summary files in {res_folder_path}')
                 summary_file_path = summary_files[0]
-                # Example: [{'dataset': 'gsm8k', 'version': '1d7fe4', 'metric': 'accuracy', 'mode': 'gen', 'qwen-7b-chat': '53.98'}
+                # Example: [{'dataset': 'gsm8k', 'version': '1d7fe4', 'metric': 'accuracy', 'mode': 'gen', 'qwen-7b-chat': '53.98'} # noqa: E501
                 summary_res: List[dict] = csv_to_list(file_path=summary_file_path)
                 final_res_list.extend(summary_res)
-            elif eval_backend == EvalBackend.VLM_EVAL_KIT.value:
+            elif eval_backend == EvalBackend.VLM_EVAL_KIT:
                 eval_config = Summarizer.parse_eval_config(candidate_task)
                 work_dir = eval_config.get('work_dir') or 'outputs'
                 if not os.path.exists(work_dir):
                     raise ValueError(f'work_dir {work_dir} does not exist.')
                 for model in eval_config['model']:
                     if model['name'] == 'CustomAPIModel':
                         model_name = model['type']
                     else:
                         model_name = model['name']
                     csv_files = glob.glob(os.path.join(work_dir, model_name, '*.csv'))
                     json_files = glob.glob(os.path.join(work_dir, model_name, '*.json'))
                     summary_files = csv_files + json_files
                     for summary_file_path in summary_files:
                         if summary_file_path.endswith('csv'):
@@ -120,9 +119,9 @@ class Summarizer:
                             summary_res: dict = json_to_dict(summary_file_path)
                         file_name = os.path.basename(summary_file_path).split('.')[0]
                         final_res_list.append({file_name: summary_res})
-            elif eval_backend == EvalBackend.THIRD_PARTY.value:
-                raise ValueError(f'*** The summarizer for Third party evaluation backend is not supported yet ***')
+            elif eval_backend == EvalBackend.THIRD_PARTY:
+                raise ValueError('*** The summarizer for Third party evaluation backend is not supported yet ***')
             else:
                 raise ValueError(f'Invalid eval_backend: {eval_backend}')

evalscope/third_party/longbench_write/README.md CHANGED Viewed

@@ -3,14 +3,18 @@
 The LongWriter supports 10,000+ Word Generation From Long Context LLMs.
 We can use the benchmark LongBench-Write focuses more on measuring the long output quality as well as the output length.
-Refer to https://github.com/THUDM/LongWriter
+GitHub: [LongWriter](https://github.com/THUDM/LongWriter)
+Technical Report: [Minimum Tuning to Unlock Long Output from LLMs with High Quality Data as the Key](https://arxiv.org/abs/2410.10210)
 ## Usage
 ### Installation
 ```bash
-pip install evalscope[framework]
+pip install evalscope[framework] -U
+pip install vllm -U
 ```
 ### Task configuration
@@ -24,53 +28,79 @@ task_cfg = dict(stage=['infer', 'eval_l', 'eval_q'],
                 model='ZhipuAI/LongWriter-glm4-9b',
                 input_data_path=None,
                 output_dir='./outputs',
-                openai_api_key=None,
-                openai_gpt_model='gpt-4o-2024-05-13',
-                infer_generation_kwargs={
-                    'max_new_tokens': 32768,
-                    'temperature': 0.5
-                },
-                eval_generation_kwargs={
-                    'max_new_tokens': 1024,
-                    'temperature': 0.5,
-                    'stop': None
+                infer_config={
+                    'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions',
+                    'is_chat': True,
+                    'verbose': False,
+                    'generation_kwargs': {
+                        'max_new_tokens': 32768,
+                        'temperature': 0.5,
+                        'repetition_penalty': 1.0
+                    },
+                    'proc_num': 16,
                 },
-                proc_num=8)
+                eval_config={
+                    # No need to set OpenAI info if skipping the stage `eval_q`
+                    'openai_api_key': None,
+                    'openai_api_base': 'https://api.openai.com/v1/chat/completions',
+                    'openai_gpt_model': 'gpt-4o-2024-05-13',
+                    'generation_kwargs': {
+                        'max_new_tokens': 1024,
+                        'temperature': 0.5,
+                        'stop': None
+                    },
+                    'proc_num': 8
+                }
+            )
 ```
 - Arguments:
-  - `stage`: To run multiple stages, `infer`--run the inference process. `eval_l`--run eval length process. `eval_q`--run eval quality process.
-  - `model`: model id on the ModelScope hub, or local model dir.
+  - `stage`: To run multiple stages, `infer`--run the inference process. `eval_l`--run eval length process. `eval_q`--run eval quality process with the model-as-judge.
+  - `model`: model id on the ModelScope hub, or local model dir. Refer to [LongWriter-glm4-9b](https://modelscope.cn/models/ZhipuAI/LongWriter-glm4-9b/summary) for more details.
   - `input_data_path`: input data path, default to `None`, it means to use [longbench_write](resources/longbench_write.jsonl)
   - `output_dir`: output root directory.
   - `openai_api_key`: openai_api_key when enabling the stage `eval_q` to use `Model-as-Judge`. Default to None if not needed.
   - `openai_gpt_model`: Judge model name from OpenAI. Default to `gpt-4o-2024-05-13`
-  - `infer_generation_kwargs`: The generation kwargs for models to be evaluated.
-  - `eval_generation_kwargs`: The generation kwargs for judge-models.
-  - `proc_num`: proc num.
+  - `generation_kwargs`: The generation configs.
+  - `proc_num`: process number for inference and evaluation.
 2. Configuration with json (Optional):
 ```json
 {
-    "stage": ["infer", "eval_l", "eval_q"],
+    "stage": [
+        "infer",
+        "eval_l",
+        "eval_q"
+    ],
     "model": "ZhipuAI/LongWriter-glm4-9b",
     "input_data_path": null,
     "output_dir": "./outputs",
-    "openai_api_key": null,
-    "openai_gpt_model": "gpt-4o-2024-05-13",
-    "infer_generation_kwargs": {
-        "max_new_tokens": 32768,
-        "temperature": 0.5
+    "infer_config": {
+        "openai_api_base": "http://127.0.0.1:8000/v1/chat/completions",
+        "is_chat": true,
+        "verbose": false,
+        "generation_kwargs": {
+            "max_new_tokens": 32768,
+            "temperature": 0.5,
+            "repetition_penalty": 1.0
+        },
+        "proc_num": 16
     },
-    "eval_generation_kwargs": {
-        "max_new_tokens": 1024,
-        "temperature": 0.5,
-        "stop": null
-    },
-    "proc_num": 8
+    "eval_config": {
+        "openai_api_key": null,
+        "openai_api_base": "https://api.openai.com/v1/chat/completions",
+        "openai_gpt_model": "gpt-4o-2024-05-13",
+        "generation_kwargs": {
+            "max_new_tokens": 1024,
+            "temperature": 0.5,
+            "stop": null
+        },
+        "proc_num": 8
+    }
 }
 ```
 Refer to [default_task.json](default_task.json) for more details.
@@ -82,24 +112,51 @@ stage:
   - infer
   - eval_l
   - eval_q
-model: ZhipuAI/LongWriter-glm4-9b
+model: "ZhipuAI/LongWriter-glm4-9b"
 input_data_path: null
-output_dir: ./outputs
-openai_api_key: null
-openai_gpt_model: gpt-4o-2024-05-13
-infer_generation_kwargs:
-  max_new_tokens: 32768
-  temperature: 0.5
-eval_generation_kwargs:
-  max_new_tokens: 1024
-  temperature: 0.5
-  stop: null
-proc_num: 8
+output_dir: "./outputs"
+infer_config:
+  openai_api_base: "http://127.0.0.1:8000/v1/chat/completions"
+  is_chat: true
+  verbose: false
+  generation_kwargs:
+    max_new_tokens: 32768
+    temperature: 0.5
+    repetition_penalty: 1.0
+  proc_num: 16
+eval_config:
+  openai_api_key: null
+  openai_api_base: "https://api.openai.com/v1/chat/completions"
+  openai_gpt_model: "gpt-4o-2024-05-13"
+  generation_kwargs:
+    max_new_tokens: 1024
+    temperature: 0.5
+    stop: null
+  proc_num: 8
 ```
 Refer to [default_task.yaml](default_task.yaml) for more details.
+### Run Model Inference
+We recommend to use the [vLLM](https://github.com/vllm-project/vllm) to deploy the model.
+Environment:
+* A100(80G) x 1
+To start vLLM server, run the following command:
+```shell
+CUDA_VISIBLE_DEVICES=0 VLLM_USE_MODELSCOPE=True vllm serve --max-model-len=65536 --gpu_memory_utilization=0.95 --trust-remote-code ZhipuAI/LongWriter-glm4-9b
+```
+- Arguments:
+  - `max-model-len`: The maximum length of the model input.
+  - `gpu_memory_utilization`: The GPU memory utilization.
+  - `trust-remote-code`: Whether to trust the remote code.
+  - `model`: Could be a model id on the ModelScope/HuggingFace hub, or a local model dir.
+* Note: You can use multiple GPUs by setting `CUDA_VISIBLE_DEVICES=0,1,2,3` alternatively.
 ### Run the task

evalscope/third_party/longbench_write/default_task.json CHANGED Viewed

@@ -14,7 +14,7 @@
     }
   },
   "eval_config": {
-    "openai_api_key": "YOUR_OPENAI_API_KEY",
+    "openai_api_key": null,
     "openai_api_base": "https://api.openai.com/v1/chat/completions",
     "openai_gpt_model": "gpt-4o-2024-05-13",
     "generation_kwargs": {

evalscope/third_party/longbench_write/default_task.yaml CHANGED Viewed

@@ -2,23 +2,24 @@ stage:
   - infer
   - eval_l
   - eval_q
-model: ZhipuAI/LongWriter-glm4-9b
+model: "ZhipuAI/LongWriter-glm4-9b"
 input_data_path: null
-output_dir: './outputs'
+output_dir: "./outputs"
 infer_config:
-  openai_api_base: 'http://127.0.0.1:8000/v1/chat/completions'
+  openai_api_base: "http://127.0.0.1:8000/v1/chat/completions"
   is_chat: true
   verbose: false
   generation_kwargs:
     max_new_tokens: 32768
     temperature: 0.5
     repetition_penalty: 1.0
+  proc_num: 16
 eval_config:
-  openai_api_key: 'YOUR_OPENAI_API_KEY'
-  openai_api_base: 'https://api.openai.com/v1/chat/completions'
-  openai_gpt_model: 'gpt-4o-2024-05-13'
+  openai_api_key: null
+  openai_api_base: "https://api.openai.com/v1/chat/completions"
+  openai_gpt_model: "gpt-4o-2024-05-13"
   generation_kwargs:
     max_new_tokens: 1024
     temperature: 0.5
     stop: null
-  proc_num: 16
+  proc_num: 8

evalscope/third_party/longbench_write/eval.py CHANGED Viewed

@@ -1,19 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright (c) ZhipuAI, Inc. and its affiliates.
-import multiprocessing
-import os
 import json
-import random
-import re
-from concurrent.futures import ThreadPoolExecutor
 import matplotlib.pyplot as plt
 import numpy as np
+import os
+import random
+import re
 import requests
+from concurrent.futures import ThreadPoolExecutor
 from tqdm import tqdm
-from evalscope.utils import jsonl_to_list
-from evalscope.utils import get_logger
+from evalscope.utils import get_logger, jsonl_to_list
 logger = get_logger()
@@ -52,14 +49,16 @@ class EvalLength:
             return 100 * max(0, 1. - (x / y - 1) / 2)
     def eval(self, dump_res: bool = True):
-        # example = {"prompt": "Write an outline for a short 100-word blog post about xxx", "type": "Community Forum", "length": 100, "response_length": 103, "response": "I. Introduction A. xxx"}
+        # example = {"prompt": "Write an outline for a short 100-word blog post about xxx",
+        #            "type": "Community Forum", "length": 100, "response_length": 103,
+        #            "response": "I. Introduction A. xxx"}
         predictions = [json.loads(line) for line in open(self.pred_path, encoding='utf-8')]
         x, y, scores = [], [], []
-        for pred in tqdm(predictions, total=len(predictions), desc=f'Process of eval_l: '):
-            x.append(pred["length"])
-            y.append(pred["response_length"])
-            scores.append(self.score(pred["length"], pred["response_length"]))
+        for pred in tqdm(predictions, total=len(predictions), desc='[Processing eval_l]'):
+            x.append(pred['length'])
+            y.append(pred['response_length'])
+            scores.append(self.score(pred['length'], pred['response_length']))
         avg_score_l = np.mean(scores)
         logger.info(f'Average score of length evaluation: {avg_score_l:.2f}')
@@ -105,7 +104,7 @@ class EvalQuality:
         EVAL_Q = 'eval_quality'
         OPENAI_BASE_URL = 'https://api.openai.com/v1/chat/completions'
-        DIMS = ["Relevance", "Accuracy", "Coherence", "Clarity", "Breadth and Depth", "Reading Experience"]
+        DIMS = ['Relevance', 'Accuracy', 'Coherence', 'Clarity', 'Breadth and Depth', 'Reading Experience']
         def __init__(self,
                      model: str,
@@ -144,7 +143,8 @@ class EvalQuality:
             self.openai_api_key: str = openai_api_key
             self.openai_gpt_model = openai_gpt_model
-            assert self.openai_api_key, 'Please set `OPENAI_API_KEY` in environment variables.'
+            if not self.openai_api_key:
+                logger.error('Please set `OPENAI_API_KEY` in the envs when stage `eval_q` is activated!')
         def get_response_gpt4(self, prompt, temperature=0.5, max_new_tokens=1024, stop=None):
             tries = 0
@@ -152,17 +152,17 @@ class EvalQuality:
                 tries += 1
                 try:
                     headers = {
-                        'Authorization': "Bearer {}".format(self.openai_api_key),
+                        'Authorization': 'Bearer {}'.format(self.openai_api_key),
                     }
                     messages = [
                         {'role': 'user', 'content': prompt},
                     ]
                     resp = requests.post(self.openai_api_base, json={
-                        "model": self.openai_gpt_model,
-                        "messages": messages,
-                        "temperature": temperature,
-                        "max_tokens": max_new_tokens,
-                        "stop": stop,
+                        'model': self.openai_gpt_model,
+                        'messages': messages,
+                        'temperature': temperature,
+                        'max_tokens': max_new_tokens,
+                        'stop': stop,
                     }, headers=headers, timeout=600)
                     if resp.status_code != 200:
                         raise Exception(resp.text)
@@ -172,16 +172,16 @@ class EvalQuality:
                 except KeyboardInterrupt as e:
                     raise e
                 except Exception as e:
-                    if "maximum context length" in str(e):
+                    if 'maximum context length' in str(e):
                         raise e
-                    elif "triggering" in str(e):
+                    elif 'triggering' in str(e):
                         return 'Trigger OpenAI\'s content management policy'
                     logger.error("Error Occurs: \"%s\"        Retry ..." % (str(e)))
             else:
-                logger.error("Max tries. Failed.")
-                return "Max tries. Failed."
+                logger.error('Max tries. Failed.')
+                return 'Max tries. Failed.'
             try:
-                return resp["choices"][0]["message"]["content"]
+                return resp['choices'][0]['message']['content']
             except:
                 return ''
@@ -195,7 +195,7 @@ class EvalQuality:
         def process_data(self, item):
             # for item in tqdm(items, total=len(items), desc=f'Process of eval_q: '):
-            prompt = self.prompt_template.replace('$INST$', item['prompt']).replace('$RESPONSE$', item["response"])
+            prompt = self.prompt_template.replace('$INST$', item['prompt']).replace('$RESPONSE$', item['response'])
             scores = None
             output = self.get_response_gpt4(prompt, **self.generation_kwargs)
             try:
@@ -235,7 +235,8 @@ class EvalQuality:
             total_score = dict()
             for dim in self.DIMS:
                 # scores = [float(score[dim]) if dim in score else 3 for score in self.eval_scores]
-                scores = [float(item['scores'][dim]) if 'scores' in item and dim in item['scores'] else 3 for item in self.eval_scores]
+                scores = [float(item['scores'][dim]) if 'scores' in item and dim in item['scores']
+                          else 3 for item in self.eval_scores]
                 total_score[dim] = ((sum(scores) / len(scores)) - 1) * 25
             total_score['total'] = sum(total_score.values()) / len(total_score)
             logger.info(f'Total score of quality evaluation: {total_score["total"]:.2f}')

evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl