PyPI - evalscope - Versions diffs - 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

evalscope 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show

evalscope/__init__.py +1 -1
evalscope/arguments.py +73 -0
evalscope/backend/base.py +5 -1
evalscope/backend/opencompass/api_meta_template.py +8 -14
evalscope/backend/opencompass/backend_manager.py +24 -15
evalscope/backend/opencompass/tasks/eval_api.py +1 -6
evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
evalscope/backend/rag_eval/__init__.py +3 -3
evalscope/backend/rag_eval/backend_manager.py +21 -25
evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
evalscope/backend/rag_eval/cmteb/base.py +22 -23
evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
evalscope/backend/rag_eval/ragas/__init__.py +2 -2
evalscope/backend/rag_eval/ragas/arguments.py +3 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +10 -15
evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
evalscope/backend/rag_eval/utils/clip.py +46 -50
evalscope/backend/rag_eval/utils/embedding.py +12 -11
evalscope/backend/rag_eval/utils/llm.py +8 -6
evalscope/backend/rag_eval/utils/tools.py +12 -11
evalscope/backend/vlm_eval_kit/__init__.py +1 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
evalscope/benchmarks/arc/__init__.py +3 -2
evalscope/benchmarks/arc/ai2_arc.py +19 -16
evalscope/benchmarks/arc/arc_adapter.py +32 -24
evalscope/benchmarks/bbh/__init__.py +1 -2
evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
evalscope/benchmarks/benchmark.py +16 -16
evalscope/benchmarks/ceval/__init__.py +3 -2
evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
evalscope/benchmarks/ceval/ceval_exam.py +18 -31
evalscope/benchmarks/cmmlu/__init__.py +3 -2
evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
evalscope/benchmarks/competition_math/__init__.py +3 -2
evalscope/benchmarks/competition_math/competition_math.py +7 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
evalscope/benchmarks/data_adapter.py +24 -24
evalscope/benchmarks/general_qa/__init__.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
evalscope/benchmarks/gsm8k/__init__.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
evalscope/benchmarks/hellaswag/__init__.py +3 -2
evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
evalscope/benchmarks/humaneval/__init__.py +1 -1
evalscope/benchmarks/humaneval/humaneval.py +15 -18
evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
evalscope/benchmarks/mmlu/__init__.py +3 -2
evalscope/benchmarks/mmlu/mmlu.py +15 -29
evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
evalscope/benchmarks/race/__init__.py +3 -2
evalscope/benchmarks/race/race.py +21 -35
evalscope/benchmarks/race/race_adapter.py +32 -29
evalscope/benchmarks/race/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/__init__.py +3 -2
evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
evalscope/benchmarks/truthful_qa/__init__.py +3 -2
evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
evalscope/cli/cli.py +6 -5
evalscope/cli/start_eval.py +31 -0
evalscope/cli/start_perf.py +0 -3
evalscope/cli/start_server.py +27 -41
evalscope/config.py +119 -95
evalscope/constants.py +61 -29
evalscope/evaluator/__init__.py +1 -0
evalscope/evaluator/evaluator.py +96 -377
evalscope/evaluator/humaneval_evaluator.py +158 -0
evalscope/evaluator/rating_eval.py +12 -33
evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
evalscope/metrics/code_metric.py +3 -9
evalscope/metrics/math_accuracy.py +3 -6
evalscope/metrics/metrics.py +21 -21
evalscope/metrics/rouge_metric.py +11 -25
evalscope/models/__init__.py +1 -2
evalscope/models/api/openai_api.py +40 -29
evalscope/models/custom/__init__.py +0 -1
evalscope/models/custom/custom_model.py +3 -3
evalscope/models/dummy_chat_model.py +7 -8
evalscope/models/model_adapter.py +89 -156
evalscope/models/openai_model.py +20 -20
evalscope/perf/arguments.py +15 -3
evalscope/perf/benchmark.py +7 -9
evalscope/perf/http_client.py +3 -8
evalscope/perf/main.py +10 -0
evalscope/perf/plugin/api/custom_api.py +1 -2
evalscope/perf/plugin/api/dashscope_api.py +1 -2
evalscope/perf/plugin/api/openai_api.py +2 -3
evalscope/perf/plugin/datasets/base.py +1 -2
evalscope/perf/plugin/datasets/flickr8k.py +1 -2
evalscope/perf/plugin/datasets/longalpaca.py +1 -2
evalscope/perf/plugin/datasets/openqa.py +1 -2
evalscope/perf/utils/analysis_result.py +1 -2
evalscope/perf/utils/benchmark_util.py +1 -2
evalscope/perf/utils/db_util.py +11 -8
evalscope/perf/utils/local_server.py +19 -13
evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
evalscope/registry/tasks/arc.yaml +2 -3
evalscope/registry/tasks/bbh.yaml +3 -4
evalscope/registry/tasks/bbh_mini.yaml +3 -4
evalscope/registry/tasks/ceval.yaml +3 -3
evalscope/registry/tasks/ceval_mini.yaml +3 -4
evalscope/registry/tasks/cmmlu.yaml +3 -3
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
evalscope/registry/tasks/general_qa.yaml +1 -1
evalscope/registry/tasks/gsm8k.yaml +2 -2
evalscope/registry/tasks/mmlu.yaml +3 -3
evalscope/registry/tasks/mmlu_mini.yaml +3 -3
evalscope/run.py +184 -375
evalscope/run_arena.py +20 -25
evalscope/summarizer.py +16 -17
evalscope/third_party/longbench_write/README.md +99 -42
evalscope/third_party/longbench_write/default_task.json +1 -1
evalscope/third_party/longbench_write/default_task.yaml +8 -7
evalscope/third_party/longbench_write/eval.py +29 -28
evalscope/third_party/longbench_write/infer.py +16 -104
evalscope/third_party/longbench_write/longbench_write.py +5 -5
evalscope/third_party/longbench_write/resources/judge.txt +1 -1
evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
evalscope/third_party/longbench_write/utils.py +0 -1
evalscope/third_party/toolbench_static/eval.py +14 -15
evalscope/third_party/toolbench_static/infer.py +48 -69
evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
evalscope/third_party/toolbench_static/requirements.txt +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
evalscope/tools/combine_reports.py +25 -30
evalscope/tools/rewrite_eval_results.py +14 -46
evalscope/utils/__init__.py +0 -1
evalscope/utils/arena_utils.py +18 -48
evalscope/{perf/utils → utils}/chat_service.py +3 -4
evalscope/utils/completion_parsers.py +3 -8
evalscope/utils/logger.py +9 -7
evalscope/utils/model_utils.py +11 -0
evalscope/utils/utils.py +12 -138
evalscope/version.py +2 -2
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
evalscope-0.8.0.dist-info/RECORD +285 -0
tests/cli/test_run.py +54 -15
tests/perf/test_perf.py +4 -0
tests/rag/test_clip_benchmark.py +38 -38
tests/rag/test_mteb.py +3 -2
tests/rag/test_ragas.py +5 -5
tests/swift/test_run_swift_eval.py +2 -3
tests/swift/test_run_swift_vlm_eval.py +2 -3
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
evalscope/cache.py +0 -98
evalscope/models/template.py +0 -1446
evalscope/run_ms.py +0 -140
evalscope/utils/task_cfg_parser.py +0 -10
evalscope/utils/task_utils.py +0 -22
evalscope-0.7.2.dist-info/RECORD +0 -286
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0

evalscope/third_party/longbench_write/infer.py CHANGED Viewed

@@ -1,18 +1,15 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright (c) ZhipuAI, Inc. and its affiliates.
-import os
 import json
-from typing import List
-import torch
 import numpy as np
+import os
 import random
-from modelscope import AutoTokenizer, AutoModelForCausalLM
-from tqdm import tqdm
+import torch
+from typing import List
-from evalscope.third_party.longbench_write.utils import count_words
 from evalscope.models.api import OpenaiApi
+from evalscope.third_party.longbench_write.utils import count_words
 from evalscope.utils import get_logger
 logger = get_logger()
@@ -25,39 +22,6 @@ Refer to https://github.com/THUDM/LongWriter for more details.
 """
-def get_pred(rank, world_size, data, path, max_new_tokens, temperature, tokenizer, fout):
-    device = torch.device(f'cuda:{rank}')
-    model = AutoModelForCausalLM.from_pretrained(path, trust_remote_code=True, torch_dtype=torch.bfloat16).to(device)
-    model = model.eval()
-    for dt in tqdm(data, total=len(data), desc=f'Infer on rank-{rank}: '):
-        prompt = dt['prompt']
-        if "llama" in path.lower():
-            prompt = f"[INST]{prompt}[/INST]"
-            input = tokenizer(prompt, truncation=False, return_tensors="pt").to(device)
-            context_length = input.input_ids.shape[-1]
-            output = model.generate(
-                **input,
-                max_new_tokens=max_new_tokens,
-                num_beams=1,
-                do_sample=True,
-                temperature=temperature,
-            )[0]
-            response = tokenizer.decode(output[context_length:], skip_special_tokens=True)
-        else:
-            response, history = model.chat(tokenizer, prompt, history=[], max_new_tokens=max_new_tokens,
-                                           temperature=temperature)
-        dt["response_length"], _ = count_words(response)
-        dt["response"] = response
-        logger.info(dt)
-        fout.write(json.dumps(dt, ensure_ascii=False) + '\n')
-        fout.flush()
-    logger.info(f'Successfully generated predictions for {len(data)} samples.')
 def seed_everything(seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed(seed)
@@ -68,69 +32,13 @@ def seed_everything(seed):
     torch.cuda.manual_seed_all(seed)
-# def run_infer(model: str,
-#               data_path: str,
-#               output_dir: str,
-#               generation_kwargs: dict = None,
-#               enable: bool = True, ):
-#     """
-#     Process inference for LongWriter model.
-#
-#     Args:
-#         model: The model id of the LongWriter model on ModelScope, or local model path.
-#         data_path: The path to the data file.
-#         output_dir: The output directory for the predictions.
-#         generation_kwargs: The generation arguments for the model.
-#             Attributes: `max_new_tokens`: The maximum number of tokens to generate. `temperature`: The temperature
-#         enable: Whether to run infer process.
-#     """
-#     model_id_path: str = os.path.join(output_dir, model.strip(os.sep).replace(os.sep, '__'))
-#
-#     if not enable:
-#         logger.warning('*** Skip `infer` stage ***')
-#         return f'{model_id_path}/pred.jsonl'
-#
-#     seed_everything(42)
-#
-#     os.makedirs(model_id_path, exist_ok=True)
-#     fout = open(f'{model_id_path}/pred.jsonl', 'w', encoding='utf-8')
-#
-#     if generation_kwargs is None:
-#         generation_kwargs = dict({
-#             'max_new_tokens': 32768,
-#             'temperature': 0.5
-#         })
-#
-#     tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
-#     world_size = torch.cuda.device_count()
-#
-#     logger.info(f'>>Input data path: {data_path}')
-#     with open(data_path, encoding='utf-8') as f:
-#         data = [json.loads(line) for line in f]
-#
-#     data_subsets = [data[i::world_size] for i in range(world_size)]
-#     processes = []
-#     for rank in range(world_size):
-#         p = mp.Process(target=get_pred,
-#                        args=(rank, world_size, data_subsets[rank], model, generation_kwargs.get('max_new_tokens'), generation_kwargs.get('temperature'), tokenizer, fout))
-#         p.start()
-#         processes.append(p)
-#
-#     for p in processes:
-#         p.join()
-#
-#     logger.info(f'Finish generating predictions for {model}.')
-#     logger.info(f'Predictions are saved in {model_id_path}/pred.jsonl.')
-#
-#     return f'{model_id_path}/pred.jsonl'
 def run_infer(model: str,
               data_path: str,
               output_dir: str,
               api_config: dict,
               generation_kwargs: dict = None,
-              enable: bool = True, ):
+              enable: bool = True,
+              proc_num: int = DEFAULT_PROC_NUM):
     """
     Process inference for LongWriter model.
@@ -147,6 +55,7 @@ def run_infer(model: str,
         generation_kwargs: The generation arguments for the model.
             Attributes: `max_new_tokens`: The maximum number of tokens to generate. `temperature`: The temperature
         enable: Whether to run infer process.
+        proc_num: calling OpenAI api service with proc_num
     """
     model_id_path: str = os.path.join(output_dir, model.strip(os.sep).replace(os.sep, '__'))
@@ -173,7 +82,8 @@ def run_infer(model: str,
     api_client = OpenaiApi(model=model,
                            openai_api_key=None,
-                           openai_api_base=api_config.get('openai_api_base', 'http://127.0.0.1:8000/v1/chat/completions'),
+                           openai_api_base=api_config.get('openai_api_base',
+                                                          'http://127.0.0.1:8000/v1/chat/completions'),
                            max_new_tokens=generation_kwargs.get('max_new_tokens', 4096),
                            temperature=generation_kwargs.get('temperature', 0.0),
                            repetition_penalty=generation_kwargs.get('repetition_penalty', 1.0),
@@ -181,9 +91,11 @@ def run_infer(model: str,
                            verbose=api_config.get('verbose', False),
                            )
-    # TODO: ONLY FOR TEST  generate_simple
-    results: List[str] = api_client.generate_simple(inputs=[example['prompt'] for example in data_list])
-    assert len(results) == len(data_list), f'Error: The number of predictions {len(results)} is not equal to the number of inputs {len(data_list)}.'
+    # TODO: refine generate_simple
+    results: List[str] = api_client.generate_simple(inputs=[example['prompt'] for example in data_list],
+                                                    num_proc=proc_num)
+    assert len(results) == len(data_list), \
+        f'Error: The number of predictions {len(results)} is not equal to the number of inputs {len(data_list)}.'
     logger.info(f'Finish generating predictions with {len(data_list)} samples for {model}')
     # Outputs
@@ -191,8 +103,8 @@ def run_infer(model: str,
     output_pred_file: str = f'{model_id_path}/pred.jsonl'
     with open(output_pred_file, 'w', encoding='utf-8') as f:
         for dt, res in zip(data_list, results):
-            dt["response_length"], _ = count_words(res)
-            dt["response"] = res
+            dt['response_length'], _ = count_words(res)
+            dt['response'] = res
             f.write(json.dumps(dt, ensure_ascii=False) + '\n')
     logger.info(f'Predictions are saved in {output_pred_file}')

evalscope/third_party/longbench_write/longbench_write.py CHANGED Viewed

@@ -2,10 +2,9 @@
 import os
 from typing import Union
-from evalscope.third_party.longbench_write.infer import run_infer
 from evalscope.third_party.longbench_write.eval import run_eval
-from evalscope.utils import yaml_to_dict, json_to_dict
-from evalscope.utils import get_logger
+from evalscope.third_party.longbench_write.infer import run_infer
+from evalscope.utils import get_logger, json_to_dict, yaml_to_dict
 logger = get_logger()
@@ -45,7 +44,8 @@ def run_task(task_cfg: Union[str, dict]):
                                   verbose=infer_config.get('verbose', False),
                               ),
                               generation_kwargs=infer_config.get('generation_kwargs'),
-                              enable='infer' in stage)
+                              enable='infer' in stage,
+                              proc_num=infer_config.get('proc_num', 16))
     # Run eval process
     run_eval(model=model,
@@ -77,7 +77,7 @@ if __name__ == '__main__':
                     },
                     eval_config={
-                        'openai_api_key': 'YOUR_OPENAI_API_KEY',
+                        'openai_api_key': None,
                         'openai_api_base': 'https://api.openai.com/v1/chat/completions',
                         'openai_gpt_model': 'gpt-4o-2024-05-13',
                         'generation_kwargs': {'max_new_tokens': 1024, 'temperature': 0.5, 'stop': None},

evalscope/third_party/longbench_write/resources/judge.txt CHANGED Viewed

@@ -28,4 +28,4 @@ $RESPONSE$
 </Response>
-Please evaluate the quality of the response. You must first provide a brief analysis of its quality, then give a comprehensive analysis with scores for each dimension. The output must strictly follow the JSON format: {"Analysis": ..., "Relevance": ..., "Accuracy": ..., "Coherence": ..., "Clarity": ..., "Breadth and Depth": ..., "Reading Experience": ...}. You do not need to consider whether the response meets the user's length requirements in your evaluation. Ensure that only one integer between 1 and 5 is output for each dimension score.
+Please evaluate the quality of the response. You must first provide a brief analysis of its quality, then give a comprehensive analysis with scores for each dimension. The output must strictly follow the JSON format: {"Analysis": ..., "Relevance": ..., "Accuracy": ..., "Coherence": ..., "Clarity": ..., "Breadth and Depth": ..., "Reading Experience": ...}. You do not need to consider whether the response meets the user's length requirements in your evaluation. Ensure that only one integer between 1 and 5 is output for each dimension score.

evalscope/third_party/longbench_write/tools/data_etl.py CHANGED Viewed

@@ -1,16 +1,15 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import json
 import os.path
-from typing import List
 import re
-import json
+from typing import List
 from evalscope.third_party.longbench_write.eval import EvalLength
-from evalscope.third_party.longbench_write.utils import count_words, chinese_to_arabic
+from evalscope.third_party.longbench_write.utils import chinese_to_arabic, count_words
 from evalscope.utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
 logger = get_logger()
 """
 This script is used to preprocess the dataset for the LongWriter.
 """
@@ -141,7 +140,7 @@ class DataETL:
         return out_file
-if __name__ == "__main__":
+if __name__ == '__main__':
     # run `no_required_length`: got 1748 exampels left
     # Refer to: https://modelscope.cn/datasets/ZhipuAI/LongWriter-6k/files

evalscope/third_party/longbench_write/utils.py CHANGED Viewed

@@ -29,7 +29,6 @@ def chinese_to_arabic(chinese_number: str) -> int:
         '七': 7,
         '八': 8,
         '九': 9,
         '俩': 2,
         '两': 2,
     }

evalscope/third_party/toolbench_static/eval.py CHANGED Viewed

@@ -1,10 +1,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import json
+import os
 from dataclasses import dataclass
 from rouge import Rouge
-import os
 @dataclass
@@ -24,7 +23,7 @@ def run_eval(args: EvalArgs):
             return 0
         rouge = Rouge()
         rouge_score = rouge.get_scores(hyps=cand_list, refs=ref_list, avg=True)
-        rougel = rouge_score["rouge-l"]["f"]
+        rougel = rouge_score['rouge-l']['f']
         return rougel
     def evaluate_action_em(cand_list: list, ref_list: list):
@@ -97,8 +96,8 @@ def run_eval(args: EvalArgs):
         data = json.load(f)
     def parse_action(text):
-        action = "None"
-        action_input = "{}"
+        action = 'None'
+        action_input = '{}'
         if 'Action Input:' in text:
             input_idx = text.rindex('Action Input:')
             action_input = text[input_idx + len('Action Input:'):].strip()
@@ -117,24 +116,24 @@ def run_eval(args: EvalArgs):
     def parse_output(text):
         action, action_input = parse_action(text)
-        if action == "Finish":
+        if action == 'Finish':
             try:
                 action_input = json.loads(action_input)
                 # print(action_input)
                 # print(json.dumps(action_input,indent=2))
-                return_type = action_input["return_type"]
-                if return_type == "give_answer":
-                    if "final_answer" in action_input.keys():
+                return_type = action_input['return_type']
+                if return_type == 'give_answer':
+                    if 'final_answer' in action_input.keys():
                         answer = str(action_input['final_answer'])
                         if answer.strip() in ['', '.', ',']:
-                            answer = "None"
+                            answer = 'None'
                     else:
-                        answer = "None"
-                    return "finish", action, action_input, answer
+                        answer = 'None'
+                    return 'finish', action, action_input, answer
                 else:
-                    return "give up", None, None, None
+                    return 'give up', None, None, None
             except:
-                return "give up", None, None, None
+                return 'give up', None, None, None
         else:
             plan = 'call'
             answer = None
@@ -163,7 +162,7 @@ def run_eval(args: EvalArgs):
         # ref_ans: None
         pred_plan, pred_action, pred_input, pred_ans = parse_output(prediction)
-        if ref_action is not None and ref_action == "invalid_hallucination_function_name":
+        if ref_action is not None and ref_action == 'invalid_hallucination_function_name':
             continue
         if pred_action is not None and ref_action != 'none' and ref_action not in [t['name'] for t in d['tools']]:
             continue

evalscope/third_party/toolbench_static/infer.py CHANGED Viewed

@@ -16,13 +16,13 @@
 # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
-from dataclasses import dataclass, field
 import json
 import os
-from rouge import Rouge
+import requests
 import time
+from dataclasses import dataclass, field
+from rouge import Rouge
 from urllib3.exceptions import MaxRetryError, NewConnectionError
-import requests
 def evaluate_rouge_l(cand_list: list, ref_list: list):
@@ -30,7 +30,7 @@ def evaluate_rouge_l(cand_list: list, ref_list: list):
         return 0
     rouge = Rouge()
     rouge_score = rouge.get_scores(hyps=cand_list, refs=ref_list, avg=True)
-    rougel = rouge_score["rouge-l"]["f"]
+    rougel = rouge_score['rouge-l']['f']
     return rougel
@@ -42,8 +42,8 @@ def nested_load_test_data(data_path):
             test_raw_data += temp_test
         return test_raw_data
     elif os.path.isfile(data_path) and data_path.endswith('.json'):
-        print("Load data from", data_path)
-        temp_data = json.load(open(data_path, "r"))
+        print('Load data from', data_path)
+        temp_data = json.load(open(data_path, 'r'))
         test_raw_data = temp_data
         return test_raw_data
     else:
@@ -51,39 +51,24 @@ def nested_load_test_data(data_path):
 def baichuan_call(context: list, system: str):
-    url = "https://api.baichuan-ai.com/v1/chat/completions"
-    api_key = "sk-xxx"
+    url = 'https://api.baichuan-ai.com/v1/chat/completions'
+    api_key = 'sk-xxx'
     new_msg = []
-    new_msg.append({
-        "role": 'system',
-        'content': system})
+    new_msg.append({'role': 'system', 'content': system})
     for m in context:
-        if m['role'] == "user":
-            new_msg.append({
-                'role': 'user', 'content': m['content']
-            })
-        elif m['role'] == "function":
-            new_msg.append({
-                'role': 'user', 'content': m['content']
-            })
+        if m['role'] == 'user':
+            new_msg.append({'role': 'user', 'content': m['content']})
+        elif m['role'] == 'function':
+            new_msg.append({'role': 'user', 'content': m['content']})
         elif m['role'] == 'assistant':
-            new_msg.append({
-                'role': 'assistant', 'content': m['content']
-            })
+            new_msg.append({'role': 'assistant', 'content': m['content']})
     # print(json.dumps(new_msg, indent=2))
-    data = {
-        "model": "Baichuan2-Turbo",
-        "messages": new_msg,
-        "stream": False
-    }
+    data = {'model': 'Baichuan2-Turbo', 'messages': new_msg, 'stream': False}
     json_data = json.dumps(data)
-    headers = {
-        "Content-Type": "application/json",
-        "Authorization": "Bearer " + api_key
-    }
+    headers = {'Content-Type': 'application/json', 'Authorization': 'Bearer ' + api_key}
     for i in range(5):
         res = None
@@ -91,7 +76,7 @@ def baichuan_call(context: list, system: str):
             res = requests.post(url, data=json_data, headers=headers, timeout=60)
             res = res._content.decode('utf-8')
             res = json.loads(res)
-            return res["choices"][0]["message"]["content"]
+            return res['choices'][0]['message']['content']
         except KeyError:
             print(res)
             time.sleep(1)
@@ -105,57 +90,52 @@ def baichuan_call(context: list, system: str):
         except NewConnectionError:
             time.sleep(5)
             continue
-    return ""
+    return ''
 def minimax_call(context: list, system: str):
-    group_id = "your-id"
-    api_key = "your-xxx"
+    group_id = 'your-id'
+    api_key = 'your-xxx'
-    url = f"https://api.minimax.chat/v1/text/chatcompletion_pro?GroupId={group_id}"
-    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
+    url = f'https://api.minimax.chat/v1/text/chatcompletion_pro?GroupId={group_id}'
+    headers = {'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'}
     # construct message
-    system_prompt = "MM智能助理是一款由MiniMax自研的，没有调用其他产品的接口的大型语言模型。" \
-                    "MiniMax是一家中国科技公司，一直致力于进行大模型相关的研究。"
+    system_prompt = 'MM智能助理是一款由MiniMax自研的，没有调用其他产品的接口的大型语言模型。' \
+                    'MiniMax是一家中国科技公司，一直致力于进行大模型相关的研究。'
     system_prompt += ('\n' + system)
     new_msg = []
     for m in context:
-        if m['role'] == "user":
-            new_msg.append({
-                'sender_type': 'USER', 'sender_name': 'user', 'text': m['content']
-            })
-        elif m['role'] == "function":
-            new_msg.append({
-                'sender_type': 'USER', 'sender_name': 'funtion', 'text': m['content']
-            })
+        if m['role'] == 'user':
+            new_msg.append({'sender_type': 'USER', 'sender_name': 'user', 'text': m['content']})
+        elif m['role'] == 'function':
+            new_msg.append({'sender_type': 'USER', 'sender_name': 'funtion', 'text': m['content']})
         elif m['role'] == 'assistant':
-            new_msg.append({
-                'sender_type': 'BOT', 'sender_name': 'MM智能助理', 'text': m['content']
-            })
+            new_msg.append({'sender_type': 'BOT', 'sender_name': 'MM智能助理', 'text': m['content']})
     request_body = {
-        "model": "abab6-chat",
+        'model': 'abab6-chat',
         # "model": "abab5.5s-chat",
-        "tokens_to_generate": 8192,
-        "reply_constraints": {"sender_type": "BOT", "sender_name": "MM智能助理"},
-        "messages": new_msg,
-        "bot_setting": [
-            {
-                "bot_name": "MM智能助理",
-                "content": system_prompt,
-            }
-        ],
+        'tokens_to_generate': 8192,
+        'reply_constraints': {
+            'sender_type': 'BOT',
+            'sender_name': 'MM智能助理'
+        },
+        'messages': new_msg,
+        'bot_setting': [{
+            'bot_name': 'MM智能助理',
+            'content': system_prompt,
+        }],
     }
     response = requests.post(url, headers=headers, json=request_body)
     status_code = response.status_code
     for i in range(5):
         try:
             if status_code == 200:
-                reply = response.json()["reply"]
+                reply = response.json()['reply']
                 if len(reply) == 0:
-                    print("limit rate")
+                    print('limit rate')
                     time.sleep(8)
                     continue
                 print(f'>>return: {reply}')
@@ -167,12 +147,12 @@ def minimax_call(context: list, system: str):
             print(response)
             time.sleep(5)
             continue
-    return ""
+    return ''
 def swift_call(context: list, system: str, swift_infer_obj):
     query_d: dict = context[-1]
-    history_list = context[: -1]
+    history_list = context[:-1]
     query: str = query_d['content']
     history_msg = []
@@ -211,9 +191,8 @@ def run_infer(args: InferArgs):
     if args.deploy_type == 'swift':
         from evalscope.third_party.toolbench_static.llm.swift_infer import SwiftInfer, SwiftInferArgs
-        swift_infer_args = SwiftInferArgs(model_id_or_path=args.model_name_or_path,
-                                          model_type=args.model_type,
-                                          max_new_tokens=args.max_new_tokens)
+        swift_infer_args = SwiftInferArgs(
+            model_id_or_path=args.model_name_or_path, model_type=args.model_type, max_new_tokens=args.max_new_tokens)
         swift_infer = SwiftInfer(args=swift_infer_args)
     else:
         swift_infer = None
@@ -232,7 +211,7 @@ def run_infer(args: InferArgs):
     preds = []
     refs = []
     for i, o in enumerate(infer_samples):
-        if i < len(processed_samples) and "predictions" in processed_samples[i].keys():
+        if i < len(processed_samples) and 'predictions' in processed_samples[i].keys():
             infer_samples[i]['predictions'] = processed_samples[i]['predictions']
             refs.append(processed_samples[i]['target'])
             preds.append(processed_samples[i]['predictions'])
@@ -267,7 +246,7 @@ def run_infer(args: InferArgs):
         reference = infer_samples[i]['target']
         infer_samples[i]['predictions'] = candidate
         if reference.strip() in ['', '.', ',']:
-            reference = "none"
+            reference = 'none'
         refs.append(reference)
         preds.append(candidate)

evalscope/third_party/toolbench_static/llm/swift_infer.py CHANGED Viewed

@@ -1,9 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from dataclasses import dataclass
-from swift.llm import (
-    get_model_tokenizer, get_template, inference, get_default_template_type,
-)
+from swift.llm import get_default_template_type, get_model_tokenizer, get_template, inference
 from swift.utils import seed_everything
 # TODO: Support custom model for swift infer
@@ -21,9 +18,8 @@ class SwiftInfer:
     def __init__(self, args: SwiftInferArgs):
         model_type = args.model_type
         template_type = get_default_template_type(model_type)
-        model, tokenizer = get_model_tokenizer(model_type,
-                                               model_id_or_path=args.model_id_or_path,
-                                               model_kwargs={'device_map': 'auto'})
+        model, tokenizer = get_model_tokenizer(
+            model_type, model_id_or_path=args.model_id_or_path, model_kwargs={'device_map': 'auto'})
         model.generation_config.max_new_tokens = args.max_new_tokens
         print(f'** Generation config: {model.generation_config}')
@@ -36,10 +32,6 @@ class SwiftInfer:
     def predict(self, system: str, query: str, history: list):
-        response, history = inference(self.model,
-                                      self.template,
-                                      query=query,
-                                      system=system,
-                                      history=history)
+        response, history = inference(self.model, self.template, query=query, system=system, history=history)
         return response

evalscope/third_party/toolbench_static/requirements.txt CHANGED Viewed

@@ -1,2 +1,2 @@
 ms-swift>=2.1.0
-rouge
+rouge

evalscope/third_party/toolbench_static/toolbench_static.py CHANGED Viewed

@@ -1,11 +1,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
-from typing import Union
 from copy import deepcopy
+from typing import Union
-from evalscope.third_party.toolbench_static.infer import InferArgs, run_infer
 from evalscope.third_party.toolbench_static.eval import EvalArgs, run_eval
-from evalscope.utils import yaml_to_dict, get_logger, json_to_dict
+from evalscope.third_party.toolbench_static.infer import InferArgs, run_infer
+from evalscope.utils import get_logger, json_to_dict, yaml_to_dict
 logger = get_logger()

evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl