PyPI - evalscope - Versions diffs - 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

evalscope 0.7.1py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show

evalscope/__init__.py +1 -1
evalscope/arguments.py +73 -0
evalscope/backend/base.py +5 -1
evalscope/backend/opencompass/api_meta_template.py +8 -14
evalscope/backend/opencompass/backend_manager.py +24 -15
evalscope/backend/opencompass/tasks/eval_api.py +1 -6
evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
evalscope/backend/rag_eval/__init__.py +3 -3
evalscope/backend/rag_eval/backend_manager.py +21 -25
evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
evalscope/backend/rag_eval/cmteb/base.py +22 -23
evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
evalscope/backend/rag_eval/ragas/__init__.py +2 -2
evalscope/backend/rag_eval/ragas/arguments.py +3 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +10 -15
evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
evalscope/backend/rag_eval/utils/clip.py +46 -50
evalscope/backend/rag_eval/utils/embedding.py +12 -11
evalscope/backend/rag_eval/utils/llm.py +8 -6
evalscope/backend/rag_eval/utils/tools.py +12 -11
evalscope/backend/vlm_eval_kit/__init__.py +1 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
evalscope/benchmarks/arc/__init__.py +3 -2
evalscope/benchmarks/arc/ai2_arc.py +19 -16
evalscope/benchmarks/arc/arc_adapter.py +32 -24
evalscope/benchmarks/bbh/__init__.py +1 -2
evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
evalscope/benchmarks/benchmark.py +16 -16
evalscope/benchmarks/ceval/__init__.py +3 -2
evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
evalscope/benchmarks/ceval/ceval_exam.py +18 -31
evalscope/benchmarks/cmmlu/__init__.py +3 -2
evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
evalscope/benchmarks/competition_math/__init__.py +3 -2
evalscope/benchmarks/competition_math/competition_math.py +7 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
evalscope/benchmarks/data_adapter.py +24 -24
evalscope/benchmarks/general_qa/__init__.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
evalscope/benchmarks/gsm8k/__init__.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
evalscope/benchmarks/hellaswag/__init__.py +3 -2
evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
evalscope/benchmarks/humaneval/__init__.py +1 -1
evalscope/benchmarks/humaneval/humaneval.py +15 -18
evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
evalscope/benchmarks/mmlu/__init__.py +3 -2
evalscope/benchmarks/mmlu/mmlu.py +15 -29
evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
evalscope/benchmarks/race/__init__.py +3 -2
evalscope/benchmarks/race/race.py +21 -35
evalscope/benchmarks/race/race_adapter.py +32 -29
evalscope/benchmarks/race/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/__init__.py +3 -2
evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
evalscope/benchmarks/truthful_qa/__init__.py +3 -2
evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
evalscope/cli/cli.py +6 -5
evalscope/cli/start_eval.py +31 -0
evalscope/cli/start_perf.py +0 -3
evalscope/cli/start_server.py +27 -41
evalscope/config.py +119 -95
evalscope/constants.py +61 -29
evalscope/evaluator/__init__.py +1 -0
evalscope/evaluator/evaluator.py +96 -377
evalscope/evaluator/humaneval_evaluator.py +158 -0
evalscope/evaluator/rating_eval.py +12 -33
evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
evalscope/metrics/code_metric.py +3 -9
evalscope/metrics/math_accuracy.py +3 -6
evalscope/metrics/metrics.py +21 -21
evalscope/metrics/rouge_metric.py +11 -25
evalscope/models/__init__.py +1 -2
evalscope/models/api/openai_api.py +40 -29
evalscope/models/custom/__init__.py +0 -1
evalscope/models/custom/custom_model.py +3 -3
evalscope/models/dummy_chat_model.py +7 -8
evalscope/models/model_adapter.py +89 -156
evalscope/models/openai_model.py +20 -20
evalscope/perf/arguments.py +15 -3
evalscope/perf/benchmark.py +7 -9
evalscope/perf/http_client.py +3 -8
evalscope/perf/main.py +10 -0
evalscope/perf/plugin/api/custom_api.py +1 -2
evalscope/perf/plugin/api/dashscope_api.py +1 -2
evalscope/perf/plugin/api/openai_api.py +3 -4
evalscope/perf/plugin/datasets/base.py +1 -2
evalscope/perf/plugin/datasets/flickr8k.py +1 -2
evalscope/perf/plugin/datasets/longalpaca.py +1 -2
evalscope/perf/plugin/datasets/openqa.py +1 -2
evalscope/perf/utils/analysis_result.py +1 -2
evalscope/perf/utils/benchmark_util.py +1 -2
evalscope/perf/utils/db_util.py +11 -8
evalscope/perf/utils/local_server.py +19 -13
evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
evalscope/registry/tasks/arc.yaml +2 -3
evalscope/registry/tasks/bbh.yaml +3 -4
evalscope/registry/tasks/bbh_mini.yaml +3 -4
evalscope/registry/tasks/ceval.yaml +3 -3
evalscope/registry/tasks/ceval_mini.yaml +3 -4
evalscope/registry/tasks/cmmlu.yaml +3 -3
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
evalscope/registry/tasks/general_qa.yaml +1 -1
evalscope/registry/tasks/gsm8k.yaml +2 -2
evalscope/registry/tasks/mmlu.yaml +3 -3
evalscope/registry/tasks/mmlu_mini.yaml +3 -3
evalscope/run.py +184 -375
evalscope/run_arena.py +20 -25
evalscope/summarizer.py +16 -17
evalscope/third_party/longbench_write/README.md +99 -42
evalscope/third_party/longbench_write/default_task.json +1 -1
evalscope/third_party/longbench_write/default_task.yaml +8 -7
evalscope/third_party/longbench_write/eval.py +29 -28
evalscope/third_party/longbench_write/infer.py +16 -104
evalscope/third_party/longbench_write/longbench_write.py +5 -5
evalscope/third_party/longbench_write/resources/judge.txt +1 -1
evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
evalscope/third_party/longbench_write/utils.py +0 -1
evalscope/third_party/toolbench_static/eval.py +14 -15
evalscope/third_party/toolbench_static/infer.py +48 -69
evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
evalscope/third_party/toolbench_static/requirements.txt +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
evalscope/tools/combine_reports.py +25 -30
evalscope/tools/rewrite_eval_results.py +14 -46
evalscope/utils/__init__.py +0 -1
evalscope/utils/arena_utils.py +18 -48
evalscope/{perf/utils → utils}/chat_service.py +3 -4
evalscope/utils/completion_parsers.py +3 -8
evalscope/utils/logger.py +9 -7
evalscope/utils/model_utils.py +11 -0
evalscope/utils/utils.py +12 -138
evalscope/version.py +2 -2
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
evalscope-0.8.0.dist-info/RECORD +285 -0
tests/cli/test_run.py +54 -15
tests/perf/test_perf.py +4 -0
tests/rag/test_clip_benchmark.py +38 -38
tests/rag/test_mteb.py +3 -2
tests/rag/test_ragas.py +5 -5
tests/swift/test_run_swift_eval.py +2 -3
tests/swift/test_run_swift_vlm_eval.py +2 -3
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
evalscope/cache.py +0 -98
evalscope/models/template.py +0 -1446
evalscope/run_ms.py +0 -140
evalscope/utils/task_cfg_parser.py +0 -10
evalscope/utils/task_utils.py +0 -22
evalscope-0.7.1.dist-info/RECORD +0 -286
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0

evalscope/metrics/metrics.py CHANGED Viewed

@@ -2,18 +2,17 @@
 # Copyright (c) EleutherAI. and its affiliates.
 # Copyright (c) OpenAI. and its affiliates.
 import itertools
-import math
-from collections.abc import Iterable
-from collections import defaultdict
-from typing import Dict, List, Union
-from nltk.translate.bleu_score import sentence_bleu
-from nltk import word_tokenize
 import jieba
+import math
 import numpy as np
+import random
 import sacrebleu
 import sklearn.metrics
-import random
+from collections import defaultdict
+from collections.abc import Iterable
+from nltk import word_tokenize
+from nltk.translate.bleu_score import sentence_bleu
+from typing import Dict, List, Union
 def mean(arr):
@@ -22,12 +21,12 @@ def mean(arr):
 def pop_stddev(arr):
     mu = mean(arr)
-    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr))
+    return math.sqrt(sum([(x - mu)**2 for x in arr]) / len(arr))
 def sample_stddev(arr):
     mu = mean(arr)
-    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1))
+    return math.sqrt(sum([(x - mu)**2 for x in arr]) / (len(arr) - 1))
 def mean_stderr(arr):
@@ -134,13 +133,14 @@ def bleu(items):
     refs, preds = _sacreformat(refs, preds)
     return sacrebleu.corpus_bleu(preds, refs).score
 def bleu_ngram_one_sample(predict, reference):
     """
     Calculate BLEU-1, BLEU-2, BLEU-3, and BLEU-4 scores
     Args:
         items: [(ref, pred)]
     Returns:
         {
             'bleu-1': 0.8,
@@ -150,6 +150,7 @@ def bleu_ngram_one_sample(predict, reference):
         }
     """
     def is_contains_chinese(strs):
         for _char in strs:
             if '\u4e00' <= _char <= '\u9fa5':
@@ -230,6 +231,7 @@ def _sacreformat(refs, preds):
 class _bootstrap_internal:
     def __init__(self, f, n):
         self.f = f
         self.n = n
@@ -260,11 +262,11 @@ def bootstrap_stderr(f, xs, iters):
     print('bootstrapping for stddev:', f.__name__)
     for bootstrap in tqdm(
-        pool.imap(
-            _bootstrap_internal(f, chunk_size),
-            [(i, xs) for i in range(iters // chunk_size)],
-        ),
-        total=iters // chunk_size,
+            pool.imap(
+                _bootstrap_internal(f, chunk_size),
+                [(i, xs) for i in range(iters // chunk_size)],
+            ),
+            total=iters // chunk_size,
     ):
         # sample w replacement
         res.extend(bootstrap)
@@ -372,11 +374,9 @@ def calculate_arc_accuracy(question_answers: Dict[str, str], predictions: Dict[s
     return score / len(question_answers)
-def calculate_pass_at_k(
-    num_samples: Union[int, List[int], np.ndarray],
-    num_correct: Union[List[int], np.ndarray],
-    k: int = 1
-) -> np.ndarray:
+def calculate_pass_at_k(num_samples: Union[int, List[int], np.ndarray],
+                        num_correct: Union[List[int], np.ndarray],
+                        k: int = 1) -> np.ndarray:
     """
     Estimates pass@k of each problem and returns them in an array.
     Examples:

evalscope/metrics/rouge_metric.py CHANGED Viewed

@@ -1,18 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import jieba
 import logging
 from collections import defaultdict
 from pathlib import Path
+from rouge_chinese import Rouge
 from statistics import mean
 from tqdm import tqdm
 from evalscope.constants import MetricsConstant
 from evalscope.metrics.bundled_rouge_score import rouge_scorer
-from rouge_chinese import Rouge
-import jieba
 class DummyTokenizer:
@@ -24,9 +22,7 @@ HERE = Path(__file__).absolute().parent
 logger = logging.getLogger(__name__)
-scorer = rouge_scorer.RougeScorer(
-    ['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer()
-)
+scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer())
 zh_scorer = Rouge()
@@ -52,11 +48,7 @@ def compute_rouge_score(predict_l, reference_l):
             result[rouge_key].append(one_sample[rouge_key])
     rlt = {}
     for rouge_key in MetricsConstant.ROUGE_KEYS:
-        rlt[rouge_key] = (
-            mean(result[rouge_key]) * 100
-            if rouge_key in result
-            else MetricsConstant.INVALID_VALUE
-        )
+        rlt[rouge_key] = (mean(result[rouge_key]) * 100 if rouge_key in result else MetricsConstant.INVALID_VALUE)
     return rlt
@@ -111,9 +103,9 @@ def _to_table(final_result) -> str:
             if not task:
                 continue
             elif task == 'total':
-                row.append(f'{final_result["total"]["rouge"][rouge_key] :0.2f}')
+                row.append(f'{final_result["total"]["rouge"][rouge_key]:0.2f}')
             else:
-                row.append(f'{final_result["tasks"][task]["rouge"][rouge_key] :0.2f}')
+                row.append(f'{final_result["tasks"][task]["rouge"][rouge_key]:0.2f}')
         table.append('\t'.join(row))
     return '\n'.join(table)
@@ -122,23 +114,17 @@ def _to_table(final_result) -> str:
 def run_rouge_eval(data_l, md_level=2, report_metric_key='rouge-l-f'):
     print(f"{'#' * md_level} Rouge Eval")
     for data in tqdm(data_l):
-        data['rouge'] = compute_rouge_score_one_sample(
-            data['gen_tok_str'], data['reference_tok_str']
-        )
+        data['rouge'] = compute_rouge_score_one_sample(data['gen_tok_str'], data['reference_tok_str'])
     task_data_d = defaultdict(list)
     for data in data_l:
         for task in data['task_tags']:
             task_data_d[task].append(data)
     total_rouge = mean([data['rouge'][report_metric_key] for data in data_l])
-    print(
-        f'[total], count: {len(data_l)}, {report_metric_key}: '
-        f'{total_rouge * 100:0.2f}%'
-    )
+    print(f'[total], count: {len(data_l)}, {report_metric_key}: '
+          f'{total_rouge * 100:0.2f}%')
     for task, task_data in task_data_d.items():
         task_rouge = mean([data['rouge'][report_metric_key] for data in task_data])
-        print(
-            f'[{task}], count: {len(task_data_d[task])}, {report_metric_key}: '
-            f'{task_rouge * 100:0.2f}%'
-        )
+        print(f'[{task}], count: {len(task_data_d[task])}, {report_metric_key}: '
+              f'{task_rouge * 100:0.2f}%')

evalscope/models/__init__.py CHANGED Viewed

@@ -1,4 +1,3 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.models.model import BaseModel
-from evalscope.models.model import ChatBaseModel
+from evalscope.models.model import BaseModel, ChatBaseModel

evalscope/models/api/openai_api.py CHANGED Viewed

@@ -1,34 +1,36 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import json
+import requests
 import threading
 import time
 from asyncio import Queue
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from tqdm import tqdm
+from typing import Dict, List, Optional, Union
-import requests
-from typing import Union, List, Optional, Dict
-from concurrent.futures import ThreadPoolExecutor
-from modelscope.utils.logger import get_logger
+from evalscope.utils.logger import get_logger
 logger = get_logger()
 class OpenaiApi:
-    def __init__(self,
-                 model: str,
-                 openai_api_key,
-                 openai_api_base,
-                 logprobs: Optional[bool] = False,
-                 top_logprobs: Optional[int] = None,
-                 max_new_tokens: int = 4096,
-                 temperature: Optional[float] = 0.0,
-                 repetition_penalty: Optional[float] = 1.0,
-                 is_chat: bool = True,
-                 verbose: bool = True,
-                 retry: int = 3,
-                 query_per_second: int = 10,     # TODO
-                 **kwargs):
+    def __init__(
+            self,
+            model: str,
+            openai_api_key,
+            openai_api_base,
+            logprobs: Optional[bool] = False,
+            top_logprobs: Optional[int] = None,
+            max_new_tokens: int = 4096,
+            temperature: Optional[float] = 0.0,
+            repetition_penalty: Optional[float] = 1.0,
+            is_chat: bool = True,
+            verbose: bool = True,
+            retry: int = 3,
+            query_per_second: int = 10,  # TODO
+            **kwargs):
         self.temperature = temperature
         self.repetition_penalty = repetition_penalty
@@ -45,14 +47,17 @@ class OpenaiApi:
         self.token_bucket = TokenBucket(query_per_second, verbose)
-    def generate_simple(self, inputs: Union[List[str]]):
+    def generate_simple(self, inputs: Union[List[str]], num_proc: int = 8):
         def process_one(in_data: str):
             if self.is_chat:
                 data = dict(
                     model=self.model,
-                    messages=[{'role': 'user', 'content': in_data}],
+                    messages=[{
+                        'role': 'user',
+                        'content': in_data
+                    }],
                     max_tokens=self.max_tokens,
                     n=1,
                     logprobs=self.logprobs,
@@ -72,7 +77,10 @@ class OpenaiApi:
             # todo
             openai_api_key = self.openai_api_key or ''
-            header = {'Authorization': f'Bearer ', 'content-type': 'application/json', }
+            header = {
+                'Authorization': f'Bearer {openai_api_key}',
+                'content-type': 'application/json',
+            }
             data = json.dumps(data, ensure_ascii=False)
             if self.verbose:
@@ -91,14 +99,18 @@ class OpenaiApi:
                 else:
                     return resp['choices'][0]['text'].strip()
-        with ThreadPoolExecutor() as executor:
-            results = list(executor.map(process_one, inputs))
+        results = []
+        with ThreadPoolExecutor(max_workers=num_proc) as executor:
+            # Submit all tasks
+            future_to_task = {executor.submit(process_one, input_one): input_one for input_one in inputs}
+            # Show progress bar
+            for future in tqdm(as_completed(future_to_task), total=len(inputs)):
+                results.append(future.result())
         return results
-    def generate(self,
-                 inputs: Union[List[str], List[List]],
-                 **kwargs) -> List[str]:
+    def generate(self, inputs: Union[List[str], List[List]], **kwargs) -> List[str]:
         """
         Generate responses from OpenAI API.
@@ -160,13 +172,12 @@ class OpenaiApi:
                 def remove_none_val(input_d: dict):
                     return {k: v for k, v in input_d.items() if v is not None}
                 data = remove_none_val(data)
                 if self.verbose:
                     logger.info(f'>> Post data: {json.dumps(data, ensure_ascii=False)}')
-                raw_response = requests.post(self.url,
-                                             headers=header,
-                                             data=json.dumps(data, ensure_ascii=False))
+                raw_response = requests.post(self.url, headers=header, data=json.dumps(data, ensure_ascii=False))
                 response = raw_response.json()
                 if self.verbose:

evalscope/models/custom/__init__.py CHANGED Viewed

@@ -1,4 +1,3 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from evalscope.models.custom.custom_model import *

evalscope/models/custom/custom_model.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from abc import ABC, abstractmethod
-from typing import Any, Union, Dict, List
 import torch
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Union
 class CustomModel(ABC):
@@ -11,7 +11,7 @@ class CustomModel(ABC):
         self.kwargs = kwargs
         if config.get('model_id', None) is None:
-            raise ValueError(f"**Error: model_id is required in config for CustomModel. Got config: {config}")
+            raise ValueError(f'**Error: model_id is required in config for CustomModel. Got config: {config}')
     @abstractmethod
     @torch.no_grad()

evalscope/models/dummy_chat_model.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import random
 import time
 from evalscope.models import ChatBaseModel
 from evalscope.utils.logger import get_logger
@@ -32,15 +33,13 @@ class DummyChatModel(ChatBaseModel):
         # Build response
         res = {
-            'choices': [
-                {
-                    'index': 0,
-                    'message': {
-                        'content': choice,
-                        'role': 'assistant'
-                    }
+            'choices': [{
+                'index': 0,
+                'message': {
+                    'content': choice,
+                    'role': 'assistant'
                 }
-            ],
+            }],
             'created': time.time(),
             'model': self.MODEL_ID + '-' + self.REVISION,
             'object': 'chat.completion',

evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.7.1py3-none-any.whl → 0.8.0py3-none-any.whl