evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +5 -1
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +46 -50
- evalscope/backend/rag_eval/utils/embedding.py +12 -11
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +32 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +119 -95
- evalscope/constants.py +61 -29
- evalscope/evaluator/__init__.py +1 -0
- evalscope/evaluator/evaluator.py +96 -377
- evalscope/evaluator/humaneval_evaluator.py +158 -0
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +15 -3
- evalscope/perf/benchmark.py +7 -9
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +10 -0
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +3 -4
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/perf/utils/db_util.py +11 -8
- evalscope/perf/utils/local_server.py +19 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +184 -375
- evalscope/run_arena.py +20 -25
- evalscope/summarizer.py +16 -17
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -28
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -5
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
- evalscope/tools/combine_reports.py +25 -30
- evalscope/tools/rewrite_eval_results.py +14 -46
- evalscope/utils/__init__.py +0 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +3 -4
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/logger.py +9 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +12 -138
- evalscope/version.py +2 -2
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
- evalscope-0.8.0.dist-info/RECORD +285 -0
- tests/cli/test_run.py +54 -15
- tests/perf/test_perf.py +4 -0
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.1.dist-info/RECORD +0 -286
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
evalscope/metrics/metrics.py
CHANGED
|
@@ -2,18 +2,17 @@
|
|
|
2
2
|
# Copyright (c) EleutherAI. and its affiliates.
|
|
3
3
|
# Copyright (c) OpenAI. and its affiliates.
|
|
4
4
|
import itertools
|
|
5
|
-
import math
|
|
6
|
-
from collections.abc import Iterable
|
|
7
|
-
from collections import defaultdict
|
|
8
|
-
from typing import Dict, List, Union
|
|
9
|
-
from nltk.translate.bleu_score import sentence_bleu
|
|
10
|
-
from nltk import word_tokenize
|
|
11
5
|
import jieba
|
|
12
|
-
|
|
6
|
+
import math
|
|
13
7
|
import numpy as np
|
|
8
|
+
import random
|
|
14
9
|
import sacrebleu
|
|
15
10
|
import sklearn.metrics
|
|
16
|
-
import
|
|
11
|
+
from collections import defaultdict
|
|
12
|
+
from collections.abc import Iterable
|
|
13
|
+
from nltk import word_tokenize
|
|
14
|
+
from nltk.translate.bleu_score import sentence_bleu
|
|
15
|
+
from typing import Dict, List, Union
|
|
17
16
|
|
|
18
17
|
|
|
19
18
|
def mean(arr):
|
|
@@ -22,12 +21,12 @@ def mean(arr):
|
|
|
22
21
|
|
|
23
22
|
def pop_stddev(arr):
|
|
24
23
|
mu = mean(arr)
|
|
25
|
-
return math.sqrt(sum([(x - mu)
|
|
24
|
+
return math.sqrt(sum([(x - mu)**2 for x in arr]) / len(arr))
|
|
26
25
|
|
|
27
26
|
|
|
28
27
|
def sample_stddev(arr):
|
|
29
28
|
mu = mean(arr)
|
|
30
|
-
return math.sqrt(sum([(x - mu)
|
|
29
|
+
return math.sqrt(sum([(x - mu)**2 for x in arr]) / (len(arr) - 1))
|
|
31
30
|
|
|
32
31
|
|
|
33
32
|
def mean_stderr(arr):
|
|
@@ -134,13 +133,14 @@ def bleu(items):
|
|
|
134
133
|
refs, preds = _sacreformat(refs, preds)
|
|
135
134
|
return sacrebleu.corpus_bleu(preds, refs).score
|
|
136
135
|
|
|
136
|
+
|
|
137
137
|
def bleu_ngram_one_sample(predict, reference):
|
|
138
138
|
"""
|
|
139
139
|
Calculate BLEU-1, BLEU-2, BLEU-3, and BLEU-4 scores
|
|
140
140
|
|
|
141
141
|
Args:
|
|
142
142
|
items: [(ref, pred)]
|
|
143
|
-
|
|
143
|
+
|
|
144
144
|
Returns:
|
|
145
145
|
{
|
|
146
146
|
'bleu-1': 0.8,
|
|
@@ -150,6 +150,7 @@ def bleu_ngram_one_sample(predict, reference):
|
|
|
150
150
|
}
|
|
151
151
|
|
|
152
152
|
"""
|
|
153
|
+
|
|
153
154
|
def is_contains_chinese(strs):
|
|
154
155
|
for _char in strs:
|
|
155
156
|
if '\u4e00' <= _char <= '\u9fa5':
|
|
@@ -230,6 +231,7 @@ def _sacreformat(refs, preds):
|
|
|
230
231
|
|
|
231
232
|
|
|
232
233
|
class _bootstrap_internal:
|
|
234
|
+
|
|
233
235
|
def __init__(self, f, n):
|
|
234
236
|
self.f = f
|
|
235
237
|
self.n = n
|
|
@@ -260,11 +262,11 @@ def bootstrap_stderr(f, xs, iters):
|
|
|
260
262
|
|
|
261
263
|
print('bootstrapping for stddev:', f.__name__)
|
|
262
264
|
for bootstrap in tqdm(
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
265
|
+
pool.imap(
|
|
266
|
+
_bootstrap_internal(f, chunk_size),
|
|
267
|
+
[(i, xs) for i in range(iters // chunk_size)],
|
|
268
|
+
),
|
|
269
|
+
total=iters // chunk_size,
|
|
268
270
|
):
|
|
269
271
|
# sample w replacement
|
|
270
272
|
res.extend(bootstrap)
|
|
@@ -372,11 +374,9 @@ def calculate_arc_accuracy(question_answers: Dict[str, str], predictions: Dict[s
|
|
|
372
374
|
return score / len(question_answers)
|
|
373
375
|
|
|
374
376
|
|
|
375
|
-
def calculate_pass_at_k(
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
k: int = 1
|
|
379
|
-
) -> np.ndarray:
|
|
377
|
+
def calculate_pass_at_k(num_samples: Union[int, List[int], np.ndarray],
|
|
378
|
+
num_correct: Union[List[int], np.ndarray],
|
|
379
|
+
k: int = 1) -> np.ndarray:
|
|
380
380
|
"""
|
|
381
381
|
Estimates pass@k of each problem and returns them in an array.
|
|
382
382
|
Examples:
|
|
@@ -1,18 +1,16 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
+
import jieba
|
|
3
4
|
import logging
|
|
4
5
|
from collections import defaultdict
|
|
5
6
|
from pathlib import Path
|
|
7
|
+
from rouge_chinese import Rouge
|
|
6
8
|
from statistics import mean
|
|
7
|
-
|
|
8
9
|
from tqdm import tqdm
|
|
9
10
|
|
|
10
11
|
from evalscope.constants import MetricsConstant
|
|
11
12
|
from evalscope.metrics.bundled_rouge_score import rouge_scorer
|
|
12
13
|
|
|
13
|
-
from rouge_chinese import Rouge
|
|
14
|
-
import jieba
|
|
15
|
-
|
|
16
14
|
|
|
17
15
|
class DummyTokenizer:
|
|
18
16
|
|
|
@@ -24,9 +22,7 @@ HERE = Path(__file__).absolute().parent
|
|
|
24
22
|
|
|
25
23
|
logger = logging.getLogger(__name__)
|
|
26
24
|
|
|
27
|
-
scorer = rouge_scorer.RougeScorer(
|
|
28
|
-
['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer()
|
|
29
|
-
)
|
|
25
|
+
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer())
|
|
30
26
|
zh_scorer = Rouge()
|
|
31
27
|
|
|
32
28
|
|
|
@@ -52,11 +48,7 @@ def compute_rouge_score(predict_l, reference_l):
|
|
|
52
48
|
result[rouge_key].append(one_sample[rouge_key])
|
|
53
49
|
rlt = {}
|
|
54
50
|
for rouge_key in MetricsConstant.ROUGE_KEYS:
|
|
55
|
-
rlt[rouge_key] = (
|
|
56
|
-
mean(result[rouge_key]) * 100
|
|
57
|
-
if rouge_key in result
|
|
58
|
-
else MetricsConstant.INVALID_VALUE
|
|
59
|
-
)
|
|
51
|
+
rlt[rouge_key] = (mean(result[rouge_key]) * 100 if rouge_key in result else MetricsConstant.INVALID_VALUE)
|
|
60
52
|
return rlt
|
|
61
53
|
|
|
62
54
|
|
|
@@ -111,9 +103,9 @@ def _to_table(final_result) -> str:
|
|
|
111
103
|
if not task:
|
|
112
104
|
continue
|
|
113
105
|
elif task == 'total':
|
|
114
|
-
row.append(f'{final_result["total"]["rouge"][rouge_key]
|
|
106
|
+
row.append(f'{final_result["total"]["rouge"][rouge_key]:0.2f}')
|
|
115
107
|
else:
|
|
116
|
-
row.append(f'{final_result["tasks"][task]["rouge"][rouge_key]
|
|
108
|
+
row.append(f'{final_result["tasks"][task]["rouge"][rouge_key]:0.2f}')
|
|
117
109
|
table.append('\t'.join(row))
|
|
118
110
|
|
|
119
111
|
return '\n'.join(table)
|
|
@@ -122,23 +114,17 @@ def _to_table(final_result) -> str:
|
|
|
122
114
|
def run_rouge_eval(data_l, md_level=2, report_metric_key='rouge-l-f'):
|
|
123
115
|
print(f"{'#' * md_level} Rouge Eval")
|
|
124
116
|
for data in tqdm(data_l):
|
|
125
|
-
data['rouge'] = compute_rouge_score_one_sample(
|
|
126
|
-
data['gen_tok_str'], data['reference_tok_str']
|
|
127
|
-
)
|
|
117
|
+
data['rouge'] = compute_rouge_score_one_sample(data['gen_tok_str'], data['reference_tok_str'])
|
|
128
118
|
task_data_d = defaultdict(list)
|
|
129
119
|
for data in data_l:
|
|
130
120
|
for task in data['task_tags']:
|
|
131
121
|
task_data_d[task].append(data)
|
|
132
122
|
|
|
133
123
|
total_rouge = mean([data['rouge'][report_metric_key] for data in data_l])
|
|
134
|
-
print(
|
|
135
|
-
|
|
136
|
-
f'{total_rouge * 100:0.2f}%'
|
|
137
|
-
)
|
|
124
|
+
print(f'[total], count: {len(data_l)}, {report_metric_key}: '
|
|
125
|
+
f'{total_rouge * 100:0.2f}%')
|
|
138
126
|
|
|
139
127
|
for task, task_data in task_data_d.items():
|
|
140
128
|
task_rouge = mean([data['rouge'][report_metric_key] for data in task_data])
|
|
141
|
-
print(
|
|
142
|
-
|
|
143
|
-
f'{task_rouge * 100:0.2f}%'
|
|
144
|
-
)
|
|
129
|
+
print(f'[{task}], count: {len(task_data_d[task])}, {report_metric_key}: '
|
|
130
|
+
f'{task_rouge * 100:0.2f}%')
|
evalscope/models/__init__.py
CHANGED
|
@@ -1,34 +1,36 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
+
import requests
|
|
4
5
|
import threading
|
|
5
6
|
import time
|
|
6
7
|
from asyncio import Queue
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
from typing import Dict, List, Optional, Union
|
|
7
11
|
|
|
8
|
-
import
|
|
9
|
-
from typing import Union, List, Optional, Dict
|
|
10
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
11
|
-
from modelscope.utils.logger import get_logger
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
12
13
|
|
|
13
14
|
logger = get_logger()
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
class OpenaiApi:
|
|
17
18
|
|
|
18
|
-
def __init__(
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
model: str,
|
|
22
|
+
openai_api_key,
|
|
23
|
+
openai_api_base,
|
|
24
|
+
logprobs: Optional[bool] = False,
|
|
25
|
+
top_logprobs: Optional[int] = None,
|
|
26
|
+
max_new_tokens: int = 4096,
|
|
27
|
+
temperature: Optional[float] = 0.0,
|
|
28
|
+
repetition_penalty: Optional[float] = 1.0,
|
|
29
|
+
is_chat: bool = True,
|
|
30
|
+
verbose: bool = True,
|
|
31
|
+
retry: int = 3,
|
|
32
|
+
query_per_second: int = 10, # TODO
|
|
33
|
+
**kwargs):
|
|
32
34
|
|
|
33
35
|
self.temperature = temperature
|
|
34
36
|
self.repetition_penalty = repetition_penalty
|
|
@@ -45,14 +47,17 @@ class OpenaiApi:
|
|
|
45
47
|
|
|
46
48
|
self.token_bucket = TokenBucket(query_per_second, verbose)
|
|
47
49
|
|
|
48
|
-
def generate_simple(self, inputs: Union[List[str]]):
|
|
50
|
+
def generate_simple(self, inputs: Union[List[str]], num_proc: int = 8):
|
|
49
51
|
|
|
50
52
|
def process_one(in_data: str):
|
|
51
53
|
|
|
52
54
|
if self.is_chat:
|
|
53
55
|
data = dict(
|
|
54
56
|
model=self.model,
|
|
55
|
-
messages=[{
|
|
57
|
+
messages=[{
|
|
58
|
+
'role': 'user',
|
|
59
|
+
'content': in_data
|
|
60
|
+
}],
|
|
56
61
|
max_tokens=self.max_tokens,
|
|
57
62
|
n=1,
|
|
58
63
|
logprobs=self.logprobs,
|
|
@@ -72,7 +77,10 @@ class OpenaiApi:
|
|
|
72
77
|
|
|
73
78
|
# todo
|
|
74
79
|
openai_api_key = self.openai_api_key or ''
|
|
75
|
-
header = {
|
|
80
|
+
header = {
|
|
81
|
+
'Authorization': f'Bearer {openai_api_key}',
|
|
82
|
+
'content-type': 'application/json',
|
|
83
|
+
}
|
|
76
84
|
data = json.dumps(data, ensure_ascii=False)
|
|
77
85
|
|
|
78
86
|
if self.verbose:
|
|
@@ -91,14 +99,18 @@ class OpenaiApi:
|
|
|
91
99
|
else:
|
|
92
100
|
return resp['choices'][0]['text'].strip()
|
|
93
101
|
|
|
94
|
-
|
|
95
|
-
|
|
102
|
+
results = []
|
|
103
|
+
with ThreadPoolExecutor(max_workers=num_proc) as executor:
|
|
104
|
+
# Submit all tasks
|
|
105
|
+
future_to_task = {executor.submit(process_one, input_one): input_one for input_one in inputs}
|
|
106
|
+
|
|
107
|
+
# Show progress bar
|
|
108
|
+
for future in tqdm(as_completed(future_to_task), total=len(inputs)):
|
|
109
|
+
results.append(future.result())
|
|
96
110
|
|
|
97
111
|
return results
|
|
98
112
|
|
|
99
|
-
def generate(self,
|
|
100
|
-
inputs: Union[List[str], List[List]],
|
|
101
|
-
**kwargs) -> List[str]:
|
|
113
|
+
def generate(self, inputs: Union[List[str], List[List]], **kwargs) -> List[str]:
|
|
102
114
|
"""
|
|
103
115
|
Generate responses from OpenAI API.
|
|
104
116
|
|
|
@@ -160,13 +172,12 @@ class OpenaiApi:
|
|
|
160
172
|
|
|
161
173
|
def remove_none_val(input_d: dict):
|
|
162
174
|
return {k: v for k, v in input_d.items() if v is not None}
|
|
175
|
+
|
|
163
176
|
data = remove_none_val(data)
|
|
164
177
|
|
|
165
178
|
if self.verbose:
|
|
166
179
|
logger.info(f'>> Post data: {json.dumps(data, ensure_ascii=False)}')
|
|
167
|
-
raw_response = requests.post(self.url,
|
|
168
|
-
headers=header,
|
|
169
|
-
data=json.dumps(data, ensure_ascii=False))
|
|
180
|
+
raw_response = requests.post(self.url, headers=header, data=json.dumps(data, ensure_ascii=False))
|
|
170
181
|
|
|
171
182
|
response = raw_response.json()
|
|
172
183
|
if self.verbose:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
from abc import ABC, abstractmethod
|
|
3
|
-
from typing import Any, Union, Dict, List
|
|
4
2
|
import torch
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Any, Dict, List, Union
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class CustomModel(ABC):
|
|
@@ -11,7 +11,7 @@ class CustomModel(ABC):
|
|
|
11
11
|
self.kwargs = kwargs
|
|
12
12
|
|
|
13
13
|
if config.get('model_id', None) is None:
|
|
14
|
-
raise ValueError(f
|
|
14
|
+
raise ValueError(f'**Error: model_id is required in config for CustomModel. Got config: {config}')
|
|
15
15
|
|
|
16
16
|
@abstractmethod
|
|
17
17
|
@torch.no_grad()
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import random
|
|
4
4
|
import time
|
|
5
|
+
|
|
5
6
|
from evalscope.models import ChatBaseModel
|
|
6
7
|
from evalscope.utils.logger import get_logger
|
|
7
8
|
|
|
@@ -32,15 +33,13 @@ class DummyChatModel(ChatBaseModel):
|
|
|
32
33
|
|
|
33
34
|
# Build response
|
|
34
35
|
res = {
|
|
35
|
-
'choices': [
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
'
|
|
39
|
-
|
|
40
|
-
'role': 'assistant'
|
|
41
|
-
}
|
|
36
|
+
'choices': [{
|
|
37
|
+
'index': 0,
|
|
38
|
+
'message': {
|
|
39
|
+
'content': choice,
|
|
40
|
+
'role': 'assistant'
|
|
42
41
|
}
|
|
43
|
-
],
|
|
42
|
+
}],
|
|
44
43
|
'created': time.time(),
|
|
45
44
|
'model': self.MODEL_ID + '-' + self.REVISION,
|
|
46
45
|
'object': 'chat.completion',
|