evalscope 0.8.0__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +11 -3
- evalscope/backend/base.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/clip.py +2 -2
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +24 -102
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +115 -87
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +24 -80
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +22 -101
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +33 -99
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +93 -9
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
- evalscope/benchmarks/ifeval/instructions.py +1477 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +27 -123
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +30 -0
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +198 -0
- evalscope/collections/sampler.py +138 -0
- evalscope/collections/schema.py +126 -0
- evalscope/config.py +45 -7
- evalscope/constants.py +7 -38
- evalscope/evaluator/__init__.py +0 -1
- evalscope/evaluator/evaluator.py +89 -121
- evalscope/evaluator/rating_eval.py +1 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +14 -5
- evalscope/metrics/__init__.py +3 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +18 -6
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +140 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +1 -1
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +111 -0
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/arguments.py +3 -1
- evalscope/perf/benchmark.py +3 -3
- evalscope/perf/main.py +5 -7
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +54 -50
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/benchmark_util.py +4 -4
- evalscope/perf/utils/db_util.py +66 -22
- evalscope/perf/utils/local_server.py +4 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +693 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +64 -125
- evalscope/run_arena.py +3 -2
- evalscope/summarizer.py +15 -27
- evalscope/third_party/longbench_write/eval.py +2 -1
- evalscope/third_party/longbench_write/longbench_write.py +2 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +1 -0
- evalscope/utils/chat_service.py +6 -5
- evalscope/utils/io_utils.py +170 -0
- evalscope/utils/logger.py +13 -0
- evalscope/utils/model_utils.py +15 -2
- evalscope/utils/utils.py +3 -200
- evalscope/version.py +2 -2
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +129 -23
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +119 -115
- tests/cli/test_collection.py +57 -0
- tests/cli/test_run.py +57 -7
- tests/perf/test_perf.py +3 -2
- tests/rag/test_mteb.py +3 -2
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
- evalscope/evaluator/humaneval_evaluator.py +0 -158
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -135
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
import glob
|
|
4
|
+
import os
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from tabulate import tabulate
|
|
7
|
+
from typing import List, Tuple
|
|
8
|
+
|
|
9
|
+
from evalscope.report.utils import Report
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
"""
|
|
14
|
+
Combine and generate table for reports of LLMs.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_report_list(reports_path_list: List[str]) -> List[Report]:
|
|
19
|
+
report_list: List[Report] = []
|
|
20
|
+
# Iterate over each report path
|
|
21
|
+
for report_path in reports_path_list:
|
|
22
|
+
model_report_dir = os.path.normpath(report_path)
|
|
23
|
+
report_files = glob.glob(os.path.join(model_report_dir, '**', '*.json'), recursive=True)
|
|
24
|
+
# Iterate over each report file
|
|
25
|
+
for file_path in report_files:
|
|
26
|
+
try:
|
|
27
|
+
report = Report.from_json(file_path)
|
|
28
|
+
report_list.append(report)
|
|
29
|
+
except Exception as e:
|
|
30
|
+
logger.error(f'Error loading report from {file_path}: {e}')
|
|
31
|
+
report_list = sorted(report_list, key=lambda x: (x.model_name, x.dataset_name))
|
|
32
|
+
return report_list
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_data_frame(report_list: List[Report],
|
|
36
|
+
flatten_metrics: bool = True,
|
|
37
|
+
flatten_categories: bool = True) -> pd.DataFrame:
|
|
38
|
+
tables = []
|
|
39
|
+
for report in report_list:
|
|
40
|
+
df = report.to_dataframe(flatten_metrics=flatten_metrics, flatten_categories=flatten_categories)
|
|
41
|
+
tables.append(df)
|
|
42
|
+
return pd.concat(tables, ignore_index=True)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def gen_table(reports_path_list: list) -> str:
|
|
46
|
+
report_list = get_report_list(reports_path_list)
|
|
47
|
+
table = get_data_frame(report_list)
|
|
48
|
+
return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class ReportsRecorder:
|
|
52
|
+
COMMON_DATASET_PATH = []
|
|
53
|
+
CUSTOM_DATASET_PATH = []
|
|
54
|
+
|
|
55
|
+
def __init__(self, oss_url: str = '', endpoint: str = ''):
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
if __name__ == '__main__':
|
|
60
|
+
report_dir_1 = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250117_151926'
|
|
61
|
+
# report_dir_2 = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250107_204445/reports'
|
|
62
|
+
|
|
63
|
+
report_table = gen_table([report_dir_1])
|
|
64
|
+
print(report_table)
|
|
65
|
+
|
|
66
|
+
# ALL VALUES ONLY FOR EXAMPLE
|
|
67
|
+
# +--------------------------+-------------------+-------------+
|
|
68
|
+
# | Model | CompetitionMath | GSM8K |
|
|
69
|
+
# +==========================+===================+=============+
|
|
70
|
+
# | ZhipuAI_chatglm2-6b-base | 25.0 (acc) | 30.50 (acc) |
|
|
71
|
+
# +--------------------------+-------------------+-------------+
|
|
72
|
+
# | ZhipuAI_chatglm2-6b | 30.5 (acc) | 40.50 (acc) |
|
|
73
|
+
# +--------------------------+-------------------+-------------+
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from pandas import DataFrame
|
|
3
|
+
|
|
4
|
+
from evalscope.constants import DataCollection
|
|
5
|
+
from evalscope.report.utils import *
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ReportGenerator:
|
|
9
|
+
|
|
10
|
+
@staticmethod
|
|
11
|
+
def gen_report(subset_score_map: dict, report_name: str, **kwargs) -> Report:
|
|
12
|
+
"""
|
|
13
|
+
Generate report for specific dataset.
|
|
14
|
+
subset_score_map: e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}, {'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}]}
|
|
15
|
+
category_map: e.g. {'subset_name': ['category_name1', 'category_name2'], ...}
|
|
16
|
+
metric_list: e.g. [{'object': AverageAccuracy, 'name': 'AverageAccuracy'}, {'object': 'WeightedAverageAccuracy', 'name': 'WeightedAverageAccuracy'}]
|
|
17
|
+
""" # noqa: E501
|
|
18
|
+
|
|
19
|
+
dataset_name = kwargs.get('dataset_name', None)
|
|
20
|
+
model_name = kwargs.get('model_name', None)
|
|
21
|
+
category_map = kwargs.get('category_map', {})
|
|
22
|
+
|
|
23
|
+
def flatten_subset() -> DataFrame:
|
|
24
|
+
"""
|
|
25
|
+
Flatten subset score map to a DataFrame.
|
|
26
|
+
|
|
27
|
+
Example:
|
|
28
|
+
name score num categories metric_name
|
|
29
|
+
0 ARC-Easy 0.5 2 [default] AverageAccuracy
|
|
30
|
+
1 ARC-Challenge 0.5 2 [default] AverageAccuracy
|
|
31
|
+
"""
|
|
32
|
+
subsets = []
|
|
33
|
+
for subset_name, scores in subset_score_map.items():
|
|
34
|
+
for score_item in scores:
|
|
35
|
+
categories = category_map.get(subset_name, ['default'])
|
|
36
|
+
if isinstance(categories, str):
|
|
37
|
+
categories = [categories]
|
|
38
|
+
subsets.append(
|
|
39
|
+
dict(
|
|
40
|
+
name=subset_name,
|
|
41
|
+
score=score_item['score'],
|
|
42
|
+
num=score_item['num'],
|
|
43
|
+
metric_name=score_item['metric_name'],
|
|
44
|
+
categories=tuple(categories)))
|
|
45
|
+
df = pd.DataFrame(subsets)
|
|
46
|
+
return df
|
|
47
|
+
|
|
48
|
+
df = flatten_subset()
|
|
49
|
+
|
|
50
|
+
metrics_list = []
|
|
51
|
+
for metric_name, group_metric in df.groupby('metric_name'):
|
|
52
|
+
categories = []
|
|
53
|
+
for category_name, group_category in group_metric.groupby('categories'):
|
|
54
|
+
subsets = []
|
|
55
|
+
for _, row in group_category.iterrows():
|
|
56
|
+
subsets.append(Subset(name=row['name'], score=row['score'], num=row['num']))
|
|
57
|
+
|
|
58
|
+
categories.append(Category(name=category_name, subsets=subsets))
|
|
59
|
+
|
|
60
|
+
metrics_list.append(Metric(name=metric_name, categories=categories))
|
|
61
|
+
|
|
62
|
+
report = Report(name=report_name, metrics=metrics_list, dataset_name=dataset_name, model_name=model_name)
|
|
63
|
+
return report
|
|
64
|
+
|
|
65
|
+
@staticmethod
|
|
66
|
+
def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
|
|
67
|
+
categories = []
|
|
68
|
+
for category_name, group_category in df.groupby('categories'):
|
|
69
|
+
subsets = []
|
|
70
|
+
for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name', 'subset_name']):
|
|
71
|
+
avg_score = group_subset['score'].mean()
|
|
72
|
+
num = group_subset['score'].count()
|
|
73
|
+
subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
|
|
74
|
+
|
|
75
|
+
categories.append(Category(name=category_name, subsets=subsets))
|
|
76
|
+
return Report(
|
|
77
|
+
name=DataCollection.NAME,
|
|
78
|
+
metrics=[Metric(name='Average', categories=categories)],
|
|
79
|
+
dataset_name=all_dataset_name,
|
|
80
|
+
model_name=model_name)
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from dataclasses import asdict, dataclass, field
|
|
5
|
+
from typing import Any, Dict, List
|
|
6
|
+
|
|
7
|
+
from evalscope.metrics import macro_mean, micro_mean
|
|
8
|
+
from evalscope.utils import normalize_score
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class Subset:
|
|
13
|
+
name: str = 'default_subset'
|
|
14
|
+
score: float = 0.0
|
|
15
|
+
num: int = 0
|
|
16
|
+
|
|
17
|
+
def __post_init__(self):
|
|
18
|
+
self.score = normalize_score(self.score)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class Category:
|
|
23
|
+
name: tuple[str] = field(default_factory=tuple)
|
|
24
|
+
num: int = 0
|
|
25
|
+
score: float = 0.0
|
|
26
|
+
macro_score: float = 0.0
|
|
27
|
+
subsets: List[Subset] = field(default_factory=list)
|
|
28
|
+
|
|
29
|
+
def __post_init__(self):
|
|
30
|
+
if isinstance(self.name, str):
|
|
31
|
+
# ensure name is tuple format
|
|
32
|
+
self.name = (self.name, )
|
|
33
|
+
self.num = sum(subset.num for subset in self.subsets)
|
|
34
|
+
self.score = normalize_score(micro_mean(self.subsets))
|
|
35
|
+
self.macro_score = normalize_score(macro_mean(self.subsets))
|
|
36
|
+
|
|
37
|
+
@classmethod
|
|
38
|
+
def from_dict(cls, data: dict):
|
|
39
|
+
subsets = [Subset(**subset) for subset in data.get('subsets', [])]
|
|
40
|
+
return cls(name=data['name'], subsets=subsets)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class Metric:
|
|
45
|
+
name: str = 'default_metric'
|
|
46
|
+
num: int = 0
|
|
47
|
+
score: float = 0.0
|
|
48
|
+
macro_score: float = 0.0
|
|
49
|
+
categories: List[Category] = field(default_factory=list)
|
|
50
|
+
|
|
51
|
+
def __post_init__(self):
|
|
52
|
+
self.num = sum(category.num for category in self.categories)
|
|
53
|
+
self.score = normalize_score(micro_mean(self.categories))
|
|
54
|
+
self.macro_score = normalize_score(macro_mean(self.categories))
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def from_dict(cls, data: dict):
|
|
58
|
+
categories = [Category.from_dict(category) for category in data.get('categories', [])]
|
|
59
|
+
return cls(name=data['name'], categories=categories)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class ReportKey:
|
|
63
|
+
model_name = 'Model'
|
|
64
|
+
dataset_name = 'Dataset'
|
|
65
|
+
metric_name = 'Metric'
|
|
66
|
+
category_name = 'Category'
|
|
67
|
+
category_prefix = 'Cat.'
|
|
68
|
+
subset_name = 'Subset'
|
|
69
|
+
num = 'Num'
|
|
70
|
+
score = 'Score'
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass
|
|
74
|
+
class Report:
|
|
75
|
+
name: str = 'default_report'
|
|
76
|
+
dataset_name: str = 'default_dataset'
|
|
77
|
+
model_name: str = 'default_model'
|
|
78
|
+
score: float = 0.0
|
|
79
|
+
metrics: List[Metric] = field(default_factory=list)
|
|
80
|
+
|
|
81
|
+
def __post_init__(self):
|
|
82
|
+
self.score = self.metrics[0].score # NOTE: only use the first metric by default
|
|
83
|
+
|
|
84
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
85
|
+
return asdict(self)
|
|
86
|
+
|
|
87
|
+
@classmethod
|
|
88
|
+
def from_dict(cls, data: dict):
|
|
89
|
+
metrics = [Metric.from_dict(metric) for metric in data.get('metrics', [])]
|
|
90
|
+
return cls(
|
|
91
|
+
name=data['name'],
|
|
92
|
+
score=data['score'],
|
|
93
|
+
metrics=metrics,
|
|
94
|
+
dataset_name=data['dataset_name'],
|
|
95
|
+
model_name=data['model_name'])
|
|
96
|
+
|
|
97
|
+
@classmethod
|
|
98
|
+
def from_json(cls, json_file: str):
|
|
99
|
+
with open(json_file, 'r') as f:
|
|
100
|
+
data = json.load(f)
|
|
101
|
+
return cls.from_dict(data)
|
|
102
|
+
|
|
103
|
+
def to_dataframe(self, flatten_metrics: bool = True, flatten_categories: bool = True):
|
|
104
|
+
table = defaultdict(list)
|
|
105
|
+
for metric in self.metrics:
|
|
106
|
+
for category in metric.categories:
|
|
107
|
+
for subset in category.subsets:
|
|
108
|
+
table[ReportKey.model_name].append(self.model_name)
|
|
109
|
+
table[ReportKey.dataset_name].append(self.dataset_name)
|
|
110
|
+
table[ReportKey.metric_name].append(metric.name)
|
|
111
|
+
table[ReportKey.category_name].append(category.name)
|
|
112
|
+
table[ReportKey.subset_name].append(subset.name)
|
|
113
|
+
table[ReportKey.num].append(subset.num)
|
|
114
|
+
table[ReportKey.score].append(subset.score) # TODO: convert to percentage
|
|
115
|
+
# NOTE: only flatten metrics if needed, use the first metric by default
|
|
116
|
+
if not flatten_metrics:
|
|
117
|
+
break
|
|
118
|
+
df = pd.DataFrame.from_dict(table, orient='columns')
|
|
119
|
+
if flatten_categories:
|
|
120
|
+
df = self._flatten_categories(df)
|
|
121
|
+
return df
|
|
122
|
+
|
|
123
|
+
def _flatten_categories(self, df: pd.DataFrame):
|
|
124
|
+
# expand categories to multiple rows
|
|
125
|
+
df_categories = df.copy()
|
|
126
|
+
# multi-level aggregation for categories
|
|
127
|
+
max_depth = df_categories[ReportKey.category_name].apply(len).max()
|
|
128
|
+
for level in range(max_depth):
|
|
129
|
+
df_categories[f'{ReportKey.category_prefix}{level}'] = df_categories[ReportKey.category_name].apply(
|
|
130
|
+
lambda x: x[level] if len(x) > level else None)
|
|
131
|
+
|
|
132
|
+
df_categories.drop(columns=[ReportKey.category_name], inplace=True)
|
|
133
|
+
return df_categories
|
evalscope/run.py
CHANGED
|
@@ -2,34 +2,21 @@
|
|
|
2
2
|
"""
|
|
3
3
|
Run evaluation for LLMs.
|
|
4
4
|
"""
|
|
5
|
-
import logging
|
|
6
5
|
import os.path
|
|
7
|
-
import torch
|
|
8
6
|
from argparse import Namespace
|
|
9
7
|
from datetime import datetime
|
|
10
|
-
from typing import List, Optional, Union
|
|
8
|
+
from typing import TYPE_CHECKING, List, Optional, Union
|
|
11
9
|
|
|
12
|
-
from evalscope.
|
|
13
|
-
from evalscope.
|
|
14
|
-
from evalscope.
|
|
15
|
-
from evalscope.
|
|
16
|
-
from evalscope.
|
|
17
|
-
from evalscope.utils import import_module_util, seed_everything
|
|
18
|
-
from evalscope.utils.logger import get_logger
|
|
10
|
+
from evalscope.config import TaskConfig, parse_task_config
|
|
11
|
+
from evalscope.constants import DataCollection, EvalBackend
|
|
12
|
+
from evalscope.utils import seed_everything
|
|
13
|
+
from evalscope.utils.io_utils import OutputsStructure
|
|
14
|
+
from evalscope.utils.logger import configure_logging, get_logger
|
|
19
15
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
BENCHMARK_PATH_PREFIX = 'evalscope.benchmarks.'
|
|
23
|
-
MEMBERS_TO_IMPORT = ['DATASET_ID', 'SUBSET_LIST', 'DataAdapterClass', 'ModelAdapterClass']
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from evalscope.models import LocalModel
|
|
24
18
|
|
|
25
|
-
|
|
26
|
-
def configure_logging(debug: bool, outputs: Optional[OutputsStructure]):
|
|
27
|
-
"""Configure logging level based on the debug flag."""
|
|
28
|
-
if outputs:
|
|
29
|
-
log_file = os.path.join(outputs.logs_dir, 'eval_log.log')
|
|
30
|
-
get_logger(log_file=log_file, force=True)
|
|
31
|
-
if debug:
|
|
32
|
-
get_logger(log_level=logging.DEBUG, force=True)
|
|
19
|
+
logger = get_logger()
|
|
33
20
|
|
|
34
21
|
|
|
35
22
|
def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]) -> Union[dict, List[dict]]:
|
|
@@ -46,37 +33,15 @@ def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]
|
|
|
46
33
|
|
|
47
34
|
def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
|
|
48
35
|
"""Run a single evaluation task."""
|
|
49
|
-
|
|
36
|
+
if task_cfg.seed is not None:
|
|
37
|
+
seed_everything(task_cfg.seed)
|
|
50
38
|
outputs = setup_work_directory(task_cfg, run_time)
|
|
51
|
-
configure_logging(task_cfg.debug, outputs)
|
|
39
|
+
configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
|
|
52
40
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
return evaluate_model(task_cfg, outputs)
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def parse_task_config(task_cfg) -> TaskConfig:
|
|
59
|
-
"""Parse task configuration from various formats into a TaskConfig object."""
|
|
60
|
-
if isinstance(task_cfg, TaskConfig):
|
|
61
|
-
logger.info('Args: Task config is provided with TaskConfig type.')
|
|
62
|
-
elif isinstance(task_cfg, dict):
|
|
63
|
-
logger.info('Args: Task config is provided with dictionary type.')
|
|
64
|
-
task_cfg = TaskConfig.from_dict(task_cfg)
|
|
65
|
-
elif isinstance(task_cfg, Namespace):
|
|
66
|
-
logger.info('Args: Task config is provided with CommandLine type.')
|
|
67
|
-
task_cfg = TaskConfig.from_args(task_cfg)
|
|
68
|
-
elif isinstance(task_cfg, str):
|
|
69
|
-
extension = task_cfg.split('.')[-1]
|
|
70
|
-
logger.info(f'Args: Task config is provided with {extension} file type.')
|
|
71
|
-
if extension in ['yaml', 'yml']:
|
|
72
|
-
task_cfg = TaskConfig.from_yaml(task_cfg)
|
|
73
|
-
elif extension == 'json':
|
|
74
|
-
task_cfg = TaskConfig.from_json(task_cfg)
|
|
75
|
-
else:
|
|
76
|
-
raise ValueError('Args: Unsupported file extension.')
|
|
41
|
+
if task_cfg.eval_backend != EvalBackend.NATIVE:
|
|
42
|
+
return run_non_native_backend(task_cfg, outputs)
|
|
77
43
|
else:
|
|
78
|
-
|
|
79
|
-
return task_cfg
|
|
44
|
+
return evaluate_model(task_cfg, outputs)
|
|
80
45
|
|
|
81
46
|
|
|
82
47
|
def setup_work_directory(task_cfg: TaskConfig, run_time: str):
|
|
@@ -84,14 +49,19 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
|
|
|
84
49
|
if task_cfg.use_cache:
|
|
85
50
|
task_cfg.work_dir = task_cfg.use_cache
|
|
86
51
|
logger.info(f'Set resume from {task_cfg.work_dir}')
|
|
87
|
-
elif task_cfg.work_dir
|
|
88
|
-
|
|
52
|
+
# elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
|
|
53
|
+
task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
|
|
89
54
|
|
|
90
55
|
outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
|
|
56
|
+
|
|
57
|
+
if task_cfg.eval_backend == EvalBackend.OPEN_COMPASS:
|
|
58
|
+
task_cfg.eval_config['time_str'] = run_time
|
|
59
|
+
elif task_cfg.eval_backend == EvalBackend.VLM_EVAL_KIT:
|
|
60
|
+
task_cfg.eval_config['work_dir'] = task_cfg.work_dir
|
|
91
61
|
return outputs
|
|
92
62
|
|
|
93
63
|
|
|
94
|
-
def run_non_native_backend(task_cfg: TaskConfig) -> dict:
|
|
64
|
+
def run_non_native_backend(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
95
65
|
"""Run evaluation using a non-native backend."""
|
|
96
66
|
eval_backend = task_cfg.eval_backend
|
|
97
67
|
eval_config = task_cfg.eval_config
|
|
@@ -101,6 +71,10 @@ def run_non_native_backend(task_cfg: TaskConfig) -> dict:
|
|
|
101
71
|
|
|
102
72
|
backend_manager_class = get_backend_manager_class(eval_backend)
|
|
103
73
|
backend_manager = backend_manager_class(config=eval_config)
|
|
74
|
+
|
|
75
|
+
task_cfg.dump_yaml(outputs.configs_dir)
|
|
76
|
+
logger.info(task_cfg)
|
|
77
|
+
|
|
104
78
|
backend_manager.run()
|
|
105
79
|
|
|
106
80
|
return dict()
|
|
@@ -123,92 +97,57 @@ def get_backend_manager_class(eval_backend: EvalBackend):
|
|
|
123
97
|
|
|
124
98
|
def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
125
99
|
"""Evaluate the model based on the provided task configuration."""
|
|
100
|
+
from evalscope.models import get_local_model
|
|
101
|
+
|
|
126
102
|
# Initialize evaluator
|
|
127
103
|
eval_results = {}
|
|
128
|
-
task_cfg
|
|
104
|
+
base_model = get_local_model(task_cfg)
|
|
105
|
+
evaluators = []
|
|
106
|
+
for dataset_name in task_cfg.datasets:
|
|
107
|
+
evaluator = create_evaluator(task_cfg, dataset_name, outputs, base_model)
|
|
108
|
+
evaluators.append(evaluator)
|
|
129
109
|
|
|
130
|
-
|
|
131
|
-
|
|
110
|
+
# dump task_cfg to outputs.configs_dir after creating evaluators
|
|
111
|
+
task_cfg.dump_yaml(outputs.configs_dir)
|
|
112
|
+
logger.info(task_cfg)
|
|
132
113
|
|
|
133
|
-
for
|
|
134
|
-
evaluator = create_evaluator(task_cfg, dataset_name, outputs)
|
|
114
|
+
for evaluator in evaluators:
|
|
135
115
|
res_dict = evaluator.eval(infer_cfg=task_cfg.generation_config, debug=task_cfg.debug, limit=task_cfg.limit)
|
|
136
116
|
eval_results[dataset_name] = res_dict
|
|
137
117
|
|
|
138
118
|
return eval_results
|
|
139
119
|
|
|
140
120
|
|
|
141
|
-
def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure):
|
|
121
|
+
def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: 'LocalModel'):
|
|
142
122
|
"""Create an evaluator object for the specified dataset."""
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
)
|
|
168
|
-
in_subset_list = dataset_config.get('subset_list', imported_modules['SUBSET_LIST'])
|
|
169
|
-
|
|
170
|
-
logger.info(f'Evaluating on subsets for {dataset_name}: {in_subset_list}\n')
|
|
171
|
-
|
|
172
|
-
return Evaluator(
|
|
173
|
-
dataset_name_or_path=dataset_name_or_path,
|
|
174
|
-
subset_list=in_subset_list,
|
|
175
|
-
data_adapter=data_adapter,
|
|
176
|
-
model_adapter=model_adapter,
|
|
177
|
-
use_cache=task_cfg.use_cache,
|
|
178
|
-
outputs=outputs,
|
|
179
|
-
datasets_dir=task_cfg.dataset_dir,
|
|
180
|
-
datasets_hub=task_cfg.dataset_hub,
|
|
181
|
-
stage=task_cfg.stage,
|
|
182
|
-
eval_type=task_cfg.eval_type,
|
|
183
|
-
overall_task_cfg=task_cfg,
|
|
184
|
-
)
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
def initialize_model_adapter(task_cfg: TaskConfig, dataset_name: str, imported_modules):
|
|
188
|
-
"""Initialize the model adapter based on the task configuration."""
|
|
189
|
-
if task_cfg.dry_run:
|
|
190
|
-
from evalscope.models.dummy_chat_model import DummyChatModel
|
|
191
|
-
return DummyChatModel(model_cfg=dict())
|
|
192
|
-
elif task_cfg.eval_type == EvalType.CUSTOM:
|
|
193
|
-
if not isinstance(task_cfg.model, CustomModel):
|
|
194
|
-
raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
|
|
195
|
-
from evalscope.models.model_adapter import CustomModelAdapter
|
|
196
|
-
return CustomModelAdapter(custom_model=task_cfg.model)
|
|
197
|
-
else:
|
|
198
|
-
device_map = task_cfg.model_args.get('device_map', 'auto') if torch.cuda.is_available() else None
|
|
199
|
-
model_precision = task_cfg.model_args.get('precision', torch.float16)
|
|
200
|
-
if isinstance(model_precision, str) and model_precision != 'auto':
|
|
201
|
-
model_precision = eval(model_precision)
|
|
202
|
-
return imported_modules['ModelAdapterClass'](
|
|
203
|
-
model_id=task_cfg.model,
|
|
204
|
-
model_revision=task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION),
|
|
205
|
-
device_map=device_map,
|
|
206
|
-
torch_dtype=model_precision,
|
|
207
|
-
generation_config=task_cfg.generation_config,
|
|
208
|
-
chat_template=task_cfg.chat_template)
|
|
123
|
+
from evalscope.benchmarks import Benchmark, BenchmarkMeta
|
|
124
|
+
from evalscope.evaluator import Evaluator
|
|
125
|
+
from evalscope.models import initialize_model_adapter
|
|
126
|
+
|
|
127
|
+
if dataset_name == DataCollection.NAME:
|
|
128
|
+
# EvaluatorCollection is a collection of evaluators
|
|
129
|
+
from evalscope.collections import EvaluatorCollection
|
|
130
|
+
return EvaluatorCollection(task_cfg, outputs)
|
|
131
|
+
|
|
132
|
+
benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
|
|
133
|
+
|
|
134
|
+
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
135
|
+
model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
|
|
136
|
+
|
|
137
|
+
# update task_cfg.dataset_args
|
|
138
|
+
task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
|
|
139
|
+
|
|
140
|
+
return Evaluator(
|
|
141
|
+
dataset_name_or_path=benchmark.dataset_id,
|
|
142
|
+
data_adapter=data_adapter,
|
|
143
|
+
model_adapter=model_adapter,
|
|
144
|
+
outputs=outputs,
|
|
145
|
+
task_cfg=task_cfg,
|
|
146
|
+
)
|
|
209
147
|
|
|
210
148
|
|
|
211
149
|
def main():
|
|
150
|
+
from evalscope.arguments import parse_args
|
|
212
151
|
args = parse_args()
|
|
213
152
|
run_task(args)
|
|
214
153
|
|
evalscope/run_arena.py
CHANGED
|
@@ -10,8 +10,9 @@ from tqdm import tqdm
|
|
|
10
10
|
|
|
11
11
|
from evalscope.constants import EvalConfigKeys
|
|
12
12
|
from evalscope.evaluator.rating_eval import RatingEvaluate
|
|
13
|
-
from evalscope.models
|
|
14
|
-
from evalscope.utils import
|
|
13
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
14
|
+
from evalscope.utils import get_obj_from_cfg
|
|
15
|
+
from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list, yaml_to_dict
|
|
15
16
|
from evalscope.utils.logger import get_logger
|
|
16
17
|
|
|
17
18
|
logger = get_logger()
|
evalscope/summarizer.py
CHANGED
|
@@ -4,10 +4,11 @@ import json
|
|
|
4
4
|
import os
|
|
5
5
|
from typing import List, Union
|
|
6
6
|
|
|
7
|
-
from evalscope.config import TaskConfig
|
|
8
|
-
from evalscope.constants import EvalBackend
|
|
9
|
-
from evalscope.
|
|
10
|
-
from evalscope.utils import csv_to_list, get_latest_folder_path
|
|
7
|
+
from evalscope.config import TaskConfig, parse_task_config
|
|
8
|
+
from evalscope.constants import EvalBackend
|
|
9
|
+
from evalscope.report import gen_table
|
|
10
|
+
from evalscope.utils import csv_to_list, get_latest_folder_path
|
|
11
|
+
from evalscope.utils.io_utils import OutputsStructure, json_to_dict, yaml_to_dict
|
|
11
12
|
from evalscope.utils.logger import get_logger
|
|
12
13
|
|
|
13
14
|
logger = get_logger()
|
|
@@ -24,7 +25,7 @@ class Summarizer:
|
|
|
24
25
|
if reports_dir is None:
|
|
25
26
|
raise ValueError(f'No reports directory in {outputs_dir}')
|
|
26
27
|
|
|
27
|
-
report_files: list = glob.glob(os.path.join(reports_dir, '
|
|
28
|
+
report_files: list = glob.glob(os.path.join(reports_dir, '**/*.json'))
|
|
28
29
|
for report_file in report_files:
|
|
29
30
|
with open(report_file, 'r') as f:
|
|
30
31
|
res_list.append(json.load(f))
|
|
@@ -47,33 +48,20 @@ class Summarizer:
|
|
|
47
48
|
A report dict is overall report on a benchmark for specific model.
|
|
48
49
|
"""
|
|
49
50
|
final_res_list: List[dict] = []
|
|
50
|
-
candidate_task_cfgs: List[
|
|
51
|
-
|
|
52
|
-
if isinstance(task_cfg,
|
|
53
|
-
candidate_task_cfgs = [task_cfg]
|
|
54
|
-
elif isinstance(task_cfg, str):
|
|
55
|
-
task_cfg: dict = yaml_to_dict(task_cfg)
|
|
56
|
-
candidate_task_cfgs = [task_cfg]
|
|
57
|
-
elif isinstance(task_cfg, TaskConfig):
|
|
58
|
-
task_cfg: dict = task_cfg.to_dict()
|
|
59
|
-
candidate_task_cfgs = [task_cfg]
|
|
60
|
-
elif isinstance(task_cfg, list):
|
|
51
|
+
candidate_task_cfgs: List[TaskConfig] = []
|
|
52
|
+
|
|
53
|
+
if isinstance(task_cfg, list):
|
|
61
54
|
for task_cfg_item in task_cfg:
|
|
62
|
-
|
|
63
|
-
task_cfg_item: dict = yaml_to_dict(task_cfg_item)
|
|
64
|
-
elif isinstance(task_cfg_item, TaskConfig):
|
|
65
|
-
task_cfg_item: dict = task_cfg_item.to_dict()
|
|
66
|
-
candidate_task_cfgs.append(task_cfg_item)
|
|
55
|
+
candidate_task_cfgs.append(parse_task_config(task_cfg_item))
|
|
67
56
|
else:
|
|
68
|
-
|
|
57
|
+
candidate_task_cfgs.append(parse_task_config(task_cfg))
|
|
69
58
|
|
|
70
59
|
for candidate_task in candidate_task_cfgs:
|
|
71
60
|
logger.info(f'**Loading task cfg for summarizer: {candidate_task}')
|
|
72
|
-
eval_backend = candidate_task.
|
|
61
|
+
eval_backend = candidate_task.eval_backend
|
|
73
62
|
|
|
74
63
|
if eval_backend == EvalBackend.NATIVE:
|
|
75
|
-
outputs_dir: str = candidate_task.
|
|
76
|
-
outputs_dir: str = os.path.expanduser(outputs_dir)
|
|
64
|
+
outputs_dir: str = os.path.expanduser(candidate_task.work_dir)
|
|
77
65
|
if outputs_dir is None:
|
|
78
66
|
raise ValueError(f'No outputs_dir in {task_cfg}')
|
|
79
67
|
res_list: list = Summarizer.get_report(outputs_dir=outputs_dir)
|
|
@@ -128,8 +116,8 @@ class Summarizer:
|
|
|
128
116
|
return final_res_list
|
|
129
117
|
|
|
130
118
|
@staticmethod
|
|
131
|
-
def parse_eval_config(candidate_task):
|
|
132
|
-
eval_config: Union[str, dict] = candidate_task.
|
|
119
|
+
def parse_eval_config(candidate_task: TaskConfig):
|
|
120
|
+
eval_config: Union[str, dict] = candidate_task.eval_config
|
|
133
121
|
assert eval_config is not None, 'Please provide eval_config for specific evaluation backend.'
|
|
134
122
|
|
|
135
123
|
if isinstance(eval_config, str):
|
|
@@ -10,7 +10,8 @@ import requests
|
|
|
10
10
|
from concurrent.futures import ThreadPoolExecutor
|
|
11
11
|
from tqdm import tqdm
|
|
12
12
|
|
|
13
|
-
from evalscope.utils import get_logger
|
|
13
|
+
from evalscope.utils import get_logger
|
|
14
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
14
15
|
|
|
15
16
|
logger = get_logger()
|
|
16
17
|
|
|
@@ -4,7 +4,8 @@ from typing import Union
|
|
|
4
4
|
|
|
5
5
|
from evalscope.third_party.longbench_write.eval import run_eval
|
|
6
6
|
from evalscope.third_party.longbench_write.infer import run_infer
|
|
7
|
-
from evalscope.utils import get_logger
|
|
7
|
+
from evalscope.utils import get_logger
|
|
8
|
+
from evalscope.utils.io_utils import json_to_dict, yaml_to_dict
|
|
8
9
|
|
|
9
10
|
logger = get_logger()
|
|
10
11
|
|
|
@@ -6,7 +6,7 @@ from typing import List
|
|
|
6
6
|
|
|
7
7
|
from evalscope.third_party.longbench_write.eval import EvalLength
|
|
8
8
|
from evalscope.third_party.longbench_write.utils import chinese_to_arabic, count_words
|
|
9
|
-
from evalscope.utils import jsonl_to_list
|
|
9
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
11
|
|
|
12
12
|
logger = get_logger()
|
|
@@ -5,7 +5,8 @@ from typing import Union
|
|
|
5
5
|
|
|
6
6
|
from evalscope.third_party.toolbench_static.eval import EvalArgs, run_eval
|
|
7
7
|
from evalscope.third_party.toolbench_static.infer import InferArgs, run_infer
|
|
8
|
-
from evalscope.utils import get_logger
|
|
8
|
+
from evalscope.utils import get_logger
|
|
9
|
+
from evalscope.utils.io_utils import json_to_dict, yaml_to_dict
|
|
9
10
|
|
|
10
11
|
logger = get_logger()
|
|
11
12
|
|