evalscope 0.8.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (147) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/base.py +1 -1
  4. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  5. evalscope/backend/rag_eval/utils/clip.py +2 -2
  6. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  7. evalscope/backend/rag_eval/utils/llm.py +1 -1
  8. evalscope/benchmarks/__init__.py +20 -1
  9. evalscope/benchmarks/arc/__init__.py +0 -5
  10. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  11. evalscope/benchmarks/bbh/__init__.py +0 -4
  12. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  13. evalscope/benchmarks/benchmark.py +70 -59
  14. evalscope/benchmarks/ceval/__init__.py +0 -5
  15. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  16. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  17. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  18. evalscope/benchmarks/competition_math/__init__.py +0 -5
  19. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  20. evalscope/benchmarks/data_adapter.py +115 -87
  21. evalscope/benchmarks/general_qa/__init__.py +0 -5
  22. evalscope/benchmarks/general_qa/general_qa_adapter.py +24 -80
  23. evalscope/benchmarks/gpqa/__init__.py +0 -0
  24. evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
  25. evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
  26. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  27. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +22 -101
  28. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  29. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +33 -99
  30. evalscope/benchmarks/humaneval/__init__.py +0 -4
  31. evalscope/benchmarks/humaneval/humaneval_adapter.py +93 -9
  32. evalscope/benchmarks/ifeval/__init__.py +0 -0
  33. evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
  34. evalscope/benchmarks/ifeval/instructions.py +1477 -0
  35. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  36. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  37. evalscope/benchmarks/ifeval/utils.py +134 -0
  38. evalscope/benchmarks/iquiz/__init__.py +0 -0
  39. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  40. evalscope/benchmarks/mmlu/__init__.py +0 -5
  41. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  42. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  43. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  44. evalscope/benchmarks/race/__init__.py +0 -5
  45. evalscope/benchmarks/race/race_adapter.py +27 -123
  46. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  47. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  48. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  49. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  50. evalscope/cli/cli.py +2 -0
  51. evalscope/cli/start_app.py +30 -0
  52. evalscope/collections/__init__.py +3 -0
  53. evalscope/collections/evaluator.py +198 -0
  54. evalscope/collections/sampler.py +138 -0
  55. evalscope/collections/schema.py +126 -0
  56. evalscope/config.py +45 -7
  57. evalscope/constants.py +7 -38
  58. evalscope/evaluator/__init__.py +0 -1
  59. evalscope/evaluator/evaluator.py +89 -121
  60. evalscope/evaluator/rating_eval.py +1 -1
  61. evalscope/evaluator/reviewer/auto_reviewer.py +14 -5
  62. evalscope/metrics/__init__.py +3 -0
  63. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  64. evalscope/metrics/math_accuracy.py +193 -50
  65. evalscope/metrics/metrics.py +18 -6
  66. evalscope/metrics/named_metrics.py +17 -0
  67. evalscope/metrics/rouge_metric.py +13 -8
  68. evalscope/models/__init__.py +14 -1
  69. evalscope/models/base_adapter.py +52 -0
  70. evalscope/models/chat_adapter.py +140 -0
  71. evalscope/models/choice_adapter.py +211 -0
  72. evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +1 -1
  73. evalscope/models/custom_adapter.py +67 -0
  74. evalscope/models/local_model.py +74 -0
  75. evalscope/models/model.py +141 -0
  76. evalscope/models/server_adapter.py +111 -0
  77. evalscope/perf/__init__.py +1 -0
  78. evalscope/perf/arguments.py +3 -1
  79. evalscope/perf/benchmark.py +3 -3
  80. evalscope/perf/main.py +5 -7
  81. evalscope/perf/plugin/api/custom_api.py +1 -1
  82. evalscope/perf/plugin/api/openai_api.py +54 -50
  83. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  84. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  85. evalscope/perf/plugin/registry.py +3 -3
  86. evalscope/perf/utils/benchmark_util.py +4 -4
  87. evalscope/perf/utils/db_util.py +66 -22
  88. evalscope/perf/utils/local_server.py +4 -1
  89. evalscope/report/__init__.py +5 -0
  90. evalscope/report/app.py +693 -0
  91. evalscope/report/combinator.py +73 -0
  92. evalscope/report/generator.py +80 -0
  93. evalscope/report/utils.py +133 -0
  94. evalscope/run.py +64 -125
  95. evalscope/run_arena.py +3 -2
  96. evalscope/summarizer.py +15 -27
  97. evalscope/third_party/longbench_write/eval.py +2 -1
  98. evalscope/third_party/longbench_write/longbench_write.py +2 -1
  99. evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
  100. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  101. evalscope/utils/__init__.py +1 -0
  102. evalscope/utils/chat_service.py +6 -5
  103. evalscope/utils/io_utils.py +170 -0
  104. evalscope/utils/logger.py +13 -0
  105. evalscope/utils/model_utils.py +15 -2
  106. evalscope/utils/utils.py +3 -200
  107. evalscope/version.py +2 -2
  108. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +129 -23
  109. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +119 -115
  110. tests/cli/test_collection.py +57 -0
  111. tests/cli/test_run.py +57 -7
  112. tests/perf/test_perf.py +3 -2
  113. tests/rag/test_mteb.py +3 -2
  114. tests/vlm/test_vlmeval.py +3 -2
  115. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
  116. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
  117. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
  118. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
  119. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
  120. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
  121. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
  122. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
  123. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
  124. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  125. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  126. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  127. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  128. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
  129. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
  130. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
  131. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
  132. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  133. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
  134. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
  135. evalscope/evaluator/humaneval_evaluator.py +0 -158
  136. evalscope/models/api/__init__.py +0 -3
  137. evalscope/models/dummy_chat_model.py +0 -49
  138. evalscope/models/model_adapter.py +0 -525
  139. evalscope/models/openai_model.py +0 -103
  140. evalscope/tools/__init__.py +0 -1
  141. evalscope/tools/combine_reports.py +0 -135
  142. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  143. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  144. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
  145. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
  146. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
  147. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,73 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import glob
4
+ import os
5
+ import pandas as pd
6
+ from tabulate import tabulate
7
+ from typing import List, Tuple
8
+
9
+ from evalscope.report.utils import Report
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
13
+ """
14
+ Combine and generate table for reports of LLMs.
15
+ """
16
+
17
+
18
+ def get_report_list(reports_path_list: List[str]) -> List[Report]:
19
+ report_list: List[Report] = []
20
+ # Iterate over each report path
21
+ for report_path in reports_path_list:
22
+ model_report_dir = os.path.normpath(report_path)
23
+ report_files = glob.glob(os.path.join(model_report_dir, '**', '*.json'), recursive=True)
24
+ # Iterate over each report file
25
+ for file_path in report_files:
26
+ try:
27
+ report = Report.from_json(file_path)
28
+ report_list.append(report)
29
+ except Exception as e:
30
+ logger.error(f'Error loading report from {file_path}: {e}')
31
+ report_list = sorted(report_list, key=lambda x: (x.model_name, x.dataset_name))
32
+ return report_list
33
+
34
+
35
+ def get_data_frame(report_list: List[Report],
36
+ flatten_metrics: bool = True,
37
+ flatten_categories: bool = True) -> pd.DataFrame:
38
+ tables = []
39
+ for report in report_list:
40
+ df = report.to_dataframe(flatten_metrics=flatten_metrics, flatten_categories=flatten_categories)
41
+ tables.append(df)
42
+ return pd.concat(tables, ignore_index=True)
43
+
44
+
45
+ def gen_table(reports_path_list: list) -> str:
46
+ report_list = get_report_list(reports_path_list)
47
+ table = get_data_frame(report_list)
48
+ return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
49
+
50
+
51
+ class ReportsRecorder:
52
+ COMMON_DATASET_PATH = []
53
+ CUSTOM_DATASET_PATH = []
54
+
55
+ def __init__(self, oss_url: str = '', endpoint: str = ''):
56
+ pass
57
+
58
+
59
+ if __name__ == '__main__':
60
+ report_dir_1 = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250117_151926'
61
+ # report_dir_2 = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250107_204445/reports'
62
+
63
+ report_table = gen_table([report_dir_1])
64
+ print(report_table)
65
+
66
+ # ALL VALUES ONLY FOR EXAMPLE
67
+ # +--------------------------+-------------------+-------------+
68
+ # | Model | CompetitionMath | GSM8K |
69
+ # +==========================+===================+=============+
70
+ # | ZhipuAI_chatglm2-6b-base | 25.0 (acc) | 30.50 (acc) |
71
+ # +--------------------------+-------------------+-------------+
72
+ # | ZhipuAI_chatglm2-6b | 30.5 (acc) | 40.50 (acc) |
73
+ # +--------------------------+-------------------+-------------+
@@ -0,0 +1,80 @@
1
+ import pandas as pd
2
+ from pandas import DataFrame
3
+
4
+ from evalscope.constants import DataCollection
5
+ from evalscope.report.utils import *
6
+
7
+
8
+ class ReportGenerator:
9
+
10
+ @staticmethod
11
+ def gen_report(subset_score_map: dict, report_name: str, **kwargs) -> Report:
12
+ """
13
+ Generate report for specific dataset.
14
+ subset_score_map: e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}, {'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}]}
15
+ category_map: e.g. {'subset_name': ['category_name1', 'category_name2'], ...}
16
+ metric_list: e.g. [{'object': AverageAccuracy, 'name': 'AverageAccuracy'}, {'object': 'WeightedAverageAccuracy', 'name': 'WeightedAverageAccuracy'}]
17
+ """ # noqa: E501
18
+
19
+ dataset_name = kwargs.get('dataset_name', None)
20
+ model_name = kwargs.get('model_name', None)
21
+ category_map = kwargs.get('category_map', {})
22
+
23
+ def flatten_subset() -> DataFrame:
24
+ """
25
+ Flatten subset score map to a DataFrame.
26
+
27
+ Example:
28
+ name score num categories metric_name
29
+ 0 ARC-Easy 0.5 2 [default] AverageAccuracy
30
+ 1 ARC-Challenge 0.5 2 [default] AverageAccuracy
31
+ """
32
+ subsets = []
33
+ for subset_name, scores in subset_score_map.items():
34
+ for score_item in scores:
35
+ categories = category_map.get(subset_name, ['default'])
36
+ if isinstance(categories, str):
37
+ categories = [categories]
38
+ subsets.append(
39
+ dict(
40
+ name=subset_name,
41
+ score=score_item['score'],
42
+ num=score_item['num'],
43
+ metric_name=score_item['metric_name'],
44
+ categories=tuple(categories)))
45
+ df = pd.DataFrame(subsets)
46
+ return df
47
+
48
+ df = flatten_subset()
49
+
50
+ metrics_list = []
51
+ for metric_name, group_metric in df.groupby('metric_name'):
52
+ categories = []
53
+ for category_name, group_category in group_metric.groupby('categories'):
54
+ subsets = []
55
+ for _, row in group_category.iterrows():
56
+ subsets.append(Subset(name=row['name'], score=row['score'], num=row['num']))
57
+
58
+ categories.append(Category(name=category_name, subsets=subsets))
59
+
60
+ metrics_list.append(Metric(name=metric_name, categories=categories))
61
+
62
+ report = Report(name=report_name, metrics=metrics_list, dataset_name=dataset_name, model_name=model_name)
63
+ return report
64
+
65
+ @staticmethod
66
+ def gen_collection_report(df: DataFrame, all_dataset_name: str, model_name: str) -> Report:
67
+ categories = []
68
+ for category_name, group_category in df.groupby('categories'):
69
+ subsets = []
70
+ for (dataset_name, subset_name), group_subset in group_category.groupby(['dataset_name', 'subset_name']):
71
+ avg_score = group_subset['score'].mean()
72
+ num = group_subset['score'].count()
73
+ subsets.append(Subset(name=f'{dataset_name}/{subset_name}', score=float(avg_score), num=int(num)))
74
+
75
+ categories.append(Category(name=category_name, subsets=subsets))
76
+ return Report(
77
+ name=DataCollection.NAME,
78
+ metrics=[Metric(name='Average', categories=categories)],
79
+ dataset_name=all_dataset_name,
80
+ model_name=model_name)
@@ -0,0 +1,133 @@
1
+ import json
2
+ import pandas as pd
3
+ from collections import defaultdict
4
+ from dataclasses import asdict, dataclass, field
5
+ from typing import Any, Dict, List
6
+
7
+ from evalscope.metrics import macro_mean, micro_mean
8
+ from evalscope.utils import normalize_score
9
+
10
+
11
+ @dataclass
12
+ class Subset:
13
+ name: str = 'default_subset'
14
+ score: float = 0.0
15
+ num: int = 0
16
+
17
+ def __post_init__(self):
18
+ self.score = normalize_score(self.score)
19
+
20
+
21
+ @dataclass
22
+ class Category:
23
+ name: tuple[str] = field(default_factory=tuple)
24
+ num: int = 0
25
+ score: float = 0.0
26
+ macro_score: float = 0.0
27
+ subsets: List[Subset] = field(default_factory=list)
28
+
29
+ def __post_init__(self):
30
+ if isinstance(self.name, str):
31
+ # ensure name is tuple format
32
+ self.name = (self.name, )
33
+ self.num = sum(subset.num for subset in self.subsets)
34
+ self.score = normalize_score(micro_mean(self.subsets))
35
+ self.macro_score = normalize_score(macro_mean(self.subsets))
36
+
37
+ @classmethod
38
+ def from_dict(cls, data: dict):
39
+ subsets = [Subset(**subset) for subset in data.get('subsets', [])]
40
+ return cls(name=data['name'], subsets=subsets)
41
+
42
+
43
+ @dataclass
44
+ class Metric:
45
+ name: str = 'default_metric'
46
+ num: int = 0
47
+ score: float = 0.0
48
+ macro_score: float = 0.0
49
+ categories: List[Category] = field(default_factory=list)
50
+
51
+ def __post_init__(self):
52
+ self.num = sum(category.num for category in self.categories)
53
+ self.score = normalize_score(micro_mean(self.categories))
54
+ self.macro_score = normalize_score(macro_mean(self.categories))
55
+
56
+ @classmethod
57
+ def from_dict(cls, data: dict):
58
+ categories = [Category.from_dict(category) for category in data.get('categories', [])]
59
+ return cls(name=data['name'], categories=categories)
60
+
61
+
62
+ class ReportKey:
63
+ model_name = 'Model'
64
+ dataset_name = 'Dataset'
65
+ metric_name = 'Metric'
66
+ category_name = 'Category'
67
+ category_prefix = 'Cat.'
68
+ subset_name = 'Subset'
69
+ num = 'Num'
70
+ score = 'Score'
71
+
72
+
73
+ @dataclass
74
+ class Report:
75
+ name: str = 'default_report'
76
+ dataset_name: str = 'default_dataset'
77
+ model_name: str = 'default_model'
78
+ score: float = 0.0
79
+ metrics: List[Metric] = field(default_factory=list)
80
+
81
+ def __post_init__(self):
82
+ self.score = self.metrics[0].score # NOTE: only use the first metric by default
83
+
84
+ def to_dict(self) -> Dict[str, Any]:
85
+ return asdict(self)
86
+
87
+ @classmethod
88
+ def from_dict(cls, data: dict):
89
+ metrics = [Metric.from_dict(metric) for metric in data.get('metrics', [])]
90
+ return cls(
91
+ name=data['name'],
92
+ score=data['score'],
93
+ metrics=metrics,
94
+ dataset_name=data['dataset_name'],
95
+ model_name=data['model_name'])
96
+
97
+ @classmethod
98
+ def from_json(cls, json_file: str):
99
+ with open(json_file, 'r') as f:
100
+ data = json.load(f)
101
+ return cls.from_dict(data)
102
+
103
+ def to_dataframe(self, flatten_metrics: bool = True, flatten_categories: bool = True):
104
+ table = defaultdict(list)
105
+ for metric in self.metrics:
106
+ for category in metric.categories:
107
+ for subset in category.subsets:
108
+ table[ReportKey.model_name].append(self.model_name)
109
+ table[ReportKey.dataset_name].append(self.dataset_name)
110
+ table[ReportKey.metric_name].append(metric.name)
111
+ table[ReportKey.category_name].append(category.name)
112
+ table[ReportKey.subset_name].append(subset.name)
113
+ table[ReportKey.num].append(subset.num)
114
+ table[ReportKey.score].append(subset.score) # TODO: convert to percentage
115
+ # NOTE: only flatten metrics if needed, use the first metric by default
116
+ if not flatten_metrics:
117
+ break
118
+ df = pd.DataFrame.from_dict(table, orient='columns')
119
+ if flatten_categories:
120
+ df = self._flatten_categories(df)
121
+ return df
122
+
123
+ def _flatten_categories(self, df: pd.DataFrame):
124
+ # expand categories to multiple rows
125
+ df_categories = df.copy()
126
+ # multi-level aggregation for categories
127
+ max_depth = df_categories[ReportKey.category_name].apply(len).max()
128
+ for level in range(max_depth):
129
+ df_categories[f'{ReportKey.category_prefix}{level}'] = df_categories[ReportKey.category_name].apply(
130
+ lambda x: x[level] if len(x) > level else None)
131
+
132
+ df_categories.drop(columns=[ReportKey.category_name], inplace=True)
133
+ return df_categories
evalscope/run.py CHANGED
@@ -2,34 +2,21 @@
2
2
  """
3
3
  Run evaluation for LLMs.
4
4
  """
5
- import logging
6
5
  import os.path
7
- import torch
8
6
  from argparse import Namespace
9
7
  from datetime import datetime
10
- from typing import List, Optional, Union
8
+ from typing import TYPE_CHECKING, List, Optional, Union
11
9
 
12
- from evalscope.arguments import parse_args
13
- from evalscope.config import TaskConfig
14
- from evalscope.constants import DEFAULT_MODEL_REVISION, DEFAULT_WORK_DIR, EvalBackend, EvalType, OutputsStructure
15
- from evalscope.evaluator import Evaluator, HumanevalEvaluator
16
- from evalscope.models.custom import CustomModel
17
- from evalscope.utils import import_module_util, seed_everything
18
- from evalscope.utils.logger import get_logger
10
+ from evalscope.config import TaskConfig, parse_task_config
11
+ from evalscope.constants import DataCollection, EvalBackend
12
+ from evalscope.utils import seed_everything
13
+ from evalscope.utils.io_utils import OutputsStructure
14
+ from evalscope.utils.logger import configure_logging, get_logger
19
15
 
20
- logger = get_logger()
21
-
22
- BENCHMARK_PATH_PREFIX = 'evalscope.benchmarks.'
23
- MEMBERS_TO_IMPORT = ['DATASET_ID', 'SUBSET_LIST', 'DataAdapterClass', 'ModelAdapterClass']
16
+ if TYPE_CHECKING:
17
+ from evalscope.models import LocalModel
24
18
 
25
-
26
- def configure_logging(debug: bool, outputs: Optional[OutputsStructure]):
27
- """Configure logging level based on the debug flag."""
28
- if outputs:
29
- log_file = os.path.join(outputs.logs_dir, 'eval_log.log')
30
- get_logger(log_file=log_file, force=True)
31
- if debug:
32
- get_logger(log_level=logging.DEBUG, force=True)
19
+ logger = get_logger()
33
20
 
34
21
 
35
22
  def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]) -> Union[dict, List[dict]]:
@@ -46,37 +33,15 @@ def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]
46
33
 
47
34
  def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
48
35
  """Run a single evaluation task."""
49
- seed_everything(task_cfg.seed)
36
+ if task_cfg.seed is not None:
37
+ seed_everything(task_cfg.seed)
50
38
  outputs = setup_work_directory(task_cfg, run_time)
51
- configure_logging(task_cfg.debug, outputs)
39
+ configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
52
40
 
53
- logger.info(task_cfg)
54
-
55
- return evaluate_model(task_cfg, outputs)
56
-
57
-
58
- def parse_task_config(task_cfg) -> TaskConfig:
59
- """Parse task configuration from various formats into a TaskConfig object."""
60
- if isinstance(task_cfg, TaskConfig):
61
- logger.info('Args: Task config is provided with TaskConfig type.')
62
- elif isinstance(task_cfg, dict):
63
- logger.info('Args: Task config is provided with dictionary type.')
64
- task_cfg = TaskConfig.from_dict(task_cfg)
65
- elif isinstance(task_cfg, Namespace):
66
- logger.info('Args: Task config is provided with CommandLine type.')
67
- task_cfg = TaskConfig.from_args(task_cfg)
68
- elif isinstance(task_cfg, str):
69
- extension = task_cfg.split('.')[-1]
70
- logger.info(f'Args: Task config is provided with {extension} file type.')
71
- if extension in ['yaml', 'yml']:
72
- task_cfg = TaskConfig.from_yaml(task_cfg)
73
- elif extension == 'json':
74
- task_cfg = TaskConfig.from_json(task_cfg)
75
- else:
76
- raise ValueError('Args: Unsupported file extension.')
41
+ if task_cfg.eval_backend != EvalBackend.NATIVE:
42
+ return run_non_native_backend(task_cfg, outputs)
77
43
  else:
78
- raise ValueError('Args: Please provide a valid task config.')
79
- return task_cfg
44
+ return evaluate_model(task_cfg, outputs)
80
45
 
81
46
 
82
47
  def setup_work_directory(task_cfg: TaskConfig, run_time: str):
@@ -84,14 +49,19 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
84
49
  if task_cfg.use_cache:
85
50
  task_cfg.work_dir = task_cfg.use_cache
86
51
  logger.info(f'Set resume from {task_cfg.work_dir}')
87
- elif task_cfg.work_dir == DEFAULT_WORK_DIR:
88
- task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
52
+ # elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
53
+ task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
89
54
 
90
55
  outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
56
+
57
+ if task_cfg.eval_backend == EvalBackend.OPEN_COMPASS:
58
+ task_cfg.eval_config['time_str'] = run_time
59
+ elif task_cfg.eval_backend == EvalBackend.VLM_EVAL_KIT:
60
+ task_cfg.eval_config['work_dir'] = task_cfg.work_dir
91
61
  return outputs
92
62
 
93
63
 
94
- def run_non_native_backend(task_cfg: TaskConfig) -> dict:
64
+ def run_non_native_backend(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
95
65
  """Run evaluation using a non-native backend."""
96
66
  eval_backend = task_cfg.eval_backend
97
67
  eval_config = task_cfg.eval_config
@@ -101,6 +71,10 @@ def run_non_native_backend(task_cfg: TaskConfig) -> dict:
101
71
 
102
72
  backend_manager_class = get_backend_manager_class(eval_backend)
103
73
  backend_manager = backend_manager_class(config=eval_config)
74
+
75
+ task_cfg.dump_yaml(outputs.configs_dir)
76
+ logger.info(task_cfg)
77
+
104
78
  backend_manager.run()
105
79
 
106
80
  return dict()
@@ -123,92 +97,57 @@ def get_backend_manager_class(eval_backend: EvalBackend):
123
97
 
124
98
  def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
125
99
  """Evaluate the model based on the provided task configuration."""
100
+ from evalscope.models import get_local_model
101
+
126
102
  # Initialize evaluator
127
103
  eval_results = {}
128
- task_cfg.dump_yaml(outputs.configs_dir)
104
+ base_model = get_local_model(task_cfg)
105
+ evaluators = []
106
+ for dataset_name in task_cfg.datasets:
107
+ evaluator = create_evaluator(task_cfg, dataset_name, outputs, base_model)
108
+ evaluators.append(evaluator)
129
109
 
130
- if task_cfg.eval_backend != EvalBackend.NATIVE:
131
- return run_non_native_backend(task_cfg)
110
+ # dump task_cfg to outputs.configs_dir after creating evaluators
111
+ task_cfg.dump_yaml(outputs.configs_dir)
112
+ logger.info(task_cfg)
132
113
 
133
- for dataset_name in task_cfg.datasets:
134
- evaluator = create_evaluator(task_cfg, dataset_name, outputs)
114
+ for evaluator in evaluators:
135
115
  res_dict = evaluator.eval(infer_cfg=task_cfg.generation_config, debug=task_cfg.debug, limit=task_cfg.limit)
136
116
  eval_results[dataset_name] = res_dict
137
117
 
138
118
  return eval_results
139
119
 
140
120
 
141
- def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure):
121
+ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: 'LocalModel'):
142
122
  """Create an evaluator object for the specified dataset."""
143
- imported_modules = import_module_util(BENCHMARK_PATH_PREFIX, dataset_name, MEMBERS_TO_IMPORT)
144
- model_adapter = initialize_model_adapter(task_cfg, dataset_name, imported_modules)
145
-
146
- if dataset_name == 'humaneval':
147
- problem_file = task_cfg.dataset_args.get('humaneval', {}).get('local_path')
148
- return HumanevalEvaluator(
149
- problem_file=problem_file,
150
- model_id=task_cfg.model,
151
- model_revision=task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION),
152
- model_adapter=model_adapter,
153
- outputs=outputs,
154
- is_custom_outputs_dir=False,
155
- )
156
- else:
157
- dataset_config = task_cfg.dataset_args.get(dataset_name, {})
158
- dataset_name_or_path = dataset_config.get('local_path') or imported_modules['DATASET_ID']
159
- in_prompt_template = dataset_config.get('prompt_template', '')
160
- few_shot_num = dataset_config.get('few_shot_num', None)
161
- few_shot_random = dataset_config.get('few_shot_random', True)
162
-
163
- data_adapter = imported_modules['DataAdapterClass'](
164
- few_shot_num=few_shot_num,
165
- few_shot_random=few_shot_random,
166
- prompt_template=in_prompt_template,
167
- )
168
- in_subset_list = dataset_config.get('subset_list', imported_modules['SUBSET_LIST'])
169
-
170
- logger.info(f'Evaluating on subsets for {dataset_name}: {in_subset_list}\n')
171
-
172
- return Evaluator(
173
- dataset_name_or_path=dataset_name_or_path,
174
- subset_list=in_subset_list,
175
- data_adapter=data_adapter,
176
- model_adapter=model_adapter,
177
- use_cache=task_cfg.use_cache,
178
- outputs=outputs,
179
- datasets_dir=task_cfg.dataset_dir,
180
- datasets_hub=task_cfg.dataset_hub,
181
- stage=task_cfg.stage,
182
- eval_type=task_cfg.eval_type,
183
- overall_task_cfg=task_cfg,
184
- )
185
-
186
-
187
- def initialize_model_adapter(task_cfg: TaskConfig, dataset_name: str, imported_modules):
188
- """Initialize the model adapter based on the task configuration."""
189
- if task_cfg.dry_run:
190
- from evalscope.models.dummy_chat_model import DummyChatModel
191
- return DummyChatModel(model_cfg=dict())
192
- elif task_cfg.eval_type == EvalType.CUSTOM:
193
- if not isinstance(task_cfg.model, CustomModel):
194
- raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
195
- from evalscope.models.model_adapter import CustomModelAdapter
196
- return CustomModelAdapter(custom_model=task_cfg.model)
197
- else:
198
- device_map = task_cfg.model_args.get('device_map', 'auto') if torch.cuda.is_available() else None
199
- model_precision = task_cfg.model_args.get('precision', torch.float16)
200
- if isinstance(model_precision, str) and model_precision != 'auto':
201
- model_precision = eval(model_precision)
202
- return imported_modules['ModelAdapterClass'](
203
- model_id=task_cfg.model,
204
- model_revision=task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION),
205
- device_map=device_map,
206
- torch_dtype=model_precision,
207
- generation_config=task_cfg.generation_config,
208
- chat_template=task_cfg.chat_template)
123
+ from evalscope.benchmarks import Benchmark, BenchmarkMeta
124
+ from evalscope.evaluator import Evaluator
125
+ from evalscope.models import initialize_model_adapter
126
+
127
+ if dataset_name == DataCollection.NAME:
128
+ # EvaluatorCollection is a collection of evaluators
129
+ from evalscope.collections import EvaluatorCollection
130
+ return EvaluatorCollection(task_cfg, outputs)
131
+
132
+ benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
133
+
134
+ data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
135
+ model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
136
+
137
+ # update task_cfg.dataset_args
138
+ task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
139
+
140
+ return Evaluator(
141
+ dataset_name_or_path=benchmark.dataset_id,
142
+ data_adapter=data_adapter,
143
+ model_adapter=model_adapter,
144
+ outputs=outputs,
145
+ task_cfg=task_cfg,
146
+ )
209
147
 
210
148
 
211
149
  def main():
150
+ from evalscope.arguments import parse_args
212
151
  args = parse_args()
213
152
  run_task(args)
214
153
 
evalscope/run_arena.py CHANGED
@@ -10,8 +10,9 @@ from tqdm import tqdm
10
10
 
11
11
  from evalscope.constants import EvalConfigKeys
12
12
  from evalscope.evaluator.rating_eval import RatingEvaluate
13
- from evalscope.models.model_adapter import ChatGenerationModelAdapter
14
- from evalscope.utils import dump_jsonl_data, get_obj_from_cfg, jsonl_to_list, yaml_to_dict
13
+ from evalscope.models import ChatGenerationModelAdapter
14
+ from evalscope.utils import get_obj_from_cfg
15
+ from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list, yaml_to_dict
15
16
  from evalscope.utils.logger import get_logger
16
17
 
17
18
  logger = get_logger()
evalscope/summarizer.py CHANGED
@@ -4,10 +4,11 @@ import json
4
4
  import os
5
5
  from typing import List, Union
6
6
 
7
- from evalscope.config import TaskConfig
8
- from evalscope.constants import EvalBackend, OutputsStructure
9
- from evalscope.tools.combine_reports import gen_table
10
- from evalscope.utils import csv_to_list, get_latest_folder_path, json_to_dict, yaml_to_dict
7
+ from evalscope.config import TaskConfig, parse_task_config
8
+ from evalscope.constants import EvalBackend
9
+ from evalscope.report import gen_table
10
+ from evalscope.utils import csv_to_list, get_latest_folder_path
11
+ from evalscope.utils.io_utils import OutputsStructure, json_to_dict, yaml_to_dict
11
12
  from evalscope.utils.logger import get_logger
12
13
 
13
14
  logger = get_logger()
@@ -24,7 +25,7 @@ class Summarizer:
24
25
  if reports_dir is None:
25
26
  raise ValueError(f'No reports directory in {outputs_dir}')
26
27
 
27
- report_files: list = glob.glob(os.path.join(reports_dir, '*.json'))
28
+ report_files: list = glob.glob(os.path.join(reports_dir, '**/*.json'))
28
29
  for report_file in report_files:
29
30
  with open(report_file, 'r') as f:
30
31
  res_list.append(json.load(f))
@@ -47,33 +48,20 @@ class Summarizer:
47
48
  A report dict is overall report on a benchmark for specific model.
48
49
  """
49
50
  final_res_list: List[dict] = []
50
- candidate_task_cfgs: List[dict] = []
51
-
52
- if isinstance(task_cfg, dict):
53
- candidate_task_cfgs = [task_cfg]
54
- elif isinstance(task_cfg, str):
55
- task_cfg: dict = yaml_to_dict(task_cfg)
56
- candidate_task_cfgs = [task_cfg]
57
- elif isinstance(task_cfg, TaskConfig):
58
- task_cfg: dict = task_cfg.to_dict()
59
- candidate_task_cfgs = [task_cfg]
60
- elif isinstance(task_cfg, list):
51
+ candidate_task_cfgs: List[TaskConfig] = []
52
+
53
+ if isinstance(task_cfg, list):
61
54
  for task_cfg_item in task_cfg:
62
- if isinstance(task_cfg_item, str):
63
- task_cfg_item: dict = yaml_to_dict(task_cfg_item)
64
- elif isinstance(task_cfg_item, TaskConfig):
65
- task_cfg_item: dict = task_cfg_item.to_dict()
66
- candidate_task_cfgs.append(task_cfg_item)
55
+ candidate_task_cfgs.append(parse_task_config(task_cfg_item))
67
56
  else:
68
- raise ValueError(f'Invalid task_cfg: {task_cfg}')
57
+ candidate_task_cfgs.append(parse_task_config(task_cfg))
69
58
 
70
59
  for candidate_task in candidate_task_cfgs:
71
60
  logger.info(f'**Loading task cfg for summarizer: {candidate_task}')
72
- eval_backend = candidate_task.get('eval_backend') or EvalBackend.NATIVE
61
+ eval_backend = candidate_task.eval_backend
73
62
 
74
63
  if eval_backend == EvalBackend.NATIVE:
75
- outputs_dir: str = candidate_task.get('outputs')
76
- outputs_dir: str = os.path.expanduser(outputs_dir)
64
+ outputs_dir: str = os.path.expanduser(candidate_task.work_dir)
77
65
  if outputs_dir is None:
78
66
  raise ValueError(f'No outputs_dir in {task_cfg}')
79
67
  res_list: list = Summarizer.get_report(outputs_dir=outputs_dir)
@@ -128,8 +116,8 @@ class Summarizer:
128
116
  return final_res_list
129
117
 
130
118
  @staticmethod
131
- def parse_eval_config(candidate_task):
132
- eval_config: Union[str, dict] = candidate_task.get('eval_config')
119
+ def parse_eval_config(candidate_task: TaskConfig):
120
+ eval_config: Union[str, dict] = candidate_task.eval_config
133
121
  assert eval_config is not None, 'Please provide eval_config for specific evaluation backend.'
134
122
 
135
123
  if isinstance(eval_config, str):
@@ -10,7 +10,8 @@ import requests
10
10
  from concurrent.futures import ThreadPoolExecutor
11
11
  from tqdm import tqdm
12
12
 
13
- from evalscope.utils import get_logger, jsonl_to_list
13
+ from evalscope.utils import get_logger
14
+ from evalscope.utils.io_utils import jsonl_to_list
14
15
 
15
16
  logger = get_logger()
16
17
 
@@ -4,7 +4,8 @@ from typing import Union
4
4
 
5
5
  from evalscope.third_party.longbench_write.eval import run_eval
6
6
  from evalscope.third_party.longbench_write.infer import run_infer
7
- from evalscope.utils import get_logger, json_to_dict, yaml_to_dict
7
+ from evalscope.utils import get_logger
8
+ from evalscope.utils.io_utils import json_to_dict, yaml_to_dict
8
9
 
9
10
  logger = get_logger()
10
11
 
@@ -6,7 +6,7 @@ from typing import List
6
6
 
7
7
  from evalscope.third_party.longbench_write.eval import EvalLength
8
8
  from evalscope.third_party.longbench_write.utils import chinese_to_arabic, count_words
9
- from evalscope.utils import jsonl_to_list
9
+ from evalscope.utils.io_utils import jsonl_to_list
10
10
  from evalscope.utils.logger import get_logger
11
11
 
12
12
  logger = get_logger()
@@ -5,7 +5,8 @@ from typing import Union
5
5
 
6
6
  from evalscope.third_party.toolbench_static.eval import EvalArgs, run_eval
7
7
  from evalscope.third_party.toolbench_static.infer import InferArgs, run_infer
8
- from evalscope.utils import get_logger, json_to_dict, yaml_to_dict
8
+ from evalscope.utils import get_logger
9
+ from evalscope.utils.io_utils import json_to_dict, yaml_to_dict
9
10
 
10
11
  logger = get_logger()
11
12