evalscope 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/backend/__init__.py +0 -3
- evalscope/backend/opencompass/tasks/eval_datasets.py +2 -2
- evalscope/backend/rag_eval/__init__.py +3 -0
- evalscope/backend/rag_eval/backend_manager.py +68 -0
- evalscope/backend/rag_eval/cmteb/__init__.py +4 -0
- evalscope/backend/rag_eval/cmteb/arguments.py +59 -0
- evalscope/backend/rag_eval/cmteb/base.py +89 -0
- evalscope/backend/rag_eval/cmteb/task_template.py +83 -0
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +302 -0
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +252 -0
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +113 -0
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +153 -0
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +345 -0
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +302 -0
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +64 -0
- evalscope/backend/rag_eval/ragas/__init__.py +2 -0
- evalscope/backend/rag_eval/ragas/arguments.py +37 -0
- evalscope/backend/rag_eval/ragas/task_template.py +117 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +1 -1
- evalscope/evaluator/evaluator.py +1 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +19 -0
- evalscope/run.py +4 -0
- evalscope/utils/logger.py +44 -14
- evalscope/utils/task_utils.py +3 -0
- evalscope/version.py +2 -2
- {evalscope-0.5.4.dist-info → evalscope-0.5.5.dist-info}/METADATA +26 -32
- {evalscope-0.5.4.dist-info → evalscope-0.5.5.dist-info}/RECORD +31 -15
- {evalscope-0.5.4.dist-info → evalscope-0.5.5.dist-info}/WHEEL +0 -0
- {evalscope-0.5.4.dist-info → evalscope-0.5.5.dist-info}/entry_points.txt +0 -0
- {evalscope-0.5.4.dist-info → evalscope-0.5.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from .Classification import *
|
|
2
|
+
from .Clustering import *
|
|
3
|
+
from .PairClassification import *
|
|
4
|
+
from .Reranking import *
|
|
5
|
+
from .Retrieval import *
|
|
6
|
+
from .STS import *
|
|
7
|
+
|
|
8
|
+
CLS_CLASSIFICATION = {
|
|
9
|
+
"TNews": TNews,
|
|
10
|
+
"IFlyTek": IFlyTek,
|
|
11
|
+
"MultilingualSentiment": MultilingualSentiment,
|
|
12
|
+
"JDReview": JDReview,
|
|
13
|
+
"OnlineShopping": OnlineShopping,
|
|
14
|
+
"Waimai": Waimai,
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
CLS_CLUSTERING = {
|
|
18
|
+
"CLSClusteringS2S": CLSClusteringFastS2S,
|
|
19
|
+
"CLSClusteringP2P": CLSClusteringFastP2P,
|
|
20
|
+
"ThuNewsClusteringS2S": ThuNewsClusteringFastS2S,
|
|
21
|
+
"ThuNewsClusteringP2P": ThuNewsClusteringFastP2P,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
CLS_PAIR_CLASSIFICATION = {
|
|
25
|
+
"Ocnli": Ocnli,
|
|
26
|
+
"Cmnli": Cmnli,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
CLS_RERANKING = {
|
|
30
|
+
"T2Reranking": T2Reranking,
|
|
31
|
+
"MMarcoReranking": MMarcoReranking,
|
|
32
|
+
"CMedQAv1": CMedQAv1,
|
|
33
|
+
"CMedQAv2": CMedQAv2,
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
CLS_RETRIEVAL = {
|
|
37
|
+
"T2Retrieval": T2Retrieval,
|
|
38
|
+
"MMarcoRetrieval": MMarcoRetrieval,
|
|
39
|
+
"DuRetrieval": DuRetrieval,
|
|
40
|
+
"CovidRetrieval": CovidRetrieval,
|
|
41
|
+
"CmedqaRetrieval": CmedqaRetrieval,
|
|
42
|
+
"EcomRetrieval": EcomRetrieval,
|
|
43
|
+
"MedicalRetrieval": MedicalRetrieval,
|
|
44
|
+
"VideoRetrieval": VideoRetrieval,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
CLS_STS = {
|
|
48
|
+
"ATEC": ATEC,
|
|
49
|
+
"BQ": BQ,
|
|
50
|
+
"LCQMC": LCQMC,
|
|
51
|
+
"PAWSX": PAWSX,
|
|
52
|
+
"STSB": STSB,
|
|
53
|
+
"AFQMC": AFQMC,
|
|
54
|
+
"QBQTC": QBQTC,
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
CLS_DICT = {
|
|
58
|
+
**CLS_CLASSIFICATION,
|
|
59
|
+
**CLS_CLUSTERING,
|
|
60
|
+
**CLS_PAIR_CLASSIFICATION,
|
|
61
|
+
**CLS_RERANKING,
|
|
62
|
+
**CLS_RETRIEVAL,
|
|
63
|
+
**CLS_STS,
|
|
64
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import List, Optional, Union, Dict, Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class TestsetGenerationArguments:
|
|
7
|
+
docs: List[str] = field(default_factory=list)
|
|
8
|
+
test_size: int = 10
|
|
9
|
+
output_file: str = "outputs/testset.json"
|
|
10
|
+
"""
|
|
11
|
+
For local LLM support, you can use the following fields:
|
|
12
|
+
model_name_or_path: str
|
|
13
|
+
model_revision: str = "master"
|
|
14
|
+
template_type: str = "default"
|
|
15
|
+
generation_config: Optional[Dict]
|
|
16
|
+
|
|
17
|
+
For API LLM support, you can use the following fields:
|
|
18
|
+
model_name="gpt-4o-mini"
|
|
19
|
+
api_base: str = "",
|
|
20
|
+
api_key: Optional[str] = None
|
|
21
|
+
"""
|
|
22
|
+
generator_llm: Dict = field(default_factory=dict)
|
|
23
|
+
critic_llm: Dict = field(default_factory=dict)
|
|
24
|
+
embeddings: Dict = field(default_factory=dict)
|
|
25
|
+
distribution: str = field(
|
|
26
|
+
default_factory=lambda: {"simple": 0.5, "multi_context": 0.4, "reasoning": 0.1}
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class EvaluationArguments:
|
|
32
|
+
testset_file: str
|
|
33
|
+
critic_llm: Dict = field(default_factory=dict)
|
|
34
|
+
embeddings: Dict = field(default_factory=dict)
|
|
35
|
+
metrics: List[str] = field(
|
|
36
|
+
default_factory=lambda: ["answer_relevancy", "faithfulness"]
|
|
37
|
+
)
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from evalscope.backend.rag_eval import EmbeddingModel, LLM
|
|
3
|
+
from evalscope.utils.logger import get_logger
|
|
4
|
+
from .arguments import TestsetGenerationArguments, EvaluationArguments
|
|
5
|
+
|
|
6
|
+
logger = get_logger()
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def testset_generation(args: TestsetGenerationArguments) -> None:
|
|
10
|
+
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
|
11
|
+
from ragas.testset.generator import TestsetGenerator
|
|
12
|
+
from ragas.testset.evolutions import simple, reasoning, multi_context
|
|
13
|
+
from ragas import RunConfig
|
|
14
|
+
|
|
15
|
+
# load data
|
|
16
|
+
file_path = args.docs
|
|
17
|
+
loader = UnstructuredFileLoader(file_path, mode="elements")
|
|
18
|
+
data = loader.load()
|
|
19
|
+
|
|
20
|
+
# generator with models
|
|
21
|
+
generator_llm = LLM.load(**args.generator_llm)
|
|
22
|
+
critic_llm = LLM.load(**args.critic_llm)
|
|
23
|
+
embeddings = EmbeddingModel.load(**args.embeddings)
|
|
24
|
+
|
|
25
|
+
# Change resulting question type distribution
|
|
26
|
+
distributions = {
|
|
27
|
+
simple: args.distribution["simple"],
|
|
28
|
+
multi_context: args.distribution["multi_context"],
|
|
29
|
+
reasoning: args.distribution["reasoning"],
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
generator = TestsetGenerator.from_langchain(generator_llm, critic_llm, embeddings)
|
|
33
|
+
|
|
34
|
+
runconfig = RunConfig(timeout=30, max_retries=1, max_wait=30, max_workers=1)
|
|
35
|
+
testset = generator.generate_with_langchain_docs(
|
|
36
|
+
data,
|
|
37
|
+
args.test_size,
|
|
38
|
+
distributions,
|
|
39
|
+
with_debugging_logs=True,
|
|
40
|
+
is_async=False,
|
|
41
|
+
run_config=runconfig,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# save file
|
|
45
|
+
testset_df = testset.to_pandas()
|
|
46
|
+
output_path = os.path.dirname(args.output_file)
|
|
47
|
+
os.makedirs(output_path, exist_ok=True)
|
|
48
|
+
testset_df.to_json(args.output_file, indent=4, index=False, orient="records")
|
|
49
|
+
|
|
50
|
+
# get answer
|
|
51
|
+
testset_with_answer = get_answer(testset_df, generator_llm)
|
|
52
|
+
testset_with_answer.to_json(
|
|
53
|
+
args.output_file, indent=4, index=False, orient="records"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_answer(testset_df, generator_llm):
|
|
58
|
+
template = """You are an assistant for question-answering tasks.
|
|
59
|
+
Use the following pieces of retrieved context to answer the question.
|
|
60
|
+
If you don't know the answer, just say that you don't know.
|
|
61
|
+
Use two sentences maximum and keep the answer concise.
|
|
62
|
+
Question: {question}
|
|
63
|
+
Context: {contexts}
|
|
64
|
+
Answer:
|
|
65
|
+
"""
|
|
66
|
+
answers = []
|
|
67
|
+
for index, row in testset_df.iterrows():
|
|
68
|
+
question = row["question"]
|
|
69
|
+
contexts = "\n".join(row["contexts"])
|
|
70
|
+
|
|
71
|
+
# Combine question and contexts as input for the LLM
|
|
72
|
+
input_text = template.format(question=question, contexts=contexts)
|
|
73
|
+
|
|
74
|
+
# Generate the answer using the generator LLM
|
|
75
|
+
answer = generator_llm.invoke(input_text)
|
|
76
|
+
answers.append(answer)
|
|
77
|
+
|
|
78
|
+
testset_df["answer"] = answers
|
|
79
|
+
return testset_df
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def rag_eval(
|
|
83
|
+
args: EvaluationArguments,
|
|
84
|
+
) -> None:
|
|
85
|
+
from datasets import Dataset
|
|
86
|
+
from ragas import evaluate
|
|
87
|
+
from evalscope.backend.rag_eval import EmbeddingModel, LLM
|
|
88
|
+
from ragas import RunConfig
|
|
89
|
+
import importlib
|
|
90
|
+
|
|
91
|
+
def dynamic_import(module_name, *function_names):
|
|
92
|
+
# 动态导入指定模块
|
|
93
|
+
module = importlib.import_module(module_name)
|
|
94
|
+
|
|
95
|
+
functions = [getattr(module, name) for name in function_names]
|
|
96
|
+
return functions
|
|
97
|
+
|
|
98
|
+
llm = LLM.load(**args.critic_llm)
|
|
99
|
+
embedding = EmbeddingModel.load(**args.embeddings)
|
|
100
|
+
|
|
101
|
+
dataset = Dataset.from_json(args.testset_file)
|
|
102
|
+
|
|
103
|
+
runconfig = RunConfig(timeout=30, max_retries=1, max_wait=30, max_workers=1)
|
|
104
|
+
score = evaluate(
|
|
105
|
+
dataset,
|
|
106
|
+
metrics=dynamic_import("ragas.metrics", *args.metrics),
|
|
107
|
+
llm=llm,
|
|
108
|
+
embeddings=embedding,
|
|
109
|
+
run_config=runconfig,
|
|
110
|
+
)
|
|
111
|
+
score_df = score.to_pandas()
|
|
112
|
+
# logger.info(score_df.to_string())
|
|
113
|
+
|
|
114
|
+
output_path = args.testset_file.split(".")[0] + "_score.json"
|
|
115
|
+
score_df.to_json(output_path, indent=4, index=False, orient="records")
|
|
116
|
+
|
|
117
|
+
logger.info(f"Eval score saved to {output_path}")
|
|
@@ -8,7 +8,7 @@ class CustomDataset:
|
|
|
8
8
|
|
|
9
9
|
def load_data(self, dataset):
|
|
10
10
|
# customize the loading of the dataset
|
|
11
|
-
data_path = os.path.join("~/LMUData", f'{dataset}.tsv')
|
|
11
|
+
data_path = os.path.join(os.path.expanduser("~/LMUData"), f'{dataset}.tsv')
|
|
12
12
|
return load(data_path)
|
|
13
13
|
|
|
14
14
|
|
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -174,6 +174,7 @@ class Evaluator(object):
|
|
|
174
174
|
"""
|
|
175
175
|
assert self.data_adapter is not None, 'data_adapter must be provided when calling func get_answers() !'
|
|
176
176
|
assert self.model_adapter is not None, 'model must be provided when calling func get_answers() !'
|
|
177
|
+
assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
|
|
177
178
|
|
|
178
179
|
answers_list = []
|
|
179
180
|
pred_dir: str = self.outputs_structure.get(OutputsStructure.PREDICTIONS_DIR)
|
|
@@ -31,6 +31,7 @@ In these examples settings.xml lists input files and formats.
|
|
|
31
31
|
from __future__ import absolute_import, division, print_function
|
|
32
32
|
import collections
|
|
33
33
|
import re
|
|
34
|
+
import os
|
|
34
35
|
|
|
35
36
|
import nltk
|
|
36
37
|
import numpy as np
|
|
@@ -38,6 +39,24 @@ import six
|
|
|
38
39
|
from absl import logging
|
|
39
40
|
from rouge_score import scoring, tokenizers
|
|
40
41
|
from six.moves import map, range
|
|
42
|
+
from evalscope.utils import get_logger
|
|
43
|
+
|
|
44
|
+
logger = get_logger()
|
|
45
|
+
|
|
46
|
+
# Deal with nltk punkt_tab.zip tokenizer file to avoid downloading issue
|
|
47
|
+
try:
|
|
48
|
+
nltk_dir = os.path.join(os.path.expanduser('~'), 'nltk_data/tokenizers')
|
|
49
|
+
os.makedirs(nltk_dir, exist_ok=True)
|
|
50
|
+
punkt_path = os.path.join(nltk_dir, 'punkt_tab.zip')
|
|
51
|
+
punkt_tab_url = 'https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/open_data/nltk_data/punkt_tab.zip'
|
|
52
|
+
|
|
53
|
+
if not os.path.exists(punkt_path):
|
|
54
|
+
os.system(f'wget -P {nltk_dir} {punkt_tab_url}')
|
|
55
|
+
os.system(f'unzip {punkt_path} -d {nltk_dir}')
|
|
56
|
+
else:
|
|
57
|
+
logger.info(f'{punkt_path} already exists, skipping download')
|
|
58
|
+
except Exception as e:
|
|
59
|
+
logger.error(f'Try to download punkt_tab.zip for nltk failed: {e}')
|
|
41
60
|
|
|
42
61
|
|
|
43
62
|
class RougeScorer(scoring.BaseScorer):
|
evalscope/run.py
CHANGED
|
@@ -207,6 +207,10 @@ def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig]]) -> Union[
|
|
|
207
207
|
from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
|
|
208
208
|
vlm_eval_kit_backend_manager = VLMEvalKitBackendManager(config=eval_config)
|
|
209
209
|
vlm_eval_kit_backend_manager.run()
|
|
210
|
+
elif eval_backend == EvalBackend.RAG_EVAL.value:
|
|
211
|
+
from evalscope.backend.rag_eval import RAGEvalBackendManager
|
|
212
|
+
rag_eval_backend_manager = RAGEvalBackendManager(config=eval_config)
|
|
213
|
+
rag_eval_backend_manager.run()
|
|
210
214
|
# TODO: Add other evaluation backends
|
|
211
215
|
elif eval_backend == EvalBackend.THIRD_PARTY.value:
|
|
212
216
|
raise NotImplementedError(f'Not implemented for evaluation backend {eval_backend}')
|
evalscope/utils/logger.py
CHANGED
|
@@ -1,18 +1,20 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
+
import importlib.util as iutil
|
|
3
4
|
import logging
|
|
4
5
|
from typing import Optional
|
|
5
6
|
|
|
6
7
|
init_loggers = {}
|
|
8
|
+
format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
9
|
+
formatter = logging.Formatter(format)
|
|
7
10
|
|
|
8
|
-
|
|
9
|
-
'%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
11
|
+
logging.basicConfig(format=format, level=logging.INFO)
|
|
10
12
|
|
|
11
13
|
|
|
12
|
-
def get_logger(
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
"""
|
|
14
|
+
def get_logger(
|
|
15
|
+
log_file: Optional[str] = None, log_level: int = logging.INFO, file_mode: str = "w"
|
|
16
|
+
):
|
|
17
|
+
"""Get logging logger
|
|
16
18
|
|
|
17
19
|
Args:
|
|
18
20
|
log_file: Log filename, if specified, file handler will be added to
|
|
@@ -22,21 +24,39 @@ def get_logger(log_file: Optional[str] = None,
|
|
|
22
24
|
specified (if filemode is unspecified, it defaults to 'w').
|
|
23
25
|
"""
|
|
24
26
|
|
|
25
|
-
logger_name = __name__.split(
|
|
27
|
+
logger_name = __name__.split(".")[0]
|
|
26
28
|
logger = logging.getLogger(logger_name)
|
|
27
|
-
|
|
29
|
+
logger.propagate = False
|
|
28
30
|
if logger_name in init_loggers:
|
|
29
31
|
add_file_handler_if_needed(logger, log_file, file_mode, log_level)
|
|
32
|
+
if logger.level != log_level:
|
|
33
|
+
logger.setLevel(log_level)
|
|
30
34
|
return logger
|
|
31
35
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
36
|
+
# handle duplicate logs to the console
|
|
37
|
+
# Starting in 1.8.0, PyTorch DDP attaches a StreamHandler <stderr> (NOTSET)
|
|
38
|
+
# to the root logger. As logger.propagate is True by default, this root
|
|
39
|
+
# level handler causes logging messages from rank>0 processes to
|
|
40
|
+
# unexpectedly show up on the console, creating much unwanted clutter.
|
|
41
|
+
# To fix this issue, we set the root logger's StreamHandler, if any, to log
|
|
42
|
+
# at the ERROR level.
|
|
43
|
+
torch_dist = False
|
|
44
|
+
is_worker0 = True
|
|
45
|
+
if iutil.find_spec("torch") is not None:
|
|
46
|
+
from modelscope.utils.torch_utils import is_dist, is_master
|
|
47
|
+
|
|
48
|
+
torch_dist = is_dist()
|
|
49
|
+
is_worker0 = is_master()
|
|
50
|
+
|
|
51
|
+
if torch_dist:
|
|
52
|
+
for handler in logger.root.handlers:
|
|
53
|
+
if type(handler) is logging.StreamHandler:
|
|
54
|
+
handler.setLevel(logging.ERROR)
|
|
35
55
|
|
|
36
56
|
stream_handler = logging.StreamHandler()
|
|
37
57
|
handlers = [stream_handler]
|
|
38
58
|
|
|
39
|
-
if log_file is not None:
|
|
59
|
+
if is_worker0 and log_file is not None:
|
|
40
60
|
file_handler = logging.FileHandler(log_file, file_mode)
|
|
41
61
|
handlers.append(file_handler)
|
|
42
62
|
|
|
@@ -45,7 +65,10 @@ def get_logger(log_file: Optional[str] = None,
|
|
|
45
65
|
handler.setLevel(log_level)
|
|
46
66
|
logger.addHandler(handler)
|
|
47
67
|
|
|
48
|
-
|
|
68
|
+
if is_worker0:
|
|
69
|
+
logger.setLevel(log_level)
|
|
70
|
+
else:
|
|
71
|
+
logger.setLevel(logging.ERROR)
|
|
49
72
|
|
|
50
73
|
init_loggers[logger_name] = True
|
|
51
74
|
|
|
@@ -57,7 +80,14 @@ def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
|
|
|
57
80
|
if isinstance(handler, logging.FileHandler):
|
|
58
81
|
return
|
|
59
82
|
|
|
60
|
-
if
|
|
83
|
+
if iutil.find_spec("torch") is not None:
|
|
84
|
+
from modelscope.utils.torch_utils import is_master
|
|
85
|
+
|
|
86
|
+
is_worker0 = is_master()
|
|
87
|
+
else:
|
|
88
|
+
is_worker0 = True
|
|
89
|
+
|
|
90
|
+
if is_worker0 and log_file is not None:
|
|
61
91
|
file_handler = logging.FileHandler(log_file, file_mode)
|
|
62
92
|
file_handler.setFormatter(formatter)
|
|
63
93
|
file_handler.setLevel(log_level)
|
evalscope/utils/task_utils.py
CHANGED
|
@@ -11,6 +11,9 @@ class EvalBackend(Enum):
|
|
|
11
11
|
|
|
12
12
|
# Use VLM Eval Kit as the multi-modal model evaluation backend
|
|
13
13
|
VLM_EVAL_KIT = 'VLMEvalKit'
|
|
14
|
+
|
|
15
|
+
# Use RAGEval as the RAG evaluation backend
|
|
16
|
+
RAG_EVAL = 'RAGEval'
|
|
14
17
|
|
|
15
18
|
# Use third-party evaluation backend/modules
|
|
16
19
|
THIRD_PARTY = 'ThirdParty'
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.5
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -24,7 +24,7 @@ Requires-Dist: editdistance
|
|
|
24
24
|
Requires-Dist: jsonlines
|
|
25
25
|
Requires-Dist: matplotlib
|
|
26
26
|
Requires-Dist: modelscope[framework]
|
|
27
|
-
Requires-Dist: nltk
|
|
27
|
+
Requires-Dist: nltk (>=3.9)
|
|
28
28
|
Requires-Dist: openai
|
|
29
29
|
Requires-Dist: pandas
|
|
30
30
|
Requires-Dist: plotly
|
|
@@ -34,7 +34,7 @@ Requires-Dist: pyyaml
|
|
|
34
34
|
Requires-Dist: regex
|
|
35
35
|
Requires-Dist: requests
|
|
36
36
|
Requires-Dist: requests-toolbelt
|
|
37
|
-
Requires-Dist: rouge-score
|
|
37
|
+
Requires-Dist: rouge-score (>=0.1.0)
|
|
38
38
|
Requires-Dist: sacrebleu
|
|
39
39
|
Requires-Dist: scikit-learn
|
|
40
40
|
Requires-Dist: seaborn
|
|
@@ -57,7 +57,7 @@ Requires-Dist: editdistance ; extra == 'all'
|
|
|
57
57
|
Requires-Dist: jsonlines ; extra == 'all'
|
|
58
58
|
Requires-Dist: matplotlib ; extra == 'all'
|
|
59
59
|
Requires-Dist: modelscope[framework] ; extra == 'all'
|
|
60
|
-
Requires-Dist: nltk ; extra == 'all'
|
|
60
|
+
Requires-Dist: nltk (>=3.9) ; extra == 'all'
|
|
61
61
|
Requires-Dist: openai ; extra == 'all'
|
|
62
62
|
Requires-Dist: pandas ; extra == 'all'
|
|
63
63
|
Requires-Dist: plotly ; extra == 'all'
|
|
@@ -67,7 +67,7 @@ Requires-Dist: pyyaml ; extra == 'all'
|
|
|
67
67
|
Requires-Dist: regex ; extra == 'all'
|
|
68
68
|
Requires-Dist: requests ; extra == 'all'
|
|
69
69
|
Requires-Dist: requests-toolbelt ; extra == 'all'
|
|
70
|
-
Requires-Dist: rouge-score ; extra == 'all'
|
|
70
|
+
Requires-Dist: rouge-score (>=0.1.0) ; extra == 'all'
|
|
71
71
|
Requires-Dist: sacrebleu ; extra == 'all'
|
|
72
72
|
Requires-Dist: scikit-learn ; extra == 'all'
|
|
73
73
|
Requires-Dist: seaborn ; extra == 'all'
|
|
@@ -80,8 +80,10 @@ Requires-Dist: transformers (>=4.33) ; extra == 'all'
|
|
|
80
80
|
Requires-Dist: transformers-stream-generator ; extra == 'all'
|
|
81
81
|
Requires-Dist: jieba ; extra == 'all'
|
|
82
82
|
Requires-Dist: rouge-chinese ; extra == 'all'
|
|
83
|
-
Requires-Dist: ms-opencompass (>=0.1.
|
|
83
|
+
Requires-Dist: ms-opencompass (>=0.1.1) ; extra == 'all'
|
|
84
84
|
Requires-Dist: ms-vlmeval (>=0.0.5) ; extra == 'all'
|
|
85
|
+
Requires-Dist: ragas ; extra == 'all'
|
|
86
|
+
Requires-Dist: mteb (>=0.14.16) ; extra == 'all'
|
|
85
87
|
Provides-Extra: inner
|
|
86
88
|
Requires-Dist: absl-py ; extra == 'inner'
|
|
87
89
|
Requires-Dist: accelerate ; extra == 'inner'
|
|
@@ -109,7 +111,10 @@ Requires-Dist: tqdm ; extra == 'inner'
|
|
|
109
111
|
Requires-Dist: transformers (<4.43,>=4.33) ; extra == 'inner'
|
|
110
112
|
Requires-Dist: transformers-stream-generator ; extra == 'inner'
|
|
111
113
|
Provides-Extra: opencompass
|
|
112
|
-
Requires-Dist: ms-opencompass (>=0.1.
|
|
114
|
+
Requires-Dist: ms-opencompass (>=0.1.1) ; extra == 'opencompass'
|
|
115
|
+
Provides-Extra: rag
|
|
116
|
+
Requires-Dist: ragas ; extra == 'rag'
|
|
117
|
+
Requires-Dist: mteb (>=0.14.16) ; extra == 'rag'
|
|
113
118
|
Provides-Extra: vlmeval
|
|
114
119
|
Requires-Dist: ms-vlmeval (>=0.0.5) ; extra == 'vlmeval'
|
|
115
120
|
|
|
@@ -144,30 +149,11 @@ English | [简体中文](README_zh.md)
|
|
|
144
149
|
|
|
145
150
|
## 📝 Introduction
|
|
146
151
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
### Framework Features
|
|
150
|
-
- **Benchmark Datasets**: Preloaded with several commonly used test benchmarks, including MMLU, CMMLU, C-Eval, GSM8K, ARC, HellaSwag, TruthfulQA, MATH, HumanEval, etc.
|
|
151
|
-
- **Evaluation Metrics**: Implements various commonly used evaluation metrics.
|
|
152
|
-
- **Model Access**: A unified model access mechanism that is compatible with the Generate and Chat interfaces of multiple model families.
|
|
153
|
-
- **Automated Evaluation**: Includes automatic evaluation of objective questions and complex task evaluation using expert models.
|
|
154
|
-
- **Evaluation Reports**: Automatically generates evaluation reports.
|
|
155
|
-
- **Arena Mode**: Used for comparisons between models and objective evaluation of models, supporting various evaluation modes, including:
|
|
156
|
-
- **Single mode**: Scoring a single model.
|
|
157
|
-
- **Pairwise-baseline mode**: Comparing against a baseline model.
|
|
158
|
-
- **Pairwise (all) mode**: Pairwise comparison among all models.
|
|
159
|
-
- **Visualization Tools**: Provides intuitive displays of evaluation results.
|
|
160
|
-
- **Model Performance Evaluation**: Offers a performance testing tool for model inference services and detailed statistics, see [Model Performance Evaluation Documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test.html).
|
|
161
|
-
- **OpenCompass Integration**: Supports OpenCompass as the evaluation backend, providing advanced encapsulation and task simplification, allowing for easier task submission for evaluation.
|
|
162
|
-
- **VLMEvalKit Integration**: Supports VLMEvalKit as the evaluation backend, facilitating the initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
|
|
163
|
-
- **Full-Link Support**: Through seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, provides a one-stop development process for model training, model deployment, model evaluation, and report viewing, enhancing user development efficiency.
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
<details><summary>Overall Architecture</summary>
|
|
152
|
+
EvalScope is the official model evaluation and performance benchmarking framework launched by the [ModelScope](https://modelscope.cn/) community. It comes with built-in common benchmarks and evaluation metrics, such as MMLU, CMMLU, C-Eval, GSM8K, ARC, HellaSwag, TruthfulQA, MATH, and HumanEval. EvalScope supports various types of model evaluations, including LLMs, multimodal LLMs, embedding models, and reranker models. It is also applicable to multiple evaluation scenarios, such as end-to-end RAG evaluation, arena mode, and model inference performance stress testing. Moreover, with the seamless integration of the ms-swift training framework, evaluations can be initiated with a single click, providing full end-to-end support from model training to evaluation 🚀
|
|
167
153
|
|
|
168
154
|
<p align="center">
|
|
169
155
|
<img src="docs/en/_static/images/evalscope_framework.png" width="70%">
|
|
170
|
-
<br>
|
|
156
|
+
<br>EvalScope Framework.
|
|
171
157
|
</p>
|
|
172
158
|
|
|
173
159
|
The architecture includes the following modules:
|
|
@@ -177,14 +163,15 @@ The architecture includes the following modules:
|
|
|
177
163
|
- **Native**: EvalScope’s own **default evaluation framework**, supporting various evaluation modes, including single model evaluation, arena mode, baseline model comparison mode, etc.
|
|
178
164
|
- **OpenCompass**: Supports [OpenCompass](https://github.com/open-compass/opencompass) as the evaluation backend, providing advanced encapsulation and task simplification, allowing you to submit tasks for evaluation more easily.
|
|
179
165
|
- **VLMEvalKit**: Supports [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) as the evaluation backend, enabling easy initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
|
|
166
|
+
- **RAGEval**: Supports RAG evaluation, supporting independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
|
|
180
167
|
- **ThirdParty**: Other third-party evaluation tasks, such as ToolBench.
|
|
181
168
|
4. **Performance Evaluator**: Model performance evaluation, responsible for measuring model inference service performance, including performance testing, stress testing, performance report generation, and visualization.
|
|
182
169
|
5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
|
|
183
170
|
6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
|
|
184
|
-
</details>
|
|
185
171
|
|
|
186
172
|
|
|
187
173
|
## 🎉 News
|
|
174
|
+
- 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
|
|
188
175
|
- 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
|
|
189
176
|
- 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
|
|
190
177
|
- 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
|
|
@@ -355,9 +342,10 @@ run_task(task_cfg=your_task_cfg)
|
|
|
355
342
|
## Evaluation Backend
|
|
356
343
|
EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
|
|
357
344
|
- **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
|
|
358
|
-
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/opencompass_backend.html)
|
|
359
|
-
- [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/vlmevalkit_backend.html)
|
|
360
|
-
- **
|
|
345
|
+
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
|
|
346
|
+
- [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/vlmevalkit_backend.html)
|
|
347
|
+
- **RAGEval**: Initiate RAG evaluation tasks through EvalScope, supporting independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html): [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/index.html)
|
|
348
|
+
- **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
|
|
361
349
|
|
|
362
350
|
## Custom Dataset Evaluation
|
|
363
351
|
EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset.html)
|
|
@@ -386,6 +374,8 @@ Refer to : [Leaderboard](https://modelscope.cn/leaderboard/58/ranking?type=free)
|
|
|
386
374
|
|
|
387
375
|
|
|
388
376
|
## TO-DO List
|
|
377
|
+
- [x] RAG evaluation
|
|
378
|
+
- [x] VLM evaluation
|
|
389
379
|
- [x] Agents evaluation
|
|
390
380
|
- [x] vLLM
|
|
391
381
|
- [ ] Distributed evaluating
|
|
@@ -397,3 +387,7 @@ Refer to : [Leaderboard](https://modelscope.cn/leaderboard/58/ranking?type=free)
|
|
|
397
387
|
- [ ] Auto-reviewer
|
|
398
388
|
- [ ] Qwen-max
|
|
399
389
|
|
|
390
|
+
|
|
391
|
+
## Star History
|
|
392
|
+
|
|
393
|
+
[](https://star-history.com/#modelscope/evalscope&Date)
|
|
@@ -2,22 +2,38 @@ evalscope/__init__.py,sha256=3eLMMrjkAIAs3vGluXNZn5-xTSbO_vfba9yNPbkVtg8,105
|
|
|
2
2
|
evalscope/cache.py,sha256=zpGjL9JMosqjk_dkODVwvIGiUC0WAMmMTHDNJOvBQU8,3288
|
|
3
3
|
evalscope/config.py,sha256=G_rpSn5Kd1aPlFJO6asnZu5FUggZmwcYdAxxpuq0yDs,6972
|
|
4
4
|
evalscope/constants.py,sha256=g8lGYlpA4Wk88HwtqId1-jJX_z8Lr2k02gWLsyofyj0,2670
|
|
5
|
-
evalscope/run.py,sha256=
|
|
5
|
+
evalscope/run.py,sha256=uAXtaxIBcR94jyfHGFAecuzn0y71oLgu-d9VOohCJAw,18738
|
|
6
6
|
evalscope/run_arena.py,sha256=BCWCAiX0BQ9pLMIq08svEcd-IoFr75gFShpV88robIY,8963
|
|
7
7
|
evalscope/run_ms.py,sha256=UtJoGnah64SXigTawJQWTi_TEGjr7Td0rjCTaO-htL8,6028
|
|
8
8
|
evalscope/summarizer.py,sha256=rIyML8HpjQxIpXg8KvQ0CzOS6xMS-JHZh6kUZzkaRsk,6640
|
|
9
|
-
evalscope/version.py,sha256=
|
|
10
|
-
evalscope/backend/__init__.py,sha256=
|
|
9
|
+
evalscope/version.py,sha256=UZ6qDTtcyaqkwS2_IkU2Kzop4lG9AL9cYpfEpYzfCrc,118
|
|
10
|
+
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
11
|
evalscope/backend/base.py,sha256=5BLrDNNwxsGp35zorD-kphmN15tlBbkuuqwkz8jWZq0,876
|
|
12
12
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
13
13
|
evalscope/backend/opencompass/api_meta_template.py,sha256=sBW0XbVDOKeJ7mVUDLhmcG4e0yClw3eluazdp_8wtgQ,1753
|
|
14
14
|
evalscope/backend/opencompass/backend_manager.py,sha256=_eg82FLAVxQ6t5e1OqlyuxZcngqD8rxvI5EijLUh_zI,10294
|
|
15
15
|
evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
16
16
|
evalscope/backend/opencompass/tasks/eval_api.py,sha256=12lrgDpMzZ1XBRboq5TEOovDPCMDwwGCJoRT78Ox_yo,1108
|
|
17
|
-
evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=
|
|
17
|
+
evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=bYFHkjiwZqh2FVRo1I88xEDZ6nYmZjAgG5ZODbthKFI,5241
|
|
18
|
+
evalscope/backend/rag_eval/__init__.py,sha256=yRCcfxhzC7wIYbgb-w76i4D9v8wXI7JmYNM6IZUn064,199
|
|
19
|
+
evalscope/backend/rag_eval/backend_manager.py,sha256=tWkFzYO9LQjaI7paD5yz1c-HtNJUbnAr0a-4biYSZvg,2562
|
|
20
|
+
evalscope/backend/rag_eval/cmteb/__init__.py,sha256=ajVz6XP5hqPq-jm66hp2poA2qKj1V19ZGoqjrGUlO7U,279
|
|
21
|
+
evalscope/backend/rag_eval/cmteb/arguments.py,sha256=698UvPVZp5Ncq_p25P_67SQkYaW2tLSCHenUOZ0n5OI,2217
|
|
22
|
+
evalscope/backend/rag_eval/cmteb/base.py,sha256=sJqTRCej7vk5ASirk21hOobX1_Hz7BO1LIHJFOGLuE4,2731
|
|
23
|
+
evalscope/backend/rag_eval/cmteb/task_template.py,sha256=HreVxyRiF2QUe4Dy9_zKNp1WU74342RWHV5_B8ycXG0,2537
|
|
24
|
+
evalscope/backend/rag_eval/cmteb/tasks/Classification.py,sha256=7adR40W6Uu58-QR9jCUP4k7TdAnG0oT225v4xHXah2g,10635
|
|
25
|
+
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py,sha256=7j1Hts_r4Nv8DlbIiPFMaU1JDxCYgu0wO0JI8T_Y6X8,8969
|
|
26
|
+
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py,sha256=2WkaTE-jF8jqsu1UcNDqN8A4567UzW5boD_0B83j-9A,4008
|
|
27
|
+
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py,sha256=50h-lXaRcb5s6ZpIgnfk5mU7iZur8ZDxwsaFbrqSZ_o,5462
|
|
28
|
+
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py,sha256=wUxiQH5aOmWNS4YswACyHqBn5xqP5eyvsq6U9WSp5R0,11457
|
|
29
|
+
evalscope/backend/rag_eval/cmteb/tasks/STS.py,sha256=6GMaoCANM-IKYLk4srHOYr_eurav3DGihHMQeJPXR6k,12054
|
|
30
|
+
evalscope/backend/rag_eval/cmteb/tasks/__init__.py,sha256=FrtoBosHq9iRp3yfZEAxWa5NkYhHtA20NmHDG6eiPNU,1421
|
|
31
|
+
evalscope/backend/rag_eval/ragas/__init__.py,sha256=tHB7XGREmcrx8ulF-JZWWoHsEbn2s-PFyWFdGzOZQcw,190
|
|
32
|
+
evalscope/backend/rag_eval/ragas/arguments.py,sha256=plVc2_3auVG5z91ExzBdkbNIhMVjyi_xQYbEzlV0iNw,1208
|
|
33
|
+
evalscope/backend/rag_eval/ragas/task_template.py,sha256=795NHXzGdeqa15ONV1AgDZywpMHucaIlvk_EBF0CK98,3868
|
|
18
34
|
evalscope/backend/vlm_eval_kit/__init__.py,sha256=xTgHM95lWzh4s0W7zxLwYkgUbPAZfAb0UoGGmyyBXrs,83
|
|
19
|
-
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=
|
|
20
|
-
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=
|
|
35
|
+
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ewhpE9yzsqf5ED6kqsqek2YEgg96GBQOupxtVNhaXxI,6046
|
|
36
|
+
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=Yz2A5kB1E8DYBnjuVCA6TTPtLjhg8vYKeJTh6FU_Ecw,1645
|
|
21
37
|
evalscope/benchmarks/__init__.py,sha256=6TKP35wfKf7R_h870fsEtcIlIAgomKOcukNL9M-5I1Y,162
|
|
22
38
|
evalscope/benchmarks/benchmark.py,sha256=EmwYyFdrAHBGMkSbsMZQOR_62Q0CSKl8zeLlr7xvJdQ,2159
|
|
23
39
|
evalscope/benchmarks/data_adapter.py,sha256=eVQvOQYQOQbIl8UlvOEUqRThL3FP3aUD6DSlqF1bqO0,10395
|
|
@@ -91,7 +107,7 @@ evalscope/cli/cli.py,sha256=uZ-qC8WBsLd5-Hn94d43sSGg0UC_12RebSD4ToKjypg,844
|
|
|
91
107
|
evalscope/cli/start_perf.py,sha256=TL6bMXYl3ln-tfs5uBmzb9x94uxz6f3PBFIt1l7g3VA,994
|
|
92
108
|
evalscope/cli/start_server.py,sha256=ATGLP2TE0aImJNicpehdzBuFlNb50F7KhyL4A_ZSoGU,3885
|
|
93
109
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
94
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
110
|
+
evalscope/evaluator/evaluator.py,sha256=eSCgPPDGfIJfKu0cthhbDLFm1xMhj_869iT3ngcQkPc,30817
|
|
95
111
|
evalscope/evaluator/rating_eval.py,sha256=cJbkyXIuwFUZoe7ZJZM6eUskNd9zlORgndckuon2OQ8,5768
|
|
96
112
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
97
113
|
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=JycPYti9h1j_8DRcu_rc5U0wkEASHYg-XBqrUUoiO-Q,17054
|
|
@@ -101,7 +117,7 @@ evalscope/metrics/math_accuracy.py,sha256=1PCy1VUNYg48JcGy-6SUmUDZNwPeAkMW1QQ_lX
|
|
|
101
117
|
evalscope/metrics/metrics.py,sha256=sDZljGiZwgHsFZ5eNi65-3z3BLCdIwWUzPcq2QpKf1k,12545
|
|
102
118
|
evalscope/metrics/rouge_metric.py,sha256=sN0r-sXXc-nJUdFrthQPAv1VFdOCrF6zzIYDKaLSgrU,4522
|
|
103
119
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
104
|
-
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=
|
|
120
|
+
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=vhzIMSQezhZuJzGndymWjB_iRbDdECoEidOIdNL3NAM,12213
|
|
105
121
|
evalscope/models/__init__.py,sha256=zG27J2HSeKPGiAIUE7QLPHEPLyXLsfaDwYI_TDXjpCg,145
|
|
106
122
|
evalscope/models/dummy_chat_model.py,sha256=xE8wcFVSCkvizEJ-B8ojX0Ir01Q5KrN5mapjMQaQtbg,1325
|
|
107
123
|
evalscope/models/model.py,sha256=ZzzVzZHVzuzdt5F1r-rEBT44ZfW9B7R1spsrV-T8nSw,3020
|
|
@@ -169,12 +185,12 @@ evalscope/tools/rewrite_eval_results.py,sha256=ZVi2hVjiTOmR_O5IaLv6qnQNpMz6FnDb9
|
|
|
169
185
|
evalscope/utils/__init__.py,sha256=6RjACRYUSpGj6fkZ7NzYpl0lFppQCp9KVn5ktZe626s,128
|
|
170
186
|
evalscope/utils/arena_utils.py,sha256=RMkymUv9Cxs37arUntzgDY5P0Dand2jGpsb7uy6wZmg,7670
|
|
171
187
|
evalscope/utils/completion_parsers.py,sha256=61l8CTh1VxHgRoMDhtznpAhuJp47MssGgS-LdEe_h80,2997
|
|
172
|
-
evalscope/utils/logger.py,sha256=
|
|
188
|
+
evalscope/utils/logger.py,sha256=cf3U400Mx1speMMNXorjwEE8noDz5Mbd-9PNgaulGeY,3013
|
|
173
189
|
evalscope/utils/task_cfg_parser.py,sha256=LiNQ2X8lbZU0cODpaY_PbKyUhNoxZIC495UsLJigX64,138
|
|
174
|
-
evalscope/utils/task_utils.py,sha256=
|
|
190
|
+
evalscope/utils/task_utils.py,sha256=IMtBSBUp3H95Ko0vn8Q55Wmz2SFZXSfjVy49tyomL_g,537
|
|
175
191
|
evalscope/utils/utils.py,sha256=zHo9hfxGBUVKE2xNMR7lDoEvfRnk4V4946DEfXQhlq4,20509
|
|
176
|
-
evalscope-0.5.
|
|
177
|
-
evalscope-0.5.
|
|
178
|
-
evalscope-0.5.
|
|
179
|
-
evalscope-0.5.
|
|
180
|
-
evalscope-0.5.
|
|
192
|
+
evalscope-0.5.5.dist-info/METADATA,sha256=scs7UaBcWE2qpewo_oe6ZB8HX5CtbohPBvom6UjUY5w,20943
|
|
193
|
+
evalscope-0.5.5.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
194
|
+
evalscope-0.5.5.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
195
|
+
evalscope-0.5.5.dist-info/top_level.txt,sha256=jNR-HMn3TR8Atolq7_4rW8IWVX6GhvYV5_1Y_KbJKlY,10
|
|
196
|
+
evalscope-0.5.5.dist-info/RECORD,,
|