evalscope 0.5.5__py3-none-any.whl → 0.5.5rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/backend/__init__.py +3 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +1 -0
- evalscope/backend/vlm_eval_kit/custom_dataset.py +1 -1
- evalscope/evaluator/evaluator.py +0 -1
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -19
- evalscope/run.py +0 -4
- evalscope/utils/logger.py +14 -44
- evalscope/utils/task_utils.py +0 -3
- evalscope/version.py +2 -2
- {evalscope-0.5.5.dist-info → evalscope-0.5.5rc0.dist-info}/METADATA +30 -24
- {evalscope-0.5.5.dist-info → evalscope-0.5.5rc0.dist-info}/RECORD +14 -30
- evalscope/backend/rag_eval/__init__.py +0 -3
- evalscope/backend/rag_eval/backend_manager.py +0 -68
- evalscope/backend/rag_eval/cmteb/__init__.py +0 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +0 -59
- evalscope/backend/rag_eval/cmteb/base.py +0 -89
- evalscope/backend/rag_eval/cmteb/task_template.py +0 -83
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -302
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -252
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -113
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -153
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -345
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -302
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -64
- evalscope/backend/rag_eval/ragas/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/arguments.py +0 -37
- evalscope/backend/rag_eval/ragas/task_template.py +0 -117
- {evalscope-0.5.5.dist-info → evalscope-0.5.5rc0.dist-info}/WHEEL +0 -0
- {evalscope-0.5.5.dist-info → evalscope-0.5.5rc0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.5.5.dist-info → evalscope-0.5.5rc0.dist-info}/top_level.txt +0 -0
|
@@ -1,64 +0,0 @@
|
|
|
1
|
-
from .Classification import *
|
|
2
|
-
from .Clustering import *
|
|
3
|
-
from .PairClassification import *
|
|
4
|
-
from .Reranking import *
|
|
5
|
-
from .Retrieval import *
|
|
6
|
-
from .STS import *
|
|
7
|
-
|
|
8
|
-
CLS_CLASSIFICATION = {
|
|
9
|
-
"TNews": TNews,
|
|
10
|
-
"IFlyTek": IFlyTek,
|
|
11
|
-
"MultilingualSentiment": MultilingualSentiment,
|
|
12
|
-
"JDReview": JDReview,
|
|
13
|
-
"OnlineShopping": OnlineShopping,
|
|
14
|
-
"Waimai": Waimai,
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
CLS_CLUSTERING = {
|
|
18
|
-
"CLSClusteringS2S": CLSClusteringFastS2S,
|
|
19
|
-
"CLSClusteringP2P": CLSClusteringFastP2P,
|
|
20
|
-
"ThuNewsClusteringS2S": ThuNewsClusteringFastS2S,
|
|
21
|
-
"ThuNewsClusteringP2P": ThuNewsClusteringFastP2P,
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
CLS_PAIR_CLASSIFICATION = {
|
|
25
|
-
"Ocnli": Ocnli,
|
|
26
|
-
"Cmnli": Cmnli,
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
CLS_RERANKING = {
|
|
30
|
-
"T2Reranking": T2Reranking,
|
|
31
|
-
"MMarcoReranking": MMarcoReranking,
|
|
32
|
-
"CMedQAv1": CMedQAv1,
|
|
33
|
-
"CMedQAv2": CMedQAv2,
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
CLS_RETRIEVAL = {
|
|
37
|
-
"T2Retrieval": T2Retrieval,
|
|
38
|
-
"MMarcoRetrieval": MMarcoRetrieval,
|
|
39
|
-
"DuRetrieval": DuRetrieval,
|
|
40
|
-
"CovidRetrieval": CovidRetrieval,
|
|
41
|
-
"CmedqaRetrieval": CmedqaRetrieval,
|
|
42
|
-
"EcomRetrieval": EcomRetrieval,
|
|
43
|
-
"MedicalRetrieval": MedicalRetrieval,
|
|
44
|
-
"VideoRetrieval": VideoRetrieval,
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
CLS_STS = {
|
|
48
|
-
"ATEC": ATEC,
|
|
49
|
-
"BQ": BQ,
|
|
50
|
-
"LCQMC": LCQMC,
|
|
51
|
-
"PAWSX": PAWSX,
|
|
52
|
-
"STSB": STSB,
|
|
53
|
-
"AFQMC": AFQMC,
|
|
54
|
-
"QBQTC": QBQTC,
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
CLS_DICT = {
|
|
58
|
-
**CLS_CLASSIFICATION,
|
|
59
|
-
**CLS_CLUSTERING,
|
|
60
|
-
**CLS_PAIR_CLASSIFICATION,
|
|
61
|
-
**CLS_RERANKING,
|
|
62
|
-
**CLS_RETRIEVAL,
|
|
63
|
-
**CLS_STS,
|
|
64
|
-
}
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass, field
|
|
2
|
-
from typing import List, Optional, Union, Dict, Any
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
@dataclass
|
|
6
|
-
class TestsetGenerationArguments:
|
|
7
|
-
docs: List[str] = field(default_factory=list)
|
|
8
|
-
test_size: int = 10
|
|
9
|
-
output_file: str = "outputs/testset.json"
|
|
10
|
-
"""
|
|
11
|
-
For local LLM support, you can use the following fields:
|
|
12
|
-
model_name_or_path: str
|
|
13
|
-
model_revision: str = "master"
|
|
14
|
-
template_type: str = "default"
|
|
15
|
-
generation_config: Optional[Dict]
|
|
16
|
-
|
|
17
|
-
For API LLM support, you can use the following fields:
|
|
18
|
-
model_name="gpt-4o-mini"
|
|
19
|
-
api_base: str = "",
|
|
20
|
-
api_key: Optional[str] = None
|
|
21
|
-
"""
|
|
22
|
-
generator_llm: Dict = field(default_factory=dict)
|
|
23
|
-
critic_llm: Dict = field(default_factory=dict)
|
|
24
|
-
embeddings: Dict = field(default_factory=dict)
|
|
25
|
-
distribution: str = field(
|
|
26
|
-
default_factory=lambda: {"simple": 0.5, "multi_context": 0.4, "reasoning": 0.1}
|
|
27
|
-
)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
@dataclass
|
|
31
|
-
class EvaluationArguments:
|
|
32
|
-
testset_file: str
|
|
33
|
-
critic_llm: Dict = field(default_factory=dict)
|
|
34
|
-
embeddings: Dict = field(default_factory=dict)
|
|
35
|
-
metrics: List[str] = field(
|
|
36
|
-
default_factory=lambda: ["answer_relevancy", "faithfulness"]
|
|
37
|
-
)
|
|
@@ -1,117 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from evalscope.backend.rag_eval import EmbeddingModel, LLM
|
|
3
|
-
from evalscope.utils.logger import get_logger
|
|
4
|
-
from .arguments import TestsetGenerationArguments, EvaluationArguments
|
|
5
|
-
|
|
6
|
-
logger = get_logger()
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def testset_generation(args: TestsetGenerationArguments) -> None:
|
|
10
|
-
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
|
11
|
-
from ragas.testset.generator import TestsetGenerator
|
|
12
|
-
from ragas.testset.evolutions import simple, reasoning, multi_context
|
|
13
|
-
from ragas import RunConfig
|
|
14
|
-
|
|
15
|
-
# load data
|
|
16
|
-
file_path = args.docs
|
|
17
|
-
loader = UnstructuredFileLoader(file_path, mode="elements")
|
|
18
|
-
data = loader.load()
|
|
19
|
-
|
|
20
|
-
# generator with models
|
|
21
|
-
generator_llm = LLM.load(**args.generator_llm)
|
|
22
|
-
critic_llm = LLM.load(**args.critic_llm)
|
|
23
|
-
embeddings = EmbeddingModel.load(**args.embeddings)
|
|
24
|
-
|
|
25
|
-
# Change resulting question type distribution
|
|
26
|
-
distributions = {
|
|
27
|
-
simple: args.distribution["simple"],
|
|
28
|
-
multi_context: args.distribution["multi_context"],
|
|
29
|
-
reasoning: args.distribution["reasoning"],
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
generator = TestsetGenerator.from_langchain(generator_llm, critic_llm, embeddings)
|
|
33
|
-
|
|
34
|
-
runconfig = RunConfig(timeout=30, max_retries=1, max_wait=30, max_workers=1)
|
|
35
|
-
testset = generator.generate_with_langchain_docs(
|
|
36
|
-
data,
|
|
37
|
-
args.test_size,
|
|
38
|
-
distributions,
|
|
39
|
-
with_debugging_logs=True,
|
|
40
|
-
is_async=False,
|
|
41
|
-
run_config=runconfig,
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
# save file
|
|
45
|
-
testset_df = testset.to_pandas()
|
|
46
|
-
output_path = os.path.dirname(args.output_file)
|
|
47
|
-
os.makedirs(output_path, exist_ok=True)
|
|
48
|
-
testset_df.to_json(args.output_file, indent=4, index=False, orient="records")
|
|
49
|
-
|
|
50
|
-
# get answer
|
|
51
|
-
testset_with_answer = get_answer(testset_df, generator_llm)
|
|
52
|
-
testset_with_answer.to_json(
|
|
53
|
-
args.output_file, indent=4, index=False, orient="records"
|
|
54
|
-
)
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
def get_answer(testset_df, generator_llm):
|
|
58
|
-
template = """You are an assistant for question-answering tasks.
|
|
59
|
-
Use the following pieces of retrieved context to answer the question.
|
|
60
|
-
If you don't know the answer, just say that you don't know.
|
|
61
|
-
Use two sentences maximum and keep the answer concise.
|
|
62
|
-
Question: {question}
|
|
63
|
-
Context: {contexts}
|
|
64
|
-
Answer:
|
|
65
|
-
"""
|
|
66
|
-
answers = []
|
|
67
|
-
for index, row in testset_df.iterrows():
|
|
68
|
-
question = row["question"]
|
|
69
|
-
contexts = "\n".join(row["contexts"])
|
|
70
|
-
|
|
71
|
-
# Combine question and contexts as input for the LLM
|
|
72
|
-
input_text = template.format(question=question, contexts=contexts)
|
|
73
|
-
|
|
74
|
-
# Generate the answer using the generator LLM
|
|
75
|
-
answer = generator_llm.invoke(input_text)
|
|
76
|
-
answers.append(answer)
|
|
77
|
-
|
|
78
|
-
testset_df["answer"] = answers
|
|
79
|
-
return testset_df
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
def rag_eval(
|
|
83
|
-
args: EvaluationArguments,
|
|
84
|
-
) -> None:
|
|
85
|
-
from datasets import Dataset
|
|
86
|
-
from ragas import evaluate
|
|
87
|
-
from evalscope.backend.rag_eval import EmbeddingModel, LLM
|
|
88
|
-
from ragas import RunConfig
|
|
89
|
-
import importlib
|
|
90
|
-
|
|
91
|
-
def dynamic_import(module_name, *function_names):
|
|
92
|
-
# 动态导入指定模块
|
|
93
|
-
module = importlib.import_module(module_name)
|
|
94
|
-
|
|
95
|
-
functions = [getattr(module, name) for name in function_names]
|
|
96
|
-
return functions
|
|
97
|
-
|
|
98
|
-
llm = LLM.load(**args.critic_llm)
|
|
99
|
-
embedding = EmbeddingModel.load(**args.embeddings)
|
|
100
|
-
|
|
101
|
-
dataset = Dataset.from_json(args.testset_file)
|
|
102
|
-
|
|
103
|
-
runconfig = RunConfig(timeout=30, max_retries=1, max_wait=30, max_workers=1)
|
|
104
|
-
score = evaluate(
|
|
105
|
-
dataset,
|
|
106
|
-
metrics=dynamic_import("ragas.metrics", *args.metrics),
|
|
107
|
-
llm=llm,
|
|
108
|
-
embeddings=embedding,
|
|
109
|
-
run_config=runconfig,
|
|
110
|
-
)
|
|
111
|
-
score_df = score.to_pandas()
|
|
112
|
-
# logger.info(score_df.to_string())
|
|
113
|
-
|
|
114
|
-
output_path = args.testset_file.split(".")[0] + "_score.json"
|
|
115
|
-
score_df.to_json(output_path, indent=4, index=False, orient="records")
|
|
116
|
-
|
|
117
|
-
logger.info(f"Eval score saved to {output_path}")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|