evalscope 0.5.5__py3-none-any.whl → 0.5.5rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (29) hide show
  1. evalscope/backend/__init__.py +3 -0
  2. evalscope/backend/vlm_eval_kit/backend_manager.py +1 -0
  3. evalscope/backend/vlm_eval_kit/custom_dataset.py +1 -1
  4. evalscope/evaluator/evaluator.py +0 -1
  5. evalscope/run.py +0 -4
  6. evalscope/utils/logger.py +14 -44
  7. evalscope/utils/task_utils.py +0 -3
  8. evalscope/version.py +2 -2
  9. {evalscope-0.5.5.dist-info → evalscope-0.5.5rc1.dist-info}/METADATA +30 -24
  10. {evalscope-0.5.5.dist-info → evalscope-0.5.5rc1.dist-info}/RECORD +13 -29
  11. evalscope/backend/rag_eval/__init__.py +0 -3
  12. evalscope/backend/rag_eval/backend_manager.py +0 -68
  13. evalscope/backend/rag_eval/cmteb/__init__.py +0 -4
  14. evalscope/backend/rag_eval/cmteb/arguments.py +0 -59
  15. evalscope/backend/rag_eval/cmteb/base.py +0 -89
  16. evalscope/backend/rag_eval/cmteb/task_template.py +0 -83
  17. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -302
  18. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -252
  19. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -113
  20. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -153
  21. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -345
  22. evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -302
  23. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -64
  24. evalscope/backend/rag_eval/ragas/__init__.py +0 -2
  25. evalscope/backend/rag_eval/ragas/arguments.py +0 -37
  26. evalscope/backend/rag_eval/ragas/task_template.py +0 -117
  27. {evalscope-0.5.5.dist-info → evalscope-0.5.5rc1.dist-info}/WHEEL +0 -0
  28. {evalscope-0.5.5.dist-info → evalscope-0.5.5rc1.dist-info}/entry_points.txt +0 -0
  29. {evalscope-0.5.5.dist-info → evalscope-0.5.5rc1.dist-info}/top_level.txt +0 -0
@@ -1,302 +0,0 @@
1
- from mteb.abstasks.AbsTaskSTS import AbsTaskSTS
2
- from mteb.abstasks.TaskMetadata import TaskMetadata
3
-
4
- class ATEC(AbsTaskSTS):
5
- metadata = TaskMetadata(
6
- name="ATEC",
7
- dataset={
8
- "path": "C-MTEB/ATEC",
9
- "revision": "0f319b1142f28d00e055a6770f3f726ae9b7d865",
10
- },
11
- description="A Chinese dataset for textual relatedness",
12
- reference="https://aclanthology.org/2021.emnlp-main.357",
13
- type="STS",
14
- category="s2s",
15
- modalities=["text"],
16
- eval_splits=["validation", "test"],
17
- eval_langs=["cmn-Hans"],
18
- main_score="cosine_spearman",
19
- date=None,
20
- domains=None,
21
- task_subtypes=None,
22
- license=None,
23
- annotations_creators=None,
24
- dialect=None,
25
- sample_creation=None,
26
- bibtex_citation="""@inproceedings{raghu-etal-2021-end,
27
- title = "End-to-End Learning of Flowchart Grounded Task-Oriented Dialogs",
28
- author = "Raghu, Dinesh and
29
- Agarwal, Shantanu and
30
- Joshi, Sachindra and
31
- {Mausam}",
32
- editor = "Moens, Marie-Francine and
33
- Huang, Xuanjing and
34
- Specia, Lucia and
35
- Yih, Scott Wen-tau",
36
- booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
37
- month = nov,
38
- year = "2021",
39
- address = "Online and Punta Cana, Dominican Republic",
40
- publisher = "Association for Computational Linguistics",
41
- url = "https://aclanthology.org/2021.emnlp-main.357",
42
- doi = "10.18653/v1/2021.emnlp-main.357",
43
- pages = "4348--4366",
44
- abstract = "We propose a novel problem within end-to-end learning of task oriented dialogs (TOD), in which the dialog system mimics a troubleshooting agent who helps a user by diagnosing their problem (e.g., car not starting). Such dialogs are grounded in domain-specific flowcharts, which the agent is supposed to follow during the conversation. Our task exposes novel technical challenges for neural TOD, such as grounding an utterance to the flowchart without explicit annotation, referring to additional manual pages when user asks a clarification question, and ability to follow unseen flowcharts at test time. We release a dataset (FLODIAL) consisting of 2,738 dialogs grounded on 12 different troubleshooting flowcharts. We also design a neural model, FLONET, which uses a retrieval-augmented generation architecture to train the dialog agent. Our experiments find that FLONET can do zero-shot transfer to unseen flowcharts, and sets a strong baseline for future research.",
45
- }""",
46
- descriptive_stats={"n_samples": None, "avg_character_length": None},
47
- )
48
-
49
- @property
50
- def metadata_dict(self) -> dict[str, str]:
51
- metadata_dict = super().metadata_dict
52
- metadata_dict["min_score"] = 0
53
- metadata_dict["max_score"] = 1
54
- return metadata_dict
55
-
56
-
57
- class BQ(AbsTaskSTS):
58
- metadata = TaskMetadata(
59
- name="BQ",
60
- dataset={
61
- "path": "C-MTEB/BQ",
62
- "revision": "e3dda5e115e487b39ec7e618c0c6a29137052a55",
63
- },
64
- description="A Chinese dataset for textual relatedness",
65
- reference="https://aclanthology.org/2021.emnlp-main.357",
66
- type="STS",
67
- category="s2s",
68
- modalities=["text"],
69
- eval_splits=["validation", "test"],
70
- eval_langs=["cmn-Hans"],
71
- main_score="cosine_spearman",
72
- date=None,
73
- domains=None,
74
- task_subtypes=None,
75
- license=None,
76
- annotations_creators=None,
77
- dialect=None,
78
- sample_creation=None,
79
- bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
80
- title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
81
- author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
82
- year={2024},
83
- eprint={2309.07597},
84
- archivePrefix={arXiv},
85
- primaryClass={cs.CL},
86
- url={https://arxiv.org/abs/2309.07597},
87
- }""",
88
- descriptive_stats={"n_samples": None, "avg_character_length": None},
89
- )
90
-
91
- @property
92
- def metadata_dict(self) -> dict[str, str]:
93
- metadata_dict = super().metadata_dict
94
- metadata_dict["min_score"] = 0
95
- metadata_dict["max_score"] = 1
96
- return metadata_dict
97
-
98
-
99
- class LCQMC(AbsTaskSTS):
100
- metadata = TaskMetadata(
101
- name="LCQMC",
102
- dataset={
103
- "path": "C-MTEB/LCQMC",
104
- "revision": "17f9b096f80380fce5ed12a9be8be7784b337daf",
105
- },
106
- description="A Chinese dataset for textual relatedness",
107
- reference="https://aclanthology.org/2021.emnlp-main.357",
108
- type="STS",
109
- category="s2s",
110
- modalities=["text"],
111
- eval_splits=["test"],
112
- eval_langs=["cmn-Hans"],
113
- main_score="cosine_spearman",
114
- date=None,
115
- domains=None,
116
- task_subtypes=None,
117
- license=None,
118
- annotations_creators=None,
119
- dialect=None,
120
- sample_creation=None,
121
- bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
122
- title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
123
- author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
124
- year={2024},
125
- eprint={2309.07597},
126
- archivePrefix={arXiv},
127
- primaryClass={cs.CL},
128
- url={https://arxiv.org/abs/2309.07597},
129
- }""",
130
- descriptive_stats={"n_samples": None, "avg_character_length": None},
131
- )
132
-
133
- @property
134
- def metadata_dict(self) -> dict[str, str]:
135
- metadata_dict = super().metadata_dict
136
- metadata_dict["min_score"] = 0
137
- metadata_dict["max_score"] = 1
138
- return metadata_dict
139
-
140
-
141
- class PAWSX(AbsTaskSTS):
142
- metadata = TaskMetadata(
143
- name="PAWSX",
144
- dataset={
145
- "path": "C-MTEB/PAWSX",
146
- "revision": "9c6a90e430ac22b5779fb019a23e820b11a8b5e1",
147
- },
148
- description="A Chinese dataset for textual relatedness",
149
- reference="https://aclanthology.org/2021.emnlp-main.357",
150
- type="STS",
151
- category="s2s",
152
- modalities=["text"],
153
- eval_splits=["test"],
154
- eval_langs=["cmn-Hans"],
155
- main_score="cosine_spearman",
156
- date=None,
157
- domains=None,
158
- task_subtypes=None,
159
- license=None,
160
- annotations_creators=None,
161
- dialect=None,
162
- sample_creation=None,
163
- bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
164
- title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
165
- author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
166
- year={2024},
167
- eprint={2309.07597},
168
- archivePrefix={arXiv},
169
- primaryClass={cs.CL},
170
- url={https://arxiv.org/abs/2309.07597},
171
- }""",
172
- descriptive_stats={"n_samples": None, "avg_character_length": None},
173
- )
174
-
175
- @property
176
- def metadata_dict(self) -> dict[str, str]:
177
- metadata_dict = super().metadata_dict
178
- metadata_dict["min_score"] = 0
179
- metadata_dict["max_score"] = 1
180
- return metadata_dict
181
-
182
-
183
- class STSB(AbsTaskSTS):
184
- metadata = TaskMetadata(
185
- name="STSB",
186
- dataset={
187
- "path": "C-MTEB/STSB",
188
- "revision": "0cde68302b3541bb8b3c340dc0644b0b745b3dc0",
189
- },
190
- description="A Chinese dataset for textual relatedness",
191
- reference="https://aclanthology.org/2021.emnlp-main.357",
192
- type="STS",
193
- category="s2s",
194
- modalities=["text"],
195
- eval_splits=["validation", "test"],
196
- eval_langs=["cmn-Hans"],
197
- main_score="cosine_spearman",
198
- date=None,
199
- domains=None,
200
- task_subtypes=None,
201
- license=None,
202
- annotations_creators=None,
203
- dialect=None,
204
- sample_creation=None,
205
- bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
206
- title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
207
- author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
208
- year={2024},
209
- eprint={2309.07597},
210
- archivePrefix={arXiv},
211
- primaryClass={cs.CL},
212
- url={https://arxiv.org/abs/2309.07597},
213
- }""",
214
- descriptive_stats={"n_samples": None, "avg_character_length": None},
215
- )
216
-
217
- @property
218
- def metadata_dict(self) -> dict[str, str]:
219
- metadata_dict = super().metadata_dict
220
- metadata_dict["min_score"] = 0
221
- metadata_dict["max_score"] = 5
222
- return metadata_dict
223
-
224
-
225
- class AFQMC(AbsTaskSTS):
226
- metadata = TaskMetadata(
227
- name="AFQMC",
228
- dataset={
229
- "path": "C-MTEB/AFQMC",
230
- "revision": "b44c3b011063adb25877c13823db83bb193913c4",
231
- },
232
- description="A Chinese dataset for textual relatedness",
233
- reference="https://aclanthology.org/2021.emnlp-main.357",
234
- type="STS",
235
- category="s2s",
236
- modalities=["text"],
237
- eval_splits=["validation"],
238
- eval_langs=["cmn-Hans"],
239
- main_score="cosine_spearman",
240
- date=None,
241
- domains=None,
242
- task_subtypes=None,
243
- license=None,
244
- annotations_creators=None,
245
- dialect=None,
246
- sample_creation=None,
247
- bibtex_citation="""@inproceedings{raghu-etal-2021-end,
248
- title = "End-to-End Learning of Flowchart Grounded Task-Oriented Dialogs",
249
- author = "Raghu, Dinesh and
250
- Agarwal, Shantanu and
251
- Joshi, Sachindra and
252
- {Mausam}",
253
- editor = "Moens, Marie-Francine and
254
- Huang, Xuanjing and
255
- Specia, Lucia and
256
- Yih, Scott Wen-tau",
257
- booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
258
- month = nov,
259
- year = "2021",
260
- address = "Online and Punta Cana, Dominican Republic",
261
- publisher = "Association for Computational Linguistics",
262
- url = "https://aclanthology.org/2021.emnlp-main.357",
263
- doi = "10.18653/v1/2021.emnlp-main.357",
264
- pages = "4348--4366",
265
- abstract = "We propose a novel problem within end-to-end learning of task oriented dialogs (TOD), in which the dialog system mimics a troubleshooting agent who helps a user by diagnosing their problem (e.g., car not starting). Such dialogs are grounded in domain-specific flowcharts, which the agent is supposed to follow during the conversation. Our task exposes novel technical challenges for neural TOD, such as grounding an utterance to the flowchart without explicit annotation, referring to additional manual pages when user asks a clarification question, and ability to follow unseen flowcharts at test time. We release a dataset (FLODIAL) consisting of 2,738 dialogs grounded on 12 different troubleshooting flowcharts. We also design a neural model, FLONET, which uses a retrieval-augmented generation architecture to train the dialog agent. Our experiments find that FLONET can do zero-shot transfer to unseen flowcharts, and sets a strong baseline for future research.",
266
- }""",
267
- descriptive_stats={"n_samples": None, "avg_character_length": None},
268
- )
269
-
270
- @property
271
- def metadata_dict(self) -> dict[str, str]:
272
- metadata_dict = super().metadata_dict
273
- metadata_dict["min_score"] = 0
274
- metadata_dict["max_score"] = 1
275
- return metadata_dict
276
-
277
-
278
- class QBQTC(AbsTaskSTS):
279
- metadata = TaskMetadata(
280
- name="QBQTC",
281
- dataset={
282
- "path": "C-MTEB/QBQTC",
283
- "revision": "790b0510dc52b1553e8c49f3d2afb48c0e5c48b7",
284
- },
285
- description="",
286
- reference="https://github.com/CLUEbenchmark/QBQTC/tree/main/dataset",
287
- type="STS",
288
- category="s2s",
289
- modalities=["text"],
290
- eval_splits=["test"],
291
- eval_langs=["cmn-Hans"],
292
- main_score="cosine_spearman",
293
- date=None,
294
- domains=None,
295
- task_subtypes=None,
296
- license=None,
297
- annotations_creators=None,
298
- dialect=None,
299
- sample_creation=None,
300
- bibtex_citation=None,
301
- descriptive_stats={"n_samples": None, "avg_character_length": None},
302
- )
@@ -1,64 +0,0 @@
1
- from .Classification import *
2
- from .Clustering import *
3
- from .PairClassification import *
4
- from .Reranking import *
5
- from .Retrieval import *
6
- from .STS import *
7
-
8
- CLS_CLASSIFICATION = {
9
- "TNews": TNews,
10
- "IFlyTek": IFlyTek,
11
- "MultilingualSentiment": MultilingualSentiment,
12
- "JDReview": JDReview,
13
- "OnlineShopping": OnlineShopping,
14
- "Waimai": Waimai,
15
- }
16
-
17
- CLS_CLUSTERING = {
18
- "CLSClusteringS2S": CLSClusteringFastS2S,
19
- "CLSClusteringP2P": CLSClusteringFastP2P,
20
- "ThuNewsClusteringS2S": ThuNewsClusteringFastS2S,
21
- "ThuNewsClusteringP2P": ThuNewsClusteringFastP2P,
22
- }
23
-
24
- CLS_PAIR_CLASSIFICATION = {
25
- "Ocnli": Ocnli,
26
- "Cmnli": Cmnli,
27
- }
28
-
29
- CLS_RERANKING = {
30
- "T2Reranking": T2Reranking,
31
- "MMarcoReranking": MMarcoReranking,
32
- "CMedQAv1": CMedQAv1,
33
- "CMedQAv2": CMedQAv2,
34
- }
35
-
36
- CLS_RETRIEVAL = {
37
- "T2Retrieval": T2Retrieval,
38
- "MMarcoRetrieval": MMarcoRetrieval,
39
- "DuRetrieval": DuRetrieval,
40
- "CovidRetrieval": CovidRetrieval,
41
- "CmedqaRetrieval": CmedqaRetrieval,
42
- "EcomRetrieval": EcomRetrieval,
43
- "MedicalRetrieval": MedicalRetrieval,
44
- "VideoRetrieval": VideoRetrieval,
45
- }
46
-
47
- CLS_STS = {
48
- "ATEC": ATEC,
49
- "BQ": BQ,
50
- "LCQMC": LCQMC,
51
- "PAWSX": PAWSX,
52
- "STSB": STSB,
53
- "AFQMC": AFQMC,
54
- "QBQTC": QBQTC,
55
- }
56
-
57
- CLS_DICT = {
58
- **CLS_CLASSIFICATION,
59
- **CLS_CLUSTERING,
60
- **CLS_PAIR_CLASSIFICATION,
61
- **CLS_RERANKING,
62
- **CLS_RETRIEVAL,
63
- **CLS_STS,
64
- }
@@ -1,2 +0,0 @@
1
- from evalscope.backend.rag_eval.ragas.arguments import TestsetGenerationArguments, EvaluationArguments
2
- from evalscope.backend.rag_eval.ragas.task_template import testset_generation, rag_eval
@@ -1,37 +0,0 @@
1
- from dataclasses import dataclass, field
2
- from typing import List, Optional, Union, Dict, Any
3
-
4
-
5
- @dataclass
6
- class TestsetGenerationArguments:
7
- docs: List[str] = field(default_factory=list)
8
- test_size: int = 10
9
- output_file: str = "outputs/testset.json"
10
- """
11
- For local LLM support, you can use the following fields:
12
- model_name_or_path: str
13
- model_revision: str = "master"
14
- template_type: str = "default"
15
- generation_config: Optional[Dict]
16
-
17
- For API LLM support, you can use the following fields:
18
- model_name="gpt-4o-mini"
19
- api_base: str = "",
20
- api_key: Optional[str] = None
21
- """
22
- generator_llm: Dict = field(default_factory=dict)
23
- critic_llm: Dict = field(default_factory=dict)
24
- embeddings: Dict = field(default_factory=dict)
25
- distribution: str = field(
26
- default_factory=lambda: {"simple": 0.5, "multi_context": 0.4, "reasoning": 0.1}
27
- )
28
-
29
-
30
- @dataclass
31
- class EvaluationArguments:
32
- testset_file: str
33
- critic_llm: Dict = field(default_factory=dict)
34
- embeddings: Dict = field(default_factory=dict)
35
- metrics: List[str] = field(
36
- default_factory=lambda: ["answer_relevancy", "faithfulness"]
37
- )
@@ -1,117 +0,0 @@
1
- import os
2
- from evalscope.backend.rag_eval import EmbeddingModel, LLM
3
- from evalscope.utils.logger import get_logger
4
- from .arguments import TestsetGenerationArguments, EvaluationArguments
5
-
6
- logger = get_logger()
7
-
8
-
9
- def testset_generation(args: TestsetGenerationArguments) -> None:
10
- from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
11
- from ragas.testset.generator import TestsetGenerator
12
- from ragas.testset.evolutions import simple, reasoning, multi_context
13
- from ragas import RunConfig
14
-
15
- # load data
16
- file_path = args.docs
17
- loader = UnstructuredFileLoader(file_path, mode="elements")
18
- data = loader.load()
19
-
20
- # generator with models
21
- generator_llm = LLM.load(**args.generator_llm)
22
- critic_llm = LLM.load(**args.critic_llm)
23
- embeddings = EmbeddingModel.load(**args.embeddings)
24
-
25
- # Change resulting question type distribution
26
- distributions = {
27
- simple: args.distribution["simple"],
28
- multi_context: args.distribution["multi_context"],
29
- reasoning: args.distribution["reasoning"],
30
- }
31
-
32
- generator = TestsetGenerator.from_langchain(generator_llm, critic_llm, embeddings)
33
-
34
- runconfig = RunConfig(timeout=30, max_retries=1, max_wait=30, max_workers=1)
35
- testset = generator.generate_with_langchain_docs(
36
- data,
37
- args.test_size,
38
- distributions,
39
- with_debugging_logs=True,
40
- is_async=False,
41
- run_config=runconfig,
42
- )
43
-
44
- # save file
45
- testset_df = testset.to_pandas()
46
- output_path = os.path.dirname(args.output_file)
47
- os.makedirs(output_path, exist_ok=True)
48
- testset_df.to_json(args.output_file, indent=4, index=False, orient="records")
49
-
50
- # get answer
51
- testset_with_answer = get_answer(testset_df, generator_llm)
52
- testset_with_answer.to_json(
53
- args.output_file, indent=4, index=False, orient="records"
54
- )
55
-
56
-
57
- def get_answer(testset_df, generator_llm):
58
- template = """You are an assistant for question-answering tasks.
59
- Use the following pieces of retrieved context to answer the question.
60
- If you don't know the answer, just say that you don't know.
61
- Use two sentences maximum and keep the answer concise.
62
- Question: {question}
63
- Context: {contexts}
64
- Answer:
65
- """
66
- answers = []
67
- for index, row in testset_df.iterrows():
68
- question = row["question"]
69
- contexts = "\n".join(row["contexts"])
70
-
71
- # Combine question and contexts as input for the LLM
72
- input_text = template.format(question=question, contexts=contexts)
73
-
74
- # Generate the answer using the generator LLM
75
- answer = generator_llm.invoke(input_text)
76
- answers.append(answer)
77
-
78
- testset_df["answer"] = answers
79
- return testset_df
80
-
81
-
82
- def rag_eval(
83
- args: EvaluationArguments,
84
- ) -> None:
85
- from datasets import Dataset
86
- from ragas import evaluate
87
- from evalscope.backend.rag_eval import EmbeddingModel, LLM
88
- from ragas import RunConfig
89
- import importlib
90
-
91
- def dynamic_import(module_name, *function_names):
92
- # 动态导入指定模块
93
- module = importlib.import_module(module_name)
94
-
95
- functions = [getattr(module, name) for name in function_names]
96
- return functions
97
-
98
- llm = LLM.load(**args.critic_llm)
99
- embedding = EmbeddingModel.load(**args.embeddings)
100
-
101
- dataset = Dataset.from_json(args.testset_file)
102
-
103
- runconfig = RunConfig(timeout=30, max_retries=1, max_wait=30, max_workers=1)
104
- score = evaluate(
105
- dataset,
106
- metrics=dynamic_import("ragas.metrics", *args.metrics),
107
- llm=llm,
108
- embeddings=embedding,
109
- run_config=runconfig,
110
- )
111
- score_df = score.to_pandas()
112
- # logger.info(score_df.to_string())
113
-
114
- output_path = args.testset_file.split(".")[0] + "_score.json"
115
- score_df.to_json(output_path, indent=4, index=False, orient="records")
116
-
117
- logger.info(f"Eval score saved to {output_path}")