evalscope 0.5.5rc1__py3-none-any.whl → 0.6.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (48) hide show
  1. evalscope/backend/__init__.py +0 -3
  2. evalscope/backend/opencompass/tasks/eval_datasets.py +1 -0
  3. evalscope/backend/rag_eval/__init__.py +4 -0
  4. evalscope/backend/rag_eval/backend_manager.py +80 -0
  5. evalscope/backend/rag_eval/clip_benchmark/__init__.py +2 -0
  6. evalscope/backend/rag_eval/clip_benchmark/arguments.py +34 -0
  7. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +277 -0
  8. evalscope/backend/rag_eval/clip_benchmark/task_template.py +119 -0
  9. evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  10. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +83 -0
  11. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +247 -0
  12. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +170 -0
  13. evalscope/backend/rag_eval/cmteb/__init__.py +4 -0
  14. evalscope/backend/rag_eval/cmteb/arguments.py +61 -0
  15. evalscope/backend/rag_eval/cmteb/base.py +91 -0
  16. evalscope/backend/rag_eval/cmteb/task_template.py +85 -0
  17. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +302 -0
  18. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +252 -0
  19. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +61 -0
  20. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +113 -0
  21. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +150 -0
  22. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +345 -0
  23. evalscope/backend/rag_eval/cmteb/tasks/STS.py +302 -0
  24. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +70 -0
  25. evalscope/backend/rag_eval/ragas/__init__.py +2 -0
  26. evalscope/backend/rag_eval/ragas/arguments.py +47 -0
  27. evalscope/backend/rag_eval/ragas/metrics/__init__.py +2 -0
  28. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +91 -0
  29. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +99 -0
  30. evalscope/backend/rag_eval/ragas/task_template.py +61 -0
  31. evalscope/backend/rag_eval/ragas/tasks/__init__.py +2 -0
  32. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +263 -0
  33. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +72 -0
  34. evalscope/backend/vlm_eval_kit/backend_manager.py +0 -1
  35. evalscope/backend/vlm_eval_kit/custom_dataset.py +1 -1
  36. evalscope/evaluator/evaluator.py +1 -0
  37. evalscope/models/api/openai_api.py +2 -2
  38. evalscope/perf/http_client.py +1 -1
  39. evalscope/perf/openai_api.py +2 -0
  40. evalscope/run.py +4 -0
  41. evalscope/utils/logger.py +44 -14
  42. evalscope/utils/task_utils.py +3 -0
  43. evalscope/version.py +2 -2
  44. {evalscope-0.5.5rc1.dist-info → evalscope-0.6.0rc0.dist-info}/METADATA +40 -44
  45. {evalscope-0.5.5rc1.dist-info → evalscope-0.6.0rc0.dist-info}/RECORD +48 -17
  46. {evalscope-0.5.5rc1.dist-info → evalscope-0.6.0rc0.dist-info}/WHEEL +0 -0
  47. {evalscope-0.5.5rc1.dist-info → evalscope-0.6.0rc0.dist-info}/entry_points.txt +0 -0
  48. {evalscope-0.5.5rc1.dist-info → evalscope-0.6.0rc0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,302 @@
1
+ from mteb.abstasks.AbsTaskSTS import AbsTaskSTS
2
+ from mteb.abstasks.TaskMetadata import TaskMetadata
3
+
4
+ class ATEC(AbsTaskSTS):
5
+ metadata = TaskMetadata(
6
+ name="ATEC",
7
+ dataset={
8
+ "path": "C-MTEB/ATEC",
9
+ "revision": "0f319b1142f28d00e055a6770f3f726ae9b7d865",
10
+ },
11
+ description="A Chinese dataset for textual relatedness",
12
+ reference="https://aclanthology.org/2021.emnlp-main.357",
13
+ type="STS",
14
+ category="s2s",
15
+ modalities=["text"],
16
+ eval_splits=["validation", "test"],
17
+ eval_langs=["cmn-Hans"],
18
+ main_score="cosine_spearman",
19
+ date=None,
20
+ domains=None,
21
+ task_subtypes=None,
22
+ license=None,
23
+ annotations_creators=None,
24
+ dialect=None,
25
+ sample_creation=None,
26
+ bibtex_citation="""@inproceedings{raghu-etal-2021-end,
27
+ title = "End-to-End Learning of Flowchart Grounded Task-Oriented Dialogs",
28
+ author = "Raghu, Dinesh and
29
+ Agarwal, Shantanu and
30
+ Joshi, Sachindra and
31
+ {Mausam}",
32
+ editor = "Moens, Marie-Francine and
33
+ Huang, Xuanjing and
34
+ Specia, Lucia and
35
+ Yih, Scott Wen-tau",
36
+ booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
37
+ month = nov,
38
+ year = "2021",
39
+ address = "Online and Punta Cana, Dominican Republic",
40
+ publisher = "Association for Computational Linguistics",
41
+ url = "https://aclanthology.org/2021.emnlp-main.357",
42
+ doi = "10.18653/v1/2021.emnlp-main.357",
43
+ pages = "4348--4366",
44
+ abstract = "We propose a novel problem within end-to-end learning of task oriented dialogs (TOD), in which the dialog system mimics a troubleshooting agent who helps a user by diagnosing their problem (e.g., car not starting). Such dialogs are grounded in domain-specific flowcharts, which the agent is supposed to follow during the conversation. Our task exposes novel technical challenges for neural TOD, such as grounding an utterance to the flowchart without explicit annotation, referring to additional manual pages when user asks a clarification question, and ability to follow unseen flowcharts at test time. We release a dataset (FLODIAL) consisting of 2,738 dialogs grounded on 12 different troubleshooting flowcharts. We also design a neural model, FLONET, which uses a retrieval-augmented generation architecture to train the dialog agent. Our experiments find that FLONET can do zero-shot transfer to unseen flowcharts, and sets a strong baseline for future research.",
45
+ }""",
46
+ descriptive_stats={"n_samples": None, "avg_character_length": None},
47
+ )
48
+
49
+ @property
50
+ def metadata_dict(self) -> dict[str, str]:
51
+ metadata_dict = super().metadata_dict
52
+ metadata_dict["min_score"] = 0
53
+ metadata_dict["max_score"] = 1
54
+ return metadata_dict
55
+
56
+
57
+ class BQ(AbsTaskSTS):
58
+ metadata = TaskMetadata(
59
+ name="BQ",
60
+ dataset={
61
+ "path": "C-MTEB/BQ",
62
+ "revision": "e3dda5e115e487b39ec7e618c0c6a29137052a55",
63
+ },
64
+ description="A Chinese dataset for textual relatedness",
65
+ reference="https://aclanthology.org/2021.emnlp-main.357",
66
+ type="STS",
67
+ category="s2s",
68
+ modalities=["text"],
69
+ eval_splits=["validation", "test"],
70
+ eval_langs=["cmn-Hans"],
71
+ main_score="cosine_spearman",
72
+ date=None,
73
+ domains=None,
74
+ task_subtypes=None,
75
+ license=None,
76
+ annotations_creators=None,
77
+ dialect=None,
78
+ sample_creation=None,
79
+ bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
80
+ title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
81
+ author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
82
+ year={2024},
83
+ eprint={2309.07597},
84
+ archivePrefix={arXiv},
85
+ primaryClass={cs.CL},
86
+ url={https://arxiv.org/abs/2309.07597},
87
+ }""",
88
+ descriptive_stats={"n_samples": None, "avg_character_length": None},
89
+ )
90
+
91
+ @property
92
+ def metadata_dict(self) -> dict[str, str]:
93
+ metadata_dict = super().metadata_dict
94
+ metadata_dict["min_score"] = 0
95
+ metadata_dict["max_score"] = 1
96
+ return metadata_dict
97
+
98
+
99
+ class LCQMC(AbsTaskSTS):
100
+ metadata = TaskMetadata(
101
+ name="LCQMC",
102
+ dataset={
103
+ "path": "C-MTEB/LCQMC",
104
+ "revision": "17f9b096f80380fce5ed12a9be8be7784b337daf",
105
+ },
106
+ description="A Chinese dataset for textual relatedness",
107
+ reference="https://aclanthology.org/2021.emnlp-main.357",
108
+ type="STS",
109
+ category="s2s",
110
+ modalities=["text"],
111
+ eval_splits=["test"],
112
+ eval_langs=["cmn-Hans"],
113
+ main_score="cosine_spearman",
114
+ date=None,
115
+ domains=None,
116
+ task_subtypes=None,
117
+ license=None,
118
+ annotations_creators=None,
119
+ dialect=None,
120
+ sample_creation=None,
121
+ bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
122
+ title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
123
+ author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
124
+ year={2024},
125
+ eprint={2309.07597},
126
+ archivePrefix={arXiv},
127
+ primaryClass={cs.CL},
128
+ url={https://arxiv.org/abs/2309.07597},
129
+ }""",
130
+ descriptive_stats={"n_samples": None, "avg_character_length": None},
131
+ )
132
+
133
+ @property
134
+ def metadata_dict(self) -> dict[str, str]:
135
+ metadata_dict = super().metadata_dict
136
+ metadata_dict["min_score"] = 0
137
+ metadata_dict["max_score"] = 1
138
+ return metadata_dict
139
+
140
+
141
+ class PAWSX(AbsTaskSTS):
142
+ metadata = TaskMetadata(
143
+ name="PAWSX",
144
+ dataset={
145
+ "path": "C-MTEB/PAWSX",
146
+ "revision": "9c6a90e430ac22b5779fb019a23e820b11a8b5e1",
147
+ },
148
+ description="A Chinese dataset for textual relatedness",
149
+ reference="https://aclanthology.org/2021.emnlp-main.357",
150
+ type="STS",
151
+ category="s2s",
152
+ modalities=["text"],
153
+ eval_splits=["test"],
154
+ eval_langs=["cmn-Hans"],
155
+ main_score="cosine_spearman",
156
+ date=None,
157
+ domains=None,
158
+ task_subtypes=None,
159
+ license=None,
160
+ annotations_creators=None,
161
+ dialect=None,
162
+ sample_creation=None,
163
+ bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
164
+ title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
165
+ author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
166
+ year={2024},
167
+ eprint={2309.07597},
168
+ archivePrefix={arXiv},
169
+ primaryClass={cs.CL},
170
+ url={https://arxiv.org/abs/2309.07597},
171
+ }""",
172
+ descriptive_stats={"n_samples": None, "avg_character_length": None},
173
+ )
174
+
175
+ @property
176
+ def metadata_dict(self) -> dict[str, str]:
177
+ metadata_dict = super().metadata_dict
178
+ metadata_dict["min_score"] = 0
179
+ metadata_dict["max_score"] = 1
180
+ return metadata_dict
181
+
182
+
183
+ class STSB(AbsTaskSTS):
184
+ metadata = TaskMetadata(
185
+ name="STSB",
186
+ dataset={
187
+ "path": "C-MTEB/STSB",
188
+ "revision": "0cde68302b3541bb8b3c340dc0644b0b745b3dc0",
189
+ },
190
+ description="A Chinese dataset for textual relatedness",
191
+ reference="https://aclanthology.org/2021.emnlp-main.357",
192
+ type="STS",
193
+ category="s2s",
194
+ modalities=["text"],
195
+ eval_splits=["validation", "test"],
196
+ eval_langs=["cmn-Hans"],
197
+ main_score="cosine_spearman",
198
+ date=None,
199
+ domains=None,
200
+ task_subtypes=None,
201
+ license=None,
202
+ annotations_creators=None,
203
+ dialect=None,
204
+ sample_creation=None,
205
+ bibtex_citation="""@misc{xiao2024cpackpackagedresourcesadvance,
206
+ title={C-Pack: Packaged Resources To Advance General Chinese Embedding},
207
+ author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff and Defu Lian and Jian-Yun Nie},
208
+ year={2024},
209
+ eprint={2309.07597},
210
+ archivePrefix={arXiv},
211
+ primaryClass={cs.CL},
212
+ url={https://arxiv.org/abs/2309.07597},
213
+ }""",
214
+ descriptive_stats={"n_samples": None, "avg_character_length": None},
215
+ )
216
+
217
+ @property
218
+ def metadata_dict(self) -> dict[str, str]:
219
+ metadata_dict = super().metadata_dict
220
+ metadata_dict["min_score"] = 0
221
+ metadata_dict["max_score"] = 5
222
+ return metadata_dict
223
+
224
+
225
+ class AFQMC(AbsTaskSTS):
226
+ metadata = TaskMetadata(
227
+ name="AFQMC",
228
+ dataset={
229
+ "path": "C-MTEB/AFQMC",
230
+ "revision": "b44c3b011063adb25877c13823db83bb193913c4",
231
+ },
232
+ description="A Chinese dataset for textual relatedness",
233
+ reference="https://aclanthology.org/2021.emnlp-main.357",
234
+ type="STS",
235
+ category="s2s",
236
+ modalities=["text"],
237
+ eval_splits=["validation"],
238
+ eval_langs=["cmn-Hans"],
239
+ main_score="cosine_spearman",
240
+ date=None,
241
+ domains=None,
242
+ task_subtypes=None,
243
+ license=None,
244
+ annotations_creators=None,
245
+ dialect=None,
246
+ sample_creation=None,
247
+ bibtex_citation="""@inproceedings{raghu-etal-2021-end,
248
+ title = "End-to-End Learning of Flowchart Grounded Task-Oriented Dialogs",
249
+ author = "Raghu, Dinesh and
250
+ Agarwal, Shantanu and
251
+ Joshi, Sachindra and
252
+ {Mausam}",
253
+ editor = "Moens, Marie-Francine and
254
+ Huang, Xuanjing and
255
+ Specia, Lucia and
256
+ Yih, Scott Wen-tau",
257
+ booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
258
+ month = nov,
259
+ year = "2021",
260
+ address = "Online and Punta Cana, Dominican Republic",
261
+ publisher = "Association for Computational Linguistics",
262
+ url = "https://aclanthology.org/2021.emnlp-main.357",
263
+ doi = "10.18653/v1/2021.emnlp-main.357",
264
+ pages = "4348--4366",
265
+ abstract = "We propose a novel problem within end-to-end learning of task oriented dialogs (TOD), in which the dialog system mimics a troubleshooting agent who helps a user by diagnosing their problem (e.g., car not starting). Such dialogs are grounded in domain-specific flowcharts, which the agent is supposed to follow during the conversation. Our task exposes novel technical challenges for neural TOD, such as grounding an utterance to the flowchart without explicit annotation, referring to additional manual pages when user asks a clarification question, and ability to follow unseen flowcharts at test time. We release a dataset (FLODIAL) consisting of 2,738 dialogs grounded on 12 different troubleshooting flowcharts. We also design a neural model, FLONET, which uses a retrieval-augmented generation architecture to train the dialog agent. Our experiments find that FLONET can do zero-shot transfer to unseen flowcharts, and sets a strong baseline for future research.",
266
+ }""",
267
+ descriptive_stats={"n_samples": None, "avg_character_length": None},
268
+ )
269
+
270
+ @property
271
+ def metadata_dict(self) -> dict[str, str]:
272
+ metadata_dict = super().metadata_dict
273
+ metadata_dict["min_score"] = 0
274
+ metadata_dict["max_score"] = 1
275
+ return metadata_dict
276
+
277
+
278
+ class QBQTC(AbsTaskSTS):
279
+ metadata = TaskMetadata(
280
+ name="QBQTC",
281
+ dataset={
282
+ "path": "C-MTEB/QBQTC",
283
+ "revision": "790b0510dc52b1553e8c49f3d2afb48c0e5c48b7",
284
+ },
285
+ description="",
286
+ reference="https://github.com/CLUEbenchmark/QBQTC/tree/main/dataset",
287
+ type="STS",
288
+ category="s2s",
289
+ modalities=["text"],
290
+ eval_splits=["test"],
291
+ eval_langs=["cmn-Hans"],
292
+ main_score="cosine_spearman",
293
+ date=None,
294
+ domains=None,
295
+ task_subtypes=None,
296
+ license=None,
297
+ annotations_creators=None,
298
+ dialect=None,
299
+ sample_creation=None,
300
+ bibtex_citation=None,
301
+ descriptive_stats={"n_samples": None, "avg_character_length": None},
302
+ )
@@ -0,0 +1,70 @@
1
+ from .Classification import *
2
+ from .Clustering import *
3
+ from .PairClassification import *
4
+ from .Reranking import *
5
+ from .Retrieval import *
6
+ from .STS import *
7
+ from .CustomTask import *
8
+
9
+
10
+ CLS_CLASSIFICATION = {
11
+ "TNews": TNews,
12
+ "IFlyTek": IFlyTek,
13
+ "MultilingualSentiment": MultilingualSentiment,
14
+ "JDReview": JDReview,
15
+ "OnlineShopping": OnlineShopping,
16
+ "Waimai": Waimai,
17
+ }
18
+
19
+ CLS_CLUSTERING = {
20
+ "CLSClusteringS2S": CLSClusteringFastS2S,
21
+ "CLSClusteringP2P": CLSClusteringFastP2P,
22
+ "ThuNewsClusteringS2S": ThuNewsClusteringFastS2S,
23
+ "ThuNewsClusteringP2P": ThuNewsClusteringFastP2P,
24
+ }
25
+
26
+ CLS_PAIR_CLASSIFICATION = {
27
+ "Ocnli": Ocnli,
28
+ "Cmnli": Cmnli,
29
+ }
30
+
31
+ CLS_RERANKING = {
32
+ "T2Reranking": T2Reranking,
33
+ "MMarcoReranking": MMarcoReranking,
34
+ "CMedQAv1": CMedQAv1,
35
+ "CMedQAv2": CMedQAv2,
36
+ }
37
+
38
+ CLS_RETRIEVAL = {
39
+ "T2Retrieval": T2Retrieval,
40
+ "MMarcoRetrieval": MMarcoRetrieval,
41
+ "DuRetrieval": DuRetrieval,
42
+ "CovidRetrieval": CovidRetrieval,
43
+ "CmedqaRetrieval": CmedqaRetrieval,
44
+ "EcomRetrieval": EcomRetrieval,
45
+ "MedicalRetrieval": MedicalRetrieval,
46
+ "VideoRetrieval": VideoRetrieval,
47
+ }
48
+
49
+ CLS_STS = {
50
+ "ATEC": ATEC,
51
+ "BQ": BQ,
52
+ "LCQMC": LCQMC,
53
+ "PAWSX": PAWSX,
54
+ "STSB": STSB,
55
+ "AFQMC": AFQMC,
56
+ "QBQTC": QBQTC,
57
+ }
58
+
59
+ CLS_CUSTOM = {
60
+ "CustomRetrieval": CustomRetrieval,
61
+ }
62
+
63
+ CLS_DICT = {
64
+ **CLS_CLASSIFICATION,
65
+ **CLS_CLUSTERING,
66
+ **CLS_PAIR_CLASSIFICATION,
67
+ **CLS_RERANKING,
68
+ **CLS_RETRIEVAL,
69
+ **CLS_STS,
70
+ }
@@ -0,0 +1,2 @@
1
+ from evalscope.backend.rag_eval.ragas.arguments import TestsetGenerationArguments, EvaluationArguments
2
+ from evalscope.backend.rag_eval.ragas.task_template import rag_eval
@@ -0,0 +1,47 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import List, Optional, Union, Dict, Any
3
+
4
+
5
+ @dataclass
6
+ class TestsetGenerationArguments:
7
+ docs: List[str] = field(default_factory=list)
8
+ test_size: int = 10
9
+ output_file: str = 'outputs/testset.json'
10
+ knowledge_graph: str = 'outputs/knowledge_graph.json'
11
+ """
12
+ For local LLM support, you can use the following fields:
13
+ model_name_or_path: str
14
+ model_revision: str = "master"
15
+ template_type: str = "default"
16
+ generation_config: Optional[Dict]
17
+
18
+ For API LLM support, you can use the following fields:
19
+ model_name="gpt-4o-mini"
20
+ api_base: str = "",
21
+ api_key: Optional[str] = None
22
+ """
23
+ generator_llm: Dict = field(default_factory=dict)
24
+ embeddings: Dict = field(default_factory=dict)
25
+ distribution: str = field(
26
+ default_factory=lambda: {'simple': 0.5, 'multi_context': 0.4, 'reasoning': 0.1}
27
+ )
28
+ # For LLM based evaluation
29
+ # available: ['english', 'hindi', 'marathi', 'chinese', 'spanish', 'amharic', 'arabic',
30
+ # 'armenian', 'bulgarian', 'urdu', 'russian', 'polish', 'persian', 'dutch', 'danish',
31
+ # 'french', 'burmese', 'greek', 'italian', 'japanese', 'deutsch', 'kazakh', 'slovak']
32
+ language: str = 'english'
33
+
34
+
35
+ @dataclass
36
+ class EvaluationArguments:
37
+ testset_file: str
38
+ critic_llm: Dict = field(default_factory=dict)
39
+ embeddings: Dict = field(default_factory=dict)
40
+ metrics: List[str] = field(
41
+ default_factory=lambda: ['answer_relevancy', 'faithfulness']
42
+ )
43
+ # For LLM based evaluation
44
+ # available: ['english', 'hindi', 'marathi', 'chinese', 'spanish', 'amharic', 'arabic',
45
+ # 'armenian', 'bulgarian', 'urdu', 'russian', 'polish', 'persian', 'dutch', 'danish',
46
+ # 'french', 'burmese', 'greek', 'italian', 'japanese', 'deutsch', 'kazakh', 'slovak']
47
+ language: str = 'english'
@@ -0,0 +1,2 @@
1
+ from .multi_modal_faithfulness import multimodal_faithness, MultiModalFaithfulness
2
+ from .multi_modal_relevance import multimodal_relevance, MultiModalRelevance
@@ -0,0 +1,91 @@
1
+ import typing as t
2
+ import numpy as np
3
+ from dataclasses import dataclass, field
4
+ from ragas.dataset_schema import SingleTurnSample
5
+ from ragas.metrics.base import MetricWithLLM, SingleTurnMetric, MetricType
6
+ from pydantic import BaseModel, Field
7
+ from evalscope.backend.rag_eval.ragas.prompts.multi_modal_prompt import ImageTextPrompt
8
+
9
+
10
+ class FaithfulnessInput(BaseModel):
11
+ response: str = Field(description="response from AI")
12
+ retrieved_contexts: list[str] = Field(description="contexts retrieved from the LLM")
13
+
14
+ def to_string_list(self):
15
+ return [
16
+ "inputs:",
17
+ self.response,
18
+ "retrieved_contexts: ",
19
+ ] + self.retrieved_contexts
20
+
21
+
22
+ class FaithfulnessOutput(BaseModel):
23
+ faithful: bool = Field(description="boolean indicating if request was faithful")
24
+
25
+
26
+ class MultiModalFaithfulnessPrompt(
27
+ ImageTextPrompt[FaithfulnessInput, FaithfulnessOutput]
28
+ ):
29
+ # refer: https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/evaluation/multi_modal/faithfulness.py
30
+ instruction = "Please tell if a given piece of information is supported by the visual as well as textual context information. You need to answer with either True or False. Answer True if any of the image(s) and textual context supports the information"
31
+ input_model = FaithfulnessInput
32
+ output_model = FaithfulnessOutput
33
+ examples = [
34
+ (
35
+ FaithfulnessInput(
36
+ response="Apple pie is generally double-crusted.",
37
+ retrieved_contexts=[
38
+ "An apple pie is a fruit pie in which the principal filling ingredient is apples.",
39
+ "Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard or cheddar cheese.",
40
+ "It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).",
41
+ ],
42
+ ),
43
+ FaithfulnessOutput(faithful=True),
44
+ ),
45
+ (
46
+ FaithfulnessInput(
47
+ response="Apple pies tastes bad.",
48
+ retrieved_contexts=[
49
+ "An apple pie is a fruit pie in which the principal filling ingredient is apples.",
50
+ "Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard or cheddar cheese.",
51
+ "It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).",
52
+ ],
53
+ ),
54
+ FaithfulnessOutput(faithful=False),
55
+ ),
56
+ ]
57
+
58
+
59
+ @dataclass
60
+ class MultiModalFaithfulness(MetricWithLLM, SingleTurnMetric):
61
+ name: str = "faithful_rate" # type: ignore
62
+ _required_columns: t.Dict[MetricType, t.Set[str]] = field(
63
+ default_factory=lambda: {
64
+ MetricType.SINGLE_TURN: {
65
+ "response",
66
+ "retrieved_contexts",
67
+ }
68
+ }
69
+ )
70
+ faithfulness_prompt: ImageTextPrompt = MultiModalFaithfulnessPrompt()
71
+
72
+ async def _ascore(self, row: t.Dict, callbacks) -> float:
73
+ prompt_input = FaithfulnessInput(
74
+ response=row["response"], retrieved_contexts=row["retrieved_contexts"]
75
+ )
76
+ assert self.llm is not None, "LLM is not set"
77
+ prompt_response = await self.faithfulness_prompt.generate(
78
+ data=prompt_input, llm=self.llm, callbacks=callbacks
79
+ )
80
+ if prompt_response is None:
81
+ return np.nan
82
+ return float(prompt_response.faithful)
83
+
84
+ async def _single_turn_ascore(
85
+ self, sample: SingleTurnSample, callbacks
86
+ ) -> float:
87
+ row = sample.to_dict()
88
+ return await self._ascore(row, callbacks)
89
+
90
+
91
+ multimodal_faithness = MultiModalFaithfulness()
@@ -0,0 +1,99 @@
1
+ import typing as t
2
+ from dataclasses import dataclass, field
3
+ import numpy as np
4
+ from ragas.dataset_schema import SingleTurnSample
5
+ from ragas.metrics.base import MetricWithLLM, SingleTurnMetric, MetricType
6
+ from pydantic import BaseModel, Field
7
+ from evalscope.backend.rag_eval.ragas.prompts.multi_modal_prompt import ImageTextPrompt
8
+
9
+
10
+ class RelevanceInput(BaseModel):
11
+ user_input: str = Field(description="user input")
12
+ response: str = Field(description="response from AI")
13
+ retrieved_contexts: list[str] = Field(description="contexts retrieved from the LLM")
14
+
15
+ def to_string_list(self):
16
+ return [
17
+ f"Question: {self.user_input}",
18
+ f"Response: {self.response}",
19
+ "retrieved_contexts: ",
20
+ ] + self.retrieved_contexts
21
+
22
+
23
+ class RelevanceOutput(BaseModel):
24
+ relevance: bool = Field(description="boolean indicating if request was relevance")
25
+
26
+
27
+ class MultiModalRelevancePrompt(ImageTextPrompt[RelevanceInput, RelevanceOutput]):
28
+ # refer https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/evaluation/multi_modal/relevancy.py
29
+ instruction = """
30
+ Your task is to evaluate if the response for the query is in line with the images and textual context information provided.
31
+ You have two options to answer. Either True / False.
32
+ Answer - True, if the response for the query is in line with context information otherwise False.
33
+ """
34
+ input_model = RelevanceInput
35
+ output_model = RelevanceOutput
36
+ examples = [
37
+ (
38
+ RelevanceInput(
39
+ user_input="What is the primary ingredient in a traditional Margherita pizza?",
40
+ response="The primary ingredients in a Margherita pizza are tomatoes, mozzarella cheese, and fresh basil.",
41
+ retrieved_contexts=[
42
+ "A traditional Margherita pizza consists of a thin crust.",
43
+ "The main toppings include tomatoes, mozzarella cheese, fresh basil, salt, and olive oil.",
44
+ "It is one of the simplest and most classic types of pizza.",
45
+ ],
46
+ ),
47
+ RelevanceOutput(relevance=True),
48
+ ),
49
+ (
50
+ RelevanceInput(
51
+ user_input="Who won the Best Actor award at the Oscars in 2021?",
52
+ response="The Best Actor award in 2021 was won by Leonardo DiCaprio.",
53
+ retrieved_contexts=[
54
+ "The 93rd Academy Awards were held in 2021.",
55
+ "Anthony Hopkins won the Best Actor award for his role in 'The Father'.",
56
+ "The event was unique due to COVID-19 restrictions.",
57
+ ],
58
+ ),
59
+ RelevanceOutput(relevance=False),
60
+ ),
61
+ ]
62
+
63
+
64
+ @dataclass
65
+ class MultiModalRelevance(MetricWithLLM, SingleTurnMetric):
66
+ name: str = "relevance_rate" # type: ignore
67
+ _required_columns: t.Dict[MetricType, t.Set[str]] = field(
68
+ default_factory=lambda: {
69
+ MetricType.SINGLE_TURN: {
70
+ "user_input",
71
+ "response",
72
+ "retrieved_contexts",
73
+ }
74
+ }
75
+ )
76
+ relevance_prompt: ImageTextPrompt = MultiModalRelevancePrompt()
77
+
78
+ async def _ascore(self, row: t.Dict, callbacks) -> float:
79
+ prompt_input = RelevanceInput(
80
+ user_input=row["user_input"],
81
+ response=row["response"],
82
+ retrieved_contexts=row["retrieved_contexts"],
83
+ )
84
+ assert self.llm is not None, "LLM is not set"
85
+ prompt_response = await self.relevance_prompt.generate(
86
+ data=prompt_input, llm=self.llm, callbacks=callbacks
87
+ )
88
+ if prompt_response is None:
89
+ return np.nan
90
+ return float(prompt_response.relevance)
91
+
92
+ async def _single_turn_ascore(
93
+ self, sample: SingleTurnSample, callbacks
94
+ ) -> float:
95
+ row = sample.to_dict()
96
+ return await self._ascore(row, callbacks)
97
+
98
+
99
+ multimodal_relevance = MultiModalRelevance()
@@ -0,0 +1,61 @@
1
+ import os
2
+ import asyncio
3
+ from datasets import Dataset
4
+ from evalscope.backend.rag_eval import EmbeddingModel, LLM
5
+ from evalscope.backend.rag_eval.ragas.tasks.translate_prompt import translate_prompts
6
+ from evalscope.utils.logger import get_logger
7
+ from .arguments import EvaluationArguments
8
+
9
+ logger = get_logger()
10
+
11
+
12
+ def rag_eval(
13
+ args: EvaluationArguments,
14
+ ) -> None:
15
+
16
+ from ragas import evaluate, RunConfig
17
+ from ragas.llms import LangchainLLMWrapper
18
+ import importlib
19
+
20
+ def dynamic_import(*function_names):
21
+ functions = []
22
+ for name in function_names:
23
+ module = importlib.import_module('ragas.metrics')
24
+ functions.append(getattr(module, name)())
25
+ return functions
26
+
27
+ llm = LLM.load(**args.critic_llm)
28
+ embedding = EmbeddingModel.load(**args.embeddings)
29
+
30
+ # load dataset
31
+ dataset = Dataset.from_json(args.testset_file)
32
+
33
+ # load metrics
34
+ metrics = dynamic_import(*args.metrics)
35
+ asyncio.run(
36
+ translate_prompts(
37
+ prompts=metrics,
38
+ target_lang=args.language,
39
+ llm=LangchainLLMWrapper(llm),
40
+ adapt_instruction=True,
41
+ )
42
+ )
43
+
44
+ # evaluate
45
+ runconfig = RunConfig(timeout=600, max_retries=2, max_wait=60, max_workers=1)
46
+ score = evaluate(
47
+ dataset,
48
+ metrics=metrics,
49
+ llm=llm,
50
+ embeddings=embedding,
51
+ run_config=runconfig,
52
+ )
53
+ score_df = score.to_pandas()
54
+ logger.info(score_df)
55
+
56
+ output_path = args.testset_file.replace('.json', '_score.json')
57
+ score_df.to_json(
58
+ output_path, indent=4, index=False, orient='records', force_ascii=False
59
+ )
60
+
61
+ logger.info(f'Eval score saved to {output_path}')
@@ -0,0 +1,2 @@
1
+ from evalscope.backend.rag_eval.ragas.tasks.testset_generation import generate_testset
2
+ from evalscope.backend.rag_eval.ragas.tasks.translate_prompt import translate_prompts