evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (234) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +6 -2
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +47 -51
  60. evalscope/backend/rag_eval/utils/embedding.py +13 -12
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +33 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +154 -96
  139. evalscope/constants.py +50 -32
  140. evalscope/evaluator/evaluator.py +97 -377
  141. evalscope/evaluator/rating_eval.py +12 -33
  142. evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
  143. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  144. evalscope/metrics/code_metric.py +3 -9
  145. evalscope/metrics/math_accuracy.py +3 -6
  146. evalscope/metrics/metrics.py +21 -21
  147. evalscope/metrics/rouge_metric.py +11 -25
  148. evalscope/models/__init__.py +1 -2
  149. evalscope/models/api/openai_api.py +40 -29
  150. evalscope/models/custom/__init__.py +0 -1
  151. evalscope/models/custom/custom_model.py +3 -3
  152. evalscope/models/dummy_chat_model.py +7 -8
  153. evalscope/models/model_adapter.py +89 -156
  154. evalscope/models/openai_model.py +20 -20
  155. evalscope/perf/arguments.py +16 -3
  156. evalscope/perf/benchmark.py +9 -11
  157. evalscope/perf/http_client.py +3 -8
  158. evalscope/perf/main.py +8 -1
  159. evalscope/perf/plugin/api/custom_api.py +1 -2
  160. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  161. evalscope/perf/plugin/api/openai_api.py +3 -4
  162. evalscope/perf/plugin/datasets/base.py +1 -2
  163. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  164. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  165. evalscope/perf/plugin/datasets/openqa.py +1 -2
  166. evalscope/perf/plugin/registry.py +3 -3
  167. evalscope/perf/utils/analysis_result.py +1 -2
  168. evalscope/perf/utils/benchmark_util.py +5 -6
  169. evalscope/perf/utils/db_util.py +77 -30
  170. evalscope/perf/utils/local_server.py +21 -13
  171. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  172. evalscope/registry/tasks/arc.yaml +2 -3
  173. evalscope/registry/tasks/bbh.yaml +3 -4
  174. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  175. evalscope/registry/tasks/ceval.yaml +3 -3
  176. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  177. evalscope/registry/tasks/cmmlu.yaml +3 -3
  178. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  179. evalscope/registry/tasks/general_qa.yaml +1 -1
  180. evalscope/registry/tasks/gsm8k.yaml +2 -2
  181. evalscope/registry/tasks/mmlu.yaml +3 -3
  182. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  183. evalscope/run.py +153 -381
  184. evalscope/run_arena.py +21 -25
  185. evalscope/summarizer.py +27 -40
  186. evalscope/third_party/longbench_write/README.md +99 -42
  187. evalscope/third_party/longbench_write/default_task.json +1 -1
  188. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  189. evalscope/third_party/longbench_write/eval.py +29 -27
  190. evalscope/third_party/longbench_write/infer.py +16 -104
  191. evalscope/third_party/longbench_write/longbench_write.py +5 -4
  192. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  193. evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
  194. evalscope/third_party/longbench_write/utils.py +0 -1
  195. evalscope/third_party/toolbench_static/eval.py +14 -15
  196. evalscope/third_party/toolbench_static/infer.py +48 -69
  197. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  198. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  199. evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
  200. evalscope/tools/combine_reports.py +27 -34
  201. evalscope/tools/rewrite_eval_results.py +15 -47
  202. evalscope/utils/__init__.py +1 -1
  203. evalscope/utils/arena_utils.py +18 -48
  204. evalscope/{perf/utils → utils}/chat_service.py +4 -5
  205. evalscope/utils/completion_parsers.py +3 -8
  206. evalscope/utils/io_utils.py +162 -0
  207. evalscope/utils/logger.py +17 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +5 -306
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
  212. evalscope-0.8.1.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +53 -15
  214. tests/perf/test_perf.py +6 -1
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. tests/vlm/test_vlmeval.py +3 -2
  222. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  224. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  225. evalscope/cache.py +0 -98
  226. evalscope/models/template.py +0 -1446
  227. evalscope/run_ms.py +0 -140
  228. evalscope/utils/task_cfg_parser.py +0 -10
  229. evalscope/utils/task_utils.py +0 -22
  230. evalscope-0.7.2.dist-info/RECORD +0 -286
  231. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
  232. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
  233. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
  234. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,12 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import json
2
3
  import os
4
+ import requests
3
5
  import shutil
4
6
  import subprocess
5
7
  import time
6
8
  import unittest
7
9
 
8
- import json
9
- import requests
10
-
11
10
  from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
12
11
  from evalscope.run import run_task
13
12
  from evalscope.summarizer import Summarizer
@@ -1,13 +1,12 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import json
2
3
  import os
4
+ import requests
3
5
  import shutil
4
6
  import subprocess
5
7
  import time
6
8
  import unittest
7
9
 
8
- import json
9
- import requests
10
-
11
10
  from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
12
11
  from evalscope.run import run_task
13
12
  from evalscope.summarizer import Summarizer
tests/vlm/test_vlmeval.py CHANGED
@@ -40,8 +40,9 @@ class TestVLMEval(unittest.TestCase):
40
40
  }], # model name for VLMEval config
41
41
  'nproc': 1,
42
42
  'reuse': True,
43
- 'work_dir': 'outputs'
44
- }
43
+ },
44
+ 'work_dir': 'outputs',
45
+ 'use_cache': 'outputs/20241216_142838'
45
46
  }
46
47
 
47
48
  logger.info(f'>> Start to run task: {task_cfg}')
@@ -1,2 +0,0 @@
1
- from .multi_modal_faithfulness import multimodal_faithness, MultiModalFaithfulness
2
- from .multi_modal_relevance import multimodal_relevance, MultiModalRelevance
@@ -1,91 +0,0 @@
1
- import typing as t
2
- import numpy as np
3
- from dataclasses import dataclass, field
4
- from ragas.dataset_schema import SingleTurnSample
5
- from ragas.metrics.base import MetricWithLLM, SingleTurnMetric, MetricType
6
- from pydantic import BaseModel, Field
7
- from evalscope.backend.rag_eval.ragas.prompts.multi_modal_prompt import ImageTextPrompt
8
-
9
-
10
- class FaithfulnessInput(BaseModel):
11
- response: str = Field(description="response from AI")
12
- retrieved_contexts: list[str] = Field(description="contexts retrieved from the LLM")
13
-
14
- def to_string_list(self):
15
- return [
16
- "inputs:",
17
- self.response,
18
- "retrieved_contexts: ",
19
- ] + self.retrieved_contexts
20
-
21
-
22
- class FaithfulnessOutput(BaseModel):
23
- faithful: bool = Field(description="boolean indicating if request was faithful")
24
-
25
-
26
- class MultiModalFaithfulnessPrompt(
27
- ImageTextPrompt[FaithfulnessInput, FaithfulnessOutput]
28
- ):
29
- # refer: https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/evaluation/multi_modal/faithfulness.py
30
- instruction = "Please tell if a given piece of information is supported by the visual as well as textual context information. You need to answer with either True or False. Answer True if any of the image(s) and textual context supports the information"
31
- input_model = FaithfulnessInput
32
- output_model = FaithfulnessOutput
33
- examples = [
34
- (
35
- FaithfulnessInput(
36
- response="Apple pie is generally double-crusted.",
37
- retrieved_contexts=[
38
- "An apple pie is a fruit pie in which the principal filling ingredient is apples.",
39
- "Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard or cheddar cheese.",
40
- "It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).",
41
- ],
42
- ),
43
- FaithfulnessOutput(faithful=True),
44
- ),
45
- (
46
- FaithfulnessInput(
47
- response="Apple pies tastes bad.",
48
- retrieved_contexts=[
49
- "An apple pie is a fruit pie in which the principal filling ingredient is apples.",
50
- "Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard or cheddar cheese.",
51
- "It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).",
52
- ],
53
- ),
54
- FaithfulnessOutput(faithful=False),
55
- ),
56
- ]
57
-
58
-
59
- @dataclass
60
- class MultiModalFaithfulness(MetricWithLLM, SingleTurnMetric):
61
- name: str = "faithful_rate" # type: ignore
62
- _required_columns: t.Dict[MetricType, t.Set[str]] = field(
63
- default_factory=lambda: {
64
- MetricType.SINGLE_TURN: {
65
- "response",
66
- "retrieved_contexts",
67
- }
68
- }
69
- )
70
- faithfulness_prompt: ImageTextPrompt = MultiModalFaithfulnessPrompt()
71
-
72
- async def _ascore(self, row: t.Dict, callbacks) -> float:
73
- prompt_input = FaithfulnessInput(
74
- response=row["response"], retrieved_contexts=row["retrieved_contexts"]
75
- )
76
- assert self.llm is not None, "LLM is not set"
77
- prompt_response = await self.faithfulness_prompt.generate(
78
- data=prompt_input, llm=self.llm, callbacks=callbacks
79
- )
80
- if prompt_response is None:
81
- return np.nan
82
- return float(prompt_response.faithful)
83
-
84
- async def _single_turn_ascore(
85
- self, sample: SingleTurnSample, callbacks
86
- ) -> float:
87
- row = sample.to_dict()
88
- return await self._ascore(row, callbacks)
89
-
90
-
91
- multimodal_faithness = MultiModalFaithfulness()
@@ -1,99 +0,0 @@
1
- import typing as t
2
- from dataclasses import dataclass, field
3
- import numpy as np
4
- from ragas.dataset_schema import SingleTurnSample
5
- from ragas.metrics.base import MetricWithLLM, SingleTurnMetric, MetricType
6
- from pydantic import BaseModel, Field
7
- from evalscope.backend.rag_eval.ragas.prompts.multi_modal_prompt import ImageTextPrompt
8
-
9
-
10
- class RelevanceInput(BaseModel):
11
- user_input: str = Field(description="user input")
12
- response: str = Field(description="response from AI")
13
- retrieved_contexts: list[str] = Field(description="contexts retrieved from the LLM")
14
-
15
- def to_string_list(self):
16
- return [
17
- f"Question: {self.user_input}",
18
- f"Response: {self.response}",
19
- "retrieved_contexts: ",
20
- ] + self.retrieved_contexts
21
-
22
-
23
- class RelevanceOutput(BaseModel):
24
- relevance: bool = Field(description="boolean indicating if request was relevance")
25
-
26
-
27
- class MultiModalRelevancePrompt(ImageTextPrompt[RelevanceInput, RelevanceOutput]):
28
- # refer https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/evaluation/multi_modal/relevancy.py
29
- instruction = """
30
- Your task is to evaluate if the response for the query is in line with the images and textual context information provided.
31
- You have two options to answer. Either True / False.
32
- Answer - True, if the response for the query is in line with context information otherwise False.
33
- """
34
- input_model = RelevanceInput
35
- output_model = RelevanceOutput
36
- examples = [
37
- (
38
- RelevanceInput(
39
- user_input="What is the primary ingredient in a traditional Margherita pizza?",
40
- response="The primary ingredients in a Margherita pizza are tomatoes, mozzarella cheese, and fresh basil.",
41
- retrieved_contexts=[
42
- "A traditional Margherita pizza consists of a thin crust.",
43
- "The main toppings include tomatoes, mozzarella cheese, fresh basil, salt, and olive oil.",
44
- "It is one of the simplest and most classic types of pizza.",
45
- ],
46
- ),
47
- RelevanceOutput(relevance=True),
48
- ),
49
- (
50
- RelevanceInput(
51
- user_input="Who won the Best Actor award at the Oscars in 2021?",
52
- response="The Best Actor award in 2021 was won by Leonardo DiCaprio.",
53
- retrieved_contexts=[
54
- "The 93rd Academy Awards were held in 2021.",
55
- "Anthony Hopkins won the Best Actor award for his role in 'The Father'.",
56
- "The event was unique due to COVID-19 restrictions.",
57
- ],
58
- ),
59
- RelevanceOutput(relevance=False),
60
- ),
61
- ]
62
-
63
-
64
- @dataclass
65
- class MultiModalRelevance(MetricWithLLM, SingleTurnMetric):
66
- name: str = "relevance_rate" # type: ignore
67
- _required_columns: t.Dict[MetricType, t.Set[str]] = field(
68
- default_factory=lambda: {
69
- MetricType.SINGLE_TURN: {
70
- "user_input",
71
- "response",
72
- "retrieved_contexts",
73
- }
74
- }
75
- )
76
- relevance_prompt: ImageTextPrompt = MultiModalRelevancePrompt()
77
-
78
- async def _ascore(self, row: t.Dict, callbacks) -> float:
79
- prompt_input = RelevanceInput(
80
- user_input=row["user_input"],
81
- response=row["response"],
82
- retrieved_contexts=row["retrieved_contexts"],
83
- )
84
- assert self.llm is not None, "LLM is not set"
85
- prompt_response = await self.relevance_prompt.generate(
86
- data=prompt_input, llm=self.llm, callbacks=callbacks
87
- )
88
- if prompt_response is None:
89
- return np.nan
90
- return float(prompt_response.relevance)
91
-
92
- async def _single_turn_ascore(
93
- self, sample: SingleTurnSample, callbacks
94
- ) -> float:
95
- row = sample.to_dict()
96
- return await self._ascore(row, callbacks)
97
-
98
-
99
- multimodal_relevance = MultiModalRelevance()
evalscope/cache.py DELETED
@@ -1,98 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- import os
4
- from typing import Union
5
-
6
- import cachetools
7
- from cachetools import Cache as CachetoolsCache
8
- from pympler import asizeof
9
- from datetime import datetime, timedelta
10
- import pickle
11
-
12
- from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
13
- from evalscope.utils.logger import get_logger
14
-
15
- logger = get_logger()
16
-
17
-
18
- DEFAULT_CACHE_MAXSIZE = 1 * 1024 * 1024 * 1024 # 1 GB
19
- DEFAULT_CACHE_EXPIRE = 60 * 60 * 24 # 1 day (seconds)
20
- DEFAULT_MEM_CACHE_PATH = os.environ.get('MEM_CACHE_PATH',
21
- os.path.join(os.path.expanduser(DEFAULT_ROOT_CACHE_DIR),
22
- 'mem_cache', 'global_cache.pkl'))
23
-
24
-
25
- class Cache:
26
-
27
- # TODO: by xingjun.wxj@alibaba-inc.com
28
- # 1. atomic operation for saving cache
29
- # 2. consider the distributed env
30
-
31
- @classmethod
32
- def lru_cache(cls, maxsize: int = DEFAULT_CACHE_MAXSIZE):
33
- return cachetools.LRUCache(maxsize=maxsize, getsizeof=asizeof.asizeof)
34
-
35
- @classmethod
36
- def ttl_cache(cls, max_size: float = DEFAULT_CACHE_MAXSIZE, expire: float = DEFAULT_CACHE_EXPIRE):
37
- return cachetools.TTLCache(maxsize=max_size,
38
- ttl=timedelta(seconds=expire),
39
- timer=datetime.now,
40
- getsizeof=asizeof.asizeof)
41
-
42
- @classmethod
43
- def load(cls, path: str) -> Union[CachetoolsCache, None]:
44
- """
45
- Load cache from disk. Pickle is used for serialization.
46
-
47
- Args:
48
- path: The local path to load the cache.
49
-
50
- Returns:
51
- The cache instance loaded from disk. Should be cachetools.Cache or None.
52
- """
53
- if os.path.exists(path):
54
- logger.info(f'** Loading cache from {path} ...')
55
- with open(path, 'rb') as f:
56
- return pickle.load(f)
57
- else:
58
- return None
59
-
60
- @classmethod
61
- def save(cls, cache: CachetoolsCache, path: str = DEFAULT_MEM_CACHE_PATH):
62
- """
63
- Dump memory cache to disk. Pickle is used for serialization.
64
-
65
- Args:
66
- cache: The cache instance to be saved.
67
- path: The local path to save the cache.
68
-
69
- Returns: None
70
- """
71
- os.makedirs(os.path.dirname(path), exist_ok=True)
72
- with open(path, 'wb') as f:
73
- pickle.dump(cache, f)
74
- logger.info(f'** Cache saved to {path} !')
75
-
76
-
77
- def init_mem_cache(method: str = 'ttl', cache_file_path: str = DEFAULT_MEM_CACHE_PATH) -> CachetoolsCache:
78
- """
79
- Initialize memory cache.
80
-
81
- Args:
82
- method (str): 'ttl' or 'lru', see https://cachetools.readthedocs.io/en/latest/ for details.
83
- cache_file_path (str): The local cache path. Should be a pickle file.
84
-
85
- Returns:
86
- The cache instance. Should be cachetools.Cache.
87
- """
88
- logger.info(f'** Initializing memory cache with method `{method}` ... \n')
89
- mem_cache = Cache.load(path=cache_file_path)
90
- if mem_cache is None:
91
- if method == 'ttl':
92
- mem_cache = Cache.ttl_cache()
93
- elif method == 'lru':
94
- mem_cache = Cache.lru_cache()
95
- else:
96
- raise ValueError(f'Unknown cache method {method}. Please choose from `ttl` or `lru`.')
97
-
98
- return mem_cache