evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +5 -1
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +46 -50
- evalscope/backend/rag_eval/utils/embedding.py +12 -11
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +32 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +119 -95
- evalscope/constants.py +61 -29
- evalscope/evaluator/__init__.py +1 -0
- evalscope/evaluator/evaluator.py +96 -377
- evalscope/evaluator/humaneval_evaluator.py +158 -0
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +15 -3
- evalscope/perf/benchmark.py +7 -9
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +10 -0
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +3 -4
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/perf/utils/db_util.py +11 -8
- evalscope/perf/utils/local_server.py +19 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +184 -375
- evalscope/run_arena.py +20 -25
- evalscope/summarizer.py +16 -17
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -28
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -5
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
- evalscope/tools/combine_reports.py +25 -30
- evalscope/tools/rewrite_eval_results.py +14 -46
- evalscope/utils/__init__.py +0 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +3 -4
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/logger.py +9 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +12 -138
- evalscope/version.py +2 -2
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
- evalscope-0.8.0.dist-info/RECORD +285 -0
- tests/cli/test_run.py +54 -15
- tests/perf/test_perf.py +4 -0
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.1.dist-info/RECORD +0 -286
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import json
|
|
2
3
|
import os
|
|
4
|
+
import requests
|
|
3
5
|
import shutil
|
|
4
6
|
import subprocess
|
|
5
7
|
import time
|
|
6
8
|
import unittest
|
|
7
9
|
|
|
8
|
-
import json
|
|
9
|
-
import requests
|
|
10
|
-
|
|
11
10
|
from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
|
|
12
11
|
from evalscope.run import run_task
|
|
13
12
|
from evalscope.summarizer import Summarizer
|
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import json
|
|
2
3
|
import os
|
|
4
|
+
import requests
|
|
3
5
|
import shutil
|
|
4
6
|
import subprocess
|
|
5
7
|
import time
|
|
6
8
|
import unittest
|
|
7
9
|
|
|
8
|
-
import json
|
|
9
|
-
import requests
|
|
10
|
-
|
|
11
10
|
from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
|
|
12
11
|
from evalscope.run import run_task
|
|
13
12
|
from evalscope.summarizer import Summarizer
|
|
@@ -1,91 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
import numpy as np
|
|
3
|
-
from dataclasses import dataclass, field
|
|
4
|
-
from ragas.dataset_schema import SingleTurnSample
|
|
5
|
-
from ragas.metrics.base import MetricWithLLM, SingleTurnMetric, MetricType
|
|
6
|
-
from pydantic import BaseModel, Field
|
|
7
|
-
from evalscope.backend.rag_eval.ragas.prompts.multi_modal_prompt import ImageTextPrompt
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class FaithfulnessInput(BaseModel):
|
|
11
|
-
response: str = Field(description="response from AI")
|
|
12
|
-
retrieved_contexts: list[str] = Field(description="contexts retrieved from the LLM")
|
|
13
|
-
|
|
14
|
-
def to_string_list(self):
|
|
15
|
-
return [
|
|
16
|
-
"inputs:",
|
|
17
|
-
self.response,
|
|
18
|
-
"retrieved_contexts: ",
|
|
19
|
-
] + self.retrieved_contexts
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class FaithfulnessOutput(BaseModel):
|
|
23
|
-
faithful: bool = Field(description="boolean indicating if request was faithful")
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class MultiModalFaithfulnessPrompt(
|
|
27
|
-
ImageTextPrompt[FaithfulnessInput, FaithfulnessOutput]
|
|
28
|
-
):
|
|
29
|
-
# refer: https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/evaluation/multi_modal/faithfulness.py
|
|
30
|
-
instruction = "Please tell if a given piece of information is supported by the visual as well as textual context information. You need to answer with either True or False. Answer True if any of the image(s) and textual context supports the information"
|
|
31
|
-
input_model = FaithfulnessInput
|
|
32
|
-
output_model = FaithfulnessOutput
|
|
33
|
-
examples = [
|
|
34
|
-
(
|
|
35
|
-
FaithfulnessInput(
|
|
36
|
-
response="Apple pie is generally double-crusted.",
|
|
37
|
-
retrieved_contexts=[
|
|
38
|
-
"An apple pie is a fruit pie in which the principal filling ingredient is apples.",
|
|
39
|
-
"Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard or cheddar cheese.",
|
|
40
|
-
"It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).",
|
|
41
|
-
],
|
|
42
|
-
),
|
|
43
|
-
FaithfulnessOutput(faithful=True),
|
|
44
|
-
),
|
|
45
|
-
(
|
|
46
|
-
FaithfulnessInput(
|
|
47
|
-
response="Apple pies tastes bad.",
|
|
48
|
-
retrieved_contexts=[
|
|
49
|
-
"An apple pie is a fruit pie in which the principal filling ingredient is apples.",
|
|
50
|
-
"Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard or cheddar cheese.",
|
|
51
|
-
"It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).",
|
|
52
|
-
],
|
|
53
|
-
),
|
|
54
|
-
FaithfulnessOutput(faithful=False),
|
|
55
|
-
),
|
|
56
|
-
]
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
@dataclass
|
|
60
|
-
class MultiModalFaithfulness(MetricWithLLM, SingleTurnMetric):
|
|
61
|
-
name: str = "faithful_rate" # type: ignore
|
|
62
|
-
_required_columns: t.Dict[MetricType, t.Set[str]] = field(
|
|
63
|
-
default_factory=lambda: {
|
|
64
|
-
MetricType.SINGLE_TURN: {
|
|
65
|
-
"response",
|
|
66
|
-
"retrieved_contexts",
|
|
67
|
-
}
|
|
68
|
-
}
|
|
69
|
-
)
|
|
70
|
-
faithfulness_prompt: ImageTextPrompt = MultiModalFaithfulnessPrompt()
|
|
71
|
-
|
|
72
|
-
async def _ascore(self, row: t.Dict, callbacks) -> float:
|
|
73
|
-
prompt_input = FaithfulnessInput(
|
|
74
|
-
response=row["response"], retrieved_contexts=row["retrieved_contexts"]
|
|
75
|
-
)
|
|
76
|
-
assert self.llm is not None, "LLM is not set"
|
|
77
|
-
prompt_response = await self.faithfulness_prompt.generate(
|
|
78
|
-
data=prompt_input, llm=self.llm, callbacks=callbacks
|
|
79
|
-
)
|
|
80
|
-
if prompt_response is None:
|
|
81
|
-
return np.nan
|
|
82
|
-
return float(prompt_response.faithful)
|
|
83
|
-
|
|
84
|
-
async def _single_turn_ascore(
|
|
85
|
-
self, sample: SingleTurnSample, callbacks
|
|
86
|
-
) -> float:
|
|
87
|
-
row = sample.to_dict()
|
|
88
|
-
return await self._ascore(row, callbacks)
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
multimodal_faithness = MultiModalFaithfulness()
|
|
@@ -1,99 +0,0 @@
|
|
|
1
|
-
import typing as t
|
|
2
|
-
from dataclasses import dataclass, field
|
|
3
|
-
import numpy as np
|
|
4
|
-
from ragas.dataset_schema import SingleTurnSample
|
|
5
|
-
from ragas.metrics.base import MetricWithLLM, SingleTurnMetric, MetricType
|
|
6
|
-
from pydantic import BaseModel, Field
|
|
7
|
-
from evalscope.backend.rag_eval.ragas.prompts.multi_modal_prompt import ImageTextPrompt
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class RelevanceInput(BaseModel):
|
|
11
|
-
user_input: str = Field(description="user input")
|
|
12
|
-
response: str = Field(description="response from AI")
|
|
13
|
-
retrieved_contexts: list[str] = Field(description="contexts retrieved from the LLM")
|
|
14
|
-
|
|
15
|
-
def to_string_list(self):
|
|
16
|
-
return [
|
|
17
|
-
f"Question: {self.user_input}",
|
|
18
|
-
f"Response: {self.response}",
|
|
19
|
-
"retrieved_contexts: ",
|
|
20
|
-
] + self.retrieved_contexts
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class RelevanceOutput(BaseModel):
|
|
24
|
-
relevance: bool = Field(description="boolean indicating if request was relevance")
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class MultiModalRelevancePrompt(ImageTextPrompt[RelevanceInput, RelevanceOutput]):
|
|
28
|
-
# refer https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/evaluation/multi_modal/relevancy.py
|
|
29
|
-
instruction = """
|
|
30
|
-
Your task is to evaluate if the response for the query is in line with the images and textual context information provided.
|
|
31
|
-
You have two options to answer. Either True / False.
|
|
32
|
-
Answer - True, if the response for the query is in line with context information otherwise False.
|
|
33
|
-
"""
|
|
34
|
-
input_model = RelevanceInput
|
|
35
|
-
output_model = RelevanceOutput
|
|
36
|
-
examples = [
|
|
37
|
-
(
|
|
38
|
-
RelevanceInput(
|
|
39
|
-
user_input="What is the primary ingredient in a traditional Margherita pizza?",
|
|
40
|
-
response="The primary ingredients in a Margherita pizza are tomatoes, mozzarella cheese, and fresh basil.",
|
|
41
|
-
retrieved_contexts=[
|
|
42
|
-
"A traditional Margherita pizza consists of a thin crust.",
|
|
43
|
-
"The main toppings include tomatoes, mozzarella cheese, fresh basil, salt, and olive oil.",
|
|
44
|
-
"It is one of the simplest and most classic types of pizza.",
|
|
45
|
-
],
|
|
46
|
-
),
|
|
47
|
-
RelevanceOutput(relevance=True),
|
|
48
|
-
),
|
|
49
|
-
(
|
|
50
|
-
RelevanceInput(
|
|
51
|
-
user_input="Who won the Best Actor award at the Oscars in 2021?",
|
|
52
|
-
response="The Best Actor award in 2021 was won by Leonardo DiCaprio.",
|
|
53
|
-
retrieved_contexts=[
|
|
54
|
-
"The 93rd Academy Awards were held in 2021.",
|
|
55
|
-
"Anthony Hopkins won the Best Actor award for his role in 'The Father'.",
|
|
56
|
-
"The event was unique due to COVID-19 restrictions.",
|
|
57
|
-
],
|
|
58
|
-
),
|
|
59
|
-
RelevanceOutput(relevance=False),
|
|
60
|
-
),
|
|
61
|
-
]
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
@dataclass
|
|
65
|
-
class MultiModalRelevance(MetricWithLLM, SingleTurnMetric):
|
|
66
|
-
name: str = "relevance_rate" # type: ignore
|
|
67
|
-
_required_columns: t.Dict[MetricType, t.Set[str]] = field(
|
|
68
|
-
default_factory=lambda: {
|
|
69
|
-
MetricType.SINGLE_TURN: {
|
|
70
|
-
"user_input",
|
|
71
|
-
"response",
|
|
72
|
-
"retrieved_contexts",
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
)
|
|
76
|
-
relevance_prompt: ImageTextPrompt = MultiModalRelevancePrompt()
|
|
77
|
-
|
|
78
|
-
async def _ascore(self, row: t.Dict, callbacks) -> float:
|
|
79
|
-
prompt_input = RelevanceInput(
|
|
80
|
-
user_input=row["user_input"],
|
|
81
|
-
response=row["response"],
|
|
82
|
-
retrieved_contexts=row["retrieved_contexts"],
|
|
83
|
-
)
|
|
84
|
-
assert self.llm is not None, "LLM is not set"
|
|
85
|
-
prompt_response = await self.relevance_prompt.generate(
|
|
86
|
-
data=prompt_input, llm=self.llm, callbacks=callbacks
|
|
87
|
-
)
|
|
88
|
-
if prompt_response is None:
|
|
89
|
-
return np.nan
|
|
90
|
-
return float(prompt_response.relevance)
|
|
91
|
-
|
|
92
|
-
async def _single_turn_ascore(
|
|
93
|
-
self, sample: SingleTurnSample, callbacks
|
|
94
|
-
) -> float:
|
|
95
|
-
row = sample.to_dict()
|
|
96
|
-
return await self._ascore(row, callbacks)
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
multimodal_relevance = MultiModalRelevance()
|
evalscope/cache.py
DELETED
|
@@ -1,98 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
from typing import Union
|
|
5
|
-
|
|
6
|
-
import cachetools
|
|
7
|
-
from cachetools import Cache as CachetoolsCache
|
|
8
|
-
from pympler import asizeof
|
|
9
|
-
from datetime import datetime, timedelta
|
|
10
|
-
import pickle
|
|
11
|
-
|
|
12
|
-
from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
|
|
13
|
-
from evalscope.utils.logger import get_logger
|
|
14
|
-
|
|
15
|
-
logger = get_logger()
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
DEFAULT_CACHE_MAXSIZE = 1 * 1024 * 1024 * 1024 # 1 GB
|
|
19
|
-
DEFAULT_CACHE_EXPIRE = 60 * 60 * 24 # 1 day (seconds)
|
|
20
|
-
DEFAULT_MEM_CACHE_PATH = os.environ.get('MEM_CACHE_PATH',
|
|
21
|
-
os.path.join(os.path.expanduser(DEFAULT_ROOT_CACHE_DIR),
|
|
22
|
-
'mem_cache', 'global_cache.pkl'))
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class Cache:
|
|
26
|
-
|
|
27
|
-
# TODO: by xingjun.wxj@alibaba-inc.com
|
|
28
|
-
# 1. atomic operation for saving cache
|
|
29
|
-
# 2. consider the distributed env
|
|
30
|
-
|
|
31
|
-
@classmethod
|
|
32
|
-
def lru_cache(cls, maxsize: int = DEFAULT_CACHE_MAXSIZE):
|
|
33
|
-
return cachetools.LRUCache(maxsize=maxsize, getsizeof=asizeof.asizeof)
|
|
34
|
-
|
|
35
|
-
@classmethod
|
|
36
|
-
def ttl_cache(cls, max_size: float = DEFAULT_CACHE_MAXSIZE, expire: float = DEFAULT_CACHE_EXPIRE):
|
|
37
|
-
return cachetools.TTLCache(maxsize=max_size,
|
|
38
|
-
ttl=timedelta(seconds=expire),
|
|
39
|
-
timer=datetime.now,
|
|
40
|
-
getsizeof=asizeof.asizeof)
|
|
41
|
-
|
|
42
|
-
@classmethod
|
|
43
|
-
def load(cls, path: str) -> Union[CachetoolsCache, None]:
|
|
44
|
-
"""
|
|
45
|
-
Load cache from disk. Pickle is used for serialization.
|
|
46
|
-
|
|
47
|
-
Args:
|
|
48
|
-
path: The local path to load the cache.
|
|
49
|
-
|
|
50
|
-
Returns:
|
|
51
|
-
The cache instance loaded from disk. Should be cachetools.Cache or None.
|
|
52
|
-
"""
|
|
53
|
-
if os.path.exists(path):
|
|
54
|
-
logger.info(f'** Loading cache from {path} ...')
|
|
55
|
-
with open(path, 'rb') as f:
|
|
56
|
-
return pickle.load(f)
|
|
57
|
-
else:
|
|
58
|
-
return None
|
|
59
|
-
|
|
60
|
-
@classmethod
|
|
61
|
-
def save(cls, cache: CachetoolsCache, path: str = DEFAULT_MEM_CACHE_PATH):
|
|
62
|
-
"""
|
|
63
|
-
Dump memory cache to disk. Pickle is used for serialization.
|
|
64
|
-
|
|
65
|
-
Args:
|
|
66
|
-
cache: The cache instance to be saved.
|
|
67
|
-
path: The local path to save the cache.
|
|
68
|
-
|
|
69
|
-
Returns: None
|
|
70
|
-
"""
|
|
71
|
-
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
72
|
-
with open(path, 'wb') as f:
|
|
73
|
-
pickle.dump(cache, f)
|
|
74
|
-
logger.info(f'** Cache saved to {path} !')
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def init_mem_cache(method: str = 'ttl', cache_file_path: str = DEFAULT_MEM_CACHE_PATH) -> CachetoolsCache:
|
|
78
|
-
"""
|
|
79
|
-
Initialize memory cache.
|
|
80
|
-
|
|
81
|
-
Args:
|
|
82
|
-
method (str): 'ttl' or 'lru', see https://cachetools.readthedocs.io/en/latest/ for details.
|
|
83
|
-
cache_file_path (str): The local cache path. Should be a pickle file.
|
|
84
|
-
|
|
85
|
-
Returns:
|
|
86
|
-
The cache instance. Should be cachetools.Cache.
|
|
87
|
-
"""
|
|
88
|
-
logger.info(f'** Initializing memory cache with method `{method}` ... \n')
|
|
89
|
-
mem_cache = Cache.load(path=cache_file_path)
|
|
90
|
-
if mem_cache is None:
|
|
91
|
-
if method == 'ttl':
|
|
92
|
-
mem_cache = Cache.ttl_cache()
|
|
93
|
-
elif method == 'lru':
|
|
94
|
-
mem_cache = Cache.lru_cache()
|
|
95
|
-
else:
|
|
96
|
-
raise ValueError(f'Unknown cache method {method}. Please choose from `ttl` or `lru`.')
|
|
97
|
-
|
|
98
|
-
return mem_cache
|