evalscope 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/__init__.py +1 -1
- evalscope/api/benchmark/adapters/__init__.py +2 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +1 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +7 -6
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +35 -0
- evalscope/api/benchmark/meta.py +6 -0
- evalscope/api/dataset/dataset.py +6 -6
- evalscope/api/dataset/loader.py +2 -1
- evalscope/api/evaluator/cache.py +24 -1
- evalscope/api/evaluator/state.py +12 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +47 -2
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +0 -1
- evalscope/api/model/generate_config.py +1 -3
- evalscope/api/model/model.py +4 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/single_model.py +3 -3
- evalscope/app/utils/data_utils.py +7 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/arguments.py +2 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/bfcl/bfcl_adapter.py +2 -6
- evalscope/benchmarks/bfcl/generation.py +2 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +5 -1
- evalscope/benchmarks/tau_bench/generation.py +1 -1
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +15 -19
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +72 -13
- evalscope/constants.py +8 -0
- evalscope/evaluator/evaluator.py +6 -4
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +20 -0
- evalscope/models/openai_compatible.py +3 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +7 -4
- evalscope/perf/benchmark.py +2 -0
- evalscope/perf/utils/benchmark_util.py +8 -5
- evalscope/perf/utils/local_server.py +3 -0
- evalscope/report/__init__.py +0 -1
- evalscope/report/generator.py +8 -87
- evalscope/run.py +9 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +42 -1
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +23 -6
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/METADATA +12 -15
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/RECORD +94 -80
- tests/benchmark/test_eval.py +30 -31
- tests/benchmark/test_image_edit.py +65 -0
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +83 -43
- tests/cli/test_collection.py +8 -5
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -3
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/aigc/__init__.py +0 -1
- /evalscope/benchmarks/{aigc → image_edit}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → image_edit/gedit}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → math_vista}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/test_t2i.py +0 -0
|
@@ -47,7 +47,6 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
47
47
|
datasets: list, the datasets.
|
|
48
48
|
models: list, the models.
|
|
49
49
|
work_dir (Optional): str, the working directory. Default to None, which means the current directory.
|
|
50
|
-
dry_run (Optional): bool, the dry-run flag. Default to False.
|
|
51
50
|
debug (Optional): bool, the debug flag. Default to False.
|
|
52
51
|
reuse (Optional): str, reuse previous outputs & results. Default to None.
|
|
53
52
|
generation_kwargs (Optional): dict, the generation config. Default to {}.
|
|
@@ -140,7 +139,6 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
140
139
|
cmd_str = f'python -m run_oc ' \
|
|
141
140
|
f'--models {" ".join(self.args.models)} ' \
|
|
142
141
|
f'--datasets {" ".join(self.args.datasets)} ' \
|
|
143
|
-
f'{self.get_restore_arg("dry-run", self.args.dry_run)} ' \
|
|
144
142
|
f'{self.get_arg_with_default("work-dir", self.args.work_dir)}'
|
|
145
143
|
|
|
146
144
|
elif cmd_mode == CmdMode.SCRIPT:
|
|
@@ -164,6 +164,13 @@ class CrossEncoderModel(BaseModel):
|
|
|
164
164
|
max_length=self.max_seq_length,
|
|
165
165
|
automodel_args=self.model_kwargs,
|
|
166
166
|
)
|
|
167
|
+
self.tokenizer = self.model.tokenizer
|
|
168
|
+
# set pad token
|
|
169
|
+
if self.tokenizer.pad_token is None:
|
|
170
|
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
171
|
+
if ('pad_token_id' not in self.model.config) or (self.model.config.pad_token_id is None):
|
|
172
|
+
self.model.config.update({'pad_token_id': self.tokenizer.eos_token_id})
|
|
173
|
+
|
|
167
174
|
self.supported_encode_params = get_supported_params(self.model.predict)
|
|
168
175
|
|
|
169
176
|
def predict(self, sentences: List[List[str]], **kwargs) -> Tensor:
|
|
@@ -189,6 +196,7 @@ class APIEmbeddingModel(BaseModel):
|
|
|
189
196
|
self.openai_api_base = kwargs.get('api_base')
|
|
190
197
|
self.openai_api_key = kwargs.get('api_key')
|
|
191
198
|
self.dimensions = kwargs.get('dimensions')
|
|
199
|
+
self.check_embedding_ctx_length = kwargs.get('check_embedding_ctx_length', False)
|
|
192
200
|
self.framework = ['API']
|
|
193
201
|
|
|
194
202
|
self.model = OpenAIEmbeddings(
|
|
@@ -196,7 +204,7 @@ class APIEmbeddingModel(BaseModel):
|
|
|
196
204
|
openai_api_base=self.openai_api_base,
|
|
197
205
|
openai_api_key=self.openai_api_key,
|
|
198
206
|
dimensions=self.dimensions,
|
|
199
|
-
check_embedding_ctx_length=
|
|
207
|
+
check_embedding_ctx_length=self.check_embedding_ctx_length,
|
|
200
208
|
)
|
|
201
209
|
|
|
202
210
|
super().__init__(model_name_or_path=self.model_name, **kwargs)
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import importlib
|
|
2
1
|
import json
|
|
3
2
|
import re
|
|
4
3
|
import traceback
|
|
@@ -12,6 +11,7 @@ from evalscope.api.metric import Score
|
|
|
12
11
|
from evalscope.api.model import Model, ModelOutput
|
|
13
12
|
from evalscope.api.registry import register_benchmark
|
|
14
13
|
from evalscope.constants import Tags
|
|
14
|
+
from evalscope.utils.import_utils import check_import
|
|
15
15
|
from evalscope.utils.logger import get_logger
|
|
16
16
|
|
|
17
17
|
logger = get_logger()
|
|
@@ -67,11 +67,7 @@ class BFCLAdapter(DefaultDataAdapter):
|
|
|
67
67
|
def __init__(self, **kwargs):
|
|
68
68
|
super().__init__(**kwargs)
|
|
69
69
|
|
|
70
|
-
|
|
71
|
-
if spec is None:
|
|
72
|
-
raise ImportError(
|
|
73
|
-
'`bfcl_eval` not found, please install it with `pip install bfcl-eval==2025.6.16` before evaluating.'
|
|
74
|
-
)
|
|
70
|
+
check_import('bfcl_eval', package='bfcl-eval==2025.6.16', raise_error=True)
|
|
75
71
|
|
|
76
72
|
self.category_map = SUBJECT_MAPPING
|
|
77
73
|
self.reformat_subset = True
|
|
@@ -78,7 +78,7 @@ def generate_turn(model: Model, row: dict[str, Any]):
|
|
|
78
78
|
if isinstance(message, str):
|
|
79
79
|
result = message
|
|
80
80
|
else:
|
|
81
|
-
result = message.
|
|
81
|
+
result = message.text
|
|
82
82
|
|
|
83
83
|
logger.debug(f'Turn:{turn_idx} Step:{n_steps} Result: {result}')
|
|
84
84
|
current_responses.append(result)
|
|
@@ -186,7 +186,7 @@ def generate_turn_with_tools(model: Model, row: dict[str, Any]):
|
|
|
186
186
|
logger.error(f'Error converting tool calls to function call strings: {e}')
|
|
187
187
|
tool_call_strs = None
|
|
188
188
|
else:
|
|
189
|
-
model_responses = [message.
|
|
189
|
+
model_responses = [message.text]
|
|
190
190
|
tool_call_strs = None
|
|
191
191
|
|
|
192
192
|
current_responses.extend(model_responses)
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
from functools import partial
|
|
4
3
|
from typing import Any, Dict
|
|
5
4
|
|
|
6
5
|
from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter
|
|
7
|
-
from evalscope.api.dataset import
|
|
6
|
+
from evalscope.api.dataset import Sample
|
|
8
7
|
from evalscope.api.registry import register_benchmark
|
|
9
8
|
from evalscope.constants import Tags
|
|
10
9
|
from evalscope.utils.logger import get_logger
|
|
@@ -6,9 +6,7 @@ from typing import Any, Dict, List
|
|
|
6
6
|
from evalscope.api.benchmark import BenchmarkMeta, DataAdapter, DefaultDataAdapter
|
|
7
7
|
from evalscope.api.dataset import DatasetDict, LocalDataLoader, Sample
|
|
8
8
|
from evalscope.api.evaluator import TaskState
|
|
9
|
-
from evalscope.api.metric import Score
|
|
10
9
|
from evalscope.api.metric.scorer import AggScore, SampleScore
|
|
11
|
-
from evalscope.api.model.model import Model
|
|
12
10
|
from evalscope.api.registry import get_benchmark, register_benchmark
|
|
13
11
|
from evalscope.config import TaskConfig
|
|
14
12
|
from evalscope.constants import DataCollection, Tags
|
|
@@ -23,7 +21,11 @@ logger = get_logger()
|
|
|
23
21
|
BenchmarkMeta(
|
|
24
22
|
name=DataCollection.NAME,
|
|
25
23
|
dataset_id='', # dataset_id need to be set
|
|
26
|
-
description='Data collection'
|
|
24
|
+
description='Custom Data collection, mixing multiple evaluation datasets for '
|
|
25
|
+
'a unified evaluation, aiming to use less data to achieve a more comprehensive '
|
|
26
|
+
'assessment of the model\'s capabilities. '
|
|
27
|
+
'[Usage Reference](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/collection/index.html)',
|
|
28
|
+
tags=[Tags.CUSTOM],
|
|
27
29
|
metric_list=['acc'],
|
|
28
30
|
eval_split='test',
|
|
29
31
|
prompt_template='',
|
|
@@ -55,9 +57,10 @@ class DataCollectionAdapter(DefaultDataAdapter):
|
|
|
55
57
|
data_id_or_path=dataset_path,
|
|
56
58
|
split=self.eval_split,
|
|
57
59
|
sample_fields=self.record_to_sample,
|
|
58
|
-
subset=
|
|
60
|
+
subset='test', # NOTE: using hardcoded test subset
|
|
59
61
|
limit=self.limit,
|
|
60
|
-
repeats=self.repeats
|
|
62
|
+
repeats=self.repeats,
|
|
63
|
+
shuffle=self.shuffle,
|
|
61
64
|
).load()
|
|
62
65
|
|
|
63
66
|
test_dataset = DatasetDict({self.default_subset: dataset})
|
|
@@ -95,7 +98,6 @@ class DataCollectionAdapter(DefaultDataAdapter):
|
|
|
95
98
|
|
|
96
99
|
# load dataset args
|
|
97
100
|
dataset_args = copy.deepcopy(self._task_config.dataset_args)
|
|
98
|
-
common_args = dataset_args.get(DataCollection.NAME, {})
|
|
99
101
|
|
|
100
102
|
# Iterate through each sample in the dataset
|
|
101
103
|
dataset = self.test_dataset[self.default_subset]
|
|
@@ -108,7 +110,6 @@ class DataCollectionAdapter(DefaultDataAdapter):
|
|
|
108
110
|
|
|
109
111
|
# update dataset args
|
|
110
112
|
cur_dataset_args = dataset_args.get(dataset_name, {})
|
|
111
|
-
cur_dataset_args.update(common_args)
|
|
112
113
|
|
|
113
114
|
# Initialize dataset adapter
|
|
114
115
|
if dataset_name not in self.dataset_adapters:
|
|
@@ -141,19 +142,22 @@ class DataCollectionAdapter(DefaultDataAdapter):
|
|
|
141
142
|
data = []
|
|
142
143
|
for sample_score in sample_scores:
|
|
143
144
|
collection_info = sample_score.sample_metadata[DataCollection.INFO]
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
145
|
+
main_score = sample_score.score.main_value
|
|
146
|
+
main_metric = sample_score.score.main_score_name
|
|
147
|
+
|
|
148
|
+
# use main score
|
|
149
|
+
data.append(
|
|
150
|
+
dict(
|
|
151
|
+
task_type=collection_info['task_type'],
|
|
152
|
+
categories=tuple(collection_info['categories']),
|
|
153
|
+
dataset_name=collection_info['dataset_name'],
|
|
154
|
+
subset_name=collection_info['subset_name'],
|
|
155
|
+
tags=collection_info['tags'],
|
|
156
|
+
sample_id=sample_score.sample_id,
|
|
157
|
+
metric=main_metric,
|
|
158
|
+
score=main_score
|
|
156
159
|
)
|
|
160
|
+
)
|
|
157
161
|
|
|
158
162
|
df = pd.DataFrame(data)
|
|
159
163
|
|
|
@@ -75,7 +75,11 @@ class GeneralArenaAdapter(DefaultDataAdapter):
|
|
|
75
75
|
dataset_dict = {}
|
|
76
76
|
for subset_name, samples in datasets.items():
|
|
77
77
|
dataset = DictDataLoader(
|
|
78
|
-
dict_list=samples,
|
|
78
|
+
dict_list=samples,
|
|
79
|
+
limit=self.limit,
|
|
80
|
+
shuffle=self.shuffle,
|
|
81
|
+
repeats=self.repeats,
|
|
82
|
+
sample_fields=self.record_to_sample
|
|
79
83
|
).load()
|
|
80
84
|
dataset_dict[subset_name] = dataset
|
|
81
85
|
|
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
"""Utility library of instructions."""
|
|
15
15
|
|
|
16
16
|
import functools
|
|
17
|
-
import immutabledict
|
|
18
17
|
import nltk
|
|
19
18
|
import os
|
|
20
19
|
import random
|
|
@@ -1551,7 +1550,7 @@ WORD_LIST = [
|
|
|
1551
1550
|
] # pylint: disable=line-too-long
|
|
1552
1551
|
|
|
1553
1552
|
# ISO 639-1 codes to language names.
|
|
1554
|
-
LANGUAGE_CODES =
|
|
1553
|
+
LANGUAGE_CODES = {
|
|
1555
1554
|
'en': 'English',
|
|
1556
1555
|
'es': 'Spanish',
|
|
1557
1556
|
'pt': 'Portuguese',
|
|
@@ -1582,7 +1581,7 @@ LANGUAGE_CODES = immutabledict.immutabledict({
|
|
|
1582
1581
|
'pa': 'Punjabi',
|
|
1583
1582
|
'ml': 'Malayalam',
|
|
1584
1583
|
'fi': 'Finnish',
|
|
1585
|
-
}
|
|
1584
|
+
}
|
|
1586
1585
|
|
|
1587
1586
|
_ALPHABETS = '([A-Za-z])'
|
|
1588
1587
|
_PREFIXES = '(Mr|St|Mrs|Ms|Dr)[.]'
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import copy
|
|
3
|
+
import os
|
|
4
|
+
from typing import Any, Dict, List
|
|
5
|
+
|
|
6
|
+
from evalscope.api.benchmark import BenchmarkMeta, ImageEditAdapter
|
|
7
|
+
from evalscope.api.dataset import Sample
|
|
8
|
+
from evalscope.api.evaluator.state import TaskState
|
|
9
|
+
from evalscope.api.messages import ChatMessage, ChatMessageUser, Content, ContentImage, ContentText
|
|
10
|
+
from evalscope.api.metric.scorer import Score
|
|
11
|
+
from evalscope.api.registry import register_benchmark
|
|
12
|
+
from evalscope.constants import FileConstants, Tags
|
|
13
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
14
|
+
from evalscope.utils.logger import get_logger
|
|
15
|
+
|
|
16
|
+
logger = get_logger()
|
|
17
|
+
|
|
18
|
+
SUBSET_LIST = [
|
|
19
|
+
'background_change', 'color_alter', 'material_alter', 'motion_change', 'ps_human', 'style_change', 'subject-add',
|
|
20
|
+
'subject-remove', 'subject-replace', 'text_change', 'tone_transfer'
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
LANGUAGE_LIST = ['en', 'cn']
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@register_benchmark(
|
|
27
|
+
BenchmarkMeta(
|
|
28
|
+
name='gedit',
|
|
29
|
+
pretty_name='GEdit-Bench',
|
|
30
|
+
dataset_id='stepfun-ai/GEdit-Bench',
|
|
31
|
+
description='GEdit-Bench Image Editing Benchmark, grounded in real-world '
|
|
32
|
+
'usages is developed to support more authentic and '
|
|
33
|
+
'comprehensive evaluation of image editing models.',
|
|
34
|
+
tags=[Tags.IMAGE_EDITING],
|
|
35
|
+
subset_list=SUBSET_LIST,
|
|
36
|
+
metric_list=['Semantic Consistency', 'Perceptual Similarity'],
|
|
37
|
+
few_shot_num=0,
|
|
38
|
+
train_split=None,
|
|
39
|
+
eval_split='train',
|
|
40
|
+
extra_params={'language': f'# language of the instruction, choose from {LANGUAGE_LIST}, default to `en`'}
|
|
41
|
+
)
|
|
42
|
+
)
|
|
43
|
+
class GEditAdapter(ImageEditAdapter):
|
|
44
|
+
|
|
45
|
+
def __init__(self, **kwargs):
|
|
46
|
+
super().__init__(**kwargs)
|
|
47
|
+
|
|
48
|
+
self.language = self.extra_params.get('language', 'en')
|
|
49
|
+
if self.language not in LANGUAGE_LIST:
|
|
50
|
+
logger.warning(f"Invalid language '{self.language}', fallback to 'en'")
|
|
51
|
+
self.language = 'en'
|
|
52
|
+
self.reformat_subset = True
|
|
53
|
+
self._use_llm_judge = True
|
|
54
|
+
|
|
55
|
+
self.load_prompt()
|
|
56
|
+
|
|
57
|
+
def load_prompt(self):
|
|
58
|
+
from . import vie_prompts
|
|
59
|
+
|
|
60
|
+
self.context = vie_prompts._context_no_delimit
|
|
61
|
+
self.SC_prompt = '\n'.join([
|
|
62
|
+
self.context, vie_prompts._prompts_0shot_two_image_edit_rule, vie_prompts._prompts_0shot_tie_rule_SC
|
|
63
|
+
])
|
|
64
|
+
self.PQ_prompt = '\n'.join([self.context, vie_prompts._prompts_0shot_rule_PQ])
|
|
65
|
+
|
|
66
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
67
|
+
record = copy.deepcopy(record)
|
|
68
|
+
|
|
69
|
+
# Process instruction and image
|
|
70
|
+
instruction = record['instruction']
|
|
71
|
+
image_bytes = record['input_image']['bytes']
|
|
72
|
+
input_image = bytes_to_base64(image_bytes, format='png', add_header=True)
|
|
73
|
+
record['input_image'] = input_image
|
|
74
|
+
record[FileConstants.ID] = record['key']
|
|
75
|
+
del record['input_image_raw']
|
|
76
|
+
|
|
77
|
+
text_content = ContentText(text=instruction)
|
|
78
|
+
image_content = ContentImage(image=input_image)
|
|
79
|
+
|
|
80
|
+
messages: List[ChatMessage] = [
|
|
81
|
+
ChatMessageUser(content=[text_content, image_content]),
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
return Sample(input=messages, subset_key=record['task_type'], metadata=record)
|
|
85
|
+
|
|
86
|
+
def sample_filter(self, sample: Sample) -> bool:
|
|
87
|
+
language = sample.metadata.get('instruction_language', 'en')
|
|
88
|
+
return super().sample_filter(sample) and language == self.language
|
|
89
|
+
|
|
90
|
+
def llm_match_score(self, original_prediction, filtered_prediction, reference, task_state: TaskState) -> Score:
|
|
91
|
+
import math
|
|
92
|
+
|
|
93
|
+
from .utils import mllm_output_to_dict
|
|
94
|
+
|
|
95
|
+
metadata = task_state.metadata
|
|
96
|
+
text_prompt = metadata['instruction']
|
|
97
|
+
input_image = metadata['input_image'] # base64 image
|
|
98
|
+
edited_image = metadata[FileConstants.IMAGE_PATH] # local image path
|
|
99
|
+
_SC_prompt = self.SC_prompt.replace('<instruction>', text_prompt)
|
|
100
|
+
|
|
101
|
+
# Initialize the score object with prediction details
|
|
102
|
+
score = Score(
|
|
103
|
+
extracted_prediction=edited_image,
|
|
104
|
+
prediction=edited_image,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Build prompts
|
|
108
|
+
SC_prompt_final = [
|
|
109
|
+
ChatMessageUser(
|
|
110
|
+
content=[
|
|
111
|
+
ContentImage(image=input_image),
|
|
112
|
+
ContentImage(image=edited_image),
|
|
113
|
+
ContentText(text=_SC_prompt)
|
|
114
|
+
]
|
|
115
|
+
)
|
|
116
|
+
]
|
|
117
|
+
PQ_prompt_final = [
|
|
118
|
+
ChatMessageUser(content=[ContentImage(image=edited_image),
|
|
119
|
+
ContentText(text=self.PQ_prompt)])
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
guess_if_cannot_parse = True
|
|
123
|
+
result_SC = self.llm_judge.judge(messages=SC_prompt_final)
|
|
124
|
+
result_PQ = self.llm_judge.judge(messages=PQ_prompt_final)
|
|
125
|
+
SC_dict = mllm_output_to_dict(result_SC, give_up_parsing=guess_if_cannot_parse)
|
|
126
|
+
PQ_dict = mllm_output_to_dict(result_PQ, give_up_parsing=guess_if_cannot_parse)
|
|
127
|
+
|
|
128
|
+
SC_score = min(SC_dict['score'])
|
|
129
|
+
PQ_score = min(PQ_dict['score'])
|
|
130
|
+
O_score = math.sqrt(SC_score * PQ_score)
|
|
131
|
+
|
|
132
|
+
score.value = {'Semantic Consistency': SC_score, 'Perceptual Quality': PQ_score, 'Overall': O_score}
|
|
133
|
+
score.main_score_name = 'Overall'
|
|
134
|
+
score.metadata = {
|
|
135
|
+
'SC_dict': SC_dict,
|
|
136
|
+
'PQ_dict': PQ_dict,
|
|
137
|
+
}
|
|
138
|
+
return score
|