evalscope 1.0.2__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +8 -2
- evalscope/app/utils/data_utils.py +3 -2
- evalscope/app/utils/visualization.py +2 -2
- evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
- evalscope/benchmarks/bfcl/bfcl_adapter.py +10 -45
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
- evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
- evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/metrics/metric.py +51 -0
- evalscope/metrics/metrics.py +16 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/report/__init__.py +9 -1
- evalscope/report/combinator.py +52 -2
- evalscope/utils/json_schema.py +8 -6
- evalscope/utils/multi_choices.py +16 -1
- evalscope/version.py +2 -2
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/METADATA +6 -32
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/RECORD +51 -54
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
- tests/__init__.py +0 -1
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -429
- tests/benchmark/test_image_edit.py +0 -65
- tests/benchmark/test_sandbox.py +0 -81
- tests/benchmark/test_t2i.py +0 -142
- tests/benchmark/test_vlm.py +0 -137
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -269
- tests/cli/test_collection.py +0 -99
- tests/cli/test_custom.py +0 -268
- tests/cli/test_reasoning.py +0 -81
- tests/common.py +0 -73
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -206
- tests/rag/test_clip_benchmark.py +0 -87
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- {tests/rag → evalscope/benchmarks/blink}/__init__.py +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0
evalscope/metrics/metric.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from collections import defaultdict
|
|
2
3
|
from typing import List
|
|
3
4
|
|
|
@@ -100,6 +101,56 @@ class MultiChoiceAcc(Metric):
|
|
|
100
101
|
return res
|
|
101
102
|
|
|
102
103
|
|
|
104
|
+
@register_metric(name='anls')
|
|
105
|
+
class ANLS(Metric):
|
|
106
|
+
|
|
107
|
+
def __init__(self, thresh_hold=0.5):
|
|
108
|
+
self.thresh_hold = thresh_hold
|
|
109
|
+
|
|
110
|
+
def apply(self, predictions, references):
|
|
111
|
+
"""
|
|
112
|
+
Calculate ANLS (Average Normalized Levenshtein Similarity) for a list of predictions and references.
|
|
113
|
+
This implementation is adapted from
|
|
114
|
+
https://github.com/QwenLM/Qwen-VL/blob/master/eval_mm/infographicsvqa_eval.py
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
references (List[str]): List of correct answers. Each answer can be a string of json.
|
|
118
|
+
predictions (List[str]): List of predicted answers.
|
|
119
|
+
"""
|
|
120
|
+
from .metrics import levenshtein_distance
|
|
121
|
+
|
|
122
|
+
res = []
|
|
123
|
+
# Unwrap predictions if it's a nested list
|
|
124
|
+
for prediction, reference in zip(predictions, references):
|
|
125
|
+
# Parse the reference which is a json string
|
|
126
|
+
try:
|
|
127
|
+
answer = json.loads(reference)
|
|
128
|
+
except json.JSONDecodeError:
|
|
129
|
+
answer = reference
|
|
130
|
+
if isinstance(answer, str):
|
|
131
|
+
answer = [answer]
|
|
132
|
+
assert isinstance(answer, list), 'The reference answer should be a list of answers.'
|
|
133
|
+
|
|
134
|
+
# Calculate ANLS for each reference answer
|
|
135
|
+
values = []
|
|
136
|
+
for ans in answer:
|
|
137
|
+
# preprocess both the answers - gt and prediction
|
|
138
|
+
gt_answer = ' '.join(ans.strip().lower().split())
|
|
139
|
+
det_answer = ' '.join(prediction.strip().lower().split())
|
|
140
|
+
|
|
141
|
+
dist = levenshtein_distance(gt_answer, det_answer)
|
|
142
|
+
length = max(len(ans.upper()), len(prediction.upper()))
|
|
143
|
+
values.append(0.0 if length == 0 else float(dist) / float(length))
|
|
144
|
+
|
|
145
|
+
question_result = 0.0
|
|
146
|
+
if values:
|
|
147
|
+
question_result = 1 - min(values)
|
|
148
|
+
if question_result < self.thresh_hold:
|
|
149
|
+
question_result = 0.0
|
|
150
|
+
res.append(question_result)
|
|
151
|
+
return res
|
|
152
|
+
|
|
153
|
+
|
|
103
154
|
# ##################
|
|
104
155
|
# T2I Metrics ######
|
|
105
156
|
####################
|
evalscope/metrics/metrics.py
CHANGED
|
@@ -467,3 +467,19 @@ def calculate_pass_at_k(
|
|
|
467
467
|
num_samples_it = iter(num_samples)
|
|
468
468
|
|
|
469
469
|
return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def levenshtein_distance(s1, s2):
|
|
473
|
+
if len(s1) > len(s2):
|
|
474
|
+
s1, s2 = s2, s1
|
|
475
|
+
|
|
476
|
+
distances = range(len(s1) + 1)
|
|
477
|
+
for i2, c2 in enumerate(s2):
|
|
478
|
+
distances_ = [i2 + 1]
|
|
479
|
+
for i1, c1 in enumerate(s1):
|
|
480
|
+
if c1 == c2:
|
|
481
|
+
distances_.append(distances[i1])
|
|
482
|
+
else:
|
|
483
|
+
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
|
|
484
|
+
distances = distances_
|
|
485
|
+
return distances[-1]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -30,13 +30,9 @@ from transformers.modeling_outputs import (
|
|
|
30
30
|
SequenceClassifierOutput,
|
|
31
31
|
TokenClassifierOutput,
|
|
32
32
|
)
|
|
33
|
-
from transformers.modeling_utils import
|
|
34
|
-
PreTrainedModel,
|
|
35
|
-
apply_chunking_to_forward,
|
|
36
|
-
find_pruneable_heads_and_indices,
|
|
37
|
-
prune_linear_layer,
|
|
38
|
-
)
|
|
33
|
+
from transformers.modeling_utils import PreTrainedModel
|
|
39
34
|
from transformers.models.bert.configuration_bert import BertConfig
|
|
35
|
+
from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
|
|
40
36
|
from transformers.utils import logging
|
|
41
37
|
from typing import Any, Dict, Optional, Tuple
|
|
42
38
|
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py
CHANGED
|
@@ -14,13 +14,9 @@ from transformers.modeling_outputs import (
|
|
|
14
14
|
BaseModelOutputWithPastAndCrossAttentions,
|
|
15
15
|
BaseModelOutputWithPoolingAndCrossAttentions,
|
|
16
16
|
)
|
|
17
|
-
from transformers.modeling_utils import
|
|
18
|
-
PreTrainedModel,
|
|
19
|
-
apply_chunking_to_forward,
|
|
20
|
-
find_pruneable_heads_and_indices,
|
|
21
|
-
prune_linear_layer,
|
|
22
|
-
)
|
|
17
|
+
from transformers.modeling_utils import PreTrainedModel
|
|
23
18
|
from transformers.models.bert.configuration_bert import BertConfig
|
|
19
|
+
from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
|
|
24
20
|
from transformers.utils import logging
|
|
25
21
|
from typing import Tuple
|
|
26
22
|
|
|
@@ -31,13 +31,9 @@ from transformers.modeling_outputs import (
|
|
|
31
31
|
SequenceClassifierOutput,
|
|
32
32
|
TokenClassifierOutput,
|
|
33
33
|
)
|
|
34
|
-
from transformers.modeling_utils import
|
|
35
|
-
PreTrainedModel,
|
|
36
|
-
apply_chunking_to_forward,
|
|
37
|
-
find_pruneable_heads_and_indices,
|
|
38
|
-
prune_linear_layer,
|
|
39
|
-
)
|
|
34
|
+
from transformers.modeling_utils import PreTrainedModel
|
|
40
35
|
from transformers.models.bert.configuration_bert import BertConfig
|
|
36
|
+
from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
|
|
41
37
|
from transformers.utils import logging
|
|
42
38
|
from typing import Optional, Tuple
|
|
43
39
|
|
evalscope/report/__init__.py
CHANGED
|
@@ -4,7 +4,13 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
from evalscope.utils.import_utils import _LazyModule
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
|
-
from .combinator import
|
|
7
|
+
from .combinator import (
|
|
8
|
+
gen_table,
|
|
9
|
+
get_data_frame,
|
|
10
|
+
get_report_list,
|
|
11
|
+
unweighted_average_from_subsets,
|
|
12
|
+
weighted_average_from_subsets,
|
|
13
|
+
)
|
|
8
14
|
from .generator import ReportGenerator
|
|
9
15
|
from .report import Category, Report, ReportKey, Subset
|
|
10
16
|
|
|
@@ -14,6 +20,8 @@ else:
|
|
|
14
20
|
'gen_table',
|
|
15
21
|
'get_data_frame',
|
|
16
22
|
'get_report_list',
|
|
23
|
+
'weighted_average_from_subsets',
|
|
24
|
+
'unweighted_average_from_subsets',
|
|
17
25
|
],
|
|
18
26
|
'generator': [
|
|
19
27
|
'ReportGenerator',
|
evalscope/report/combinator.py
CHANGED
|
@@ -4,9 +4,9 @@ import glob
|
|
|
4
4
|
import os
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from tabulate import tabulate
|
|
7
|
-
from typing import List, Tuple
|
|
7
|
+
from typing import Dict, List, Tuple, Union
|
|
8
8
|
|
|
9
|
-
from evalscope.report.report import Report
|
|
9
|
+
from evalscope.report.report import Report, Subset
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
11
|
|
|
12
12
|
logger = get_logger()
|
|
@@ -86,3 +86,53 @@ def gen_table(
|
|
|
86
86
|
add_overall_metric=add_overall_metric
|
|
87
87
|
)
|
|
88
88
|
return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def weighted_average_from_subsets(
|
|
92
|
+
subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
|
|
93
|
+
) -> Subset:
|
|
94
|
+
"""Calculate weighted average for given subsets.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
subset_names (List[str]): List of subset names to include in the average.
|
|
98
|
+
subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
|
|
99
|
+
new_name (str): Name for the resulting Subset object.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Subset: A new Subset object with weighted average score
|
|
103
|
+
"""
|
|
104
|
+
total_score = 0
|
|
105
|
+
total_count = 0
|
|
106
|
+
for name in subset_names:
|
|
107
|
+
if name in subset_dict:
|
|
108
|
+
subset = subset_dict[name]
|
|
109
|
+
total_score += subset.score * subset.num
|
|
110
|
+
total_count += subset.num
|
|
111
|
+
|
|
112
|
+
weighted_avg = total_score / total_count if total_count > 0 else 0
|
|
113
|
+
return Subset(name=new_name, score=weighted_avg, num=total_count)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def unweighted_average_from_subsets(
|
|
117
|
+
subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
|
|
118
|
+
) -> Subset:
|
|
119
|
+
"""Calculate unweighted average for given subsets.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
subset_names (List[str]): List of subset names to include in the average.
|
|
123
|
+
subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
|
|
124
|
+
new_name (str): Name for the resulting Subset object.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Subset: A new Subset object with unweighted average score
|
|
128
|
+
"""
|
|
129
|
+
scores = []
|
|
130
|
+
total_count = 0
|
|
131
|
+
for name in subset_names:
|
|
132
|
+
if name in subset_dict:
|
|
133
|
+
subset = subset_dict[name]
|
|
134
|
+
scores.append(subset.score)
|
|
135
|
+
total_count += subset.num
|
|
136
|
+
|
|
137
|
+
unweighted_avg = sum(scores) / len(scores) if scores else 0
|
|
138
|
+
return Subset(name=new_name, score=unweighted_avg, num=total_count)
|
evalscope/utils/json_schema.py
CHANGED
|
@@ -59,18 +59,20 @@ class JSONSchema(BaseModel):
|
|
|
59
59
|
required: Optional[List[str]] = Field(default=None)
|
|
60
60
|
"""Required fields for object parameters."""
|
|
61
61
|
|
|
62
|
-
@field_validator('type')
|
|
63
|
-
def validate_type(cls, v: Optional[str]) -> Optional[JSONType]:
|
|
64
|
-
return python_type_to_json_type(v)
|
|
65
|
-
|
|
66
62
|
@model_validator(mode='before')
|
|
67
63
|
def convert_type_before_validation(cls, values):
|
|
68
64
|
values = deepcopy(values)
|
|
69
65
|
|
|
70
66
|
def recursive_convert_type(obj):
|
|
71
67
|
if isinstance(obj, dict):
|
|
72
|
-
|
|
73
|
-
|
|
68
|
+
# Convert 'type' field if it's a string
|
|
69
|
+
if 'type' in obj and isinstance(obj['type'], str):
|
|
70
|
+
try:
|
|
71
|
+
obj['type'] = python_type_to_json_type(obj['type'])
|
|
72
|
+
except ValueError:
|
|
73
|
+
# If conversion fails, leave it as is
|
|
74
|
+
pass
|
|
75
|
+
# Recursively process nested structures
|
|
74
76
|
for k, v in obj.items():
|
|
75
77
|
obj[k] = recursive_convert_type(v)
|
|
76
78
|
elif isinstance(obj, list):
|
evalscope/utils/multi_choices.py
CHANGED
|
@@ -81,12 +81,27 @@ def answer_options(choices: Choices) -> str:
|
|
|
81
81
|
return '\n'.join([f'{answer_character(i)}) {choices[j].value}' for i, j in enumerate(indexes)])
|
|
82
82
|
|
|
83
83
|
|
|
84
|
+
def format_letter_choices(choices: Union[Choices, List[str]]) -> str:
|
|
85
|
+
"""
|
|
86
|
+
Returns the `choices` formatted as a letter list, e.g.:
|
|
87
|
+
|
|
88
|
+
["choice 1", "choice 2", "choice 3"] ->
|
|
89
|
+
"A,B,C"
|
|
90
|
+
"""
|
|
91
|
+
if isinstance(choices, list):
|
|
92
|
+
choices = Choices(choices)
|
|
93
|
+
|
|
94
|
+
indexes = list(range(len(choices)))
|
|
95
|
+
|
|
96
|
+
return ','.join([f'{answer_character(i)}' for i in indexes])
|
|
97
|
+
|
|
98
|
+
|
|
84
99
|
def prompt(question: str, choices: Union[Choices, List[str]], template: str, fewshot: Optional[str] = None) -> str:
|
|
85
100
|
if isinstance(choices, list):
|
|
86
101
|
choices = Choices(choices)
|
|
87
102
|
|
|
88
103
|
choices_text = answer_options(choices)
|
|
89
|
-
letters =
|
|
104
|
+
letters = format_letter_choices(choices)
|
|
90
105
|
if not fewshot:
|
|
91
106
|
return template.format(
|
|
92
107
|
choices=choices_text,
|
evalscope/version.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
|
-
Home-page: https://github.com/modelscope/evalscope
|
|
6
5
|
Author: ModelScope team
|
|
7
6
|
Author-email: contact@modelscope.cn
|
|
8
7
|
License: Apache License 2.0
|
|
8
|
+
Project-URL: Homepage, https://github.com/modelscope/evalscope
|
|
9
9
|
Keywords: python,llm,evaluation
|
|
10
10
|
Classifier: Development Status :: 4 - Beta
|
|
11
11
|
Classifier: Operating System :: OS Independent
|
|
@@ -14,6 +14,7 @@ Classifier: Programming Language :: Python :: 3.9
|
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.10
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.11
|
|
16
16
|
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
17
18
|
Requires-Python: >=3.9
|
|
18
19
|
Description-Content-Type: text/markdown
|
|
19
20
|
License-File: LICENSE
|
|
@@ -56,35 +57,6 @@ Requires-Dist: peft>=0.17; extra == "aigc"
|
|
|
56
57
|
Requires-Dist: torch; extra == "aigc"
|
|
57
58
|
Requires-Dist: torchvision; extra == "aigc"
|
|
58
59
|
Provides-Extra: all
|
|
59
|
-
Requires-Dist: colorlog; extra == "all"
|
|
60
|
-
Requires-Dist: datasets==3.6.0; extra == "all"
|
|
61
|
-
Requires-Dist: docstring-parser; extra == "all"
|
|
62
|
-
Requires-Dist: dotenv; extra == "all"
|
|
63
|
-
Requires-Dist: jieba; extra == "all"
|
|
64
|
-
Requires-Dist: jsonlines; extra == "all"
|
|
65
|
-
Requires-Dist: langdetect; extra == "all"
|
|
66
|
-
Requires-Dist: latex2sympy2-extended[antlr4_9_3]; extra == "all"
|
|
67
|
-
Requires-Dist: matplotlib; extra == "all"
|
|
68
|
-
Requires-Dist: modelscope[framework]>=1.27; extra == "all"
|
|
69
|
-
Requires-Dist: nltk>=3.9; extra == "all"
|
|
70
|
-
Requires-Dist: openai; extra == "all"
|
|
71
|
-
Requires-Dist: overrides; extra == "all"
|
|
72
|
-
Requires-Dist: pandas; extra == "all"
|
|
73
|
-
Requires-Dist: pillow; extra == "all"
|
|
74
|
-
Requires-Dist: pydantic; extra == "all"
|
|
75
|
-
Requires-Dist: pyyaml>=5.1; extra == "all"
|
|
76
|
-
Requires-Dist: requests; extra == "all"
|
|
77
|
-
Requires-Dist: rich; extra == "all"
|
|
78
|
-
Requires-Dist: rouge-chinese; extra == "all"
|
|
79
|
-
Requires-Dist: rouge-score>=0.1.0; extra == "all"
|
|
80
|
-
Requires-Dist: sacrebleu; extra == "all"
|
|
81
|
-
Requires-Dist: scikit-learn; extra == "all"
|
|
82
|
-
Requires-Dist: seaborn; extra == "all"
|
|
83
|
-
Requires-Dist: sympy; extra == "all"
|
|
84
|
-
Requires-Dist: tabulate; extra == "all"
|
|
85
|
-
Requires-Dist: tqdm; extra == "all"
|
|
86
|
-
Requires-Dist: transformers>=4.33; extra == "all"
|
|
87
|
-
Requires-Dist: word2number; extra == "all"
|
|
88
60
|
Requires-Dist: ms-opencompass>=0.1.6; extra == "all"
|
|
89
61
|
Requires-Dist: ms-vlmeval>=0.0.17; extra == "all"
|
|
90
62
|
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
|
|
@@ -99,6 +71,7 @@ Requires-Dist: aiohttp; extra == "all"
|
|
|
99
71
|
Requires-Dist: fastapi; extra == "all"
|
|
100
72
|
Requires-Dist: jinja2; extra == "all"
|
|
101
73
|
Requires-Dist: numpy; extra == "all"
|
|
74
|
+
Requires-Dist: rich; extra == "all"
|
|
102
75
|
Requires-Dist: sse-starlette; extra == "all"
|
|
103
76
|
Requires-Dist: transformers; extra == "all"
|
|
104
77
|
Requires-Dist: uvicorn; extra == "all"
|
|
@@ -266,7 +239,8 @@ Please scan the QR code below to join our community groups:
|
|
|
266
239
|
> **Version 1.0 Refactoring**
|
|
267
240
|
>
|
|
268
241
|
> Version 1.0 introduces a major overhaul of the evaluation framework, establishing a new, more modular and extensible API layer under `evalscope/api`. Key improvements include standardized data models for benchmarks, samples, and results; a registry-based design for components such as benchmarks and metrics; and a rewritten core evaluator that orchestrates the new architecture. Existing benchmark adapters have been migrated to this API, resulting in cleaner, more consistent, and easier-to-maintain implementations.
|
|
269
|
-
|
|
242
|
+
- 🔥 **[2025.10.14]** Added support for OCRBench, OCRBench-v2, DocVQA, InfoVQA, ChartQA, and BLINK multimodal image-text evaluation benchmarks.
|
|
243
|
+
- 🔥 **[2025.09.22]** Code evaluation benchmarks (HumanEval, LiveCodeBench) now support running in a sandbox environment. To use this feature, please install [ms-enclave](https://github.com/modelscope/ms-enclave) first.
|
|
270
244
|
- 🔥 **[2025.09.19]** Added support for multimodal image-text evaluation benchmarks including RealWorldQA, AI2D, MMStar, MMBench, and OmniBench, as well as pure text evaluation benchmarks such as Multi-IF, HealthBench, and AMC.
|
|
271
245
|
- 🔥 **[2025.09.05]** Added support for vision-language multimodal model evaluation tasks, such as MathVista and MMMU. For more supported datasets, please [refer to the documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/vlm.html).
|
|
272
246
|
- 🔥 **[2025.09.04]** Added support for image editing task evaluation, including the [GEdit-Bench](https://modelscope.cn/datasets/stepfun-ai/GEdit-Bench) benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/image_edit.html).
|
|
@@ -4,14 +4,14 @@ evalscope/config.py,sha256=S2N11-AxQkT7lVffpjXdtpT4QpnSP6th-c8I-501mwM,11507
|
|
|
4
4
|
evalscope/constants.py,sha256=W3E4Jp-x6qxvPOYtU9bNlzlERFvSAA_3F007apIwUlU,3601
|
|
5
5
|
evalscope/run.py,sha256=A9_7pR3FiA-It46A3Mqk7ce6fQy548p0ux2QUugj2hI,6531
|
|
6
6
|
evalscope/summarizer.py,sha256=HUDJ1zKi22uNst3AUfX67Z0sHzeZy-4S8sYyvxJnBzc,5901
|
|
7
|
-
evalscope/version.py,sha256=
|
|
7
|
+
evalscope/version.py,sha256=hqGJMtjd3F6yPJucqhuYtXuGYSumthFmroHsUTY761Y,118
|
|
8
8
|
evalscope/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
9
|
evalscope/api/registry.py,sha256=Qk0KMGDbt-iI0-OfoJZbOtxt76qreAVWh36HOoQAKM4,5448
|
|
10
10
|
evalscope/api/benchmark/__init__.py,sha256=9xcTxpcQ6HhZ0QDwEIZhAT5IjybzaJ60VGLcmaFE5dU,188
|
|
11
11
|
evalscope/api/benchmark/benchmark.py,sha256=gqAM81SeGb_Q0rA6Q-LFpnNkOUiwOj43aRWECtCxAOE,10832
|
|
12
12
|
evalscope/api/benchmark/meta.py,sha256=N4u8NQjkjIw-xaf6KFnb6C8JDKB0DLbsXyXblDqIpvE,4304
|
|
13
13
|
evalscope/api/benchmark/adapters/__init__.py,sha256=uLt_GiU4s-_6Rjgmr4OUTtE7dvEX-ZIQ403fd6oNuxA,264
|
|
14
|
-
evalscope/api/benchmark/adapters/default_data_adapter.py,sha256=
|
|
14
|
+
evalscope/api/benchmark/adapters/default_data_adapter.py,sha256=RWDweSmXKGv5hPPjeV4VF76gbKqYJEsab_lQYGUM2PA,28785
|
|
15
15
|
evalscope/api/benchmark/adapters/image_edit_adapter.py,sha256=06V-_A8RKuMNYMt7-vaXn2qBa9LIZgfFO_6PUuhAkh0,3052
|
|
16
16
|
evalscope/api/benchmark/adapters/multi_choice_adapter.py,sha256=auqLNvF50Or9bo3LOmQLXHfFaTTCTqvQzZog3glInng,3062
|
|
17
17
|
evalscope/api/benchmark/adapters/text2image_adapter.py,sha256=jO64hwjQexIv-MTyHH0Ffp_6p--9TKufOmX_U39mAnE,6385
|
|
@@ -50,15 +50,15 @@ evalscope/app/arguments.py,sha256=1wHTLeFx1G94cKXYOeOVe_wTiOY2D929UctIRGOtRaQ,69
|
|
|
50
50
|
evalscope/app/constants.py,sha256=oG6tZ618zJcCnwZ5THnYL0gWTPDb5XKrnmdrWxY3Z4Q,385
|
|
51
51
|
evalscope/app/ui/__init__.py,sha256=IBxyQ2H-kSHoHJmXWDR8QMermvsMbiu673PQbXP_FnE,616
|
|
52
52
|
evalscope/app/ui/app_ui.py,sha256=wLrQ4VM7BnzvaYmPAk8NH9t5BaWooHFJcgmAOOd2I1w,2032
|
|
53
|
-
evalscope/app/ui/multi_model.py,sha256=
|
|
53
|
+
evalscope/app/ui/multi_model.py,sha256=mvMgpgiJGRrNRtReFcD_PiLatq-81zp65Vb3JYUP3PE,15356
|
|
54
54
|
evalscope/app/ui/sidebar.py,sha256=JA0QbG2iPStK-lFy6x_AjOHlQdesmgXoS0OYJUJ_Wyg,1339
|
|
55
|
-
evalscope/app/ui/single_model.py,sha256=
|
|
55
|
+
evalscope/app/ui/single_model.py,sha256=zFt1uDYrcgNJ7e_YLigrs6IXT3jyGMVn-7rv4CHAZvE,9741
|
|
56
56
|
evalscope/app/ui/visualization.py,sha256=jXFX_-7woQkcAiQkPAIRwVv1kdRdXonn9IvmB8yzPDU,1102
|
|
57
|
-
evalscope/app/utils/data_utils.py,sha256=
|
|
57
|
+
evalscope/app/utils/data_utils.py,sha256=GYOfkh0NoueeX3od-L852Q9C9SSkEFlW_40wjPa5b9w,7470
|
|
58
58
|
evalscope/app/utils/env_utils.py,sha256=2pmz4uNun-XNP6TqM6Oe576XopweEClhBaIdWO--kd0,382
|
|
59
59
|
evalscope/app/utils/localization.py,sha256=rWEviBmcnhIpAA-cG8djbbUA6p1Y358c0dxge5Pqi1U,6131
|
|
60
60
|
evalscope/app/utils/text_utils.py,sha256=-K-hRPMZ29Yqjhzd-391gPaD4B4wUuIg71PfbLnGJ38,3754
|
|
61
|
-
evalscope/app/utils/visualization.py,sha256=
|
|
61
|
+
evalscope/app/utils/visualization.py,sha256=lycwcr-kFT2FKVw6iWMh3iD_n4dqpWVzhXMLDnkN8QY,3563
|
|
62
62
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
63
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
64
64
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
@@ -109,7 +109,7 @@ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4ty
|
|
|
109
109
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=jlwM13Ty-Ax6AeMsNlo9xIBupNFgnceYuXtCmh0hNTQ,6160
|
|
110
110
|
evalscope/benchmarks/__init__.py,sha256=WHR4ej9Tqa2N9CyIaUWXS8EnHZtcujaNeg9hf8GT31Y,1182
|
|
111
111
|
evalscope/benchmarks/ai2d/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
112
|
-
evalscope/benchmarks/ai2d/ai2d_adapter.py,sha256=
|
|
112
|
+
evalscope/benchmarks/ai2d/ai2d_adapter.py,sha256=qnQT2E0ZG8g4noOafu-QvBOKm-zEJ5X08QHw3ekNa4w,2473
|
|
113
113
|
evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
114
114
|
evalscope/benchmarks/aime/aime24_adapter.py,sha256=HTlriHoHzlm1Rf3KAiGRLs8sx6Gyf6s7RGtOjk_hGS4,1767
|
|
115
115
|
evalscope/benchmarks/aime/aime25_adapter.py,sha256=ZOE_6Zhg1MatWJSu2Zq372nKUODYtNFZimS1MJRFz5A,1591
|
|
@@ -152,10 +152,15 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
|
|
|
152
152
|
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
|
|
153
153
|
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
|
|
154
154
|
evalscope/benchmarks/bfcl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
155
|
-
evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=
|
|
155
|
+
evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=ZmwGylqXCAcpJ8glQmj7HkDa8OqE9KODiHvWelTGLIo,17033
|
|
156
156
|
evalscope/benchmarks/bfcl/generation.py,sha256=c6lNjo-VTSUrVg-pqyPSucrbCKBOdBSyN0aR5AAtE4A,8701
|
|
157
|
+
evalscope/benchmarks/blink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
158
|
+
evalscope/benchmarks/blink/blink_adapter.py,sha256=ocQKsDGwnUAg2si2p7tqIGeH3PKPqTSByjbt7ceraRo,2642
|
|
157
159
|
evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
158
160
|
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=4FLPgY-UtqINafnNxfOsE9AwS6GFXFCUGOBI-4EZUGk,8503
|
|
161
|
+
evalscope/benchmarks/chartqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
162
|
+
evalscope/benchmarks/chartqa/chartqa_adapter.py,sha256=DA1kthMUvn4_GUfdRfuR-au3RkhE3WKPnR_f8nlhd4c,2813
|
|
163
|
+
evalscope/benchmarks/chartqa/utils.py,sha256=Ta9ZUMpIqzrAszju7_WOMBAlilH1Tx6TCheVpjrZJJI,1672
|
|
159
164
|
evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
160
165
|
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=OWzRlSGswV24V-heLqqo7GQzpJp01TZ0DhFHq0iUP9A,8238
|
|
161
166
|
evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
@@ -167,6 +172,8 @@ evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=eetF21dN5
|
|
|
167
172
|
evalscope/benchmarks/docmath/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
168
173
|
evalscope/benchmarks/docmath/docmath_adapter.py,sha256=-mel6hA-x_e7fV0uOHdX5BpoQEVyQ5VqwIwEqSNDpnc,4623
|
|
169
174
|
evalscope/benchmarks/docmath/utils.py,sha256=d6Yjoa5q91kjr1SdVPVBndzDaUzMlO_GfEqMtUXXr0s,7707
|
|
175
|
+
evalscope/benchmarks/docvqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
176
|
+
evalscope/benchmarks/docvqa/docvqa_adapter.py,sha256=xGaayycILYoLd8r6wLLppDbU6Z1FdafbYFyjLHaftAA,2882
|
|
170
177
|
evalscope/benchmarks/drop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
171
178
|
evalscope/benchmarks/drop/drop_adapter.py,sha256=Jbbr5O_Y5LI_vT_RskRQVKxGkiIraX_uXP7fYaZ5eZs,9995
|
|
172
179
|
evalscope/benchmarks/drop/utils.py,sha256=Z9PHrNnRfGqFHCLONg5SWKARp1eTJlHFc_bU46t_YrM,1344
|
|
@@ -175,7 +182,7 @@ evalscope/benchmarks/frames/frames_adapter.py,sha256=w1kRya7w5omt95HHE6AzbzYVhyT
|
|
|
175
182
|
evalscope/benchmarks/frames/utils.py,sha256=gULWM6Rwv5bTSSWcDYp-iSIoWj8r5VtbQakhRzHJq8A,1172
|
|
176
183
|
evalscope/benchmarks/general_arena/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
177
184
|
evalscope/benchmarks/general_arena/general_arena_adapter.py,sha256=DzJaokqZwR2L8HDiahss8EbQ3vcsMXkzkMghxU-uAOo,21639
|
|
178
|
-
evalscope/benchmarks/general_arena/utils.py,sha256=
|
|
185
|
+
evalscope/benchmarks/general_arena/utils.py,sha256=p6pZfvdNCMOU_vWHm_DYU57Sa2WTDdFOkVBubblCRN4,6912
|
|
179
186
|
evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
180
187
|
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=7VKg_EzXkRvoWpR7h8qB4sVVb1eZHCGcPk-X_NMS5tE,2062
|
|
181
188
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
@@ -191,7 +198,7 @@ evalscope/benchmarks/healthbench/utils.py,sha256=M8SnOEhlqXWm03CFE6CAtbMiu6MqdGg
|
|
|
191
198
|
evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
192
199
|
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=tAe63NfV5ljUm1f4RTSFxWOVKBUhk3Cc0EGzF5uYLK4,2041
|
|
193
200
|
evalscope/benchmarks/hle/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
194
|
-
evalscope/benchmarks/hle/hle_adapter.py,sha256=
|
|
201
|
+
evalscope/benchmarks/hle/hle_adapter.py,sha256=kJP7bzIDbr82GKi0FTy2zf_j1UWNBfuXYzokYJ-S9WE,6410
|
|
195
202
|
evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
196
203
|
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=uLs3UHSALS3YHt0qzBismrIqdEUgbEalQbjC0CU7ym4,4085
|
|
197
204
|
evalscope/benchmarks/humaneval/utils.py,sha256=rPnc_JuSjNg9aV7UMUwsLrDlm-ufj64GNIBCWBeuRcM,6517
|
|
@@ -206,6 +213,8 @@ evalscope/benchmarks/image_edit/gedit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCe
|
|
|
206
213
|
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py,sha256=a6hhRbnGCvMEMsbnSbczjXd4vHfMVEnFfP459FCF_Mc,5250
|
|
207
214
|
evalscope/benchmarks/image_edit/gedit/utils.py,sha256=UN0z9Dafs8d8lEXqxin321d8smiS3H9p3gyLkZFPFNg,14735
|
|
208
215
|
evalscope/benchmarks/image_edit/gedit/vie_prompts.py,sha256=qVXWQyVUwZxEasDjVmYBk30_JI4gnvHacMOmMsA4wcI,22056
|
|
216
|
+
evalscope/benchmarks/infovqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
217
|
+
evalscope/benchmarks/infovqa/infovqa_adapter.py,sha256=3m_EvfRZ5ItHkz-3mVlsF_NnPS7NH1-EXwUW-s4VMxA,2617
|
|
209
218
|
evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
210
219
|
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=mNHA_Fuj_gAdOEoR7oChnGmErf1czqwnk8Zk-jRhBys,1304
|
|
211
220
|
evalscope/benchmarks/live_code_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -226,7 +235,7 @@ evalscope/benchmarks/math_vista/math_vista_adapter.py,sha256=Mu9BpH0rDNM0yMrGws4
|
|
|
226
235
|
evalscope/benchmarks/minerva_math/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
227
236
|
evalscope/benchmarks/minerva_math/minerva_math_adapter.py,sha256=jyT9_D4w8PTtLBN3Kn10_CnssH_mPuRNnn9rek_zUEs,1655
|
|
228
237
|
evalscope/benchmarks/mm_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
229
|
-
evalscope/benchmarks/mm_bench/mm_bench_adapter.py,sha256=
|
|
238
|
+
evalscope/benchmarks/mm_bench/mm_bench_adapter.py,sha256=ht2DVt_zEBJp4jvGy3myHHgdUUP9eff2O5BpIc9Fv74,4376
|
|
230
239
|
evalscope/benchmarks/mm_star/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
231
240
|
evalscope/benchmarks/mm_star/mm_star_adapter.py,sha256=oamLv6U2-JAK5mdVLkUgYxkOahxQkQYMRKAyu_xPAUE,2818
|
|
232
241
|
evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
@@ -236,7 +245,7 @@ evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=GtIyUubUg6Q6Ydh1Adj0-32
|
|
|
236
245
|
evalscope/benchmarks/mmlu_redux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
237
246
|
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=m_37OIFrJB4ZIvtbDJ_m9P9mA2QtrNjGfbbVo15awJg,7402
|
|
238
247
|
evalscope/benchmarks/mmmu/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
239
|
-
evalscope/benchmarks/mmmu/mmmu_adapter.py,sha256=
|
|
248
|
+
evalscope/benchmarks/mmmu/mmmu_adapter.py,sha256=WrykWq8n61CVrQ4XQhI3iEySgErHdZyng3udOL-Pddk,6054
|
|
240
249
|
evalscope/benchmarks/mmmu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
241
250
|
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py,sha256=banPS1nDt9bQ95urKbSZnR-hBTw23eL9MSrHt_0ZLp0,4725
|
|
242
251
|
evalscope/benchmarks/multi_if/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -248,6 +257,21 @@ evalscope/benchmarks/musr/musr_adapter.py,sha256=kx6bckj7Nijl4Wysuj-mKYdy0hIRDJh
|
|
|
248
257
|
evalscope/benchmarks/needle_haystack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
249
258
|
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py,sha256=GYaswWPwYI3aV5HSpcuBTgW9-HDtf2xzNZg0WrsI0Yo,17033
|
|
250
259
|
evalscope/benchmarks/needle_haystack/utils.py,sha256=k8WDigqt5LgzHw6DtaYsLtb3BJL0FTZS9JOyJCpoPq8,2935
|
|
260
|
+
evalscope/benchmarks/ocr_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
261
|
+
evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py,sha256=gkQb7g0-Lf5Sjemqs5kqogCLGFJI6YQv8-vGI1EbyLE,4392
|
|
262
|
+
evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py,sha256=cBpRDJvI9f6vKRD4wTPv-8ThGddR3EhVobgjQQUAYlE,2606
|
|
263
|
+
evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py,sha256=31bL0V32Fq7prF1WoVjXmrmMdhg0qNcoiOaKykKOrZM,36528
|
|
264
|
+
evalscope/benchmarks/ocr_bench_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
265
|
+
evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py,sha256=QGY4R75UxDafIwSaOEPPuCaX3Z8BGoZVvcc6OWbeO9w,7976
|
|
266
|
+
evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py,sha256=d1nU7LNwubBd_1rIe7i67hOVcJx5IUXkqVeqt1CQzak,1624
|
|
267
|
+
evalscope/benchmarks/ocr_bench_v2/parallel.py,sha256=Q54wFSSRBp-kG2MhW4eOoXE1W9g-SDVhN8JuphDERsE,2029
|
|
268
|
+
evalscope/benchmarks/ocr_bench_v2/spotting_metric.py,sha256=nftLaTOKEmqvSWr-c20f9hyyvNnd-Hg3E46KwqmkjLc,6149
|
|
269
|
+
evalscope/benchmarks/ocr_bench_v2/utils.py,sha256=z9DSh2m1yvM3vsvxvqdHuPgRFxgdmEnzuNIuO7PAV3s,15914
|
|
270
|
+
evalscope/benchmarks/ocr_bench_v2/vqa_metric.py,sha256=XkAiXk1uE7lsWQQXvjnHXZMsga8B9FVyq5qG8ghePK4,8980
|
|
271
|
+
evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
272
|
+
evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt,sha256=QO0K9z1ethy_lgs9vaxGN1u5DnPFsssp8z62Cni24iw,1424
|
|
273
|
+
evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py,sha256=qCuqDtsCfxAiQHYLNdHU7BQ9kLIZ9iyfmRxtIrGOBck,20349
|
|
274
|
+
evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py,sha256=7HzM1PEw8wNOhmQOsZe582Y2rr4u66Q3JKVvvMasntE,19565
|
|
251
275
|
evalscope/benchmarks/olympiad_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
252
276
|
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py,sha256=zePVmGjmyuwCWVb4h1PIQKAIFqBehwRwO2WOD0KX_ik,6565
|
|
253
277
|
evalscope/benchmarks/olympiad_bench/utils.py,sha256=w7vEZcT3vCVq8_DSMgAjZPpVFVHStJPJYsPkrs-yOFM,21412
|
|
@@ -302,8 +326,8 @@ evalscope/filters/selection.py,sha256=yiJu2JjXDH_lgfEtB9umkGcA3zpo3zvnyoq2mKrXbn
|
|
|
302
326
|
evalscope/metrics/__init__.py,sha256=1giVHESSjn98uBiAvYm5uLsmRQwmf9NHPSt7OT_QJss,1615
|
|
303
327
|
evalscope/metrics/llm_judge.py,sha256=XukhH9PQtIZAcbjJlOmOD9ye3ngRv_IGKKJE9jhheOE,8653
|
|
304
328
|
evalscope/metrics/math_parser.py,sha256=BMfautQtNNiF9f2DIEfO6SXSn_GYhzaddAjGWG10MJA,17257
|
|
305
|
-
evalscope/metrics/metric.py,sha256=
|
|
306
|
-
evalscope/metrics/metrics.py,sha256=
|
|
329
|
+
evalscope/metrics/metric.py,sha256=KNp_DNi9Ntq4my5G7La7AlP2Vj1p6hIgOheAh-4go5Q,12861
|
|
330
|
+
evalscope/metrics/metrics.py,sha256=Y7TQ6MYaGE32EntTz-18CmQqYMpo1rQSvUiSwzBgpaQ,14599
|
|
307
331
|
evalscope/metrics/rouge_metric.py,sha256=bqvSotuDdC0MEKmt8v6y6tBTBx0S3Ma-tfF-cMCckA4,4645
|
|
308
332
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
309
333
|
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=T91PgJfi1As7BR7I-Hq6rLlvHAtMB9JpBw9gMTH8VlE,12114
|
|
@@ -381,9 +405,9 @@ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py,sh
|
|
|
381
405
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py,sha256=OOr1JD9kTlUGXZNG5b3kvkUaNz7QTmhaGoHhIKL69qo,7613
|
|
382
406
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py,sha256=Ns7oM4KpKxWZTo8Lefe4EDFw-jzp5633zAArcWjoVZA,9772
|
|
383
407
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py,sha256=KIF5tsiE7a5dbDfa-IKwzuzMUpuEAQPrm1nWFFtAeoI,20032
|
|
384
|
-
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py,sha256=
|
|
408
|
+
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py,sha256=85ZvV2gKSnsbP5941PeJ-JJ4t8_lOYQe1EOxrHlIbNI,52728
|
|
385
409
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py,sha256=o5ykt3Q_WQlNmyxjQaS2-KPLGq1xqLZixNYam_Bs6NA,18701
|
|
386
|
-
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py,sha256=
|
|
410
|
+
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py,sha256=NPDpIRxjiroafZk5Z2uA9bC8Bi-yXY7um5HXxThF7N0,46857
|
|
387
411
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
388
412
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py,sha256=s7EkhtrIJ0LPUuLBArws8N23R1MoIoNaYUjwsbUqRkY,7994
|
|
389
413
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py,sha256=FnUyxxazEVaP69pAq9cig3j-mcX37BX-unPj0SVKUJI,3805
|
|
@@ -403,7 +427,7 @@ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/bl
|
|
|
403
427
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py,sha256=TOAI-KaUrtKjR1GNU_WwNXNpb9gGT-KX2FYe3muv_e0,4275
|
|
404
428
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py,sha256=-DprR09KYuwNEzEbhPvFRI3MR4_VdPMUGLPN6sL9Ym8,14625
|
|
405
429
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py,sha256=S68U0DxWYGDmreRbH5yLDHBNN9PsczY9H0Uik0hO-ds,13872
|
|
406
|
-
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py,sha256=
|
|
430
|
+
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py,sha256=zv_WyHi67hvgHQ4DkZ8a4UoPcgrADKayqVtiIq-p3V4,36695
|
|
407
431
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py,sha256=p67DDiFS-676z0z8jPj6NwXwNjEsqTXaXCh3g2UiDno,840
|
|
408
432
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py,sha256=LqMHlUTy2LEzoVwjALtrAw0UYmzIuHnFjQiVmn5nv-I,605
|
|
409
433
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py,sha256=d4HInkL_Phk0Bgg2cWaOvhsPa6lkqDeovFW86PL0I18,6371
|
|
@@ -448,8 +472,8 @@ evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3S
|
|
|
448
472
|
evalscope/perf/utils/local_server.py,sha256=_lSPlNEnOmPA_DtREgPS_vj2w_7D8PPSpypXbb0YfJM,4880
|
|
449
473
|
evalscope/perf/utils/log_utils.py,sha256=NWSK_ITG4yoVx5GMLbIRGDoXSs90s7X3mftdm37Os2U,1666
|
|
450
474
|
evalscope/perf/utils/rich_display.py,sha256=AQmXv1EuA1-IGgco-Jy1NLOmTKv4eBFH2K4QS8OoGVo,8206
|
|
451
|
-
evalscope/report/__init__.py,sha256=
|
|
452
|
-
evalscope/report/combinator.py,sha256=
|
|
475
|
+
evalscope/report/__init__.py,sha256=xS6eeTgsPdIlIOhzUn-ND77uV34vMVug4PmXHmYAxwM,1080
|
|
476
|
+
evalscope/report/combinator.py,sha256=F7KOClXVh56-XEw3Sb5uxwA6L8ZlH_P4-MOlm3Yp_Cg,5020
|
|
453
477
|
evalscope/report/generator.py,sha256=t2R3WGa4SowTRUPOgITtyTR4QDiJ6i3FH__byDKZU8Y,4959
|
|
454
478
|
evalscope/report/report.py,sha256=lEBD_E_RJiydFTaGFNLIMTFxNrqv8QcLZb_iuUg5HB0,8479
|
|
455
479
|
evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
@@ -494,41 +518,14 @@ evalscope/utils/deprecation_utils.py,sha256=aDv3HFNcJFZ7rxNgALQP0-ITO8L23HC_RX-C
|
|
|
494
518
|
evalscope/utils/function_utils.py,sha256=E-AIzx_PKrZDGl1cBvlvqNvMa8yM2WUJ2wh73PNBXrQ,1887
|
|
495
519
|
evalscope/utils/import_utils.py,sha256=S0WQ3gt4zpwJHjGcyC-604pWWExg3JV7f3wzoOH-tuo,5794
|
|
496
520
|
evalscope/utils/io_utils.py,sha256=79F0p7dFxA84tIVSL_C4piJgeQQtVUfb2R_Xcd8v_cE,11615
|
|
497
|
-
evalscope/utils/json_schema.py,sha256=
|
|
521
|
+
evalscope/utils/json_schema.py,sha256=GVP1m6g4mBrsFmOWOOVnmvl2joOz8gTlGEytLv5qy7s,8451
|
|
498
522
|
evalscope/utils/logger.py,sha256=roFk4Su4aJwsF0s-uYc5-tABnghwYPX3gpkA5QUGzK8,5675
|
|
499
523
|
evalscope/utils/model_utils.py,sha256=mdtYoHhUdfpxUtnS52XZjNdO3uSK4yeIBHT3aDU7s-A,2455
|
|
500
|
-
evalscope/utils/multi_choices.py,sha256=
|
|
524
|
+
evalscope/utils/multi_choices.py,sha256=0UJbgr5eXNgitPC79JLcyUU-OXg9BlM-mVk-fWtUSno,9881
|
|
501
525
|
evalscope/utils/url_utils.py,sha256=9HcFt9uZNbOJR3ADUFQ_dBFKziHV6H66Df7HYs1M4Po,1757
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
tests/benchmark/test_image_edit.py,sha256=z3z7psMRFynpVgUAFoH--ieeGXzb9cHkrq3tT_sCZo8,2165
|
|
509
|
-
tests/benchmark/test_sandbox.py,sha256=bHyX8ammdn7EsEbN80cIzDNhQZlJD3Ssoj9l4efF7rI,2968
|
|
510
|
-
tests/benchmark/test_t2i.py,sha256=fciaGsOrkOpT4WQlsnmjrqw6qolCzI0DGyWQAJkM-Es,4513
|
|
511
|
-
tests/benchmark/test_vlm.py,sha256=gn0ledf_yPY1IhCyCtiqT_dTVPUVZ3NVPr9yzsC_UZQ,4501
|
|
512
|
-
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
513
|
-
tests/cli/test_all.py,sha256=1omOXC1lBphBLm0hTf5HNstlF_bwi16dYyr00gvaCTM,7301
|
|
514
|
-
tests/cli/test_collection.py,sha256=lGz3YUS_0gM6_HjQLe26OfBAkHOPOEDWMO-UyP58GN8,4455
|
|
515
|
-
tests/cli/test_custom.py,sha256=9z_N7Re712xI62TqVSTBdzB_iFFEUb55wcWIcGvJb84,9254
|
|
516
|
-
tests/cli/test_reasoning.py,sha256=rU181LLoKbFCpNPFCIZULxEgsJ2PYswel2pP2EsjEmo,2696
|
|
517
|
-
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
518
|
-
tests/perf/test_perf.py,sha256=ugYNEyU32ctryPFa_6fr8aQYxfHJMymdKnKKEHM9Ajc,6174
|
|
519
|
-
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
520
|
-
tests/rag/test_clip_benchmark.py,sha256=qpSLgmHMGcYTnxP7AI__y-ii5_tu_fCSht6p3TBetkA,2650
|
|
521
|
-
tests/rag/test_mteb.py,sha256=fdNQIyUEzE7puPCKw5QhCHTEu7hz-ieHeq1xCWGh6IM,7246
|
|
522
|
-
tests/rag/test_ragas.py,sha256=5qozXvPFIb67T-igJv87ijlOgkPnqgkkBVXu6Ht4D0A,4554
|
|
523
|
-
tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
524
|
-
tests/swift/test_run_swift_eval.py,sha256=YbIhYNoI4kAB-ox-OXAKUifLIXTFqP-xGZicrAgK_V0,5784
|
|
525
|
-
tests/swift/test_run_swift_vlm_eval.py,sha256=RwrKkc1WHEZxetM11cGL81G4faKCn7SYn4VlwL03atI,4934
|
|
526
|
-
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=UAUtOCQ72xbm8s-sov3cBEpYVDy189wpB-qOL3KoU7M,6053
|
|
527
|
-
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
528
|
-
tests/vlm/test_vlmeval.py,sha256=EDQRkYfSyOICUwo_tm3p-puaE_xdFmqOPkrt5etxsqM,3307
|
|
529
|
-
evalscope-1.0.2.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
530
|
-
evalscope-1.0.2.dist-info/METADATA,sha256=vZciS7qNosSJOdwyRSxsCyVqvw8hyqKS84yKjlbxwzw,40305
|
|
531
|
-
evalscope-1.0.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
532
|
-
evalscope-1.0.2.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
533
|
-
evalscope-1.0.2.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
534
|
-
evalscope-1.0.2.dist-info/RECORD,,
|
|
526
|
+
evalscope-1.1.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
527
|
+
evalscope-1.1.0.dist-info/METADATA,sha256=pap4NeCTqw7bec2KqYboFj25zabm1m5rwoiqukX8EO4,39544
|
|
528
|
+
evalscope-1.1.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
529
|
+
evalscope-1.1.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
530
|
+
evalscope-1.1.0.dist-info/top_level.txt,sha256=jNR-HMn3TR8Atolq7_4rW8IWVX6GhvYV5_1Y_KbJKlY,10
|
|
531
|
+
evalscope-1.1.0.dist-info/RECORD,,
|