evalscope 1.0.2__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (87) hide show
  1. evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
  2. evalscope/app/ui/multi_model.py +6 -1
  3. evalscope/app/ui/single_model.py +8 -2
  4. evalscope/app/utils/data_utils.py +3 -2
  5. evalscope/app/utils/visualization.py +2 -2
  6. evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
  7. evalscope/benchmarks/bfcl/bfcl_adapter.py +10 -45
  8. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  9. evalscope/benchmarks/chartqa/__init__.py +0 -0
  10. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  11. evalscope/benchmarks/chartqa/utils.py +38 -0
  12. evalscope/benchmarks/docvqa/__init__.py +0 -0
  13. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  14. evalscope/benchmarks/general_arena/utils.py +2 -1
  15. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  16. evalscope/benchmarks/infovqa/__init__.py +0 -0
  17. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  18. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
  19. evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
  20. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  21. evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
  22. evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
  23. evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
  24. evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
  25. evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  26. evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
  27. evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
  28. evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  29. evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  30. evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  31. evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
  32. evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
  33. evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
  34. evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
  35. evalscope/metrics/metric.py +51 -0
  36. evalscope/metrics/metrics.py +16 -0
  37. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  38. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  39. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  40. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  41. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  42. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  43. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  44. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  45. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  46. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  47. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  48. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  49. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  50. evalscope/report/__init__.py +9 -1
  51. evalscope/report/combinator.py +52 -2
  52. evalscope/utils/json_schema.py +8 -6
  53. evalscope/utils/multi_choices.py +16 -1
  54. evalscope/version.py +2 -2
  55. {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/METADATA +6 -32
  56. {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/RECORD +51 -54
  57. {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
  58. tests/__init__.py +0 -1
  59. tests/benchmark/__init__.py +0 -1
  60. tests/benchmark/test_eval.py +0 -429
  61. tests/benchmark/test_image_edit.py +0 -65
  62. tests/benchmark/test_sandbox.py +0 -81
  63. tests/benchmark/test_t2i.py +0 -142
  64. tests/benchmark/test_vlm.py +0 -137
  65. tests/cli/__init__.py +0 -1
  66. tests/cli/test_all.py +0 -269
  67. tests/cli/test_collection.py +0 -99
  68. tests/cli/test_custom.py +0 -268
  69. tests/cli/test_reasoning.py +0 -81
  70. tests/common.py +0 -73
  71. tests/perf/__init__.py +0 -1
  72. tests/perf/test_perf.py +0 -206
  73. tests/rag/test_clip_benchmark.py +0 -87
  74. tests/rag/test_mteb.py +0 -213
  75. tests/rag/test_ragas.py +0 -128
  76. tests/swift/__init__.py +0 -1
  77. tests/swift/test_run_swift_eval.py +0 -146
  78. tests/swift/test_run_swift_vlm_eval.py +0 -128
  79. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  80. tests/test_run_all.py +0 -12
  81. tests/utils.py +0 -13
  82. tests/vlm/__init__.py +0 -1
  83. tests/vlm/test_vlmeval.py +0 -102
  84. {tests/rag → evalscope/benchmarks/blink}/__init__.py +0 -0
  85. {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
  86. {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
  87. {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0
@@ -1,3 +1,4 @@
1
+ import json
1
2
  from collections import defaultdict
2
3
  from typing import List
3
4
 
@@ -100,6 +101,56 @@ class MultiChoiceAcc(Metric):
100
101
  return res
101
102
 
102
103
 
104
+ @register_metric(name='anls')
105
+ class ANLS(Metric):
106
+
107
+ def __init__(self, thresh_hold=0.5):
108
+ self.thresh_hold = thresh_hold
109
+
110
+ def apply(self, predictions, references):
111
+ """
112
+ Calculate ANLS (Average Normalized Levenshtein Similarity) for a list of predictions and references.
113
+ This implementation is adapted from
114
+ https://github.com/QwenLM/Qwen-VL/blob/master/eval_mm/infographicsvqa_eval.py
115
+
116
+ Args:
117
+ references (List[str]): List of correct answers. Each answer can be a string of json.
118
+ predictions (List[str]): List of predicted answers.
119
+ """
120
+ from .metrics import levenshtein_distance
121
+
122
+ res = []
123
+ # Unwrap predictions if it's a nested list
124
+ for prediction, reference in zip(predictions, references):
125
+ # Parse the reference which is a json string
126
+ try:
127
+ answer = json.loads(reference)
128
+ except json.JSONDecodeError:
129
+ answer = reference
130
+ if isinstance(answer, str):
131
+ answer = [answer]
132
+ assert isinstance(answer, list), 'The reference answer should be a list of answers.'
133
+
134
+ # Calculate ANLS for each reference answer
135
+ values = []
136
+ for ans in answer:
137
+ # preprocess both the answers - gt and prediction
138
+ gt_answer = ' '.join(ans.strip().lower().split())
139
+ det_answer = ' '.join(prediction.strip().lower().split())
140
+
141
+ dist = levenshtein_distance(gt_answer, det_answer)
142
+ length = max(len(ans.upper()), len(prediction.upper()))
143
+ values.append(0.0 if length == 0 else float(dist) / float(length))
144
+
145
+ question_result = 0.0
146
+ if values:
147
+ question_result = 1 - min(values)
148
+ if question_result < self.thresh_hold:
149
+ question_result = 0.0
150
+ res.append(question_result)
151
+ return res
152
+
153
+
103
154
  # ##################
104
155
  # T2I Metrics ######
105
156
  ####################
@@ -467,3 +467,19 @@ def calculate_pass_at_k(
467
467
  num_samples_it = iter(num_samples)
468
468
 
469
469
  return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
470
+
471
+
472
+ def levenshtein_distance(s1, s2):
473
+ if len(s1) > len(s2):
474
+ s1, s2 = s2, s1
475
+
476
+ distances = range(len(s1) + 1)
477
+ for i2, c2 in enumerate(s2):
478
+ distances_ = [i2 + 1]
479
+ for i1, c1 in enumerate(s1):
480
+ if c1 == c2:
481
+ distances_.append(distances[i1])
482
+ else:
483
+ distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
484
+ distances = distances_
485
+ return distances[-1]
@@ -30,13 +30,9 @@ from transformers.modeling_outputs import (
30
30
  SequenceClassifierOutput,
31
31
  TokenClassifierOutput,
32
32
  )
33
- from transformers.modeling_utils import (
34
- PreTrainedModel,
35
- apply_chunking_to_forward,
36
- find_pruneable_heads_and_indices,
37
- prune_linear_layer,
38
- )
33
+ from transformers.modeling_utils import PreTrainedModel
39
34
  from transformers.models.bert.configuration_bert import BertConfig
35
+ from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
40
36
  from transformers.utils import logging
41
37
  from typing import Any, Dict, Optional, Tuple
42
38
 
@@ -14,13 +14,9 @@ from transformers.modeling_outputs import (
14
14
  BaseModelOutputWithPastAndCrossAttentions,
15
15
  BaseModelOutputWithPoolingAndCrossAttentions,
16
16
  )
17
- from transformers.modeling_utils import (
18
- PreTrainedModel,
19
- apply_chunking_to_forward,
20
- find_pruneable_heads_and_indices,
21
- prune_linear_layer,
22
- )
17
+ from transformers.modeling_utils import PreTrainedModel
23
18
  from transformers.models.bert.configuration_bert import BertConfig
19
+ from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
24
20
  from transformers.utils import logging
25
21
  from typing import Tuple
26
22
 
@@ -31,13 +31,9 @@ from transformers.modeling_outputs import (
31
31
  SequenceClassifierOutput,
32
32
  TokenClassifierOutput,
33
33
  )
34
- from transformers.modeling_utils import (
35
- PreTrainedModel,
36
- apply_chunking_to_forward,
37
- find_pruneable_heads_and_indices,
38
- prune_linear_layer,
39
- )
34
+ from transformers.modeling_utils import PreTrainedModel
40
35
  from transformers.models.bert.configuration_bert import BertConfig
36
+ from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
41
37
  from transformers.utils import logging
42
38
  from typing import Optional, Tuple
43
39
 
@@ -4,7 +4,13 @@ from typing import TYPE_CHECKING
4
4
  from evalscope.utils.import_utils import _LazyModule
5
5
 
6
6
  if TYPE_CHECKING:
7
- from .combinator import gen_table, get_data_frame, get_report_list
7
+ from .combinator import (
8
+ gen_table,
9
+ get_data_frame,
10
+ get_report_list,
11
+ unweighted_average_from_subsets,
12
+ weighted_average_from_subsets,
13
+ )
8
14
  from .generator import ReportGenerator
9
15
  from .report import Category, Report, ReportKey, Subset
10
16
 
@@ -14,6 +20,8 @@ else:
14
20
  'gen_table',
15
21
  'get_data_frame',
16
22
  'get_report_list',
23
+ 'weighted_average_from_subsets',
24
+ 'unweighted_average_from_subsets',
17
25
  ],
18
26
  'generator': [
19
27
  'ReportGenerator',
@@ -4,9 +4,9 @@ import glob
4
4
  import os
5
5
  import pandas as pd
6
6
  from tabulate import tabulate
7
- from typing import List, Tuple
7
+ from typing import Dict, List, Tuple, Union
8
8
 
9
- from evalscope.report.report import Report
9
+ from evalscope.report.report import Report, Subset
10
10
  from evalscope.utils.logger import get_logger
11
11
 
12
12
  logger = get_logger()
@@ -86,3 +86,53 @@ def gen_table(
86
86
  add_overall_metric=add_overall_metric
87
87
  )
88
88
  return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
89
+
90
+
91
+ def weighted_average_from_subsets(
92
+ subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
93
+ ) -> Subset:
94
+ """Calculate weighted average for given subsets.
95
+
96
+ Args:
97
+ subset_names (List[str]): List of subset names to include in the average.
98
+ subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
99
+ new_name (str): Name for the resulting Subset object.
100
+
101
+ Returns:
102
+ Subset: A new Subset object with weighted average score
103
+ """
104
+ total_score = 0
105
+ total_count = 0
106
+ for name in subset_names:
107
+ if name in subset_dict:
108
+ subset = subset_dict[name]
109
+ total_score += subset.score * subset.num
110
+ total_count += subset.num
111
+
112
+ weighted_avg = total_score / total_count if total_count > 0 else 0
113
+ return Subset(name=new_name, score=weighted_avg, num=total_count)
114
+
115
+
116
+ def unweighted_average_from_subsets(
117
+ subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
118
+ ) -> Subset:
119
+ """Calculate unweighted average for given subsets.
120
+
121
+ Args:
122
+ subset_names (List[str]): List of subset names to include in the average.
123
+ subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
124
+ new_name (str): Name for the resulting Subset object.
125
+
126
+ Returns:
127
+ Subset: A new Subset object with unweighted average score
128
+ """
129
+ scores = []
130
+ total_count = 0
131
+ for name in subset_names:
132
+ if name in subset_dict:
133
+ subset = subset_dict[name]
134
+ scores.append(subset.score)
135
+ total_count += subset.num
136
+
137
+ unweighted_avg = sum(scores) / len(scores) if scores else 0
138
+ return Subset(name=new_name, score=unweighted_avg, num=total_count)
@@ -59,18 +59,20 @@ class JSONSchema(BaseModel):
59
59
  required: Optional[List[str]] = Field(default=None)
60
60
  """Required fields for object parameters."""
61
61
 
62
- @field_validator('type')
63
- def validate_type(cls, v: Optional[str]) -> Optional[JSONType]:
64
- return python_type_to_json_type(v)
65
-
66
62
  @model_validator(mode='before')
67
63
  def convert_type_before_validation(cls, values):
68
64
  values = deepcopy(values)
69
65
 
70
66
  def recursive_convert_type(obj):
71
67
  if isinstance(obj, dict):
72
- if 'type' in obj:
73
- obj['type'] = python_type_to_json_type(obj['type'])
68
+ # Convert 'type' field if it's a string
69
+ if 'type' in obj and isinstance(obj['type'], str):
70
+ try:
71
+ obj['type'] = python_type_to_json_type(obj['type'])
72
+ except ValueError:
73
+ # If conversion fails, leave it as is
74
+ pass
75
+ # Recursively process nested structures
74
76
  for k, v in obj.items():
75
77
  obj[k] = recursive_convert_type(v)
76
78
  elif isinstance(obj, list):
@@ -81,12 +81,27 @@ def answer_options(choices: Choices) -> str:
81
81
  return '\n'.join([f'{answer_character(i)}) {choices[j].value}' for i, j in enumerate(indexes)])
82
82
 
83
83
 
84
+ def format_letter_choices(choices: Union[Choices, List[str]]) -> str:
85
+ """
86
+ Returns the `choices` formatted as a letter list, e.g.:
87
+
88
+ ["choice 1", "choice 2", "choice 3"] ->
89
+ "A,B,C"
90
+ """
91
+ if isinstance(choices, list):
92
+ choices = Choices(choices)
93
+
94
+ indexes = list(range(len(choices)))
95
+
96
+ return ','.join([f'{answer_character(i)}' for i in indexes])
97
+
98
+
84
99
  def prompt(question: str, choices: Union[Choices, List[str]], template: str, fewshot: Optional[str] = None) -> str:
85
100
  if isinstance(choices, list):
86
101
  choices = Choices(choices)
87
102
 
88
103
  choices_text = answer_options(choices)
89
- letters = ','.join(answer_character(i) for i in range(len(choices)))
104
+ letters = format_letter_choices(choices)
90
105
  if not fewshot:
91
106
  return template.format(
92
107
  choices=choices_text,
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '1.0.2'
4
- __release_datetime__ = '2025-09-23 18:00:00'
3
+ __version__ = '1.1.0'
4
+ __release_datetime__ = '2025-10-14 14:00:00'
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 1.0.2
3
+ Version: 1.1.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
- Home-page: https://github.com/modelscope/evalscope
6
5
  Author: ModelScope team
7
6
  Author-email: contact@modelscope.cn
8
7
  License: Apache License 2.0
8
+ Project-URL: Homepage, https://github.com/modelscope/evalscope
9
9
  Keywords: python,llm,evaluation
10
10
  Classifier: Development Status :: 4 - Beta
11
11
  Classifier: Operating System :: OS Independent
@@ -14,6 +14,7 @@ Classifier: Programming Language :: Python :: 3.9
14
14
  Classifier: Programming Language :: Python :: 3.10
15
15
  Classifier: Programming Language :: Python :: 3.11
16
16
  Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: License :: OSI Approved :: Apache Software License
17
18
  Requires-Python: >=3.9
18
19
  Description-Content-Type: text/markdown
19
20
  License-File: LICENSE
@@ -56,35 +57,6 @@ Requires-Dist: peft>=0.17; extra == "aigc"
56
57
  Requires-Dist: torch; extra == "aigc"
57
58
  Requires-Dist: torchvision; extra == "aigc"
58
59
  Provides-Extra: all
59
- Requires-Dist: colorlog; extra == "all"
60
- Requires-Dist: datasets==3.6.0; extra == "all"
61
- Requires-Dist: docstring-parser; extra == "all"
62
- Requires-Dist: dotenv; extra == "all"
63
- Requires-Dist: jieba; extra == "all"
64
- Requires-Dist: jsonlines; extra == "all"
65
- Requires-Dist: langdetect; extra == "all"
66
- Requires-Dist: latex2sympy2-extended[antlr4_9_3]; extra == "all"
67
- Requires-Dist: matplotlib; extra == "all"
68
- Requires-Dist: modelscope[framework]>=1.27; extra == "all"
69
- Requires-Dist: nltk>=3.9; extra == "all"
70
- Requires-Dist: openai; extra == "all"
71
- Requires-Dist: overrides; extra == "all"
72
- Requires-Dist: pandas; extra == "all"
73
- Requires-Dist: pillow; extra == "all"
74
- Requires-Dist: pydantic; extra == "all"
75
- Requires-Dist: pyyaml>=5.1; extra == "all"
76
- Requires-Dist: requests; extra == "all"
77
- Requires-Dist: rich; extra == "all"
78
- Requires-Dist: rouge-chinese; extra == "all"
79
- Requires-Dist: rouge-score>=0.1.0; extra == "all"
80
- Requires-Dist: sacrebleu; extra == "all"
81
- Requires-Dist: scikit-learn; extra == "all"
82
- Requires-Dist: seaborn; extra == "all"
83
- Requires-Dist: sympy; extra == "all"
84
- Requires-Dist: tabulate; extra == "all"
85
- Requires-Dist: tqdm; extra == "all"
86
- Requires-Dist: transformers>=4.33; extra == "all"
87
- Requires-Dist: word2number; extra == "all"
88
60
  Requires-Dist: ms-opencompass>=0.1.6; extra == "all"
89
61
  Requires-Dist: ms-vlmeval>=0.0.17; extra == "all"
90
62
  Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
@@ -99,6 +71,7 @@ Requires-Dist: aiohttp; extra == "all"
99
71
  Requires-Dist: fastapi; extra == "all"
100
72
  Requires-Dist: jinja2; extra == "all"
101
73
  Requires-Dist: numpy; extra == "all"
74
+ Requires-Dist: rich; extra == "all"
102
75
  Requires-Dist: sse-starlette; extra == "all"
103
76
  Requires-Dist: transformers; extra == "all"
104
77
  Requires-Dist: uvicorn; extra == "all"
@@ -266,7 +239,8 @@ Please scan the QR code below to join our community groups:
266
239
  > **Version 1.0 Refactoring**
267
240
  >
268
241
  > Version 1.0 introduces a major overhaul of the evaluation framework, establishing a new, more modular and extensible API layer under `evalscope/api`. Key improvements include standardized data models for benchmarks, samples, and results; a registry-based design for components such as benchmarks and metrics; and a rewritten core evaluator that orchestrates the new architecture. Existing benchmark adapters have been migrated to this API, resulting in cleaner, more consistent, and easier-to-maintain implementations.
269
-
242
+ - 🔥 **[2025.10.14]** Added support for OCRBench, OCRBench-v2, DocVQA, InfoVQA, ChartQA, and BLINK multimodal image-text evaluation benchmarks.
243
+ - 🔥 **[2025.09.22]** Code evaluation benchmarks (HumanEval, LiveCodeBench) now support running in a sandbox environment. To use this feature, please install [ms-enclave](https://github.com/modelscope/ms-enclave) first.
270
244
  - 🔥 **[2025.09.19]** Added support for multimodal image-text evaluation benchmarks including RealWorldQA, AI2D, MMStar, MMBench, and OmniBench, as well as pure text evaluation benchmarks such as Multi-IF, HealthBench, and AMC.
271
245
  - 🔥 **[2025.09.05]** Added support for vision-language multimodal model evaluation tasks, such as MathVista and MMMU. For more supported datasets, please [refer to the documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/vlm.html).
272
246
  - 🔥 **[2025.09.04]** Added support for image editing task evaluation, including the [GEdit-Bench](https://modelscope.cn/datasets/stepfun-ai/GEdit-Bench) benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/image_edit.html).
@@ -4,14 +4,14 @@ evalscope/config.py,sha256=S2N11-AxQkT7lVffpjXdtpT4QpnSP6th-c8I-501mwM,11507
4
4
  evalscope/constants.py,sha256=W3E4Jp-x6qxvPOYtU9bNlzlERFvSAA_3F007apIwUlU,3601
5
5
  evalscope/run.py,sha256=A9_7pR3FiA-It46A3Mqk7ce6fQy548p0ux2QUugj2hI,6531
6
6
  evalscope/summarizer.py,sha256=HUDJ1zKi22uNst3AUfX67Z0sHzeZy-4S8sYyvxJnBzc,5901
7
- evalscope/version.py,sha256=H_zHGJkiB6equdW6Jo4F_hhdLYKZqriowav05O5_CeY,118
7
+ evalscope/version.py,sha256=hqGJMtjd3F6yPJucqhuYtXuGYSumthFmroHsUTY761Y,118
8
8
  evalscope/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  evalscope/api/registry.py,sha256=Qk0KMGDbt-iI0-OfoJZbOtxt76qreAVWh36HOoQAKM4,5448
10
10
  evalscope/api/benchmark/__init__.py,sha256=9xcTxpcQ6HhZ0QDwEIZhAT5IjybzaJ60VGLcmaFE5dU,188
11
11
  evalscope/api/benchmark/benchmark.py,sha256=gqAM81SeGb_Q0rA6Q-LFpnNkOUiwOj43aRWECtCxAOE,10832
12
12
  evalscope/api/benchmark/meta.py,sha256=N4u8NQjkjIw-xaf6KFnb6C8JDKB0DLbsXyXblDqIpvE,4304
13
13
  evalscope/api/benchmark/adapters/__init__.py,sha256=uLt_GiU4s-_6Rjgmr4OUTtE7dvEX-ZIQ403fd6oNuxA,264
14
- evalscope/api/benchmark/adapters/default_data_adapter.py,sha256=WS4Pm0pk51Se196Ho31FmOqGyOajTtUGbbjWD9U7UwU,28064
14
+ evalscope/api/benchmark/adapters/default_data_adapter.py,sha256=RWDweSmXKGv5hPPjeV4VF76gbKqYJEsab_lQYGUM2PA,28785
15
15
  evalscope/api/benchmark/adapters/image_edit_adapter.py,sha256=06V-_A8RKuMNYMt7-vaXn2qBa9LIZgfFO_6PUuhAkh0,3052
16
16
  evalscope/api/benchmark/adapters/multi_choice_adapter.py,sha256=auqLNvF50Or9bo3LOmQLXHfFaTTCTqvQzZog3glInng,3062
17
17
  evalscope/api/benchmark/adapters/text2image_adapter.py,sha256=jO64hwjQexIv-MTyHH0Ffp_6p--9TKufOmX_U39mAnE,6385
@@ -50,15 +50,15 @@ evalscope/app/arguments.py,sha256=1wHTLeFx1G94cKXYOeOVe_wTiOY2D929UctIRGOtRaQ,69
50
50
  evalscope/app/constants.py,sha256=oG6tZ618zJcCnwZ5THnYL0gWTPDb5XKrnmdrWxY3Z4Q,385
51
51
  evalscope/app/ui/__init__.py,sha256=IBxyQ2H-kSHoHJmXWDR8QMermvsMbiu673PQbXP_FnE,616
52
52
  evalscope/app/ui/app_ui.py,sha256=wLrQ4VM7BnzvaYmPAk8NH9t5BaWooHFJcgmAOOd2I1w,2032
53
- evalscope/app/ui/multi_model.py,sha256=fO8z-ZFucWtgaKmuQ50AkUp4BoYOFqOkxeTBUUAK0bM,15122
53
+ evalscope/app/ui/multi_model.py,sha256=mvMgpgiJGRrNRtReFcD_PiLatq-81zp65Vb3JYUP3PE,15356
54
54
  evalscope/app/ui/sidebar.py,sha256=JA0QbG2iPStK-lFy6x_AjOHlQdesmgXoS0OYJUJ_Wyg,1339
55
- evalscope/app/ui/single_model.py,sha256=1rgYrJOO75fJG2pa74tzEocO_91jXOAKFQAUViBcYFk,9459
55
+ evalscope/app/ui/single_model.py,sha256=zFt1uDYrcgNJ7e_YLigrs6IXT3jyGMVn-7rv4CHAZvE,9741
56
56
  evalscope/app/ui/visualization.py,sha256=jXFX_-7woQkcAiQkPAIRwVv1kdRdXonn9IvmB8yzPDU,1102
57
- evalscope/app/utils/data_utils.py,sha256=m7Z0Us_josUFseI8VJpIp8QaYeLnu91E2HCZ8WSB07E,7396
57
+ evalscope/app/utils/data_utils.py,sha256=GYOfkh0NoueeX3od-L852Q9C9SSkEFlW_40wjPa5b9w,7470
58
58
  evalscope/app/utils/env_utils.py,sha256=2pmz4uNun-XNP6TqM6Oe576XopweEClhBaIdWO--kd0,382
59
59
  evalscope/app/utils/localization.py,sha256=rWEviBmcnhIpAA-cG8djbbUA6p1Y358c0dxge5Pqi1U,6131
60
60
  evalscope/app/utils/text_utils.py,sha256=-K-hRPMZ29Yqjhzd-391gPaD4B4wUuIg71PfbLnGJ38,3754
61
- evalscope/app/utils/visualization.py,sha256=dwEXbGfY7vFysnL0HmrHS2BEWaJkg-dZ9ayDlRhdvv4,3559
61
+ evalscope/app/utils/visualization.py,sha256=lycwcr-kFT2FKVw6iWMh3iD_n4dqpWVzhXMLDnkN8QY,3563
62
62
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
63
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
64
64
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -109,7 +109,7 @@ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4ty
109
109
  evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=jlwM13Ty-Ax6AeMsNlo9xIBupNFgnceYuXtCmh0hNTQ,6160
110
110
  evalscope/benchmarks/__init__.py,sha256=WHR4ej9Tqa2N9CyIaUWXS8EnHZtcujaNeg9hf8GT31Y,1182
111
111
  evalscope/benchmarks/ai2d/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
112
- evalscope/benchmarks/ai2d/ai2d_adapter.py,sha256=3GBNV4cNv9bBLJRdG_uA9qNhuN6qAEutHl8d-rsFpFU,2018
112
+ evalscope/benchmarks/ai2d/ai2d_adapter.py,sha256=qnQT2E0ZG8g4noOafu-QvBOKm-zEJ5X08QHw3ekNa4w,2473
113
113
  evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
114
  evalscope/benchmarks/aime/aime24_adapter.py,sha256=HTlriHoHzlm1Rf3KAiGRLs8sx6Gyf6s7RGtOjk_hGS4,1767
115
115
  evalscope/benchmarks/aime/aime25_adapter.py,sha256=ZOE_6Zhg1MatWJSu2Zq372nKUODYtNFZimS1MJRFz5A,1591
@@ -152,10 +152,15 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
152
152
  evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
153
153
  evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
154
154
  evalscope/benchmarks/bfcl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
- evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=WzpL7XWDdx-EvbLluIOiMlADTO42CYs0IwQFvIfhTI0,18402
155
+ evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=ZmwGylqXCAcpJ8glQmj7HkDa8OqE9KODiHvWelTGLIo,17033
156
156
  evalscope/benchmarks/bfcl/generation.py,sha256=c6lNjo-VTSUrVg-pqyPSucrbCKBOdBSyN0aR5AAtE4A,8701
157
+ evalscope/benchmarks/blink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
158
+ evalscope/benchmarks/blink/blink_adapter.py,sha256=ocQKsDGwnUAg2si2p7tqIGeH3PKPqTSByjbt7ceraRo,2642
157
159
  evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
158
160
  evalscope/benchmarks/ceval/ceval_adapter.py,sha256=4FLPgY-UtqINafnNxfOsE9AwS6GFXFCUGOBI-4EZUGk,8503
161
+ evalscope/benchmarks/chartqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
162
+ evalscope/benchmarks/chartqa/chartqa_adapter.py,sha256=DA1kthMUvn4_GUfdRfuR-au3RkhE3WKPnR_f8nlhd4c,2813
163
+ evalscope/benchmarks/chartqa/utils.py,sha256=Ta9ZUMpIqzrAszju7_WOMBAlilH1Tx6TCheVpjrZJJI,1672
159
164
  evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
160
165
  evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=OWzRlSGswV24V-heLqqo7GQzpJp01TZ0DhFHq0iUP9A,8238
161
166
  evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
@@ -167,6 +172,8 @@ evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=eetF21dN5
167
172
  evalscope/benchmarks/docmath/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
168
173
  evalscope/benchmarks/docmath/docmath_adapter.py,sha256=-mel6hA-x_e7fV0uOHdX5BpoQEVyQ5VqwIwEqSNDpnc,4623
169
174
  evalscope/benchmarks/docmath/utils.py,sha256=d6Yjoa5q91kjr1SdVPVBndzDaUzMlO_GfEqMtUXXr0s,7707
175
+ evalscope/benchmarks/docvqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
176
+ evalscope/benchmarks/docvqa/docvqa_adapter.py,sha256=xGaayycILYoLd8r6wLLppDbU6Z1FdafbYFyjLHaftAA,2882
170
177
  evalscope/benchmarks/drop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
171
178
  evalscope/benchmarks/drop/drop_adapter.py,sha256=Jbbr5O_Y5LI_vT_RskRQVKxGkiIraX_uXP7fYaZ5eZs,9995
172
179
  evalscope/benchmarks/drop/utils.py,sha256=Z9PHrNnRfGqFHCLONg5SWKARp1eTJlHFc_bU46t_YrM,1344
@@ -175,7 +182,7 @@ evalscope/benchmarks/frames/frames_adapter.py,sha256=w1kRya7w5omt95HHE6AzbzYVhyT
175
182
  evalscope/benchmarks/frames/utils.py,sha256=gULWM6Rwv5bTSSWcDYp-iSIoWj8r5VtbQakhRzHJq8A,1172
176
183
  evalscope/benchmarks/general_arena/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
177
184
  evalscope/benchmarks/general_arena/general_arena_adapter.py,sha256=DzJaokqZwR2L8HDiahss8EbQ3vcsMXkzkMghxU-uAOo,21639
178
- evalscope/benchmarks/general_arena/utils.py,sha256=zS4l1RKwvl0Z9Mk7kth9WVQGHTgE_aNDZa_XNy9tGyM,6874
185
+ evalscope/benchmarks/general_arena/utils.py,sha256=p6pZfvdNCMOU_vWHm_DYU57Sa2WTDdFOkVBubblCRN4,6912
179
186
  evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
180
187
  evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=7VKg_EzXkRvoWpR7h8qB4sVVb1eZHCGcPk-X_NMS5tE,2062
181
188
  evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
@@ -191,7 +198,7 @@ evalscope/benchmarks/healthbench/utils.py,sha256=M8SnOEhlqXWm03CFE6CAtbMiu6MqdGg
191
198
  evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
192
199
  evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=tAe63NfV5ljUm1f4RTSFxWOVKBUhk3Cc0EGzF5uYLK4,2041
193
200
  evalscope/benchmarks/hle/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
194
- evalscope/benchmarks/hle/hle_adapter.py,sha256=4YVmETL9mEiLxF4vWRjePLyFaxelax6nOaqoAH5ZxmU,6389
201
+ evalscope/benchmarks/hle/hle_adapter.py,sha256=kJP7bzIDbr82GKi0FTy2zf_j1UWNBfuXYzokYJ-S9WE,6410
195
202
  evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
196
203
  evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=uLs3UHSALS3YHt0qzBismrIqdEUgbEalQbjC0CU7ym4,4085
197
204
  evalscope/benchmarks/humaneval/utils.py,sha256=rPnc_JuSjNg9aV7UMUwsLrDlm-ufj64GNIBCWBeuRcM,6517
@@ -206,6 +213,8 @@ evalscope/benchmarks/image_edit/gedit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCe
206
213
  evalscope/benchmarks/image_edit/gedit/gedit_adapter.py,sha256=a6hhRbnGCvMEMsbnSbczjXd4vHfMVEnFfP459FCF_Mc,5250
207
214
  evalscope/benchmarks/image_edit/gedit/utils.py,sha256=UN0z9Dafs8d8lEXqxin321d8smiS3H9p3gyLkZFPFNg,14735
208
215
  evalscope/benchmarks/image_edit/gedit/vie_prompts.py,sha256=qVXWQyVUwZxEasDjVmYBk30_JI4gnvHacMOmMsA4wcI,22056
216
+ evalscope/benchmarks/infovqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
217
+ evalscope/benchmarks/infovqa/infovqa_adapter.py,sha256=3m_EvfRZ5ItHkz-3mVlsF_NnPS7NH1-EXwUW-s4VMxA,2617
209
218
  evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
210
219
  evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=mNHA_Fuj_gAdOEoR7oChnGmErf1czqwnk8Zk-jRhBys,1304
211
220
  evalscope/benchmarks/live_code_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -226,7 +235,7 @@ evalscope/benchmarks/math_vista/math_vista_adapter.py,sha256=Mu9BpH0rDNM0yMrGws4
226
235
  evalscope/benchmarks/minerva_math/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
227
236
  evalscope/benchmarks/minerva_math/minerva_math_adapter.py,sha256=jyT9_D4w8PTtLBN3Kn10_CnssH_mPuRNnn9rek_zUEs,1655
228
237
  evalscope/benchmarks/mm_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
229
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py,sha256=py0DakGQX1JE2rqYjYN9w_-H0DtQ-YqG5k2s_UzbxxU,4372
238
+ evalscope/benchmarks/mm_bench/mm_bench_adapter.py,sha256=ht2DVt_zEBJp4jvGy3myHHgdUUP9eff2O5BpIc9Fv74,4376
230
239
  evalscope/benchmarks/mm_star/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
231
240
  evalscope/benchmarks/mm_star/mm_star_adapter.py,sha256=oamLv6U2-JAK5mdVLkUgYxkOahxQkQYMRKAyu_xPAUE,2818
232
241
  evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
@@ -236,7 +245,7 @@ evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=GtIyUubUg6Q6Ydh1Adj0-32
236
245
  evalscope/benchmarks/mmlu_redux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
237
246
  evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=m_37OIFrJB4ZIvtbDJ_m9P9mA2QtrNjGfbbVo15awJg,7402
238
247
  evalscope/benchmarks/mmmu/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
239
- evalscope/benchmarks/mmmu/mmmu_adapter.py,sha256=C7UM6HvomcA_Srf7771S0CaUvifBX63i161XaacraGQ,6038
248
+ evalscope/benchmarks/mmmu/mmmu_adapter.py,sha256=WrykWq8n61CVrQ4XQhI3iEySgErHdZyng3udOL-Pddk,6054
240
249
  evalscope/benchmarks/mmmu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
241
250
  evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py,sha256=banPS1nDt9bQ95urKbSZnR-hBTw23eL9MSrHt_0ZLp0,4725
242
251
  evalscope/benchmarks/multi_if/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -248,6 +257,21 @@ evalscope/benchmarks/musr/musr_adapter.py,sha256=kx6bckj7Nijl4Wysuj-mKYdy0hIRDJh
248
257
  evalscope/benchmarks/needle_haystack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
249
258
  evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py,sha256=GYaswWPwYI3aV5HSpcuBTgW9-HDtf2xzNZg0WrsI0Yo,17033
250
259
  evalscope/benchmarks/needle_haystack/utils.py,sha256=k8WDigqt5LgzHw6DtaYsLtb3BJL0FTZS9JOyJCpoPq8,2935
260
+ evalscope/benchmarks/ocr_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
261
+ evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py,sha256=gkQb7g0-Lf5Sjemqs5kqogCLGFJI6YQv8-vGI1EbyLE,4392
262
+ evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py,sha256=cBpRDJvI9f6vKRD4wTPv-8ThGddR3EhVobgjQQUAYlE,2606
263
+ evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py,sha256=31bL0V32Fq7prF1WoVjXmrmMdhg0qNcoiOaKykKOrZM,36528
264
+ evalscope/benchmarks/ocr_bench_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
265
+ evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py,sha256=QGY4R75UxDafIwSaOEPPuCaX3Z8BGoZVvcc6OWbeO9w,7976
266
+ evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py,sha256=d1nU7LNwubBd_1rIe7i67hOVcJx5IUXkqVeqt1CQzak,1624
267
+ evalscope/benchmarks/ocr_bench_v2/parallel.py,sha256=Q54wFSSRBp-kG2MhW4eOoXE1W9g-SDVhN8JuphDERsE,2029
268
+ evalscope/benchmarks/ocr_bench_v2/spotting_metric.py,sha256=nftLaTOKEmqvSWr-c20f9hyyvNnd-Hg3E46KwqmkjLc,6149
269
+ evalscope/benchmarks/ocr_bench_v2/utils.py,sha256=z9DSh2m1yvM3vsvxvqdHuPgRFxgdmEnzuNIuO7PAV3s,15914
270
+ evalscope/benchmarks/ocr_bench_v2/vqa_metric.py,sha256=XkAiXk1uE7lsWQQXvjnHXZMsga8B9FVyq5qG8ghePK4,8980
271
+ evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
272
+ evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt,sha256=QO0K9z1ethy_lgs9vaxGN1u5DnPFsssp8z62Cni24iw,1424
273
+ evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py,sha256=qCuqDtsCfxAiQHYLNdHU7BQ9kLIZ9iyfmRxtIrGOBck,20349
274
+ evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py,sha256=7HzM1PEw8wNOhmQOsZe582Y2rr4u66Q3JKVvvMasntE,19565
251
275
  evalscope/benchmarks/olympiad_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
252
276
  evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py,sha256=zePVmGjmyuwCWVb4h1PIQKAIFqBehwRwO2WOD0KX_ik,6565
253
277
  evalscope/benchmarks/olympiad_bench/utils.py,sha256=w7vEZcT3vCVq8_DSMgAjZPpVFVHStJPJYsPkrs-yOFM,21412
@@ -302,8 +326,8 @@ evalscope/filters/selection.py,sha256=yiJu2JjXDH_lgfEtB9umkGcA3zpo3zvnyoq2mKrXbn
302
326
  evalscope/metrics/__init__.py,sha256=1giVHESSjn98uBiAvYm5uLsmRQwmf9NHPSt7OT_QJss,1615
303
327
  evalscope/metrics/llm_judge.py,sha256=XukhH9PQtIZAcbjJlOmOD9ye3ngRv_IGKKJE9jhheOE,8653
304
328
  evalscope/metrics/math_parser.py,sha256=BMfautQtNNiF9f2DIEfO6SXSn_GYhzaddAjGWG10MJA,17257
305
- evalscope/metrics/metric.py,sha256=CabKKEbw_DptyH1ZQju7WzjB47fWUKdOhFB1ROpUC-4,10871
306
- evalscope/metrics/metrics.py,sha256=VxAggzEfaLKxWcXyuve8QbEBwV2W71udVyt0gynzGec,14134
329
+ evalscope/metrics/metric.py,sha256=KNp_DNi9Ntq4my5G7La7AlP2Vj1p6hIgOheAh-4go5Q,12861
330
+ evalscope/metrics/metrics.py,sha256=Y7TQ6MYaGE32EntTz-18CmQqYMpo1rQSvUiSwzBgpaQ,14599
307
331
  evalscope/metrics/rouge_metric.py,sha256=bqvSotuDdC0MEKmt8v6y6tBTBx0S3Ma-tfF-cMCckA4,4645
308
332
  evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
309
333
  evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=T91PgJfi1As7BR7I-Hq6rLlvHAtMB9JpBw9gMTH8VlE,12114
@@ -381,9 +405,9 @@ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py,sh
381
405
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py,sha256=OOr1JD9kTlUGXZNG5b3kvkUaNz7QTmhaGoHhIKL69qo,7613
382
406
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py,sha256=Ns7oM4KpKxWZTo8Lefe4EDFw-jzp5633zAArcWjoVZA,9772
383
407
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py,sha256=KIF5tsiE7a5dbDfa-IKwzuzMUpuEAQPrm1nWFFtAeoI,20032
384
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py,sha256=uhaehowhTqRhQtq_dVCgF-9Iu4yU19AMxx2sJimYwlA,52711
408
+ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py,sha256=85ZvV2gKSnsbP5941PeJ-JJ4t8_lOYQe1EOxrHlIbNI,52728
385
409
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py,sha256=o5ykt3Q_WQlNmyxjQaS2-KPLGq1xqLZixNYam_Bs6NA,18701
386
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py,sha256=aBKdQQS7cHMPgYqIknCdHCZ7j2_QLACPn_jU_njiMIs,46840
410
+ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py,sha256=NPDpIRxjiroafZk5Z2uA9bC8Bi-yXY7um5HXxThF7N0,46857
387
411
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
388
412
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py,sha256=s7EkhtrIJ0LPUuLBArws8N23R1MoIoNaYUjwsbUqRkY,7994
389
413
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py,sha256=FnUyxxazEVaP69pAq9cig3j-mcX37BX-unPj0SVKUJI,3805
@@ -403,7 +427,7 @@ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/bl
403
427
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py,sha256=TOAI-KaUrtKjR1GNU_WwNXNpb9gGT-KX2FYe3muv_e0,4275
404
428
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py,sha256=-DprR09KYuwNEzEbhPvFRI3MR4_VdPMUGLPN6sL9Ym8,14625
405
429
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py,sha256=S68U0DxWYGDmreRbH5yLDHBNN9PsczY9H0Uik0hO-ds,13872
406
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py,sha256=i1XlJe_PTSmiPkZKIhUXC_lc0-z2ewNYo4W1DvZQxjY,36678
430
+ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py,sha256=zv_WyHi67hvgHQ4DkZ8a4UoPcgrADKayqVtiIq-p3V4,36695
407
431
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py,sha256=p67DDiFS-676z0z8jPj6NwXwNjEsqTXaXCh3g2UiDno,840
408
432
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py,sha256=LqMHlUTy2LEzoVwjALtrAw0UYmzIuHnFjQiVmn5nv-I,605
409
433
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py,sha256=d4HInkL_Phk0Bgg2cWaOvhsPa6lkqDeovFW86PL0I18,6371
@@ -448,8 +472,8 @@ evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3S
448
472
  evalscope/perf/utils/local_server.py,sha256=_lSPlNEnOmPA_DtREgPS_vj2w_7D8PPSpypXbb0YfJM,4880
449
473
  evalscope/perf/utils/log_utils.py,sha256=NWSK_ITG4yoVx5GMLbIRGDoXSs90s7X3mftdm37Os2U,1666
450
474
  evalscope/perf/utils/rich_display.py,sha256=AQmXv1EuA1-IGgco-Jy1NLOmTKv4eBFH2K4QS8OoGVo,8206
451
- evalscope/report/__init__.py,sha256=DTigCg9fkU_zGNDqIaZy3CWYbrlvODvCxCTVqSx6ano,875
452
- evalscope/report/combinator.py,sha256=Xzlhs7kwfI6cgs7rngxhvsur0bCJkrM0tAy6isq2VME,3235
475
+ evalscope/report/__init__.py,sha256=xS6eeTgsPdIlIOhzUn-ND77uV34vMVug4PmXHmYAxwM,1080
476
+ evalscope/report/combinator.py,sha256=F7KOClXVh56-XEw3Sb5uxwA6L8ZlH_P4-MOlm3Yp_Cg,5020
453
477
  evalscope/report/generator.py,sha256=t2R3WGa4SowTRUPOgITtyTR4QDiJ6i3FH__byDKZU8Y,4959
454
478
  evalscope/report/report.py,sha256=lEBD_E_RJiydFTaGFNLIMTFxNrqv8QcLZb_iuUg5HB0,8479
455
479
  evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
@@ -494,41 +518,14 @@ evalscope/utils/deprecation_utils.py,sha256=aDv3HFNcJFZ7rxNgALQP0-ITO8L23HC_RX-C
494
518
  evalscope/utils/function_utils.py,sha256=E-AIzx_PKrZDGl1cBvlvqNvMa8yM2WUJ2wh73PNBXrQ,1887
495
519
  evalscope/utils/import_utils.py,sha256=S0WQ3gt4zpwJHjGcyC-604pWWExg3JV7f3wzoOH-tuo,5794
496
520
  evalscope/utils/io_utils.py,sha256=79F0p7dFxA84tIVSL_C4piJgeQQtVUfb2R_Xcd8v_cE,11615
497
- evalscope/utils/json_schema.py,sha256=ZExvQA-SI6SxWBx_hCmuQ2RRqwGKuywy4sTotvd2hH0,8288
521
+ evalscope/utils/json_schema.py,sha256=GVP1m6g4mBrsFmOWOOVnmvl2joOz8gTlGEytLv5qy7s,8451
498
522
  evalscope/utils/logger.py,sha256=roFk4Su4aJwsF0s-uYc5-tABnghwYPX3gpkA5QUGzK8,5675
499
523
  evalscope/utils/model_utils.py,sha256=mdtYoHhUdfpxUtnS52XZjNdO3uSK4yeIBHT3aDU7s-A,2455
500
- evalscope/utils/multi_choices.py,sha256=OxBER7amWpoRY0Z-o39rDmCNK6wpr1HQm9mMHpWLgp0,9524
524
+ evalscope/utils/multi_choices.py,sha256=0UJbgr5eXNgitPC79JLcyUU-OXg9BlM-mVk-fWtUSno,9881
501
525
  evalscope/utils/url_utils.py,sha256=9HcFt9uZNbOJR3ADUFQ_dBFKziHV6H66Df7HYs1M4Po,1757
502
- tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
503
- tests/common.py,sha256=BB136KcGaEfdWqMwApa48K0CTSGmOCUZ0FYDqpfYnAA,2423
504
- tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
505
- tests/utils.py,sha256=Fgm0CU6ilZjCGOfOMJH-Trxy0UIAGbhvy0Ijy_zDGUk,323
506
- tests/benchmark/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
507
- tests/benchmark/test_eval.py,sha256=vSAvhiCKxHpjHdGhZn8l0qzPSiG1ZZafz_M06B_a8_Y,13827
508
- tests/benchmark/test_image_edit.py,sha256=z3z7psMRFynpVgUAFoH--ieeGXzb9cHkrq3tT_sCZo8,2165
509
- tests/benchmark/test_sandbox.py,sha256=bHyX8ammdn7EsEbN80cIzDNhQZlJD3Ssoj9l4efF7rI,2968
510
- tests/benchmark/test_t2i.py,sha256=fciaGsOrkOpT4WQlsnmjrqw6qolCzI0DGyWQAJkM-Es,4513
511
- tests/benchmark/test_vlm.py,sha256=gn0ledf_yPY1IhCyCtiqT_dTVPUVZ3NVPr9yzsC_UZQ,4501
512
- tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
513
- tests/cli/test_all.py,sha256=1omOXC1lBphBLm0hTf5HNstlF_bwi16dYyr00gvaCTM,7301
514
- tests/cli/test_collection.py,sha256=lGz3YUS_0gM6_HjQLe26OfBAkHOPOEDWMO-UyP58GN8,4455
515
- tests/cli/test_custom.py,sha256=9z_N7Re712xI62TqVSTBdzB_iFFEUb55wcWIcGvJb84,9254
516
- tests/cli/test_reasoning.py,sha256=rU181LLoKbFCpNPFCIZULxEgsJ2PYswel2pP2EsjEmo,2696
517
- tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
518
- tests/perf/test_perf.py,sha256=ugYNEyU32ctryPFa_6fr8aQYxfHJMymdKnKKEHM9Ajc,6174
519
- tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
520
- tests/rag/test_clip_benchmark.py,sha256=qpSLgmHMGcYTnxP7AI__y-ii5_tu_fCSht6p3TBetkA,2650
521
- tests/rag/test_mteb.py,sha256=fdNQIyUEzE7puPCKw5QhCHTEu7hz-ieHeq1xCWGh6IM,7246
522
- tests/rag/test_ragas.py,sha256=5qozXvPFIb67T-igJv87ijlOgkPnqgkkBVXu6Ht4D0A,4554
523
- tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
524
- tests/swift/test_run_swift_eval.py,sha256=YbIhYNoI4kAB-ox-OXAKUifLIXTFqP-xGZicrAgK_V0,5784
525
- tests/swift/test_run_swift_vlm_eval.py,sha256=RwrKkc1WHEZxetM11cGL81G4faKCn7SYn4VlwL03atI,4934
526
- tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=UAUtOCQ72xbm8s-sov3cBEpYVDy189wpB-qOL3KoU7M,6053
527
- tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
528
- tests/vlm/test_vlmeval.py,sha256=EDQRkYfSyOICUwo_tm3p-puaE_xdFmqOPkrt5etxsqM,3307
529
- evalscope-1.0.2.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
530
- evalscope-1.0.2.dist-info/METADATA,sha256=vZciS7qNosSJOdwyRSxsCyVqvw8hyqKS84yKjlbxwzw,40305
531
- evalscope-1.0.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
532
- evalscope-1.0.2.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
533
- evalscope-1.0.2.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
534
- evalscope-1.0.2.dist-info/RECORD,,
526
+ evalscope-1.1.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
527
+ evalscope-1.1.0.dist-info/METADATA,sha256=pap4NeCTqw7bec2KqYboFj25zabm1m5rwoiqukX8EO4,39544
528
+ evalscope-1.1.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
529
+ evalscope-1.1.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
530
+ evalscope-1.1.0.dist-info/top_level.txt,sha256=jNR-HMn3TR8Atolq7_4rW8IWVX6GhvYV5_1Y_KbJKlY,10
531
+ evalscope-1.1.0.dist-info/RECORD,,