evalscope 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (176) hide show
  1. evalscope/api/benchmark/__init__.py +8 -1
  2. evalscope/api/benchmark/adapters/__init__.py +1 -0
  3. evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
  4. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  5. evalscope/api/benchmark/benchmark.py +14 -0
  6. evalscope/api/dataset/dataset.py +21 -0
  7. evalscope/api/dataset/loader.py +6 -2
  8. evalscope/api/mixin/sandbox_mixin.py +32 -54
  9. evalscope/api/model/generate_config.py +6 -0
  10. evalscope/app/ui/multi_model.py +6 -1
  11. evalscope/app/ui/single_model.py +8 -2
  12. evalscope/app/utils/data_utils.py +3 -2
  13. evalscope/app/utils/visualization.py +2 -2
  14. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  15. evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
  16. evalscope/benchmarks/bfcl/bfcl_adapter.py +11 -46
  17. evalscope/benchmarks/blink/__init__.py +0 -0
  18. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  19. evalscope/benchmarks/chartqa/__init__.py +0 -0
  20. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  21. evalscope/benchmarks/chartqa/utils.py +38 -0
  22. evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
  23. evalscope/benchmarks/docvqa/__init__.py +0 -0
  24. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  25. evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
  26. evalscope/benchmarks/general_arena/utils.py +2 -1
  27. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  28. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  29. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
  30. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  31. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
  32. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  33. evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
  34. evalscope/benchmarks/infovqa/__init__.py +0 -0
  35. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  36. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
  37. evalscope/benchmarks/math_verse/__init__.py +0 -0
  38. evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
  39. evalscope/benchmarks/math_vision/__init__.py +0 -0
  40. evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
  41. evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
  42. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
  43. evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
  44. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
  45. evalscope/benchmarks/ner/__init__.py +0 -0
  46. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  47. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  48. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  49. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  50. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  51. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  52. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  53. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  54. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  55. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  56. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  57. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  58. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  59. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  60. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  61. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  62. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  63. evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
  64. evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
  65. evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
  66. evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
  67. evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  68. evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
  69. evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
  70. evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  71. evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  72. evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  73. evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
  74. evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
  75. evalscope/benchmarks/ocr_bench_v2/utils.py +433 -0
  76. evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
  77. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  78. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  79. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  80. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  81. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  82. evalscope/benchmarks/poly_math/__init__.py +0 -0
  83. evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
  84. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  85. evalscope/benchmarks/pope/__init__.py +0 -0
  86. evalscope/benchmarks/pope/pope_adapter.py +111 -0
  87. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  88. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  89. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  90. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  91. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
  92. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
  93. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  94. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  95. evalscope/benchmarks/zerobench/__init__.py +0 -0
  96. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  97. evalscope/constants.py +4 -0
  98. evalscope/evaluator/evaluator.py +72 -79
  99. evalscope/metrics/math_parser.py +14 -0
  100. evalscope/metrics/metric.py +52 -1
  101. evalscope/metrics/metrics.py +16 -0
  102. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  103. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  104. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  105. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  106. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  107. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  108. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  109. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  110. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  111. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  112. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  113. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  114. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  115. evalscope/models/utils/openai.py +4 -0
  116. evalscope/perf/arguments.py +24 -4
  117. evalscope/perf/benchmark.py +74 -89
  118. evalscope/perf/http_client.py +31 -16
  119. evalscope/perf/main.py +15 -2
  120. evalscope/perf/plugin/api/base.py +9 -7
  121. evalscope/perf/plugin/api/custom_api.py +13 -58
  122. evalscope/perf/plugin/api/default_api.py +179 -79
  123. evalscope/perf/plugin/api/openai_api.py +4 -3
  124. evalscope/perf/plugin/datasets/base.py +21 -0
  125. evalscope/perf/plugin/datasets/custom.py +2 -3
  126. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  127. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  128. evalscope/perf/plugin/datasets/openqa.py +2 -4
  129. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  130. evalscope/perf/utils/benchmark_util.py +36 -22
  131. evalscope/perf/utils/db_util.py +14 -19
  132. evalscope/perf/utils/local_server.py +0 -44
  133. evalscope/perf/utils/log_utils.py +21 -6
  134. evalscope/report/__init__.py +11 -2
  135. evalscope/report/combinator.py +52 -2
  136. evalscope/run.py +4 -0
  137. evalscope/utils/function_utils.py +195 -12
  138. evalscope/utils/io_utils.py +74 -0
  139. evalscope/utils/json_schema.py +8 -6
  140. evalscope/utils/logger.py +49 -17
  141. evalscope/utils/multi_choices.py +16 -1
  142. evalscope/utils/ner.py +377 -0
  143. evalscope/version.py +2 -2
  144. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/METADATA +239 -393
  145. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/RECORD +140 -98
  146. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
  147. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -1
  148. tests/__init__.py +0 -1
  149. tests/benchmark/__init__.py +0 -1
  150. tests/benchmark/test_eval.py +0 -429
  151. tests/benchmark/test_image_edit.py +0 -65
  152. tests/benchmark/test_sandbox.py +0 -81
  153. tests/benchmark/test_t2i.py +0 -142
  154. tests/benchmark/test_vlm.py +0 -137
  155. tests/cli/__init__.py +0 -1
  156. tests/cli/test_all.py +0 -269
  157. tests/cli/test_collection.py +0 -99
  158. tests/cli/test_custom.py +0 -268
  159. tests/cli/test_reasoning.py +0 -81
  160. tests/common.py +0 -73
  161. tests/perf/__init__.py +0 -1
  162. tests/perf/test_perf.py +0 -206
  163. tests/rag/test_clip_benchmark.py +0 -87
  164. tests/rag/test_mteb.py +0 -213
  165. tests/rag/test_ragas.py +0 -128
  166. tests/swift/__init__.py +0 -1
  167. tests/swift/test_run_swift_eval.py +0 -146
  168. tests/swift/test_run_swift_vlm_eval.py +0 -128
  169. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  170. tests/test_run_all.py +0 -12
  171. tests/utils.py +0 -13
  172. tests/vlm/__init__.py +0 -1
  173. tests/vlm/test_vlmeval.py +0 -102
  174. {tests/rag → evalscope/benchmarks/aa_lcr}/__init__.py +0 -0
  175. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
  176. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
@@ -2,7 +2,6 @@ import os
2
2
  import subprocess
3
3
  import uvicorn
4
4
  from contextlib import asynccontextmanager
5
- from dataclasses import dataclass
6
5
  from fastapi import FastAPI
7
6
  from fastapi.middleware.cors import CORSMiddleware
8
7
  from sse_starlette.sse import EventSourceResponse
@@ -15,49 +14,6 @@ from evalscope.utils.logger import get_logger
15
14
  logger = get_logger()
16
15
 
17
16
 
18
- @dataclass
19
- class ServerSentEvent(object):
20
-
21
- def __init__(self, data='', event=None, id=None, retry=None):
22
- self.data = data
23
- self.event = event
24
- self.id = id
25
- self.retry = retry
26
-
27
- @classmethod
28
- def decode(cls, line):
29
- """Decode line to ServerSentEvent
30
-
31
-
32
- Args:
33
- line (str): The line.
34
-
35
- Return:
36
- ServerSentEvent (obj:`ServerSentEvent`): The ServerSentEvent object.
37
-
38
- """
39
- if not line:
40
- return None
41
- sse_msg = cls()
42
- # format data:xxx
43
- field_type, _, field_value = line.partition(':')
44
- if field_value.startswith(' '): # compatible with openai api
45
- field_value = field_value[1:]
46
- if field_type == 'event':
47
- sse_msg.event = field_value
48
- elif field_type == 'data':
49
- field_value = field_value.rstrip()
50
- sse_msg.data = field_value
51
- elif field_type == 'id':
52
- sse_msg.id = field_value
53
- elif field_type == 'retry':
54
- sse_msg.retry = field_value
55
- else:
56
- pass
57
-
58
- return sse_msg
59
-
60
-
61
17
  @asynccontextmanager
62
18
  async def lifespan(app: FastAPI):
63
19
  yield
@@ -15,29 +15,42 @@ def init_wandb(args: Arguments) -> None:
15
15
  raise RuntimeError('Cannot import wandb. Please install it with command: \n pip install wandb')
16
16
  os.environ['WANDB_SILENT'] = 'true'
17
17
  os.environ['WANDB_DIR'] = args.outputs_dir
18
-
19
- wandb.login(key=args.wandb_api_key)
20
18
  current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
21
19
  name = args.name if args.name else f'{args.model_id}_{current_time}'
22
- wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
20
+
21
+ # Remove sensitive information from logging config
22
+ logging_config = args.to_dict()
23
+ logging_config.pop('api_key', None)
24
+ logging_config.pop('wandb_api_key', None)
25
+
26
+ if args.wandb_api_key is not None:
27
+ wandb.login(key=args.wandb_api_key)
28
+ wandb.init(project='perf_benchmark', name=name, config=logging_config)
23
29
 
24
30
 
25
31
  def init_swanlab(args: Arguments) -> None:
32
+ """
33
+ Initialize SwanLab for logging.
34
+ """
26
35
  import datetime
27
36
  try:
28
37
  import swanlab
29
38
  except ImportError:
30
39
  raise RuntimeError('Cannot import swanlab. Please install it with command: \n pip install swanlab')
31
40
  os.environ['SWANLAB_LOG_DIR'] = args.outputs_dir
32
- if not args.swanlab_api_key == 'local':
33
- swanlab.login(api_key=args.swanlab_api_key)
34
41
  current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
35
42
  name = args.name if args.name else f'{args.model_id}_{current_time}'
36
43
  swanlab.config.update({'framework': '📏evalscope'})
44
+
45
+ # Remove sensitive information from logging config
46
+ logging_config = args.to_dict()
47
+ logging_config.pop('api_key', None)
48
+ logging_config.pop('swanlab_api_key', None)
49
+
37
50
  init_kwargs = {
38
51
  'project': os.getenv('SWANLAB_PROJ_NAME', 'perf_benchmark'),
39
52
  'name': name,
40
- 'config': args.to_dict(),
53
+ 'config': logging_config,
41
54
  'mode': 'local' if args.swanlab_api_key == 'local' else None
42
55
  }
43
56
 
@@ -45,4 +58,6 @@ def init_swanlab(args: Arguments) -> None:
45
58
  if workspace:
46
59
  init_kwargs['workspace'] = workspace
47
60
 
61
+ if isinstance(args.swanlab_api_key, str) and not args.swanlab_api_key == 'local':
62
+ swanlab.login(api_key=args.swanlab_api_key)
48
63
  swanlab.init(**init_kwargs)
@@ -4,9 +4,15 @@ from typing import TYPE_CHECKING
4
4
  from evalscope.utils.import_utils import _LazyModule
5
5
 
6
6
  if TYPE_CHECKING:
7
- from .combinator import gen_table, get_data_frame, get_report_list
7
+ from .combinator import (
8
+ gen_table,
9
+ get_data_frame,
10
+ get_report_list,
11
+ unweighted_average_from_subsets,
12
+ weighted_average_from_subsets,
13
+ )
8
14
  from .generator import ReportGenerator
9
- from .report import Category, Report, ReportKey, Subset
15
+ from .report import Category, Metric, Report, ReportKey, Subset
10
16
 
11
17
  else:
12
18
  _import_structure = {
@@ -14,6 +20,8 @@ else:
14
20
  'gen_table',
15
21
  'get_data_frame',
16
22
  'get_report_list',
23
+ 'weighted_average_from_subsets',
24
+ 'unweighted_average_from_subsets',
17
25
  ],
18
26
  'generator': [
19
27
  'ReportGenerator',
@@ -23,6 +31,7 @@ else:
23
31
  'Report',
24
32
  'ReportKey',
25
33
  'Subset',
34
+ 'Metric',
26
35
  ],
27
36
  }
28
37
 
@@ -4,9 +4,9 @@ import glob
4
4
  import os
5
5
  import pandas as pd
6
6
  from tabulate import tabulate
7
- from typing import List, Tuple
7
+ from typing import Dict, List, Tuple, Union
8
8
 
9
- from evalscope.report.report import Report
9
+ from evalscope.report.report import Report, Subset
10
10
  from evalscope.utils.logger import get_logger
11
11
 
12
12
  logger = get_logger()
@@ -86,3 +86,53 @@ def gen_table(
86
86
  add_overall_metric=add_overall_metric
87
87
  )
88
88
  return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
89
+
90
+
91
+ def weighted_average_from_subsets(
92
+ subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
93
+ ) -> Subset:
94
+ """Calculate weighted average for given subsets.
95
+
96
+ Args:
97
+ subset_names (List[str]): List of subset names to include in the average.
98
+ subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
99
+ new_name (str): Name for the resulting Subset object.
100
+
101
+ Returns:
102
+ Subset: A new Subset object with weighted average score
103
+ """
104
+ total_score = 0
105
+ total_count = 0
106
+ for name in subset_names:
107
+ if name in subset_dict:
108
+ subset = subset_dict[name]
109
+ total_score += subset.score * subset.num
110
+ total_count += subset.num
111
+
112
+ weighted_avg = total_score / total_count if total_count > 0 else 0
113
+ return Subset(name=new_name, score=weighted_avg, num=total_count)
114
+
115
+
116
+ def unweighted_average_from_subsets(
117
+ subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
118
+ ) -> Subset:
119
+ """Calculate unweighted average for given subsets.
120
+
121
+ Args:
122
+ subset_names (List[str]): List of subset names to include in the average.
123
+ subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
124
+ new_name (str): Name for the resulting Subset object.
125
+
126
+ Returns:
127
+ Subset: A new Subset object with unweighted average score
128
+ """
129
+ scores = []
130
+ total_count = 0
131
+ for name in subset_names:
132
+ if name in subset_dict:
133
+ subset = subset_dict[name]
134
+ scores.append(subset.score)
135
+ total_count += subset.num
136
+
137
+ unweighted_avg = sum(scores) / len(scores) if scores else 0
138
+ return Subset(name=new_name, score=unweighted_avg, num=total_count)
evalscope/run.py CHANGED
@@ -38,6 +38,7 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
38
38
  if task_cfg.eval_backend != EvalBackend.NATIVE:
39
39
  result = run_non_native_backend(task_cfg, outputs)
40
40
  else:
41
+ logger.info('Running with native backend')
41
42
  result = evaluate_model(task_cfg, outputs)
42
43
 
43
44
  logger.info(f'Finished evaluation for {task_cfg.model_id} on {task_cfg.datasets}')
@@ -94,12 +95,15 @@ def run_non_native_backend(task_cfg: TaskConfig, outputs: OutputsStructure) -> d
94
95
  def get_backend_manager_class(eval_backend: EvalBackend):
95
96
  """Get the backend manager class based on the evaluation backend."""
96
97
  if eval_backend == EvalBackend.OPEN_COMPASS:
98
+ logger.info('Using OpenCompassBackendManager')
97
99
  from evalscope.backend.opencompass import OpenCompassBackendManager
98
100
  return OpenCompassBackendManager
99
101
  elif eval_backend == EvalBackend.VLM_EVAL_KIT:
102
+ logger.info('Using VLMEvalKitBackendManager')
100
103
  from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
101
104
  return VLMEvalKitBackendManager
102
105
  elif eval_backend == EvalBackend.RAG_EVAL:
106
+ logger.info('Using RAGEvalBackendManager')
103
107
  from evalscope.backend.rag_eval import RAGEvalBackendManager
104
108
  return RAGEvalBackendManager
105
109
  elif eval_backend == EvalBackend.THIRD_PARTY:
@@ -1,7 +1,50 @@
1
+ import asyncio
1
2
  import threading
2
3
  import time
4
+ from concurrent.futures import ThreadPoolExecutor, wait
3
5
  from contextlib import contextmanager
4
6
  from functools import wraps
7
+ from tqdm import tqdm
8
+ from typing import Any, Awaitable, Callable, List, Optional, Sequence, TypeVar, Union
9
+
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
13
+
14
+ T = TypeVar('T')
15
+ R = TypeVar('R')
16
+
17
+ # Global lock to safely create per-instance locks in decorators
18
+ _THREAD_SAFE_GLOBAL_LOCK = threading.RLock()
19
+
20
+
21
+ def thread_safe(func: Callable[..., T]) -> Callable[..., T]:
22
+ """Thread-safe decorator.
23
+ - If decorating a bound method, uses a per-instance, per-method lock.
24
+ - If decorating a function, uses a function-scoped lock.
25
+ """
26
+ func_lock = threading.RLock()
27
+ lock_attr_name = f'__lock_{func.__name__}'
28
+
29
+ @wraps(func)
30
+ def wrapper(*args, **kwargs):
31
+ # Prefer per-instance lock if the first arg looks like 'self'
32
+ if args and hasattr(args[0], '__dict__'):
33
+ self_obj = args[0]
34
+ lock = getattr(self_obj, lock_attr_name, None)
35
+ if lock is None:
36
+ with _THREAD_SAFE_GLOBAL_LOCK:
37
+ lock = getattr(self_obj, lock_attr_name, None)
38
+ if lock is None:
39
+ lock = threading.RLock()
40
+ setattr(self_obj, lock_attr_name, lock)
41
+ else:
42
+ lock = func_lock
43
+
44
+ with lock:
45
+ return func(*args, **kwargs)
46
+
47
+ return wrapper
5
48
 
6
49
 
7
50
  def run_once(func):
@@ -19,18 +62,6 @@ def run_once(func):
19
62
  return wrapper
20
63
 
21
64
 
22
- def thread_safe(func):
23
- """Thread-safe decorator for functions that need to be executed in a thread-safe manner."""
24
- lock = threading.RLock()
25
-
26
- @wraps(func)
27
- def wrapper(*args, **kwargs):
28
- with lock:
29
- return func(*args, **kwargs)
30
-
31
- return wrapper
32
-
33
-
34
65
  def retry_func(retries=3, sleep_interval=0):
35
66
  """A decorator that retries a function call up to `retries` times if an exception occurs."""
36
67
 
@@ -68,3 +99,155 @@ def retry_context(retries=3, sleep_interval=0):
68
99
  if attempt == retries - 1: # Last attempt
69
100
  break
70
101
  raise last_exception
102
+
103
+
104
+ class AsyncioLoopRunner:
105
+ """Singleton background asyncio loop runner for sync→async bridging."""
106
+ _instance: Optional['AsyncioLoopRunner'] = None
107
+ _inst_lock = threading.Lock()
108
+
109
+ def __init__(self) -> None:
110
+ self._loop: Optional[asyncio.AbstractEventLoop] = None
111
+ self._thread: Optional[threading.Thread] = None
112
+ self._start_loop()
113
+
114
+ def _start_loop(self) -> None:
115
+ loop = asyncio.new_event_loop()
116
+ self._loop = loop
117
+
118
+ def run_loop() -> None:
119
+ asyncio.set_event_loop(loop)
120
+ loop.run_forever()
121
+
122
+ self._thread = threading.Thread(target=run_loop, daemon=True, name='AsyncioLoopRunner')
123
+ self._thread.start()
124
+
125
+ @classmethod
126
+ def instance(cls) -> 'AsyncioLoopRunner':
127
+ if cls._instance is not None:
128
+ return cls._instance
129
+ with cls._inst_lock:
130
+ if cls._instance is None:
131
+ cls._instance = AsyncioLoopRunner()
132
+ return cls._instance
133
+
134
+ @classmethod
135
+ def run(cls, coro: Awaitable[T], timeout: Optional[float] = None) -> T:
136
+ """Submit a coroutine to the background loop and wait for result."""
137
+ inst = cls.instance()
138
+ fut = asyncio.run_coroutine_threadsafe(coro, inst._loop)
139
+ return fut.result(timeout=timeout)
140
+
141
+ @property
142
+ def loop(self) -> Optional[asyncio.AbstractEventLoop]:
143
+ """Access the underlying event loop (read-only use)."""
144
+ return self._loop
145
+
146
+ def stop(self, join_timeout: float = 5.0) -> None:
147
+ """Optional shutdown of the background loop (generally not needed)."""
148
+ if not self._loop:
149
+ return
150
+ self._loop.call_soon_threadsafe(self._loop.stop)
151
+ if self._thread:
152
+ self._thread.join(timeout=join_timeout)
153
+
154
+
155
+ def run_in_threads_with_progress(
156
+ items: Sequence[T],
157
+ worker: Callable[[T], R],
158
+ *,
159
+ desc: str,
160
+ max_workers: int,
161
+ heartbeat_sec: int,
162
+ on_result: Optional[Callable[[T, R], None]] = None,
163
+ on_error: Optional[Callable[[T, Exception], None]] = None,
164
+ ) -> List[R]:
165
+ """
166
+ Execute a collection of tasks concurrently with a ThreadPoolExecutor while
167
+ displaying a tqdm progress bar and emitting periodic heartbeat logs.
168
+
169
+ Key behaviors:
170
+ - Concurrency: Uses up to `min(len(items), max_workers)` threads.
171
+ - Progress: A tqdm bar advances when each task finishes (success or failure).
172
+ - Heartbeat: If no tasks finish within `heartbeat_sec`, a status line is logged.
173
+ - Ordering: Results are appended in completion order (not the original order).
174
+ - Error handling:
175
+ * If `on_error` is provided, it is called for each failed item; execution continues
176
+ unless `on_error` itself raises.
177
+ * If `on_error` is None, the first exception is raised immediately and stops processing.
178
+ - Callbacks:
179
+ * `on_result(item, result)` is called after a successful result is obtained.
180
+ * Both callbacks run in the main thread (not worker threads).
181
+
182
+ Args:
183
+ items: A sequence of items (inputs) to process. Converted to a list internally.
184
+ worker: A callable executed in threads to process a single item and return a result.
185
+ desc: A short text shown as the tqdm progress bar description.
186
+ max_workers: Upper bound on the number of concurrent threads.
187
+ heartbeat_sec: Interval (in seconds) to wait before emitting a heartbeat log if
188
+ no tasks complete in that window.
189
+ on_result: Optional callback invoked as on_result(item, result) after success.
190
+ on_error: Optional callback invoked as on_error(item, exception) on failure. If omitted,
191
+ the exception is propagated and the function terminates early.
192
+
193
+ Returns:
194
+ A list of results collected as tasks complete (completion order).
195
+ If some tasks fail and `on_error` is provided (and does not re-raise), those failures
196
+ are skipped and not included in the returned results.
197
+
198
+ Raises:
199
+ Exception: Propagates the first task exception if `on_error` is not provided, or if
200
+ `on_error` re-raises.
201
+
202
+ Notes:
203
+ - The function is blocking until all tasks complete or an exception is propagated.
204
+ - Use `on_error` to implement "best-effort" processing where failures are logged
205
+ and the rest continue.
206
+ """
207
+ # Defensive copy to avoid consuming a generator multiple times and to compute pool size.
208
+ pending_items: List[T] = list(items)
209
+ if not pending_items:
210
+ return []
211
+
212
+ results: List[R] = []
213
+
214
+ # Bound the pool by actual workload size for efficiency.
215
+ with ThreadPoolExecutor(max_workers=min(len(pending_items), max_workers)) as executor:
216
+ # Submit all tasks up-front and map futures back to their originating item.
217
+ future_to_item = {executor.submit(worker, item): item for item in pending_items}
218
+
219
+ # Progress bar reflects total number of submitted tasks; updated per finished future.
220
+ with tqdm(total=len(pending_items), desc=desc, mininterval=1, dynamic_ncols=True) as pbar:
221
+ # Track unfinished futures and poll with a timeout to enable heartbeat logs.
222
+ pending = set(future_to_item.keys())
223
+ while pending:
224
+ # Wait with timeout to detect stalls and emit heartbeats proactively.
225
+ done, not_done = wait(pending, timeout=heartbeat_sec)
226
+ if not done:
227
+ # Heartbeat when nothing has completed within the window.
228
+ logger.info(f'{desc} still processing... pending={len(not_done)}')
229
+ continue
230
+
231
+ # Consume completed futures.
232
+ for future in done:
233
+ item = future_to_item[future]
234
+ try:
235
+ res = future.result()
236
+ results.append(res)
237
+ # Invoke success callback in caller thread (not in worker).
238
+ if on_result is not None:
239
+ on_result(item, res)
240
+ except Exception as exc:
241
+ # Delegate failure handling to on_error if provided; otherwise bubble up.
242
+ if on_error is not None:
243
+ on_error(item, exc)
244
+ else:
245
+ raise
246
+ finally:
247
+ # Always advance progress for completed futures (success or failure).
248
+ pbar.update(1)
249
+
250
+ # Continue polling remaining futures.
251
+ pending = not_done
252
+
253
+ return results
@@ -12,6 +12,7 @@ import yaml
12
12
  from datetime import datetime
13
13
  from io import BytesIO
14
14
  from PIL import Image
15
+ from typing import Tuple
15
16
 
16
17
  from evalscope.constants import DumpMode
17
18
  from evalscope.utils.logger import get_logger
@@ -173,6 +174,24 @@ def csv_to_list(csv_file) -> list:
173
174
  return res_list
174
175
 
175
176
 
177
+ def tsv_to_list(tsv_file) -> list:
178
+ """
179
+ Read tsv file to list.
180
+
181
+ Args:
182
+ tsv_file: tsv file path.
183
+
184
+ Returns:
185
+ list: list of lines. Each line is a dict.
186
+ """
187
+ res_list = []
188
+ with open(tsv_file, 'r', encoding='utf-8') as f:
189
+ reader = csv.DictReader(f, delimiter='\t')
190
+ for row in reader:
191
+ res_list.append(row)
192
+ return res_list
193
+
194
+
176
195
  def csv_to_jsonl(csv_file, jsonl_file):
177
196
  """
178
197
  Convert csv file to jsonl file.
@@ -420,3 +439,58 @@ def convert_normal_types(obj):
420
439
  return tuple(convert_normal_types(item) for item in obj)
421
440
  else:
422
441
  return obj
442
+
443
+
444
+ def compress_image_to_limit(image_bytes: bytes, max_bytes: int = 10_000_000) -> Tuple[bytes, str]:
445
+ """
446
+ Ensure image bytes are under max_bytes by re-encoding to JPEG with quality reduction
447
+ and optional downscaling. Returns (processed_bytes, format_str).
448
+ If the original bytes are already below the limit, returns them as PNG.
449
+ """
450
+ if len(image_bytes) <= max_bytes:
451
+ return image_bytes, 'png'
452
+
453
+ try:
454
+ img = Image.open(BytesIO(image_bytes))
455
+ except Exception as exc:
456
+ logger.warning(f'Failed to open image bytes with PIL, sending original image; may exceed API limit: {exc}')
457
+ return image_bytes, 'png'
458
+
459
+ # Convert to RGB for JPEG if needed
460
+ if img.mode not in ('RGB', 'L'):
461
+ img = img.convert('RGB')
462
+
463
+ def encode_jpeg(source: Image.Image, quality: int) -> bytes:
464
+ buf = BytesIO()
465
+ source.save(buf, format='JPEG', quality=quality, optimize=True, progressive=True)
466
+ return buf.getvalue()
467
+
468
+ # Start with moderate quality and reduce
469
+ quality: int = 85
470
+ out: bytes = encode_jpeg(img, quality)
471
+ quality_floor: int = 40
472
+
473
+ while len(out) > max_bytes and quality > quality_floor:
474
+ quality -= 10
475
+ out = encode_jpeg(img, quality)
476
+
477
+ # If still too large, progressively downscale
478
+ min_side_floor: int = 256
479
+ scale: float = 0.9
480
+ while len(out) > max_bytes and min(img.size) > min_side_floor:
481
+ new_w = max(min_side_floor, int(img.width * scale))
482
+ new_h = max(min_side_floor, int(img.height * scale))
483
+ if (new_w, new_h) == img.size:
484
+ break
485
+ img = img.resize((new_w, new_h), Image.LANCZOS)
486
+ out = encode_jpeg(img, quality)
487
+
488
+ if len(out) > max_bytes:
489
+ logger.warning(f'Image remains above limit after compression: size={len(out)} bytes (limit={max_bytes}).')
490
+ else:
491
+ logger.info(
492
+ f'Compressed image from {len(image_bytes)} to {len(out)} bytes; '
493
+ f'quality={quality}, size={img.width}x{img.height}.'
494
+ )
495
+
496
+ return out, 'jpeg'
@@ -59,18 +59,20 @@ class JSONSchema(BaseModel):
59
59
  required: Optional[List[str]] = Field(default=None)
60
60
  """Required fields for object parameters."""
61
61
 
62
- @field_validator('type')
63
- def validate_type(cls, v: Optional[str]) -> Optional[JSONType]:
64
- return python_type_to_json_type(v)
65
-
66
62
  @model_validator(mode='before')
67
63
  def convert_type_before_validation(cls, values):
68
64
  values = deepcopy(values)
69
65
 
70
66
  def recursive_convert_type(obj):
71
67
  if isinstance(obj, dict):
72
- if 'type' in obj:
73
- obj['type'] = python_type_to_json_type(obj['type'])
68
+ # Convert 'type' field if it's a string
69
+ if 'type' in obj and isinstance(obj['type'], str):
70
+ try:
71
+ obj['type'] = python_type_to_json_type(obj['type'])
72
+ except ValueError:
73
+ # If conversion fails, leave it as is
74
+ pass
75
+ # Recursively process nested structures
74
76
  for k, v in obj.items():
75
77
  obj[k] = recursive_convert_type(v)
76
78
  elif isinstance(obj, list):
evalscope/utils/logger.py CHANGED
@@ -53,16 +53,16 @@ def get_logger(
53
53
  name: Optional[str] = None,
54
54
  log_level: int = DEFAULT_LEVEL,
55
55
  file_mode: str = 'w',
56
- force=False
56
+ force: bool = False,
57
57
  ):
58
58
  """Get logging logger
59
59
 
60
60
  Args:
61
- log_file: Log filename, if specified, file handler will be added to
62
- logger
63
- log_level: Logging level.
64
- file_mode: Specifies the mode to open the file, if filename is
65
- specified (if filemode is unspecified, it defaults to 'w').
61
+ log_file: Log filename. If specified, a file handler will be added to the logger.
62
+ name: Logical component name. Used to derive the logger name.
63
+ log_level: Logging level to set.
64
+ file_mode: Mode to open the file when log_file is provided (default 'w').
65
+ force: If True, reconfigure the existing logger (levels, formatters, handlers).
66
66
  """
67
67
 
68
68
  if name:
@@ -77,7 +77,7 @@ def get_logger(
77
77
  logger.setLevel(log_level)
78
78
  for handler in logger.handlers:
79
79
  handler.setLevel(log_level)
80
- # 区分不同类型的 handler,使用相应的格式化器
80
+ # Select formatter by handler type
81
81
  if isinstance(handler, logging.FileHandler):
82
82
  handler.setFormatter(
83
83
  plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter
@@ -86,6 +86,7 @@ def get_logger(
86
86
  handler.setFormatter(
87
87
  color_detailed_formatter if log_level == logging.DEBUG else color_simple_formatter
88
88
  )
89
+ # Ensure file handler points to current log_file (replace if needed)
89
90
  add_file_handler_if_needed(logger, log_file, file_mode, log_level)
90
91
  return logger
91
92
 
@@ -137,23 +138,54 @@ def configure_logging(debug: bool, log_file: Optional[str] = None):
137
138
  get_logger(log_level=logging.DEBUG, force=True)
138
139
 
139
140
 
140
- def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
141
- for handler in logger.handlers:
142
- if isinstance(handler, logging.FileHandler):
143
- return
141
+ def add_file_handler_if_needed(
142
+ logger: logging.Logger,
143
+ log_file: Optional[str],
144
+ file_mode: str,
145
+ log_level: int,
146
+ ) -> None:
147
+ """Ensure logger has a FileHandler targeting log_file.
148
+ - If no FileHandler exists, add one.
149
+ - If a FileHandler exists but points to a different file, replace it.
150
+ """
151
+ if log_file is None:
152
+ return
144
153
 
154
+ # Only worker-0 writes files
145
155
  if iutil.find_spec('torch') is not None:
146
156
  from modelscope.utils.torch_utils import is_master
147
-
148
157
  is_worker0 = is_master()
149
158
  else:
150
159
  is_worker0 = True
151
160
 
152
- if is_worker0 and log_file is not None:
153
- file_handler = logging.FileHandler(log_file, file_mode)
154
- file_handler.setFormatter(plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter)
155
- file_handler.setLevel(log_level)
156
- logger.addHandler(file_handler)
161
+ if not is_worker0:
162
+ return
163
+
164
+ target_path = os.path.abspath(log_file)
165
+ existing_file_handlers = [h for h in logger.handlers if isinstance(h, logging.FileHandler)]
166
+
167
+ # If there is a FileHandler already pointing to the target file, nothing to do.
168
+ for fh in existing_file_handlers:
169
+ try:
170
+ if os.path.abspath(getattr(fh, 'baseFilename', '')) == target_path:
171
+ return
172
+ except Exception:
173
+ # If any issue retrieving baseFilename, fall through to replacement
174
+ pass
175
+
176
+ # Replace all existing FileHandlers with the new one
177
+ for fh in existing_file_handlers:
178
+ try:
179
+ logger.removeHandler(fh)
180
+ fh.flush()
181
+ fh.close()
182
+ except Exception:
183
+ pass
184
+
185
+ file_handler = logging.FileHandler(target_path, file_mode)
186
+ file_handler.setFormatter(plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter)
187
+ file_handler.setLevel(log_level)
188
+ logger.addHandler(file_handler)
157
189
 
158
190
 
159
191
  def warn_once(logger: Logger, message: str) -> None: