evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
evalscope/run.py
CHANGED
|
@@ -13,9 +13,6 @@ from evalscope.utils.io_utils import OutputsStructure
|
|
|
13
13
|
from evalscope.utils.logger import configure_logging, get_logger
|
|
14
14
|
from evalscope.utils.model_utils import seed_everything
|
|
15
15
|
|
|
16
|
-
if TYPE_CHECKING:
|
|
17
|
-
from evalscope.models import LocalModel
|
|
18
|
-
|
|
19
16
|
logger = get_logger()
|
|
20
17
|
|
|
21
18
|
|
|
@@ -109,27 +106,42 @@ def get_backend_manager_class(eval_backend: EvalBackend):
|
|
|
109
106
|
raise NotImplementedError(f'Not implemented for evaluation backend {eval_backend}')
|
|
110
107
|
|
|
111
108
|
|
|
112
|
-
def evaluate_model(
|
|
109
|
+
def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
113
110
|
"""Evaluate the model based on the provided task configuration."""
|
|
114
|
-
from evalscope.
|
|
111
|
+
from evalscope.api.evaluator import Evaluator
|
|
112
|
+
from evalscope.api.model import get_model_with_task_config
|
|
113
|
+
from evalscope.api.registry import get_benchmark
|
|
114
|
+
from evalscope.evaluator import DefaultEvaluator
|
|
115
115
|
from evalscope.report import gen_table
|
|
116
116
|
|
|
117
117
|
# Initialize evaluator
|
|
118
118
|
eval_results = {}
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
for
|
|
122
|
-
|
|
119
|
+
# Initialize model
|
|
120
|
+
model = get_model_with_task_config(task_config=task_config)
|
|
121
|
+
# Initialize evaluators for each dataset
|
|
122
|
+
evaluators: List[Evaluator] = []
|
|
123
|
+
for dataset_name in task_config.datasets:
|
|
124
|
+
# Create evaluator for each dataset
|
|
125
|
+
benchmark = get_benchmark(dataset_name, task_config)
|
|
126
|
+
evaluator = DefaultEvaluator(
|
|
127
|
+
task_config=task_config,
|
|
128
|
+
model=model,
|
|
129
|
+
benchmark=benchmark,
|
|
130
|
+
outputs=outputs,
|
|
131
|
+
)
|
|
123
132
|
evaluators.append(evaluator)
|
|
124
133
|
|
|
134
|
+
# Update task_config.dataset_args with benchmark metadata
|
|
135
|
+
task_config.dataset_args[dataset_name] = benchmark.to_dict()
|
|
136
|
+
|
|
125
137
|
# dump task_cfg to outputs.configs_dir after creating evaluators
|
|
126
|
-
|
|
127
|
-
logger.info(
|
|
138
|
+
task_config.dump_yaml(outputs.configs_dir)
|
|
139
|
+
logger.info(task_config)
|
|
128
140
|
|
|
129
141
|
# Run evaluation for each evaluator
|
|
130
142
|
for evaluator in evaluators:
|
|
131
143
|
res_dict = evaluator.eval()
|
|
132
|
-
eval_results[evaluator.
|
|
144
|
+
eval_results[evaluator.benchmark.name] = res_dict
|
|
133
145
|
|
|
134
146
|
# Make overall report
|
|
135
147
|
try:
|
|
@@ -139,11 +151,11 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
139
151
|
logger.error('Failed to generate report table.')
|
|
140
152
|
|
|
141
153
|
# Clean up
|
|
142
|
-
if
|
|
154
|
+
if model is not None:
|
|
143
155
|
import gc
|
|
144
156
|
import torch
|
|
145
157
|
|
|
146
|
-
del
|
|
158
|
+
del model
|
|
147
159
|
del evaluators
|
|
148
160
|
torch.cuda.empty_cache()
|
|
149
161
|
gc.collect()
|
|
@@ -151,36 +163,6 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
151
163
|
return eval_results
|
|
152
164
|
|
|
153
165
|
|
|
154
|
-
def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: 'LocalModel'):
|
|
155
|
-
"""Create an evaluator object for the specified dataset."""
|
|
156
|
-
from evalscope.benchmarks import Benchmark, BenchmarkMeta
|
|
157
|
-
from evalscope.evaluator import Evaluator
|
|
158
|
-
from evalscope.models import initialize_model_adapter
|
|
159
|
-
|
|
160
|
-
benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
|
|
161
|
-
|
|
162
|
-
if dataset_name == DataCollection.NAME:
|
|
163
|
-
# EvaluatorCollection is a collection of evaluators
|
|
164
|
-
from evalscope.collections import EvaluatorCollection
|
|
165
|
-
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
166
|
-
return EvaluatorCollection(task_cfg, data_adapter, outputs, base_model)
|
|
167
|
-
|
|
168
|
-
# Initialize data adapter first to update config
|
|
169
|
-
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
170
|
-
# Initialize model adapter
|
|
171
|
-
model_adapter = initialize_model_adapter(task_cfg, data_adapter, base_model)
|
|
172
|
-
|
|
173
|
-
# update task_cfg.dataset_args
|
|
174
|
-
task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
|
|
175
|
-
|
|
176
|
-
return Evaluator(
|
|
177
|
-
data_adapter=data_adapter,
|
|
178
|
-
model_adapter=model_adapter,
|
|
179
|
-
outputs=outputs,
|
|
180
|
-
task_cfg=task_cfg,
|
|
181
|
-
)
|
|
182
|
-
|
|
183
|
-
|
|
184
166
|
def main():
|
|
185
167
|
from evalscope.arguments import parse_args
|
|
186
168
|
args = parse_args()
|
evalscope/summarizer.py
CHANGED
|
@@ -80,7 +80,7 @@ class Summarizer:
|
|
|
80
80
|
|
|
81
81
|
summary_file_path = summary_files[0]
|
|
82
82
|
# Example: [{'dataset': 'gsm8k', 'version': '1d7fe4', 'metric': 'accuracy', 'mode': 'gen', 'qwen-7b-chat': '53.98'} # noqa: E501
|
|
83
|
-
summary_res: List[dict] = csv_to_list(
|
|
83
|
+
summary_res: List[dict] = csv_to_list(summary_file_path)
|
|
84
84
|
final_res_list.extend(summary_res)
|
|
85
85
|
elif eval_backend == EvalBackend.VLM_EVAL_KIT:
|
|
86
86
|
eval_config = Summarizer.parse_eval_config(candidate_task)
|
evalscope/utils/__init__.py
CHANGED
|
@@ -7,9 +7,22 @@ from .import_utils import _LazyModule
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
8
8
|
from .argument_utils import BaseArgument, get_supported_params, parse_int_or_float
|
|
9
9
|
from .deprecation_utils import deprecated
|
|
10
|
+
from .function_utils import run_once, thread_safe
|
|
10
11
|
from .import_utils import get_module_path, is_module_installed
|
|
11
|
-
from .io_utils import (
|
|
12
|
-
|
|
12
|
+
from .io_utils import (
|
|
13
|
+
OutputsStructure,
|
|
14
|
+
csv_to_jsonl,
|
|
15
|
+
csv_to_list,
|
|
16
|
+
dict_to_yaml,
|
|
17
|
+
gen_hash,
|
|
18
|
+
get_latest_folder_path,
|
|
19
|
+
get_valid_list,
|
|
20
|
+
json_to_dict,
|
|
21
|
+
jsonl_to_csv,
|
|
22
|
+
jsonl_to_list,
|
|
23
|
+
safe_filename,
|
|
24
|
+
yaml_to_dict,
|
|
25
|
+
)
|
|
13
26
|
from .logger import configure_logging, get_logger
|
|
14
27
|
from .model_utils import EvalBackend, dict_torch_dtype_to_str, fix_do_sample_warning, get_device, seed_everything
|
|
15
28
|
|
|
@@ -31,6 +44,10 @@ else:
|
|
|
31
44
|
'is_module_installed',
|
|
32
45
|
'get_module_path',
|
|
33
46
|
],
|
|
47
|
+
'function_utils': [
|
|
48
|
+
'thread_safe',
|
|
49
|
+
'run_once',
|
|
50
|
+
],
|
|
34
51
|
'io_utils': [
|
|
35
52
|
'OutputsStructure',
|
|
36
53
|
'csv_to_list',
|
|
@@ -44,6 +61,8 @@ else:
|
|
|
44
61
|
'jsonl_to_list',
|
|
45
62
|
'gen_hash',
|
|
46
63
|
'get_valid_list',
|
|
64
|
+
'safe_filename',
|
|
65
|
+
'thread_safe',
|
|
47
66
|
],
|
|
48
67
|
'deprecation_utils': [
|
|
49
68
|
'deprecated',
|
evalscope/utils/chat_service.py
CHANGED
|
@@ -204,7 +204,8 @@ class ChatService:
|
|
|
204
204
|
|
|
205
205
|
def _prepare_chat_inputs(self, request: ChatCompletionRequest):
|
|
206
206
|
formatted_prompt = self.tokenizer.apply_chat_template(
|
|
207
|
-
request.messages, tokenize=False, add_generation_prompt=True
|
|
207
|
+
request.messages, tokenize=False, add_generation_prompt=True
|
|
208
|
+
)
|
|
208
209
|
inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=False).to(self.device)
|
|
209
210
|
prompt_tokens = len(inputs['input_ids'][0])
|
|
210
211
|
return formatted_prompt, inputs, prompt_tokens
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import functools
|
|
2
2
|
import inspect
|
|
3
|
+
import os
|
|
3
4
|
from typing import Callable, Optional
|
|
4
5
|
|
|
5
6
|
from .logger import get_logger
|
|
@@ -22,7 +23,7 @@ def deprecated(since: str, remove_in: Optional[str] = None, alternative: Optiona
|
|
|
22
23
|
@functools.wraps(func)
|
|
23
24
|
def wrapper(*args, **kwargs):
|
|
24
25
|
# Get the file name where the function is defined
|
|
25
|
-
file_name = inspect.getfile(func)
|
|
26
|
+
file_name = os.path.basename(inspect.getfile(func))
|
|
26
27
|
|
|
27
28
|
# Construct the warning message
|
|
28
29
|
warning_parts = [
|
|
@@ -40,3 +41,13 @@ def deprecated(since: str, remove_in: Optional[str] = None, alternative: Optiona
|
|
|
40
41
|
return wrapper
|
|
41
42
|
|
|
42
43
|
return decorator
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def deprecated_warning(logger, message: str):
|
|
47
|
+
"""
|
|
48
|
+
Log a deprecation warning.
|
|
49
|
+
|
|
50
|
+
:param logger: Logger instance to log the warning
|
|
51
|
+
:param message: Warning message to log
|
|
52
|
+
"""
|
|
53
|
+
logger.warning(f'Deprecated: {message}')
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import threading
|
|
2
|
+
from functools import wraps
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def run_once(func):
|
|
6
|
+
"""Decorator to ensure a function is only run once."""
|
|
7
|
+
has_run = False
|
|
8
|
+
result = None
|
|
9
|
+
|
|
10
|
+
def wrapper(*args, **kwargs):
|
|
11
|
+
nonlocal has_run, result
|
|
12
|
+
if not has_run:
|
|
13
|
+
result = func(*args, **kwargs)
|
|
14
|
+
has_run = True
|
|
15
|
+
return result
|
|
16
|
+
|
|
17
|
+
return wrapper
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def thread_safe(func):
|
|
21
|
+
"""Thread-safe decorator for functions that need to be executed in a thread-safe manner."""
|
|
22
|
+
lock = threading.RLock()
|
|
23
|
+
|
|
24
|
+
@wraps(func)
|
|
25
|
+
def wrapper(*args, **kwargs):
|
|
26
|
+
with lock:
|
|
27
|
+
return func(*args, **kwargs)
|
|
28
|
+
|
|
29
|
+
return wrapper
|
evalscope/utils/io_utils.py
CHANGED
|
@@ -5,6 +5,8 @@ import json
|
|
|
5
5
|
import jsonlines as jsonl
|
|
6
6
|
import os
|
|
7
7
|
import re
|
|
8
|
+
import string
|
|
9
|
+
import unicodedata
|
|
8
10
|
import yaml
|
|
9
11
|
from io import BytesIO
|
|
10
12
|
from PIL import Image
|
|
@@ -33,7 +35,7 @@ class OutputsStructure:
|
|
|
33
35
|
'configs_dir': None
|
|
34
36
|
}
|
|
35
37
|
|
|
36
|
-
def _get_dir(self, attr_name, dir_name):
|
|
38
|
+
def _get_dir(self, attr_name, dir_name) -> str:
|
|
37
39
|
if self._dirs[attr_name] is None:
|
|
38
40
|
dir_path = os.path.join(self.outputs_dir, dir_name)
|
|
39
41
|
if self.is_make:
|
|
@@ -72,10 +74,20 @@ def jsonl_to_list(jsonl_file):
|
|
|
72
74
|
Returns:
|
|
73
75
|
list: list of lines. Each line is a dict.
|
|
74
76
|
"""
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
77
|
+
try:
|
|
78
|
+
res_list = []
|
|
79
|
+
with jsonl.open(jsonl_file, mode='r') as reader:
|
|
80
|
+
for line in reader.iter(type=dict, allow_none=True, skip_invalid=False):
|
|
81
|
+
res_list.append(line)
|
|
82
|
+
except Exception:
|
|
83
|
+
# Fallback to reading line by line
|
|
84
|
+
res_list = []
|
|
85
|
+
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
|
86
|
+
for line in f:
|
|
87
|
+
if line.strip(): # Skip empty lines
|
|
88
|
+
res_list.append(json.loads(line.strip()))
|
|
89
|
+
if not res_list:
|
|
90
|
+
logger.warning(f'No data found in {jsonl_file}.')
|
|
79
91
|
return res_list
|
|
80
92
|
|
|
81
93
|
|
|
@@ -272,7 +284,90 @@ def get_valid_list(input_list, candidate_list):
|
|
|
272
284
|
|
|
273
285
|
|
|
274
286
|
def PIL_to_base64(image: Image.Image, format: str = 'JPEG') -> str:
|
|
287
|
+
"""
|
|
288
|
+
Convert a PIL Image to a base64 encoded string.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
image (Image.Image): The PIL Image to convert.
|
|
292
|
+
format (str): The format to save the image in. Default is 'JPEG'.
|
|
293
|
+
Returns:
|
|
294
|
+
str: Base64 encoded string of the image.
|
|
295
|
+
"""
|
|
275
296
|
buffered = BytesIO()
|
|
276
297
|
image.save(buffered, format=format)
|
|
277
298
|
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
|
278
299
|
return img_str
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def safe_filename(s: str, max_length: int = 255) -> str:
|
|
303
|
+
"""
|
|
304
|
+
Convert a string into a safe filename by removing or replacing unsafe characters.
|
|
305
|
+
|
|
306
|
+
Args:
|
|
307
|
+
s (str): The input string to convert
|
|
308
|
+
max_length (int): Maximum length of the resulting filename (default 255)
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
str: A safe filename string
|
|
312
|
+
|
|
313
|
+
Examples:
|
|
314
|
+
>>> safe_filename("Hello/World?.txt")
|
|
315
|
+
'Hello_World.txt'
|
|
316
|
+
"""
|
|
317
|
+
# normalize unicode characters
|
|
318
|
+
s = unicodedata.normalize('NFKD', s)
|
|
319
|
+
s = s.encode('ASCII', 'ignore').decode('ASCII')
|
|
320
|
+
|
|
321
|
+
# remove or replace unsafe characters
|
|
322
|
+
# Keep only alphanumeric characters, dots, dashes, and underscores
|
|
323
|
+
safe_chars = string.ascii_letters + string.digits + '.-_'
|
|
324
|
+
s = ''.join(c if c in safe_chars else '_' for c in s)
|
|
325
|
+
|
|
326
|
+
# remove consecutive underscores
|
|
327
|
+
s = re.sub(r'_+', '_', s)
|
|
328
|
+
|
|
329
|
+
# remove leading/trailing periods and underscores
|
|
330
|
+
s = s.strip('._')
|
|
331
|
+
|
|
332
|
+
# handle empty string case
|
|
333
|
+
if not s:
|
|
334
|
+
s = 'untitled'
|
|
335
|
+
|
|
336
|
+
# handle starting with a period (hidden files)
|
|
337
|
+
if s.startswith('.'):
|
|
338
|
+
s = '_' + s
|
|
339
|
+
|
|
340
|
+
# enforce length limit
|
|
341
|
+
if len(s) > max_length:
|
|
342
|
+
# If we need to truncate, preserve the file extension if present
|
|
343
|
+
name, ext = os.path.splitext(s)
|
|
344
|
+
ext_len = len(ext)
|
|
345
|
+
if ext_len > 0:
|
|
346
|
+
max_name_length = max_length - ext_len
|
|
347
|
+
s = name[:max_name_length] + ext
|
|
348
|
+
else:
|
|
349
|
+
s = s[:max_length]
|
|
350
|
+
|
|
351
|
+
return s
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def convert_numpy_types(obj):
|
|
355
|
+
"""Recursively convert numpy types to native Python types for JSON serialization."""
|
|
356
|
+
import numpy as np
|
|
357
|
+
|
|
358
|
+
if isinstance(obj, np.bool_):
|
|
359
|
+
return bool(obj)
|
|
360
|
+
elif isinstance(obj, np.integer):
|
|
361
|
+
return int(obj)
|
|
362
|
+
elif isinstance(obj, np.floating):
|
|
363
|
+
return float(obj)
|
|
364
|
+
elif isinstance(obj, np.ndarray):
|
|
365
|
+
return obj.tolist()
|
|
366
|
+
elif isinstance(obj, dict):
|
|
367
|
+
return {key: convert_numpy_types(value) for key, value in obj.items()}
|
|
368
|
+
elif isinstance(obj, list):
|
|
369
|
+
return [convert_numpy_types(item) for item in obj]
|
|
370
|
+
elif isinstance(obj, tuple):
|
|
371
|
+
return tuple(convert_numpy_types(item) for item in obj)
|
|
372
|
+
else:
|
|
373
|
+
return obj
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
import types
|
|
2
|
+
import typing
|
|
3
|
+
from copy import deepcopy
|
|
4
|
+
from dataclasses import is_dataclass
|
|
5
|
+
from datetime import date, datetime, time
|
|
6
|
+
from enum import EnumMeta
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
from typing import (
|
|
9
|
+
Any,
|
|
10
|
+
Dict,
|
|
11
|
+
List,
|
|
12
|
+
Literal,
|
|
13
|
+
Optional,
|
|
14
|
+
Set,
|
|
15
|
+
Tuple,
|
|
16
|
+
Type,
|
|
17
|
+
Union,
|
|
18
|
+
cast,
|
|
19
|
+
get_args,
|
|
20
|
+
get_origin,
|
|
21
|
+
get_type_hints,
|
|
22
|
+
is_typeddict,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
JSONType = Literal['string', 'integer', 'number', 'boolean', 'array', 'object', 'null']
|
|
26
|
+
"""Valid types within JSON schema."""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class JSONSchema(BaseModel):
|
|
30
|
+
"""JSON Schema for type."""
|
|
31
|
+
|
|
32
|
+
type: Optional[JSONType] = Field(default=None)
|
|
33
|
+
"""JSON type of tool parameter."""
|
|
34
|
+
|
|
35
|
+
format: Optional[str] = Field(default=None)
|
|
36
|
+
"""Format of the parameter (e.g. date-time)."""
|
|
37
|
+
|
|
38
|
+
description: Optional[str] = Field(default=None)
|
|
39
|
+
"""Parameter description."""
|
|
40
|
+
|
|
41
|
+
default: Any = Field(default=None)
|
|
42
|
+
"""Default value for parameter."""
|
|
43
|
+
|
|
44
|
+
enum: Optional[List[Any]] = Field(default=None)
|
|
45
|
+
"""Valid values for enum parameters."""
|
|
46
|
+
|
|
47
|
+
items: Optional['JSONSchema'] = Field(default=None)
|
|
48
|
+
"""Valid type for array parameters."""
|
|
49
|
+
|
|
50
|
+
properties: Optional[Dict[str, 'JSONSchema']] = Field(default=None)
|
|
51
|
+
"""Valid fields for object parametrs."""
|
|
52
|
+
|
|
53
|
+
additionalProperties: Optional[Union['JSONSchema', bool]] = Field(default=None)
|
|
54
|
+
"""Are additional properties allowed?"""
|
|
55
|
+
|
|
56
|
+
anyOf: Optional[List['JSONSchema']] = Field(default=None)
|
|
57
|
+
"""Valid types for union parameters."""
|
|
58
|
+
|
|
59
|
+
required: Optional[List[str]] = Field(default=None)
|
|
60
|
+
"""Required fields for object parameters."""
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def json_schema(t: Type[Any]) -> JSONSchema:
|
|
64
|
+
"""Provide a JSON Schema for the specified type.
|
|
65
|
+
|
|
66
|
+
Schemas can be automatically inferred for a wide variety of
|
|
67
|
+
Python class types including Pydantic BaseModel, dataclasses,
|
|
68
|
+
and typed dicts.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
t: Python type
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
JSON Schema for type.
|
|
75
|
+
"""
|
|
76
|
+
origin = get_origin(t)
|
|
77
|
+
args = get_args(t)
|
|
78
|
+
|
|
79
|
+
if origin is None:
|
|
80
|
+
if t is int:
|
|
81
|
+
return JSONSchema(type='integer')
|
|
82
|
+
elif t is float:
|
|
83
|
+
return JSONSchema(type='number')
|
|
84
|
+
elif t is str:
|
|
85
|
+
return JSONSchema(type='string')
|
|
86
|
+
elif t is bool:
|
|
87
|
+
return JSONSchema(type='boolean')
|
|
88
|
+
elif t is datetime:
|
|
89
|
+
return JSONSchema(type='string', format='date-time')
|
|
90
|
+
elif t is date:
|
|
91
|
+
return JSONSchema(type='string', format='date')
|
|
92
|
+
elif t is time:
|
|
93
|
+
return JSONSchema(type='string', format='time')
|
|
94
|
+
elif t is list or t is set:
|
|
95
|
+
return JSONSchema(type='array', items=JSONSchema())
|
|
96
|
+
elif t is dict:
|
|
97
|
+
return JSONSchema(type='object', additionalProperties=JSONSchema())
|
|
98
|
+
elif (is_dataclass(t) or is_typeddict(t) or (isinstance(t, type) and issubclass(t, BaseModel))):
|
|
99
|
+
return cls_json_schema(t)
|
|
100
|
+
elif isinstance(t, EnumMeta):
|
|
101
|
+
return JSONSchema(enum=[item.value for item in t])
|
|
102
|
+
elif t is type(None):
|
|
103
|
+
return JSONSchema(type='null')
|
|
104
|
+
else:
|
|
105
|
+
return JSONSchema()
|
|
106
|
+
elif (origin is list or origin is List or origin is tuple or origin is Tuple or origin is set or origin is Set):
|
|
107
|
+
return JSONSchema(type='array', items=json_schema(args[0]) if args else JSONSchema())
|
|
108
|
+
elif origin is dict or origin is Dict:
|
|
109
|
+
return JSONSchema(
|
|
110
|
+
type='object',
|
|
111
|
+
additionalProperties=json_schema(args[1]) if len(args) > 1 else JSONSchema(),
|
|
112
|
+
)
|
|
113
|
+
elif origin is Union or origin is types.UnionType:
|
|
114
|
+
return JSONSchema(anyOf=[json_schema(arg) for arg in args])
|
|
115
|
+
elif origin is Optional:
|
|
116
|
+
return JSONSchema(anyOf=[json_schema(arg) for arg in args] + [JSONSchema(type='null')])
|
|
117
|
+
elif origin is typing.Literal:
|
|
118
|
+
return JSONSchema(enum=list(args))
|
|
119
|
+
|
|
120
|
+
return JSONSchema() # Default case if we can't determine the type
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def cls_json_schema(cls: Type[Any]) -> JSONSchema:
|
|
124
|
+
properties: Dict[str, JSONSchema] = {}
|
|
125
|
+
required: List[str] = []
|
|
126
|
+
|
|
127
|
+
if is_dataclass(cls):
|
|
128
|
+
fields = cls.__dataclass_fields__ # type: ignore
|
|
129
|
+
for name, field in fields.items():
|
|
130
|
+
properties[name] = json_schema(field.type) # type: ignore
|
|
131
|
+
if field.default == field.default_factory:
|
|
132
|
+
required.append(name)
|
|
133
|
+
elif isinstance(cls, type) and issubclass(cls, BaseModel):
|
|
134
|
+
schema = cls.model_json_schema()
|
|
135
|
+
schema = resolve_schema_references(schema)
|
|
136
|
+
for name, prop in schema.get('properties', {}).items():
|
|
137
|
+
properties[name] = JSONSchema(**prop)
|
|
138
|
+
required = schema.get('required', [])
|
|
139
|
+
elif is_typeddict(cls):
|
|
140
|
+
annotations = get_type_hints(cls)
|
|
141
|
+
for name, type_hint in annotations.items():
|
|
142
|
+
properties[name] = json_schema(type_hint)
|
|
143
|
+
if name in cls.__required_keys__:
|
|
144
|
+
required.append(name)
|
|
145
|
+
|
|
146
|
+
return JSONSchema(
|
|
147
|
+
type='object',
|
|
148
|
+
properties=properties,
|
|
149
|
+
required=required if required else None,
|
|
150
|
+
additionalProperties=False,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def python_type_to_json_type(python_type: Optional[str]) -> JSONType:
|
|
155
|
+
if python_type == 'str':
|
|
156
|
+
return 'string'
|
|
157
|
+
elif python_type == 'int':
|
|
158
|
+
return 'integer'
|
|
159
|
+
elif python_type == 'float':
|
|
160
|
+
return 'number'
|
|
161
|
+
elif python_type == 'bool':
|
|
162
|
+
return 'boolean'
|
|
163
|
+
elif python_type == 'list':
|
|
164
|
+
return 'array'
|
|
165
|
+
elif python_type == 'dict':
|
|
166
|
+
return 'object'
|
|
167
|
+
elif python_type == 'None':
|
|
168
|
+
return 'null'
|
|
169
|
+
elif python_type is None:
|
|
170
|
+
# treat 'unknown' as string as anything can be converted to string
|
|
171
|
+
return 'string'
|
|
172
|
+
else:
|
|
173
|
+
raise ValueError(f'Unsupported type: {python_type} for Python to JSON conversion.')
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def resolve_schema_references(schema: Dict[str, Any]) -> Dict[str, Any]:
|
|
177
|
+
"""Resolves all $ref references in a JSON schema by inlining the definitions."""
|
|
178
|
+
schema = deepcopy(schema)
|
|
179
|
+
definitions = schema.pop('$defs', {})
|
|
180
|
+
|
|
181
|
+
def _resolve_refs(obj: Any) -> Any:
|
|
182
|
+
if isinstance(obj, dict):
|
|
183
|
+
if '$ref' in obj and obj['$ref'].startswith('#/$defs/'):
|
|
184
|
+
ref_key = obj['$ref'].split('/')[-1]
|
|
185
|
+
if ref_key in definitions:
|
|
186
|
+
# Replace with a deep copy of the definition
|
|
187
|
+
resolved = deepcopy(definitions[ref_key])
|
|
188
|
+
# Process any nested references in the definition
|
|
189
|
+
resolved = _resolve_refs(resolved)
|
|
190
|
+
|
|
191
|
+
# Merge in the current object fields, which should take priority
|
|
192
|
+
# This means that if you have e.g.
|
|
193
|
+
# {"$ref": "#/$defs/SubType", "description": "subtype of type SubType"},
|
|
194
|
+
# and SubType resolves to
|
|
195
|
+
# {"description": "The SubType Class", "parameters": {"param1": {"type": "string"}}},
|
|
196
|
+
# the final result will be:
|
|
197
|
+
# {"description": "subtype of type SubType", "parameters": {"param1": {"type": "string"}}}
|
|
198
|
+
return resolved | {k: o for k, o in obj.items() if k != '$ref'}
|
|
199
|
+
|
|
200
|
+
# Process all entries in the dictionary
|
|
201
|
+
return {k: _resolve_refs(v) for k, v in obj.items()}
|
|
202
|
+
elif isinstance(obj, list):
|
|
203
|
+
return [_resolve_refs(item) for item in obj]
|
|
204
|
+
else:
|
|
205
|
+
return obj
|
|
206
|
+
|
|
207
|
+
return cast(Dict[str, Any], _resolve_refs(schema))
|
|
208
|
+
return cast(Dict[str, Any], _resolve_refs(schema))
|
evalscope/utils/logger.py
CHANGED
|
@@ -1,18 +1,27 @@
|
|
|
1
|
+
import colorlog
|
|
1
2
|
import importlib.util as iutil
|
|
2
3
|
import logging
|
|
3
4
|
import os
|
|
4
|
-
from
|
|
5
|
+
from logging import Logger
|
|
6
|
+
from typing import List, Optional
|
|
5
7
|
|
|
6
8
|
init_loggers = {}
|
|
9
|
+
# Define log formats
|
|
10
|
+
data_format = '%Y-%m-%d %H:%M:%S'
|
|
11
|
+
# For console output
|
|
12
|
+
color_detailed_format = '%(asctime)s - %(name)s - %(filename)s - %(funcName)s - %(lineno)d - %(log_color)s%(levelname)s%(reset)s: %(message)s' # noqa:E501
|
|
13
|
+
color_simple_format = '%(asctime)s - %(name)s - %(log_color)s%(levelname)s%(reset)s: %(message)s'
|
|
14
|
+
color_detailed_formatter = colorlog.ColoredFormatter(color_detailed_format, datefmt=data_format)
|
|
15
|
+
color_simple_formatter = colorlog.ColoredFormatter(color_simple_format, datefmt=data_format)
|
|
16
|
+
# For file output
|
|
17
|
+
detailed_format = '%(asctime)s - %(name)s - %(filename)s - %(funcName)s - %(lineno)d - %(levelname)s: %(message)s' # noqa:E501
|
|
18
|
+
simple_format = '%(asctime)s - %(name)s - %(levelname)s: %(message)s'
|
|
19
|
+
plain_detailed_formatter = logging.Formatter(detailed_format, datefmt=data_format)
|
|
20
|
+
plain_simple_formatter = logging.Formatter(simple_format, datefmt=data_format)
|
|
7
21
|
|
|
8
|
-
detailed_format = '%(asctime)s - %(name)s - %(filename)s - %(funcName)s - %(lineno)d - %(levelname)s - %(message)s'
|
|
9
|
-
simple_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
10
|
-
|
|
11
|
-
detailed_formatter = logging.Formatter(detailed_format)
|
|
12
|
-
simple_formatter = logging.Formatter(simple_format)
|
|
13
22
|
DEFAULT_LEVEL = logging.DEBUG if os.getenv('EVALSCOPE_LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
|
|
14
23
|
|
|
15
|
-
logging.basicConfig(format=simple_format, level=
|
|
24
|
+
logging.basicConfig(format=simple_format, level=logging.INFO, force=True)
|
|
16
25
|
|
|
17
26
|
# set logging level
|
|
18
27
|
logging.getLogger('datasets').setLevel(logging.WARNING)
|
|
@@ -20,7 +29,13 @@ logging.getLogger('httpx').setLevel(logging.WARNING)
|
|
|
20
29
|
logging.getLogger('modelscope').setLevel(logging.ERROR)
|
|
21
30
|
|
|
22
31
|
|
|
23
|
-
def get_logger(
|
|
32
|
+
def get_logger(
|
|
33
|
+
log_file: Optional[str] = None,
|
|
34
|
+
name: Optional[str] = None,
|
|
35
|
+
log_level: int = DEFAULT_LEVEL,
|
|
36
|
+
file_mode: str = 'w',
|
|
37
|
+
force=False
|
|
38
|
+
):
|
|
24
39
|
"""Get logging logger
|
|
25
40
|
|
|
26
41
|
Args:
|
|
@@ -31,7 +46,10 @@ def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, f
|
|
|
31
46
|
specified (if filemode is unspecified, it defaults to 'w').
|
|
32
47
|
"""
|
|
33
48
|
|
|
34
|
-
|
|
49
|
+
if name:
|
|
50
|
+
logger_name = f"evalscope.{name.split('.')[-1]}"
|
|
51
|
+
else:
|
|
52
|
+
logger_name = 'evalscope'
|
|
35
53
|
logger = logging.getLogger(logger_name)
|
|
36
54
|
logger.propagate = False
|
|
37
55
|
|
|
@@ -40,7 +58,15 @@ def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, f
|
|
|
40
58
|
logger.setLevel(log_level)
|
|
41
59
|
for handler in logger.handlers:
|
|
42
60
|
handler.setLevel(log_level)
|
|
43
|
-
|
|
61
|
+
# 区分不同类型的 handler,使用相应的格式化器
|
|
62
|
+
if isinstance(handler, logging.FileHandler):
|
|
63
|
+
handler.setFormatter(
|
|
64
|
+
plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter
|
|
65
|
+
)
|
|
66
|
+
else:
|
|
67
|
+
handler.setFormatter(
|
|
68
|
+
color_detailed_formatter if log_level == logging.DEBUG else color_simple_formatter
|
|
69
|
+
)
|
|
44
70
|
add_file_handler_if_needed(logger, log_file, file_mode, log_level)
|
|
45
71
|
return logger
|
|
46
72
|
|
|
@@ -66,7 +92,11 @@ def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, f
|
|
|
66
92
|
handlers.append(file_handler)
|
|
67
93
|
|
|
68
94
|
for handler in handlers:
|
|
69
|
-
|
|
95
|
+
# 区分不同类型的 handler,使用相应的格式化器
|
|
96
|
+
if isinstance(handler, logging.FileHandler):
|
|
97
|
+
handler.setFormatter(plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter)
|
|
98
|
+
else:
|
|
99
|
+
handler.setFormatter(color_detailed_formatter if log_level == logging.DEBUG else color_simple_formatter)
|
|
70
100
|
handler.setLevel(log_level)
|
|
71
101
|
logger.addHandler(handler)
|
|
72
102
|
|
|
@@ -102,6 +132,15 @@ def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
|
|
|
102
132
|
|
|
103
133
|
if is_worker0 and log_file is not None:
|
|
104
134
|
file_handler = logging.FileHandler(log_file, file_mode)
|
|
105
|
-
file_handler.setFormatter(
|
|
135
|
+
file_handler.setFormatter(plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter)
|
|
106
136
|
file_handler.setLevel(log_level)
|
|
107
137
|
logger.addHandler(file_handler)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def warn_once(logger: Logger, message: str) -> None:
|
|
141
|
+
if message not in _warned:
|
|
142
|
+
logger.warning(message)
|
|
143
|
+
_warned.append(message)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
_warned: List[str] = []
|