PyPI - evalscope - Versions diffs - 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

evalscope 0.17.1py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (273) hide show

evalscope/__init__.py +4 -1
evalscope/api/__init__.py +0 -0
evalscope/api/benchmark/__init__.py +3 -0
evalscope/api/benchmark/adapters/__init__.py +3 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
evalscope/api/benchmark/benchmark.py +321 -0
evalscope/api/benchmark/meta.py +115 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +349 -0
evalscope/api/dataset/loader.py +261 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +355 -0
evalscope/api/evaluator/evaluator.py +56 -0
evalscope/api/evaluator/state.py +264 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +11 -0
evalscope/api/messages/chat_message.py +198 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +55 -0
evalscope/api/metric/scorer.py +105 -0
evalscope/api/mixin/__init__.py +2 -0
evalscope/api/mixin/dataset_mixin.py +105 -0
evalscope/api/mixin/llm_judge_mixin.py +168 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +157 -0
evalscope/api/model/model.py +383 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/ui/app_ui.py +2 -1
evalscope/app/ui/multi_model.py +50 -25
evalscope/app/ui/single_model.py +23 -11
evalscope/app/utils/data_utils.py +42 -26
evalscope/app/utils/text_utils.py +0 -2
evalscope/app/utils/visualization.py +9 -4
evalscope/arguments.py +6 -7
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +6 -3
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +2 -1
evalscope/backend/rag_eval/utils/llm.py +13 -12
evalscope/benchmarks/__init__.py +0 -2
evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
evalscope/benchmarks/aime/aime24_adapter.py +38 -40
evalscope/benchmarks/aime/aime25_adapter.py +34 -40
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
evalscope/benchmarks/arc/arc_adapter.py +34 -147
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
evalscope/benchmarks/arena_hard/utils.py +37 -1
evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
evalscope/benchmarks/bfcl/generation.py +222 -0
evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
evalscope/benchmarks/docmath/utils.py +4 -5
evalscope/benchmarks/drop/drop_adapter.py +88 -40
evalscope/benchmarks/frames/frames_adapter.py +135 -52
evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
evalscope/benchmarks/general_arena/utils.py +23 -27
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
evalscope/benchmarks/hle/hle_adapter.py +127 -93
evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
evalscope/benchmarks/ifeval/instructions.py +109 -64
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
evalscope/benchmarks/musr/musr_adapter.py +33 -64
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
evalscope/benchmarks/race/race_adapter.py +33 -119
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
evalscope/benchmarks/super_gpqa/utils.py +2 -1
evalscope/benchmarks/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
evalscope/cli/cli.py +2 -0
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +2 -10
evalscope/collections/sampler.py +10 -10
evalscope/collections/schema.py +13 -11
evalscope/config.py +95 -54
evalscope/constants.py +29 -61
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +277 -423
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +13 -13
evalscope/metrics/llm_judge.py +32 -30
evalscope/metrics/math_parser.py +27 -22
evalscope/metrics/metric.py +307 -0
evalscope/metrics/metrics.py +22 -18
evalscope/metrics/t2v_metrics/__init__.py +0 -52
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
evalscope/models/__init__.py +6 -29
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +47 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +123 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +698 -0
evalscope/perf/benchmark.py +2 -1
evalscope/perf/http_client.py +4 -2
evalscope/perf/plugin/api/custom_api.py +5 -4
evalscope/perf/plugin/api/openai_api.py +11 -9
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +4 -2
evalscope/perf/utils/benchmark_util.py +7 -5
evalscope/perf/utils/db_util.py +9 -6
evalscope/perf/utils/local_server.py +8 -3
evalscope/perf/utils/rich_display.py +16 -10
evalscope/report/__init__.py +2 -2
evalscope/report/combinator.py +18 -12
evalscope/report/generator.py +101 -6
evalscope/report/{utils.py → report.py} +8 -6
evalscope/run.py +26 -44
evalscope/summarizer.py +1 -1
evalscope/utils/__init__.py +21 -2
evalscope/utils/chat_service.py +2 -1
evalscope/utils/deprecation_utils.py +12 -1
evalscope/utils/function_utils.py +29 -0
evalscope/utils/io_utils.py +100 -5
evalscope/utils/json_schema.py +208 -0
evalscope/utils/logger.py +51 -12
evalscope/utils/model_utils.py +10 -7
evalscope/utils/multi_choices.py +271 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
tests/aigc/test_t2i.py +22 -4
tests/benchmark/__init__.py +1 -0
tests/benchmark/test_eval.py +386 -0
tests/cli/test_all.py +3 -5
tests/cli/test_collection.py +13 -4
tests/cli/test_custom.py +22 -15
tests/rag/test_clip_benchmark.py +1 -0
evalscope/benchmarks/aigc/t2i/base.py +0 -56
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -81
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -528
evalscope/benchmarks/filters.py +0 -59
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/process_bench/critique_template.txt +0 -13
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/benchmarks/utils.py +0 -60
evalscope/collections/evaluator.py +0 -375
evalscope/metrics/completion_parsers.py +0 -227
evalscope/metrics/named_metrics.py +0 -55
evalscope/models/adapters/__init__.py +0 -14
evalscope/models/adapters/base_adapter.py +0 -84
evalscope/models/adapters/bfcl_adapter.py +0 -246
evalscope/models/adapters/chat_adapter.py +0 -207
evalscope/models/adapters/choice_adapter.py +0 -222
evalscope/models/adapters/custom_adapter.py +0 -71
evalscope/models/adapters/server_adapter.py +0 -236
evalscope/models/adapters/t2i_adapter.py +0 -79
evalscope/models/adapters/tau_bench_adapter.py +0 -189
evalscope/models/custom/__init__.py +0 -4
evalscope/models/custom/custom_model.py +0 -50
evalscope/models/custom/dummy_model.py +0 -99
evalscope/models/local_model.py +0 -128
evalscope/models/register.py +0 -41
tests/cli/test_run.py +0 -489
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0

evalscope/run.py CHANGED Viewed

@@ -13,9 +13,6 @@ from evalscope.utils.io_utils import OutputsStructure
 from evalscope.utils.logger import configure_logging, get_logger
 from evalscope.utils.model_utils import seed_everything
-if TYPE_CHECKING:
-    from evalscope.models import LocalModel
 logger = get_logger()
@@ -109,27 +106,42 @@ def get_backend_manager_class(eval_backend: EvalBackend):
         raise NotImplementedError(f'Not implemented for evaluation backend {eval_backend}')
-def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
+def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
     """Evaluate the model based on the provided task configuration."""
-    from evalscope.models import get_local_model
+    from evalscope.api.evaluator import Evaluator
+    from evalscope.api.model import get_model_with_task_config
+    from evalscope.api.registry import get_benchmark
+    from evalscope.evaluator import DefaultEvaluator
     from evalscope.report import gen_table
     # Initialize evaluator
     eval_results = {}
-    base_model = get_local_model(task_cfg)
-    evaluators = []
-    for dataset_name in task_cfg.datasets:
-        evaluator = create_evaluator(task_cfg, dataset_name, outputs, base_model)
+    # Initialize model
+    model = get_model_with_task_config(task_config=task_config)
+    # Initialize evaluators for each dataset
+    evaluators: List[Evaluator] = []
+    for dataset_name in task_config.datasets:
+        # Create evaluator for each dataset
+        benchmark = get_benchmark(dataset_name, task_config)
+        evaluator = DefaultEvaluator(
+            task_config=task_config,
+            model=model,
+            benchmark=benchmark,
+            outputs=outputs,
+        )
         evaluators.append(evaluator)
+        # Update task_config.dataset_args with benchmark metadata
+        task_config.dataset_args[dataset_name] = benchmark.to_dict()
     # dump task_cfg to outputs.configs_dir after creating evaluators
-    task_cfg.dump_yaml(outputs.configs_dir)
-    logger.info(task_cfg)
+    task_config.dump_yaml(outputs.configs_dir)
+    logger.info(task_config)
     # Run evaluation for each evaluator
     for evaluator in evaluators:
         res_dict = evaluator.eval()
-        eval_results[evaluator.dataset_name] = res_dict
+        eval_results[evaluator.benchmark.name] = res_dict
     # Make overall report
     try:
@@ -139,11 +151,11 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
         logger.error('Failed to generate report table.')
     # Clean up
-    if base_model is not None:
+    if model is not None:
         import gc
         import torch
-        del base_model
+        del model
         del evaluators
         torch.cuda.empty_cache()
         gc.collect()
@@ -151,36 +163,6 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
     return eval_results
-def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: 'LocalModel'):
-    """Create an evaluator object for the specified dataset."""
-    from evalscope.benchmarks import Benchmark, BenchmarkMeta
-    from evalscope.evaluator import Evaluator
-    from evalscope.models import initialize_model_adapter
-    benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
-    if dataset_name == DataCollection.NAME:
-        # EvaluatorCollection is a collection of evaluators
-        from evalscope.collections import EvaluatorCollection
-        data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
-        return EvaluatorCollection(task_cfg, data_adapter, outputs, base_model)
-    # Initialize data adapter first to update config
-    data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
-    # Initialize model adapter
-    model_adapter = initialize_model_adapter(task_cfg, data_adapter, base_model)
-    # update task_cfg.dataset_args
-    task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
-    return Evaluator(
-        data_adapter=data_adapter,
-        model_adapter=model_adapter,
-        outputs=outputs,
-        task_cfg=task_cfg,
-    )
 def main():
     from evalscope.arguments import parse_args
     args = parse_args()

evalscope/summarizer.py CHANGED Viewed

@@ -80,7 +80,7 @@ class Summarizer:
                 summary_file_path = summary_files[0]
                 # Example: [{'dataset': 'gsm8k', 'version': '1d7fe4', 'metric': 'accuracy', 'mode': 'gen', 'qwen-7b-chat': '53.98'} # noqa: E501
-                summary_res: List[dict] = csv_to_list(file_path=summary_file_path)
+                summary_res: List[dict] = csv_to_list(summary_file_path)
                 final_res_list.extend(summary_res)
             elif eval_backend == EvalBackend.VLM_EVAL_KIT:
                 eval_config = Summarizer.parse_eval_config(candidate_task)

evalscope/utils/__init__.py CHANGED Viewed

@@ -7,9 +7,22 @@ from .import_utils import _LazyModule
 if TYPE_CHECKING:
     from .argument_utils import BaseArgument, get_supported_params, parse_int_or_float
     from .deprecation_utils import deprecated
+    from .function_utils import run_once, thread_safe
     from .import_utils import get_module_path, is_module_installed
-    from .io_utils import (OutputsStructure, csv_to_jsonl, csv_to_list, dict_to_yaml, gen_hash, get_latest_folder_path,
-                           get_valid_list, json_to_dict, jsonl_to_csv, jsonl_to_list, yaml_to_dict)
+    from .io_utils import (
+        OutputsStructure,
+        csv_to_jsonl,
+        csv_to_list,
+        dict_to_yaml,
+        gen_hash,
+        get_latest_folder_path,
+        get_valid_list,
+        json_to_dict,
+        jsonl_to_csv,
+        jsonl_to_list,
+        safe_filename,
+        yaml_to_dict,
+    )
     from .logger import configure_logging, get_logger
     from .model_utils import EvalBackend, dict_torch_dtype_to_str, fix_do_sample_warning, get_device, seed_everything
@@ -31,6 +44,10 @@ else:
             'is_module_installed',
             'get_module_path',
         ],
+        'function_utils': [
+            'thread_safe',
+            'run_once',
+        ],
         'io_utils': [
             'OutputsStructure',
             'csv_to_list',
@@ -44,6 +61,8 @@ else:
             'jsonl_to_list',
             'gen_hash',
             'get_valid_list',
+            'safe_filename',
+            'thread_safe',
         ],
         'deprecation_utils': [
             'deprecated',

evalscope/utils/chat_service.py CHANGED Viewed

@@ -204,7 +204,8 @@ class ChatService:
     def _prepare_chat_inputs(self, request: ChatCompletionRequest):
         formatted_prompt = self.tokenizer.apply_chat_template(
-            request.messages, tokenize=False, add_generation_prompt=True)
+            request.messages, tokenize=False, add_generation_prompt=True
+        )
         inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=False).to(self.device)
         prompt_tokens = len(inputs['input_ids'][0])
         return formatted_prompt, inputs, prompt_tokens

evalscope/utils/deprecation_utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import functools
 import inspect
+import os
 from typing import Callable, Optional
 from .logger import get_logger
@@ -22,7 +23,7 @@ def deprecated(since: str, remove_in: Optional[str] = None, alternative: Optiona
         @functools.wraps(func)
         def wrapper(*args, **kwargs):
             # Get the file name where the function is defined
-            file_name = inspect.getfile(func)
+            file_name = os.path.basename(inspect.getfile(func))
             # Construct the warning message
             warning_parts = [
@@ -40,3 +41,13 @@ def deprecated(since: str, remove_in: Optional[str] = None, alternative: Optiona
         return wrapper
     return decorator
+def deprecated_warning(logger, message: str):
+    """
+    Log a deprecation warning.
+    :param logger: Logger instance to log the warning
+    :param message: Warning message to log
+    """
+    logger.warning(f'Deprecated: {message}')

evalscope/utils/function_utils.py ADDED Viewed

@@ -0,0 +1,29 @@
+import threading
+from functools import wraps
+def run_once(func):
+    """Decorator to ensure a function is only run once."""
+    has_run = False
+    result = None
+    def wrapper(*args, **kwargs):
+        nonlocal has_run, result
+        if not has_run:
+            result = func(*args, **kwargs)
+            has_run = True
+        return result
+    return wrapper
+def thread_safe(func):
+    """Thread-safe decorator for functions that need to be executed in a thread-safe manner."""
+    lock = threading.RLock()
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        with lock:
+            return func(*args, **kwargs)
+    return wrapper

evalscope/utils/io_utils.py CHANGED Viewed

@@ -5,6 +5,8 @@ import json
 import jsonlines as jsonl
 import os
 import re
+import string
+import unicodedata
 import yaml
 from io import BytesIO
 from PIL import Image
@@ -33,7 +35,7 @@ class OutputsStructure:
             'configs_dir': None
         }
-    def _get_dir(self, attr_name, dir_name):
+    def _get_dir(self, attr_name, dir_name) -> str:
         if self._dirs[attr_name] is None:
             dir_path = os.path.join(self.outputs_dir, dir_name)
             if self.is_make:
@@ -72,10 +74,20 @@ def jsonl_to_list(jsonl_file):
     Returns:
         list: list of lines. Each line is a dict.
     """
-    res_list = []
-    with jsonl.open(jsonl_file, mode='r') as reader:
-        for line in reader.iter(type=dict, allow_none=True, skip_invalid=False):
-            res_list.append(line)
+    try:
+        res_list = []
+        with jsonl.open(jsonl_file, mode='r') as reader:
+            for line in reader.iter(type=dict, allow_none=True, skip_invalid=False):
+                res_list.append(line)
+    except Exception:
+        # Fallback to reading line by line
+        res_list = []
+        with open(jsonl_file, 'r', encoding='utf-8') as f:
+            for line in f:
+                if line.strip():  # Skip empty lines
+                    res_list.append(json.loads(line.strip()))
+    if not res_list:
+        logger.warning(f'No data found in {jsonl_file}.')
     return res_list
@@ -272,7 +284,90 @@ def get_valid_list(input_list, candidate_list):
 def PIL_to_base64(image: Image.Image, format: str = 'JPEG') -> str:
+    """
+    Convert a PIL Image to a base64 encoded string.
+    Args:
+        image (Image.Image): The PIL Image to convert.
+        format (str): The format to save the image in. Default is 'JPEG'.
+    Returns:
+        str: Base64 encoded string of the image.
+    """
     buffered = BytesIO()
     image.save(buffered, format=format)
     img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
     return img_str
+def safe_filename(s: str, max_length: int = 255) -> str:
+    """
+    Convert a string into a safe filename by removing or replacing unsafe characters.
+    Args:
+        s (str): The input string to convert
+        max_length (int): Maximum length of the resulting filename (default 255)
+    Returns:
+        str: A safe filename string
+    Examples:
+        >>> safe_filename("Hello/World?.txt")
+        'Hello_World.txt'
+    """
+    # normalize unicode characters
+    s = unicodedata.normalize('NFKD', s)
+    s = s.encode('ASCII', 'ignore').decode('ASCII')
+    # remove or replace unsafe characters
+    # Keep only alphanumeric characters, dots, dashes, and underscores
+    safe_chars = string.ascii_letters + string.digits + '.-_'
+    s = ''.join(c if c in safe_chars else '_' for c in s)
+    # remove consecutive underscores
+    s = re.sub(r'_+', '_', s)
+    # remove leading/trailing periods and underscores
+    s = s.strip('._')
+    # handle empty string case
+    if not s:
+        s = 'untitled'
+    # handle starting with a period (hidden files)
+    if s.startswith('.'):
+        s = '_' + s
+    # enforce length limit
+    if len(s) > max_length:
+        # If we need to truncate, preserve the file extension if present
+        name, ext = os.path.splitext(s)
+        ext_len = len(ext)
+        if ext_len > 0:
+            max_name_length = max_length - ext_len
+            s = name[:max_name_length] + ext
+        else:
+            s = s[:max_length]
+    return s
+def convert_numpy_types(obj):
+    """Recursively convert numpy types to native Python types for JSON serialization."""
+    import numpy as np
+    if isinstance(obj, np.bool_):
+        return bool(obj)
+    elif isinstance(obj, np.integer):
+        return int(obj)
+    elif isinstance(obj, np.floating):
+        return float(obj)
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, dict):
+        return {key: convert_numpy_types(value) for key, value in obj.items()}
+    elif isinstance(obj, list):
+        return [convert_numpy_types(item) for item in obj]
+    elif isinstance(obj, tuple):
+        return tuple(convert_numpy_types(item) for item in obj)
+    else:
+        return obj

evalscope/utils/json_schema.py ADDED Viewed

@@ -0,0 +1,208 @@
+import types
+import typing
+from copy import deepcopy
+from dataclasses import is_dataclass
+from datetime import date, datetime, time
+from enum import EnumMeta
+from pydantic import BaseModel, Field
+from typing import (
+    Any,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    Union,
+    cast,
+    get_args,
+    get_origin,
+    get_type_hints,
+    is_typeddict,
+)
+JSONType = Literal['string', 'integer', 'number', 'boolean', 'array', 'object', 'null']
+"""Valid types within JSON schema."""
+class JSONSchema(BaseModel):
+    """JSON Schema for type."""
+    type: Optional[JSONType] = Field(default=None)
+    """JSON type of tool parameter."""
+    format: Optional[str] = Field(default=None)
+    """Format of the parameter (e.g. date-time)."""
+    description: Optional[str] = Field(default=None)
+    """Parameter description."""
+    default: Any = Field(default=None)
+    """Default value for parameter."""
+    enum: Optional[List[Any]] = Field(default=None)
+    """Valid values for enum parameters."""
+    items: Optional['JSONSchema'] = Field(default=None)
+    """Valid type for array parameters."""
+    properties: Optional[Dict[str, 'JSONSchema']] = Field(default=None)
+    """Valid fields for object parametrs."""
+    additionalProperties: Optional[Union['JSONSchema', bool]] = Field(default=None)
+    """Are additional properties allowed?"""
+    anyOf: Optional[List['JSONSchema']] = Field(default=None)
+    """Valid types for union parameters."""
+    required: Optional[List[str]] = Field(default=None)
+    """Required fields for object parameters."""
+def json_schema(t: Type[Any]) -> JSONSchema:
+    """Provide a JSON Schema for the specified type.
+    Schemas can be automatically inferred for a wide variety of
+    Python class types including Pydantic BaseModel, dataclasses,
+    and typed dicts.
+    Args:
+        t: Python type
+    Returns:
+        JSON Schema for type.
+    """
+    origin = get_origin(t)
+    args = get_args(t)
+    if origin is None:
+        if t is int:
+            return JSONSchema(type='integer')
+        elif t is float:
+            return JSONSchema(type='number')
+        elif t is str:
+            return JSONSchema(type='string')
+        elif t is bool:
+            return JSONSchema(type='boolean')
+        elif t is datetime:
+            return JSONSchema(type='string', format='date-time')
+        elif t is date:
+            return JSONSchema(type='string', format='date')
+        elif t is time:
+            return JSONSchema(type='string', format='time')
+        elif t is list or t is set:
+            return JSONSchema(type='array', items=JSONSchema())
+        elif t is dict:
+            return JSONSchema(type='object', additionalProperties=JSONSchema())
+        elif (is_dataclass(t) or is_typeddict(t) or (isinstance(t, type) and issubclass(t, BaseModel))):
+            return cls_json_schema(t)
+        elif isinstance(t, EnumMeta):
+            return JSONSchema(enum=[item.value for item in t])
+        elif t is type(None):
+            return JSONSchema(type='null')
+        else:
+            return JSONSchema()
+    elif (origin is list or origin is List or origin is tuple or origin is Tuple or origin is set or origin is Set):
+        return JSONSchema(type='array', items=json_schema(args[0]) if args else JSONSchema())
+    elif origin is dict or origin is Dict:
+        return JSONSchema(
+            type='object',
+            additionalProperties=json_schema(args[1]) if len(args) > 1 else JSONSchema(),
+        )
+    elif origin is Union or origin is types.UnionType:
+        return JSONSchema(anyOf=[json_schema(arg) for arg in args])
+    elif origin is Optional:
+        return JSONSchema(anyOf=[json_schema(arg) for arg in args] + [JSONSchema(type='null')])
+    elif origin is typing.Literal:
+        return JSONSchema(enum=list(args))
+    return JSONSchema()  # Default case if we can't determine the type
+def cls_json_schema(cls: Type[Any]) -> JSONSchema:
+    properties: Dict[str, JSONSchema] = {}
+    required: List[str] = []
+    if is_dataclass(cls):
+        fields = cls.__dataclass_fields__  # type: ignore
+        for name, field in fields.items():
+            properties[name] = json_schema(field.type)  # type: ignore
+            if field.default == field.default_factory:
+                required.append(name)
+    elif isinstance(cls, type) and issubclass(cls, BaseModel):
+        schema = cls.model_json_schema()
+        schema = resolve_schema_references(schema)
+        for name, prop in schema.get('properties', {}).items():
+            properties[name] = JSONSchema(**prop)
+        required = schema.get('required', [])
+    elif is_typeddict(cls):
+        annotations = get_type_hints(cls)
+        for name, type_hint in annotations.items():
+            properties[name] = json_schema(type_hint)
+            if name in cls.__required_keys__:
+                required.append(name)
+    return JSONSchema(
+        type='object',
+        properties=properties,
+        required=required if required else None,
+        additionalProperties=False,
+    )
+def python_type_to_json_type(python_type: Optional[str]) -> JSONType:
+    if python_type == 'str':
+        return 'string'
+    elif python_type == 'int':
+        return 'integer'
+    elif python_type == 'float':
+        return 'number'
+    elif python_type == 'bool':
+        return 'boolean'
+    elif python_type == 'list':
+        return 'array'
+    elif python_type == 'dict':
+        return 'object'
+    elif python_type == 'None':
+        return 'null'
+    elif python_type is None:
+        # treat 'unknown' as string as anything can be converted to string
+        return 'string'
+    else:
+        raise ValueError(f'Unsupported type: {python_type} for Python to JSON conversion.')
+def resolve_schema_references(schema: Dict[str, Any]) -> Dict[str, Any]:
+    """Resolves all $ref references in a JSON schema by inlining the definitions."""
+    schema = deepcopy(schema)
+    definitions = schema.pop('$defs', {})
+    def _resolve_refs(obj: Any) -> Any:
+        if isinstance(obj, dict):
+            if '$ref' in obj and obj['$ref'].startswith('#/$defs/'):
+                ref_key = obj['$ref'].split('/')[-1]
+                if ref_key in definitions:
+                    # Replace with a deep copy of the definition
+                    resolved = deepcopy(definitions[ref_key])
+                    # Process any nested references in the definition
+                    resolved = _resolve_refs(resolved)
+                    # Merge in the current object fields, which should take priority
+                    # This means that if you have e.g.
+                    # {"$ref": "#/$defs/SubType", "description": "subtype of type SubType"},
+                    # and SubType resolves to
+                    # {"description": "The SubType Class", "parameters": {"param1": {"type": "string"}}},
+                    # the final result will be:
+                    # {"description": "subtype of type SubType", "parameters": {"param1": {"type": "string"}}}
+                    return resolved | {k: o for k, o in obj.items() if k != '$ref'}
+            # Process all entries in the dictionary
+            return {k: _resolve_refs(v) for k, v in obj.items()}
+        elif isinstance(obj, list):
+            return [_resolve_refs(item) for item in obj]
+        else:
+            return obj
+    return cast(Dict[str, Any], _resolve_refs(schema))
+    return cast(Dict[str, Any], _resolve_refs(schema))

evalscope/utils/logger.py CHANGED Viewed

@@ -1,18 +1,27 @@
+import colorlog
 import importlib.util as iutil
 import logging
 import os
-from typing import Optional
+from logging import Logger
+from typing import List, Optional
 init_loggers = {}
+# Define log formats
+data_format = '%Y-%m-%d %H:%M:%S'
+# For console output
+color_detailed_format = '%(asctime)s - %(name)s - %(filename)s - %(funcName)s - %(lineno)d - %(log_color)s%(levelname)s%(reset)s: %(message)s'  # noqa:E501
+color_simple_format = '%(asctime)s - %(name)s - %(log_color)s%(levelname)s%(reset)s: %(message)s'
+color_detailed_formatter = colorlog.ColoredFormatter(color_detailed_format, datefmt=data_format)
+color_simple_formatter = colorlog.ColoredFormatter(color_simple_format, datefmt=data_format)
+# For file output
+detailed_format = '%(asctime)s - %(name)s - %(filename)s - %(funcName)s - %(lineno)d - %(levelname)s: %(message)s'  # noqa:E501
+simple_format = '%(asctime)s - %(name)s - %(levelname)s: %(message)s'
+plain_detailed_formatter = logging.Formatter(detailed_format, datefmt=data_format)
+plain_simple_formatter = logging.Formatter(simple_format, datefmt=data_format)
-detailed_format = '%(asctime)s - %(name)s - %(filename)s - %(funcName)s - %(lineno)d - %(levelname)s - %(message)s'
-simple_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-detailed_formatter = logging.Formatter(detailed_format)
-simple_formatter = logging.Formatter(simple_format)
 DEFAULT_LEVEL = logging.DEBUG if os.getenv('EVALSCOPE_LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
-logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL, force=True)
+logging.basicConfig(format=simple_format, level=logging.INFO, force=True)
 # set logging level
 logging.getLogger('datasets').setLevel(logging.WARNING)
@@ -20,7 +29,13 @@ logging.getLogger('httpx').setLevel(logging.WARNING)
 logging.getLogger('modelscope').setLevel(logging.ERROR)
-def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, file_mode: str = 'w', force=False):
+def get_logger(
+    log_file: Optional[str] = None,
+    name: Optional[str] = None,
+    log_level: int = DEFAULT_LEVEL,
+    file_mode: str = 'w',
+    force=False
+):
     """Get logging logger
     Args:
@@ -31,7 +46,10 @@ def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, f
             specified (if filemode is unspecified, it defaults to 'w').
     """
-    logger_name = __name__.split('.')[0]
+    if name:
+        logger_name = f"evalscope.{name.split('.')[-1]}"
+    else:
+        logger_name = 'evalscope'
     logger = logging.getLogger(logger_name)
     logger.propagate = False
@@ -40,7 +58,15 @@ def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, f
             logger.setLevel(log_level)
             for handler in logger.handlers:
                 handler.setLevel(log_level)
-                handler.setFormatter(detailed_formatter if log_level == logging.DEBUG else simple_formatter)
+                # 区分不同类型的 handler，使用相应的格式化器
+                if isinstance(handler, logging.FileHandler):
+                    handler.setFormatter(
+                        plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter
+                    )
+                else:
+                    handler.setFormatter(
+                        color_detailed_formatter if log_level == logging.DEBUG else color_simple_formatter
+                    )
             add_file_handler_if_needed(logger, log_file, file_mode, log_level)
         return logger
@@ -66,7 +92,11 @@ def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, f
         handlers.append(file_handler)
     for handler in handlers:
-        handler.setFormatter(detailed_formatter if log_level == logging.DEBUG else simple_formatter)
+        # 区分不同类型的 handler，使用相应的格式化器
+        if isinstance(handler, logging.FileHandler):
+            handler.setFormatter(plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter)
+        else:
+            handler.setFormatter(color_detailed_formatter if log_level == logging.DEBUG else color_simple_formatter)
         handler.setLevel(log_level)
         logger.addHandler(handler)
@@ -102,6 +132,15 @@ def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
     if is_worker0 and log_file is not None:
         file_handler = logging.FileHandler(log_file, file_mode)
-        file_handler.setFormatter(detailed_formatter if log_level == logging.DEBUG else simple_formatter)
+        file_handler.setFormatter(plain_detailed_formatter if log_level == logging.DEBUG else plain_simple_formatter)
         file_handler.setLevel(log_level)
         logger.addHandler(file_handler)
+def warn_once(logger: Logger, message: str) -> None:
+    if message not in _warned:
+        logger.warning(message)
+        _warned.append(message)
+_warned: List[str] = []

evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

evalscope 0.17.1py3-none-any.whl → 1.0.0py3-none-any.whl