PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (324) hide show

evalscope/api/benchmark/__init__.py +9 -1
evalscope/api/benchmark/adapters/__init__.py +4 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +85 -2
evalscope/api/benchmark/meta.py +10 -1
evalscope/api/dataset/dataset.py +27 -6
evalscope/api/dataset/loader.py +8 -3
evalscope/api/evaluator/cache.py +31 -4
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +17 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +52 -2
evalscope/api/metric/__init__.py +1 -1
evalscope/api/metric/metric.py +6 -1
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +1 -1
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/generate_config.py +10 -6
evalscope/api/model/model.py +5 -2
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +11 -5
evalscope/app/utils/data_utils.py +8 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/app/utils/visualization.py +2 -2
evalscope/arguments.py +8 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/aime24_adapter.py +5 -0
evalscope/benchmarks/aime/aime25_adapter.py +136 -1
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/benchmarks/aime/math_normalize.py +189 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/drop_adapter.py +15 -44
evalscope/benchmarks/drop/utils.py +97 -0
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +103 -18
evalscope/constants.py +18 -0
evalscope/evaluator/evaluator.py +138 -82
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/llm_judge.py +19 -7
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +317 -13
evalscope/metrics/metrics.py +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +22 -0
evalscope/models/openai_compatible.py +21 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +16 -6
evalscope/perf/arguments.py +26 -4
evalscope/perf/benchmark.py +76 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +188 -79
evalscope/perf/plugin/api/openai_api.py +85 -20
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +43 -27
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +3 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +13 -3
evalscope/report/combinator.py +91 -20
evalscope/report/generator.py +8 -87
evalscope/report/report.py +8 -4
evalscope/run.py +13 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/argument_utils.py +1 -1
evalscope/utils/chat_service.py +1 -1
evalscope/utils/function_utils.py +249 -12
evalscope/utils/import_utils.py +73 -1
evalscope/utils/io_utils.py +132 -7
evalscope/utils/json_schema.py +25 -2
evalscope/utils/logger.py +69 -18
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +39 -7
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/__init__.py +0 -1
tests/aigc/__init__.py +0 -1
tests/aigc/test_t2i.py +0 -142
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -386
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -229
tests/cli/test_collection.py +0 -96
tests/cli/test_custom.py +0 -268
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -176
tests/rag/test_clip_benchmark.py +0 -90
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
/evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
{tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/utils/function_utils.py CHANGED Viewed

@@ -1,29 +1,266 @@
+import asyncio
 import threading
+import time
+from concurrent.futures import ThreadPoolExecutor, wait
+from contextlib import contextmanager
 from functools import wraps
+from tqdm import tqdm
+from typing import Any, Awaitable, Callable, List, Optional, Sequence, TypeVar, Union
+from evalscope.utils.logger import get_logger
-def run_once(func):
-    """Decorator to ensure a function is only run once."""
-    has_run = False
-    result = None
+logger = get_logger()
+T = TypeVar('T')
+R = TypeVar('R')
+# Global lock to safely create per-instance locks in decorators
+_THREAD_SAFE_GLOBAL_LOCK = threading.RLock()
+def thread_safe(func: Callable[..., T]) -> Callable[..., T]:
+    """Thread-safe decorator.
+    - If decorating a bound method, uses a per-instance, per-method lock.
+    - If decorating a function, uses a function-scoped lock.
+    """
+    func_lock = threading.RLock()
+    lock_attr_name = f'__lock_{func.__name__}'
+    @wraps(func)
     def wrapper(*args, **kwargs):
-        nonlocal has_run, result
-        if not has_run:
-            result = func(*args, **kwargs)
-            has_run = True
-        return result
+        # Prefer per-instance lock if the first arg looks like 'self'
+        if args and hasattr(args[0], '__dict__'):
+            self_obj = args[0]
+            lock = getattr(self_obj, lock_attr_name, None)
+            if lock is None:
+                with _THREAD_SAFE_GLOBAL_LOCK:
+                    lock = getattr(self_obj, lock_attr_name, None)
+                    if lock is None:
+                        lock = threading.RLock()
+                        setattr(self_obj, lock_attr_name, lock)
+        else:
+            lock = func_lock
+        with lock:
+            return func(*args, **kwargs)
     return wrapper
-def thread_safe(func):
-    """Thread-safe decorator for functions that need to be executed in a thread-safe manner."""
+def run_once(func: Callable[..., T]) -> Callable[..., T]:
+    """Decorator to ensure a function is executed at most once across threads."""
     lock = threading.RLock()
+    has_run: bool = False
+    result: Optional[T] = None
     @wraps(func)
     def wrapper(*args, **kwargs):
+        nonlocal has_run, result
+        if has_run:
+            return result
+        # Double-checked locking to avoid redundant locking on hot path
         with lock:
-            return func(*args, **kwargs)
+            if not has_run:
+                result = func(*args, **kwargs)
+                has_run = True
+        return result
     return wrapper
+def retry_func(retries=3, sleep_interval=0):
+    """A decorator that retries a function call up to `retries` times if an exception occurs."""
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            last_exception = None
+            for attempt in range(retries):
+                try:
+                    return func(*args, **kwargs)
+                except Exception as e:
+                    last_exception = e
+                    if sleep_interval > 0:
+                        time.sleep(sleep_interval)
+            raise last_exception
+        return wrapper
+    return decorator
+@contextmanager
+def retry_context(retries=3, sleep_interval=0):
+    """A context manager that retries the code block up to `retries` times if an exception occurs."""
+    last_exception = None
+    for attempt in range(retries):
+        try:
+            yield
+            return  # If no exception, exit successfully
+        except Exception as e:
+            last_exception = e
+            if sleep_interval > 0:
+                time.sleep(sleep_interval)
+            if attempt == retries - 1:  # Last attempt
+                break
+    raise last_exception
+class AsyncioLoopRunner:
+    """Singleton background asyncio loop runner for sync→async bridging."""
+    _instance: Optional['AsyncioLoopRunner'] = None
+    _inst_lock = threading.Lock()
+    def __init__(self) -> None:
+        self._loop: Optional[asyncio.AbstractEventLoop] = None
+        self._thread: Optional[threading.Thread] = None
+        self._start_loop()
+    def _start_loop(self) -> None:
+        loop = asyncio.new_event_loop()
+        self._loop = loop
+        def run_loop() -> None:
+            asyncio.set_event_loop(loop)
+            loop.run_forever()
+        self._thread = threading.Thread(target=run_loop, daemon=True, name='AsyncioLoopRunner')
+        self._thread.start()
+    @classmethod
+    def instance(cls) -> 'AsyncioLoopRunner':
+        if cls._instance is not None:
+            return cls._instance
+        with cls._inst_lock:
+            if cls._instance is None:
+                cls._instance = AsyncioLoopRunner()
+        return cls._instance
+    @classmethod
+    def run(cls, coro: Awaitable[T], timeout: Optional[float] = None) -> T:
+        """Submit a coroutine to the background loop and wait for result."""
+        inst = cls.instance()
+        fut = asyncio.run_coroutine_threadsafe(coro, inst._loop)
+        return fut.result(timeout=timeout)
+    @property
+    def loop(self) -> Optional[asyncio.AbstractEventLoop]:
+        """Access the underlying event loop (read-only use)."""
+        return self._loop
+    def stop(self, join_timeout: float = 5.0) -> None:
+        """Optional shutdown of the background loop (generally not needed)."""
+        if not self._loop:
+            return
+        self._loop.call_soon_threadsafe(self._loop.stop)
+        if self._thread:
+            self._thread.join(timeout=join_timeout)
+def run_in_threads_with_progress(
+    items: Sequence[T],
+    worker: Callable[[T], R],
+    *,
+    desc: str,
+    max_workers: int,
+    heartbeat_sec: int,
+    on_result: Optional[Callable[[T, R], None]] = None,
+    on_error: Optional[Callable[[T, Exception], None]] = None,
+    filter_none_results: bool = False,
+) -> List[R]:
+    """
+    Execute a collection of tasks concurrently with a ThreadPoolExecutor while
+    displaying a tqdm progress bar and emitting periodic heartbeat logs.
+    Key behaviors:
+    - Concurrency: Uses up to `min(len(items), max_workers)` threads.
+    - Progress: A tqdm bar advances when each task finishes (success or failure).
+    - Heartbeat: If no tasks finish within `heartbeat_sec`, a status line is logged.
+    - Ordering: Results are appended in completion order (not the original order).
+    - Error handling:
+        * If `on_error` is provided, it is called for each failed item; execution continues
+          unless `on_error` itself raises.
+        * If `on_error` is None, the first exception is raised immediately and stops processing.
+    - Callbacks:
+        * `on_result(item, result)` is called after a successful result is obtained.
+        * Both callbacks run in the main thread (not worker threads).
+    Args:
+        items: A sequence of items (inputs) to process. Converted to a list internally.
+        worker: A callable executed in threads to process a single item and return a result.
+        desc: A short text shown as the tqdm progress bar description.
+        max_workers: Upper bound on the number of concurrent threads.
+        heartbeat_sec: Interval (in seconds) to wait before emitting a heartbeat log if
+            no tasks complete in that window.
+        on_result: Optional callback invoked as on_result(item, result) after success.
+        on_error: Optional callback invoked as on_error(item, exception) on failure. If omitted,
+            the exception is propagated and the function terminates early.
+    Returns:
+        A list of results collected as tasks complete (completion order).
+        If some tasks fail and `on_error` is provided (and does not re-raise), those failures
+        are skipped and not included in the returned results.
+    Raises:
+        Exception: Propagates the first task exception if `on_error` is not provided, or if
+        `on_error` re-raises.
+    Notes:
+        - The function is blocking until all tasks complete or an exception is propagated.
+        - Use `on_error` to implement "best-effort" processing where failures are logged
+          and the rest continue.
+    """
+    # Defensive copy to avoid consuming a generator multiple times and to compute pool size.
+    pending_items: List[T] = list(items)
+    if not pending_items:
+        return []
+    # Include indices to ensure results are returned in input order
+    indexed_items = list(enumerate(items))
+    results: List[Optional[R]] = [None] * len(items)  # Preallocate results list
+    # Bound the pool by actual workload size for efficiency.
+    with ThreadPoolExecutor(max_workers=min(len(indexed_items), max_workers)) as executor:
+        # Submit all tasks up-front and map futures back to their originating item.
+        future_to_index = {executor.submit(worker, item): index for index, item in indexed_items}
+        # Progress bar reflects total number of submitted tasks; updated per finished future.
+        with tqdm(total=len(indexed_items), desc=desc, mininterval=1, dynamic_ncols=True) as pbar:
+            # Track unfinished futures and poll with a timeout to enable heartbeat logs.
+            pending = set(future_to_index.keys())
+            while pending:
+                # Wait with timeout to detect stalls and emit heartbeats proactively.
+                done, not_done = wait(pending, timeout=heartbeat_sec)
+                if not done:
+                    # Heartbeat when nothing has completed within the window.
+                    logger.info(f'{desc} still processing... pending={len(not_done)}')
+                    continue
+                # Consume completed futures.
+                for future in done:
+                    index = future_to_index[future]
+                    try:
+                        res = future.result()
+                        results[index] = res  # Store result at the correct index
+                        # Invoke success callback in caller thread (not in worker).
+                        if on_result is not None:
+                            on_result(items[index], res)
+                    except Exception as exc:
+                        # Delegate failure handling to on_error if provided; otherwise bubble up.
+                        if on_error is not None:
+                            on_error(items[index], exc)
+                        else:
+                            raise
+                    finally:
+                        # Always advance progress for completed futures (success or failure).
+                        pbar.update(1)
+                # Continue polling remaining futures.
+                pending = not_done
+    # Return results, which are now guaranteed to be in input order
+    if filter_none_results:
+        # Filter out None results if on_error was used and some tasks failed
+        results = [res for res in results if res is not None]
+    return results

evalscope/utils/import_utils.py CHANGED Viewed

@@ -5,13 +5,85 @@ import importlib
 import os
 from itertools import chain
 from types import ModuleType
-from typing import Any
+from typing import Any, Optional, Union
+from evalscope.constants import IS_BUILD_DOC
 from .logger import get_logger
 logger = get_logger()  # pylint: disable=invalid-name
+def check_import(
+    module_name: Union[str, list[str]],
+    package: Optional[Union[str, list[str]]] = None,
+    raise_warning: bool = True,
+    raise_error: bool = False,
+    feature_name: Optional[str] = 'this feature',
+) -> bool:
+    """Check if a module or list of modules can be imported.
+    Args:
+        module_name (Union[str, list[str]]): The name(s) of the module(s) to check.
+        package (Union[str, list[str]], optional): The package(s) to install if the module(s) are not found.
+            Defaults to None.
+        raise_error (bool, optional): Whether to raise an error if any module is not found. Defaults to False.
+        raise_warning (bool, optional): Whether to log a warning if any module is not found. Defaults to True.
+        feature_name (str, optional): The feature name that requires the module(s). Used in the warning/error message.
+            Defaults to 'this feature'.
+    Returns:
+        bool: True if all modules can be imported, False otherwise.
+    """
+    # Convert single strings to lists for uniform processing
+    if isinstance(module_name, str):
+        module_names = [module_name]
+    else:
+        module_names = module_name
+    if package is None:
+        packages = [None] * len(module_names)
+    elif isinstance(package, str):
+        packages = [package] * len(module_names)
+    else:
+        packages = package
+        # Ensure packages list has same length as module_names
+        if len(packages) < len(module_names):
+            packages.extend([None] * (len(module_names) - len(packages)))
+    missing_modules = []
+    missing_packages = []
+    for i, mod_name in enumerate(module_names):
+        try:
+            importlib.import_module(mod_name)
+        except ImportError:
+            missing_modules.append(mod_name)
+            if i < len(packages) and packages[i]:
+                missing_packages.append(packages[i])
+    if missing_modules:
+        if len(missing_modules) == 1:
+            error_msg = f'`{missing_modules[0]}` not found.'
+        else:
+            error_msg = f'The following modules are not found: {", ".join(f"`{mod}`" for mod in missing_modules)}.'
+        if missing_packages:
+            if len(missing_packages) == 1:
+                error_msg += f' Please run `pip install {missing_packages[0]}` to use {feature_name}.'
+            else:
+                unique_packages = list(dict.fromkeys(missing_packages))  # Remove duplicates while preserving order
+                error_msg += f' Please run `pip install {" ".join(unique_packages)}` to use {feature_name}.'
+        if raise_warning:
+            logger.warning(error_msg)
+        if not IS_BUILD_DOC and raise_error:
+            raise ImportError(error_msg)
+        return False
+    return True
 class _LazyModule(ModuleType):
     """
     Module class that surfaces all objects but only performs associated imports when the objects are requested.

evalscope/utils/io_utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import base64
 import csv
 import hashlib
+import io
 import json
 import jsonlines as jsonl
 import os
@@ -8,8 +9,10 @@ import re
 import string
 import unicodedata
 import yaml
+from datetime import datetime
 from io import BytesIO
 from PIL import Image
+from typing import Tuple
 from evalscope.constants import DumpMode
 from evalscope.utils.logger import get_logger
@@ -122,6 +125,9 @@ def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
     if not isinstance(data_list, list):
         data_list = [data_list]
+    # Convert non-serializable types to serializable ones
+    data_list = convert_normal_types(data_list)
     if dump_mode == DumpMode.OVERWRITE:
         dump_mode = 'w'
     elif dump_mode == DumpMode.APPEND:
@@ -168,6 +174,24 @@ def csv_to_list(csv_file) -> list:
     return res_list
+def tsv_to_list(tsv_file) -> list:
+    """
+    Read tsv file to list.
+    Args:
+        tsv_file: tsv file path.
+    Returns:
+        list: list of lines. Each line is a dict.
+    """
+    res_list = []
+    with open(tsv_file, 'r', encoding='utf-8') as f:
+        reader = csv.DictReader(f, delimiter='\t')
+        for row in reader:
+            res_list.append(row)
+    return res_list
 def csv_to_jsonl(csv_file, jsonl_file):
     """
     Convert csv file to jsonl file.
@@ -283,22 +307,64 @@ def get_valid_list(input_list, candidate_list):
            [i for i in input_list if i not in candidate_list]
-def PIL_to_base64(image: Image.Image, format: str = 'JPEG') -> str:
+def PIL_to_base64(image: Image.Image, format: str = 'JPEG', add_header: bool = False) -> str:
     """
     Convert a PIL Image to a base64 encoded string.
     Args:
         image (Image.Image): The PIL Image to convert.
         format (str): The format to save the image in. Default is 'JPEG'.
+        add_header (bool): Whether to add the base64 header. Default is False.
     Returns:
         str: Base64 encoded string of the image.
     """
     buffered = BytesIO()
     image.save(buffered, format=format)
     img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
+    if add_header:
+        img_str = f'data:image/{format.lower()};base64,{img_str}'
     return img_str
+def bytes_to_base64(bytes_data: bytes, *, format: str = 'png', add_header: bool = False, content_type='image') -> str:
+    """Convert bytes to a base64 encoded string.
+    Args:
+        bytes_data (bytes): The bytes to convert.
+        format (str): The format of the image. Default is 'png'.
+        add_header (bool): Whether to add the base64 header. Default is False.
+        content_type (str): The type of the data, 'image' or 'audio'. Default is 'image'.
+    Returns:
+        str: Base64 encoded string of the bytes.
+    """
+    base64_str = base64.b64encode(bytes_data).decode('utf-8')
+    if add_header:
+        base64_str = f'data:{content_type}/{format};base64,{base64_str}'
+    return base64_str
+def base64_to_PIL(base64_str):
+    """Convert a base64 encoded string to a PIL Image.
+    Args:
+        base64_str (str): The base64 encoded string.
+    Returns:
+        Image.Image: The decoded PIL Image.
+    """
+    # remove header
+    if ',' in base64_str:
+        base64_str = base64_str.split(',', 1)[1]
+    # decode
+    img_data = base64.b64decode(base64_str)
+    img_file = io.BytesIO(img_data)
+    img = Image.open(img_file)
+    return img
 def safe_filename(s: str, max_length: int = 255) -> str:
     """
     Convert a string into a safe filename by removing or replacing unsafe characters.
@@ -351,11 +417,13 @@ def safe_filename(s: str, max_length: int = 255) -> str:
     return s
-def convert_numpy_types(obj):
-    """Recursively convert numpy types to native Python types for JSON serialization."""
+def convert_normal_types(obj):
+    """Recursively convert numpy types and datetime objects to native Python types for JSON serialization."""
     import numpy as np
-    if isinstance(obj, np.bool_):
+    if isinstance(obj, datetime):
+        return obj.isoformat()
+    elif isinstance(obj, np.bool_):
         return bool(obj)
     elif isinstance(obj, np.integer):
         return int(obj)
@@ -364,10 +432,67 @@ def convert_numpy_types(obj):
     elif isinstance(obj, np.ndarray):
         return obj.tolist()
     elif isinstance(obj, dict):
-        return {key: convert_numpy_types(value) for key, value in obj.items()}
+        return {key: convert_normal_types(value) for key, value in obj.items()}
     elif isinstance(obj, list):
-        return [convert_numpy_types(item) for item in obj]
+        return [convert_normal_types(item) for item in obj]
     elif isinstance(obj, tuple):
-        return tuple(convert_numpy_types(item) for item in obj)
+        return tuple(convert_normal_types(item) for item in obj)
+    elif isinstance(obj, os.PathLike):
+        return str(obj)
     else:
         return obj
+def compress_image_to_limit(image_bytes: bytes, max_bytes: int = 10_000_000) -> Tuple[bytes, str]:
+    """
+    Ensure image bytes are under max_bytes by re-encoding to JPEG with quality reduction
+    and optional downscaling. Returns (processed_bytes, format_str).
+    If the original bytes are already below the limit, returns them as PNG.
+    """
+    if len(image_bytes) <= max_bytes:
+        return image_bytes, 'png'
+    try:
+        img = Image.open(BytesIO(image_bytes))
+    except Exception as exc:
+        logger.warning(f'Failed to open image bytes with PIL, sending original image; may exceed API limit: {exc}')
+        return image_bytes, 'png'
+    # Convert to RGB for JPEG if needed
+    if img.mode not in ('RGB', 'L'):
+        img = img.convert('RGB')
+    def encode_jpeg(source: Image.Image, quality: int) -> bytes:
+        buf = BytesIO()
+        source.save(buf, format='JPEG', quality=quality, optimize=True, progressive=True)
+        return buf.getvalue()
+    # Start with moderate quality and reduce
+    quality: int = 85
+    out: bytes = encode_jpeg(img, quality)
+    quality_floor: int = 40
+    while len(out) > max_bytes and quality > quality_floor:
+        quality -= 10
+        out = encode_jpeg(img, quality)
+    # If still too large, progressively downscale
+    min_side_floor: int = 256
+    scale: float = 0.9
+    while len(out) > max_bytes and min(img.size) > min_side_floor:
+        new_w = max(min_side_floor, int(img.width * scale))
+        new_h = max(min_side_floor, int(img.height * scale))
+        if (new_w, new_h) == img.size:
+            break
+        img = img.resize((new_w, new_h), Image.LANCZOS)
+        out = encode_jpeg(img, quality)
+    if len(out) > max_bytes:
+        logger.warning(f'Image remains above limit after compression: size={len(out)} bytes (limit={max_bytes}).')
+    else:
+        logger.info(
+            f'Compressed image from {len(image_bytes)} to {len(out)} bytes; '
+            f'quality={quality}, size={img.width}x{img.height}.'
+        )
+    return out, 'jpeg'

evalscope/utils/json_schema.py CHANGED Viewed

@@ -4,7 +4,7 @@ from copy import deepcopy
 from dataclasses import is_dataclass
 from datetime import date, datetime, time
 from enum import EnumMeta
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator, model_validator
 from typing import (
     Any,
     Dict,
@@ -59,6 +59,28 @@ class JSONSchema(BaseModel):
     required: Optional[List[str]] = Field(default=None)
     """Required fields for object parameters."""
+    @model_validator(mode='before')
+    def convert_type_before_validation(cls, values):
+        values = deepcopy(values)
+        def recursive_convert_type(obj):
+            if isinstance(obj, dict):
+                # Convert 'type' field if it's a string
+                if 'type' in obj and isinstance(obj['type'], str):
+                    try:
+                        obj['type'] = python_type_to_json_type(obj['type'])
+                    except ValueError:
+                        # If conversion fails, leave it as is
+                        pass
+                # Recursively process nested structures
+                for k, v in obj.items():
+                    obj[k] = recursive_convert_type(v)
+            elif isinstance(obj, list):
+                return [recursive_convert_type(item) for item in obj]
+            return obj
+        return recursive_convert_type(values)
 def json_schema(t: Type[Any]) -> JSONSchema:
     """Provide a JSON Schema for the specified type.
@@ -152,6 +174,8 @@ def cls_json_schema(cls: Type[Any]) -> JSONSchema:
 def python_type_to_json_type(python_type: Optional[str]) -> JSONType:
+    if python_type is not None and python_type in get_args(JSONType):
+        return python_type
     if python_type == 'str':
         return 'string'
     elif python_type == 'int':
@@ -205,4 +229,3 @@ def resolve_schema_references(schema: Dict[str, Any]) -> Dict[str, Any]:
             return obj
     return cast(Dict[str, Any], _resolve_refs(schema))
-    return cast(Dict[str, Any], _resolve_refs(schema))

evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl