PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (324) hide show

evalscope/api/benchmark/__init__.py +9 -1
evalscope/api/benchmark/adapters/__init__.py +4 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +85 -2
evalscope/api/benchmark/meta.py +10 -1
evalscope/api/dataset/dataset.py +27 -6
evalscope/api/dataset/loader.py +8 -3
evalscope/api/evaluator/cache.py +31 -4
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +17 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +52 -2
evalscope/api/metric/__init__.py +1 -1
evalscope/api/metric/metric.py +6 -1
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +1 -1
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/generate_config.py +10 -6
evalscope/api/model/model.py +5 -2
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +11 -5
evalscope/app/utils/data_utils.py +8 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/app/utils/visualization.py +2 -2
evalscope/arguments.py +8 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/aime24_adapter.py +5 -0
evalscope/benchmarks/aime/aime25_adapter.py +136 -1
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/benchmarks/aime/math_normalize.py +189 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/drop_adapter.py +15 -44
evalscope/benchmarks/drop/utils.py +97 -0
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +103 -18
evalscope/constants.py +18 -0
evalscope/evaluator/evaluator.py +138 -82
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/llm_judge.py +19 -7
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +317 -13
evalscope/metrics/metrics.py +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +22 -0
evalscope/models/openai_compatible.py +21 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +16 -6
evalscope/perf/arguments.py +26 -4
evalscope/perf/benchmark.py +76 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +188 -79
evalscope/perf/plugin/api/openai_api.py +85 -20
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +43 -27
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +3 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +13 -3
evalscope/report/combinator.py +91 -20
evalscope/report/generator.py +8 -87
evalscope/report/report.py +8 -4
evalscope/run.py +13 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/argument_utils.py +1 -1
evalscope/utils/chat_service.py +1 -1
evalscope/utils/function_utils.py +249 -12
evalscope/utils/import_utils.py +73 -1
evalscope/utils/io_utils.py +132 -7
evalscope/utils/json_schema.py +25 -2
evalscope/utils/logger.py +69 -18
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +39 -7
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/__init__.py +0 -1
tests/aigc/__init__.py +0 -1
tests/aigc/test_t2i.py +0 -142
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -386
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -229
tests/cli/test_collection.py +0 -96
tests/cli/test_custom.py +0 -268
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -176
tests/rag/test_clip_benchmark.py +0 -90
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
/evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
{tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/api/mixin/sandbox_mixin.py ADDED Viewed

@@ -0,0 +1,182 @@
+from typing import TYPE_CHECKING, Any, Dict, Optional
+from evalscope.utils.function_utils import AsyncioLoopRunner, thread_safe
+from evalscope.utils.logger import get_logger
+if TYPE_CHECKING:
+    from ms_enclave.sandbox.manager import SandboxManager
+    from evalscope.config import TaskConfig
+logger = get_logger()
+class SandboxMixin:
+    """Sandbox mixin for sandboxed code execution."""
+    def __init__(self, task_config: 'TaskConfig'):
+        self._task_config = task_config
+        self._manager: Optional['SandboxManager'] = None
+        """Sandbox manager instance."""
+        self._sandbox_id: Optional[str] = None
+        """Sandbox ID."""
+        # Lazy init state
+        self._initialized: bool = False
+        # NOTE: Initialization is deferred.
+        super().__init__()
+    async def _async_init(self):
+        """Async initialization helper."""
+        await self.init_sandbox_manager_async()
+        await self.init_sandbox_async()
+    @property
+    def use_sandbox(self) -> bool:
+        """
+        Return whether to use sandbox for the benchmark.
+        """
+        if not self._task_config:
+            return False
+        else:
+            return self._task_config.use_sandbox
+    @property
+    def sandbox_manager(self) -> Optional['SandboxManager']:
+        """Get the sandbox manager instance."""
+        return self._manager
+    @property
+    def sandbox_id(self) -> Optional[str]:
+        """Get the sandbox ID."""
+        return self._sandbox_id
+    @thread_safe
+    def ensure_sandbox_ready(self) -> bool:
+        """
+        Ensure the sandbox loop, manager, and sandbox instance are initialized.
+        This method is thread-safe and idempotent.
+        """
+        if not self.use_sandbox:
+            return False
+        if self._initialized and self._manager and self._sandbox_id:
+            return True
+        # Initialize manager and sandbox using the class-level runner
+        AsyncioLoopRunner.run(self.init_sandbox_manager_async())
+        AsyncioLoopRunner.run(self.init_sandbox_async())
+        self._initialized = True
+        return True
+    async def init_sandbox_manager_async(self) -> Optional['SandboxManager']:
+        """Initialize the sandbox manager asynchronously."""
+        if self._manager is not None:
+            return self._manager
+        if not self.use_sandbox:
+            return None
+        from ms_enclave.sandbox.manager import HttpSandboxManager, LocalSandboxManager
+        manager_config = self._task_config.sandbox_manager_config or {}
+        if manager_config.get('base_url'):
+            # Remote manager
+            self._manager = HttpSandboxManager(**manager_config)
+        else:
+            # Local manager
+            self._manager = LocalSandboxManager(**manager_config)
+        await self._manager.start()
+        logger.info('Sandbox manager initialized.')
+        return self._manager
+    def init_sandbox_manager(self) -> Optional['SandboxManager']:
+        """Initialize the sandbox manager."""
+        if self._manager is not None:
+            return self._manager
+        if not self.use_sandbox:
+            return None
+        return AsyncioLoopRunner.run(self.init_sandbox_manager_async())
+    async def init_sandbox_async(self) -> Optional[str]:
+        """Initialize the sandbox instance asynchronously."""
+        if self._sandbox_id is not None:
+            return self._sandbox_id
+        if not self.use_sandbox:
+            return None
+        from ms_enclave.sandbox.model import DockerSandboxConfig, SandboxType
+        sandbox_config = self._task_config.sandbox_config or DockerSandboxConfig(
+            image='python:3.11-slim', tools_config={
+                'shell_executor': {},
+                'python_executor': {}
+            }
+        )
+        sandbox_type = self._task_config.sandbox_type or SandboxType.DOCKER
+        self._sandbox_id = await self._manager.create_sandbox(sandbox_type=sandbox_type, config=sandbox_config)
+        sandbox_info = await self._manager.get_sandbox_info(self._sandbox_id)
+        logger.info(f'Sandbox of type {sandbox_type} initialized. Info: {sandbox_info.model_dump(exclude_none=True)}')
+        return self._sandbox_id
+    def init_sandbox(self) -> Optional[str]:
+        """Initialize the sandbox instance."""
+        if self._sandbox_id is not None:
+            return self._sandbox_id
+        if not self.use_sandbox:
+            return None
+        return AsyncioLoopRunner.run(self.init_sandbox_async())
+    def execute_code_in_sandbox(self, code: str, timeout: int = 60, language: str = 'python') -> Dict[str, Any]:
+        """Execute code in the sandbox."""
+        # Lazy, thread-safe initialization
+        if not self.ensure_sandbox_ready():
+            logger.warning('Sandbox is not initialized.')
+            return {'error': 'Sandbox is not initialized.'}
+        from ms_enclave.sandbox.model import ExecutionStatus, ToolResult
+        async def _execute_async():
+            if language.lower() == 'python':
+                tool_name = 'python_executor'
+                parameters = {'code': code, 'timeout': timeout}
+                result = await self._manager.execute_tool(self._sandbox_id, tool_name, parameters)
+            elif language.lower() == 'shell':
+                tool_name = 'shell_executor'
+                parameters = {'command': code, 'timeout': timeout}
+                result = await self._manager.execute_tool(self._sandbox_id, tool_name, parameters)
+            else:
+                logger.warning(f"Unsupported language: {language}. Supported languages are 'python' and 'shell'.")
+                result = ToolResult(
+                    status=ExecutionStatus.ERROR,
+                    tool_name='code_executor',
+                    output=f"Unsupported language: {language}. Supported languages are 'python' and 'shell'."
+                )
+            return result
+        # Execute in background loop via class-level runner
+        result = AsyncioLoopRunner.run(_execute_async(), timeout=timeout + 10)
+        return result.model_dump(exclude_none=True)
+    def sandbox_finalize(self, *args, **kwargs):
+        """Finalize the sandbox manager."""
+        if self._manager:
+            try:
+                # Stop the manager but keep the shared loop alive
+                AsyncioLoopRunner.run(self._manager.stop(), timeout=30)
+                logger.info('Sandbox manager finalized.')
+            except Exception as e:
+                logger.warning(f'Error finalizing sandbox manager: {e}')

evalscope/api/model/generate_config.py CHANGED Viewed

@@ -25,9 +25,7 @@ class ResponseSchema(BaseModel):
 class GenerateConfig(BaseModel):
     """Model generation options."""
-    max_retries: Optional[int] = Field(default=None)
-    """Maximum number of times to retry request (defaults to unlimited)."""
+    model_config = {'extra': 'allow'}
     timeout: Optional[int] = Field(default=None)
     """Request timeout (in seconds)."""
@@ -38,9 +36,6 @@ class GenerateConfig(BaseModel):
     stream: Optional[bool] = Field(default=None)
     """Whether to stream the response (default is model specific)."""
-    system_message: Optional[str] = Field(default=None)
-    """Override the default system message."""
     max_tokens: Optional[int] = Field(default=None)
     """The maximum number of tokens that can be generated in the completion (default is model specific)."""
@@ -62,6 +57,9 @@ class GenerateConfig(BaseModel):
     presence_penalty: Optional[float] = Field(default=None)
     """Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. OpenAI, Google, Grok, Groq, vLLM, and SGLang only."""
+    repetition_penalty: Optional[float] = Field(default=None)
+    """Exponential penalty applied to existing tokens in the generated text. 1.0 means no penalty. OpenAI, HuggingFace, and vLLM only."""
     logit_bias: Optional[Dict[int, float]] = Field(default=None)
     """Map token Ids to an associated bias value from -100 to 100 (e.g. "42=10,43=-10"). OpenAI, Grok, Grok, and vLLM only."""
@@ -113,6 +111,12 @@ class GenerateConfig(BaseModel):
     extra_body: Optional[Dict[str, Any]] = Field(default=None)
     """Extra body to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
+    extra_query: Optional[Dict[str, Any]] = Field(default=None)
+    """Extra query parameters to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
+    extra_headers: Optional[Dict[str, str]] = Field(default=None)
+    """Extra headers to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
     height: Optional[int] = Field(default=None)
     """Image height for image generation model only"""

evalscope/api/model/model.py CHANGED Viewed

@@ -318,7 +318,7 @@ def get_model_with_task_config(task_config: 'TaskConfig') -> Model:
 @thread_safe
 def get_model(
-    model: str,
+    model: Union[str, Model, ModelAPI],
     eval_type: str,
     base_url: Optional[str] = None,
     api_key: Optional[str] = None,
@@ -346,6 +346,9 @@ def get_model(
     if isinstance(model, Model):
         return model
+    if isinstance(model, ModelAPI):
+        return Model(model, config, model_args)
     # see if we can return a memoized model instance
     # (exclude mockllm since custom_outputs is an infinite generator)
     model_cache_key: str = ''
@@ -362,7 +365,7 @@ def get_model(
     logger.info(
         f'Creating model {model} with eval_type={eval_type} '
-        f'base_url={base_url}, api_key={api_key}, config={config}, model_args={model_args}'
+        f'base_url={base_url}, config={config.model_dump(exclude_none=True)}, model_args={model_args}'
     )
     # find a matching model type

evalscope/api/tool/tool_info.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import inspect
 from dataclasses import dataclass
 from docstring_parser import Docstring, parse
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator
 from typing import Any, Callable, Dict, List, Literal, Optional, TypeAlias, Union, get_args, get_type_hints
 from evalscope.utils.json_schema import JSONSchema, JSONType, json_schema, python_type_to_json_type

evalscope/app/app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import argparse
 from evalscope.utils.logger import configure_logging
 from .arguments import add_argument
 from .ui import create_app_ui
+from .utils.env_utils import setup_env
 def create_app(args: argparse.Namespace):
@@ -17,6 +18,8 @@ def create_app(args: argparse.Namespace):
     """
     configure_logging(debug=args.debug)
+    setup_env(args)
     demo = create_app_ui(args)
     demo.launch(

evalscope/app/ui/multi_model.py CHANGED Viewed

@@ -204,7 +204,12 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
         data_score_df_b, _ = get_single_dataset_df(report_df_b, dataset_name)
         # Get subset choices - should be same for both models
-        subsets = data_score_df_a[ReportKey.subset_name].unique().tolist()
+        # Only select the subsets that Cat.0 is not '-'
+        df_for_subsets = data_score_df_a.copy()
+        subsets = sorted(
+            df_for_subsets.loc[df_for_subsets[f'{ReportKey.category_prefix}0'].ne('-'),
+                               ReportKey.subset_name].dropna().unique().tolist()
+        )
         return gr.update(choices=subsets, value=None), None

evalscope/app/ui/single_model.py CHANGED Viewed

@@ -134,11 +134,17 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
     )
     def update_single_report_dataset(dataset_name, report_list):
         logger.debug(f'Updating single report dataset: {dataset_name}')
-        report_df = get_data_frame(report_list=report_list)
+        report_df = get_data_frame(report_list=report_list, flatten_metrics=True, flatten_categories=True)
         analysis = get_report_analysis(report_list, dataset_name)
         data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
         data_score_plot = plot_single_dataset_scores(data_score_df)
-        subsets = data_score_df[ReportKey.subset_name].unique().tolist()
+        # Only select the subsets that Cat.0 is not '-'
+        df_for_subsets = data_score_df.copy()
+        subsets = sorted(
+            df_for_subsets.loc[df_for_subsets[f'{ReportKey.category_prefix}0'].ne('-'),
+                               ReportKey.subset_name].dropna().unique().tolist()
+        )
         logger.debug(f'subsets: {subsets}')
         return data_score_plot, styler, gr.update(choices=subsets, value=None), None, analysis
@@ -198,9 +204,9 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
         # Process the data for display
         input_md = row['Input'] + '\n\n' + process_model_prediction(row['Metadata'])
-        generated_md = process_model_prediction(row['Generated'])
-        gold_md = process_model_prediction(row['Gold'])
-        pred_md = convert_markdown_image(process_model_prediction(row['Pred']))
+        generated_md = convert_markdown_image(row['Generated'])
+        gold_md = convert_markdown_image(row['Gold'])
+        pred_md = process_model_prediction(row['Pred'])
         score_md = process_json_content(row['Score'])
         nscore_val = float(row['NScore']) if not pd.isna(row['NScore']) else 0.0

evalscope/app/utils/data_utils.py CHANGED Viewed

@@ -2,7 +2,6 @@
 Data loading and processing utilities for the Evalscope dashboard.
 """
 import glob
-import numpy as np
 import os
 import pandas as pd
 from typing import Any, Dict, List, Union
@@ -160,17 +159,19 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
             if f'{sample_dataset_name}/{sample_subset_name}' != subset_name:
                 continue
-        prediction = sample_score.score.prediction
-        target = review_result.target
-        extracted_prediction = sample_score.score.extracted_prediction
         score = sample_score.score
+        metadata = sample_score.sample_metadata
+        prediction = score.prediction
+        target = review_result.target
+        extracted_prediction = score.extracted_prediction
         raw_d = {
             'Index': str(review_result.index),
             'Input': review_result.input.replace('\n', '\n\n'),  # for markdown
-            'Metadata': sample_score.sample_metadata,
-            'Generated': prediction if prediction != extracted_prediction else '*Same as Pred*',
+            'Metadata': metadata,
+            'Generated': prediction or '',  # Ensure no None value
             'Gold': target,
-            'Pred': extracted_prediction,
+            'Pred': (extracted_prediction if extracted_prediction != prediction else '*Same as Generated*')
+            or '',  # Ensure no None value
             'Score': score.model_dump(exclude_none=True),
             'NScore': normalize_score(score.main_value)
         }

evalscope/app/utils/env_utils.py ADDED Viewed

@@ -0,0 +1,12 @@
+# flake8: noqa
+import os
+def setup_env(args):
+    compat_dsw_gradio(args)
+def compat_dsw_gradio(args) -> None:
+    if ('JUPYTER_NAME' in os.environ) and ('dsw-'
+                                           in os.environ['JUPYTER_NAME']) and ('GRADIO_ROOT_PATH' not in os.environ):
+        os.environ['GRADIO_ROOT_PATH'] = f"/{os.environ['JUPYTER_NAME']}/proxy/{args.server_port}"

evalscope/app/utils/text_utils.py CHANGED Viewed

@@ -2,11 +2,9 @@
 Text processing utilities for the Evalscope dashboard.
 """
 import json
-import numpy as np
 import os
-import pandas as pd
 import re
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 from evalscope.utils.logger import get_logger
 from ..constants import LATEX_DELIMITERS
@@ -14,15 +12,19 @@ from ..constants import LATEX_DELIMITERS
 logger = get_logger()
-def convert_markdown_image(text):
-    if not os.path.isfile(text):
-        return text
-    # Convert the image path to a markdown image tag
-    if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
-        text = os.path.abspath(text)
-        image_tag = f'![image](gradio_api/file={text})'
-        logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
+def convert_markdown_image(text: str):
+    if text.startswith('data:image'):
+        # Convert base64 image data to a markdown image tag
+        image_tag = f'![image]({text})'
+        logger.debug(f'Converting base64 image data to markdown: {text[:30]}... -> {image_tag[:40]}...')
         return image_tag
+    elif os.path.isfile(text):
+        # Convert the image path to a markdown image tag
+        if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
+            text = os.path.abspath(text)
+            image_tag = f'![image](gradio_api/file={text})'
+            logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
+            return image_tag
     return text
@@ -85,7 +87,7 @@ def process_model_prediction_old(item: Any, max_length: int = 2048) -> str:
     return result
-def process_model_prediction(item: Any, max_length: int = 32000) -> str:
+def process_model_prediction(item: Any, max_length: Optional[int] = None) -> str:
     if isinstance(item, (dict, list)):
         result = json.dumps(item, ensure_ascii=False, indent=2)
         result = f'```json\n{result}\n```'

evalscope/app/utils/visualization.py CHANGED Viewed

@@ -18,7 +18,7 @@ logger = get_logger()
 def plot_single_report_scores(df: pd.DataFrame):
     if df is None:
         return None
-    logger.debug(f'df: {df}')
+    logger.debug(f'df: \n{df}')
     plot = px.bar(df, x=df[ReportKey.dataset_name], y=df[ReportKey.score], text=df[ReportKey.score])
     width = DEFAULT_BAR_WIDTH if len(df[ReportKey.dataset_name]) <= 5 else None
@@ -36,7 +36,7 @@ def plot_single_report_sunburst(report_list: List[Report]):
         df = get_data_frame(report_list=report_list, flatten_metrics=False)
         categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
         path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
-    logger.debug(f'df: {df}')
+    logger.debug(f'df: \n{df}')
     df[categories] = df[categories].fillna('default')  # NOTE: fillna for empty categories
     plot = px.sunburst(

evalscope/arguments.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import argparse
 import json
-from evalscope.constants import EvalBackend, EvalType, JudgeStrategy, ModelTask, OutputType
+from evalscope.constants import EvalBackend, EvalType, JudgeStrategy, ModelTask
 class ParseStrArgsAction(argparse.Action):
@@ -60,8 +60,7 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.')  # noqa: E501
     # Evaluation-related arguments
-    parser.add_argument('--eval-type', type=str, help='The type for evaluating.',
-                        choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
+    parser.add_argument('--eval-type', type=str, help='The type for evaluating.')
     parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
                         choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL])  # noqa: E501
     parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.')  # noqa: E501
@@ -77,7 +76,6 @@ def add_argument(parser: argparse.ArgumentParser):
     # Debug and runtime mode arguments
     parser.add_argument('--ignore-errors', action='store_true', default=False, help='Ignore errors during evaluation.')
     parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.')  # noqa: E501
-    parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
     parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
     parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
     parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
@@ -89,6 +87,12 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.')  # noqa: E501
     parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
     parser.add_argument('--analysis-report', action='store_true', default=False, help='Generate analysis report for the evaluation results using judge model.')  # noqa: E501
+    # Sandbox-related arguments
+    parser.add_argument('--use-sandbox', action='store_true', default=False, help='Whether to use sandbox for model evaluation.')  # noqa: E501
+    parser.add_argument('--sandbox-type', type=str, default='docker', help='The sandbox type to use.')  # noqa: E501
+    parser.add_argument('--sandbox-config', type=json.loads, default='{}', help='The sandbox config, should be a json string.')  # noqa: E501
+    parser.add_argument('--sandbox-manager-config', type=json.loads, default='{}', help='The sandbox manager config, should be a json string.')  # noqa: E501
     # yapf: enable

evalscope/backend/opencompass/backend_manager.py CHANGED Viewed

@@ -47,7 +47,6 @@ class OpenCompassBackendManager(BackendManager):
                     datasets: list, the datasets.
                     models: list, the models.
                     work_dir (Optional): str, the working directory. Default to None, which means the current directory.
-                    dry_run (Optional): bool, the dry-run flag. Default to False.
                     debug (Optional): bool, the debug flag. Default to False.
                     reuse (Optional): str, reuse previous outputs & results. Default to None.
                     generation_kwargs (Optional): dict, the generation config. Default to {}.
@@ -140,7 +139,6 @@ class OpenCompassBackendManager(BackendManager):
             cmd_str = f'python -m run_oc ' \
                       f'--models {" ".join(self.args.models)} ' \
                       f'--datasets {" ".join(self.args.datasets)} ' \
-                      f'{self.get_restore_arg("dry-run", self.args.dry_run)} ' \
                       f'{self.get_arg_with_default("work-dir", self.args.work_dir)}'
         elif cmd_mode == CmdMode.SCRIPT:

evalscope/backend/rag_eval/utils/embedding.py CHANGED Viewed

@@ -164,6 +164,13 @@ class CrossEncoderModel(BaseModel):
             max_length=self.max_seq_length,
             automodel_args=self.model_kwargs,
         )
+        self.tokenizer = self.model.tokenizer
+        # set pad token
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        if ('pad_token_id' not in self.model.config) or (self.model.config.pad_token_id is None):
+            self.model.config.update({'pad_token_id': self.tokenizer.eos_token_id})
         self.supported_encode_params = get_supported_params(self.model.predict)
     def predict(self, sentences: List[List[str]], **kwargs) -> Tensor:
@@ -189,6 +196,7 @@ class APIEmbeddingModel(BaseModel):
         self.openai_api_base = kwargs.get('api_base')
         self.openai_api_key = kwargs.get('api_key')
         self.dimensions = kwargs.get('dimensions')
+        self.check_embedding_ctx_length = kwargs.get('check_embedding_ctx_length', False)
         self.framework = ['API']
         self.model = OpenAIEmbeddings(
@@ -196,7 +204,7 @@ class APIEmbeddingModel(BaseModel):
             openai_api_base=self.openai_api_base,
             openai_api_key=self.openai_api_key,
             dimensions=self.dimensions,
-            check_embedding_ctx_length=False
+            check_embedding_ctx_length=self.check_embedding_ctx_length,
         )
         super().__init__(model_name_or_path=self.model_name, **kwargs)

evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl