evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/api/benchmark/__init__.py +9 -1
- evalscope/api/benchmark/adapters/__init__.py +4 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +85 -2
- evalscope/api/benchmark/meta.py +10 -1
- evalscope/api/dataset/dataset.py +27 -6
- evalscope/api/dataset/loader.py +8 -3
- evalscope/api/evaluator/cache.py +31 -4
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/__init__.py +1 -1
- evalscope/api/metric/metric.py +6 -1
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/generate_config.py +10 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +11 -5
- evalscope/app/utils/data_utils.py +8 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/aime24_adapter.py +5 -0
- evalscope/benchmarks/aime/aime25_adapter.py +136 -1
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/benchmarks/aime/math_normalize.py +189 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
- evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/drop_adapter.py +15 -44
- evalscope/benchmarks/drop/utils.py +97 -0
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
- evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +103 -18
- evalscope/constants.py +18 -0
- evalscope/evaluator/evaluator.py +138 -82
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +317 -13
- evalscope/metrics/metrics.py +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +21 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +16 -6
- evalscope/perf/arguments.py +26 -4
- evalscope/perf/benchmark.py +76 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +188 -79
- evalscope/perf/plugin/api/openai_api.py +85 -20
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +43 -27
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +3 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +13 -3
- evalscope/report/combinator.py +91 -20
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +13 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/argument_utils.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +249 -12
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +132 -7
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +69 -18
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +39 -7
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/__init__.py +0 -1
- tests/aigc/__init__.py +0 -1
- tests/aigc/test_t2i.py +0 -142
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -386
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -229
- tests/cli/test_collection.py +0 -96
- tests/cli/test_custom.py +0 -268
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -176
- tests/rag/test_clip_benchmark.py +0 -90
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
- {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional
|
|
2
|
+
|
|
3
|
+
from evalscope.utils.function_utils import AsyncioLoopRunner, thread_safe
|
|
4
|
+
from evalscope.utils.logger import get_logger
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from ms_enclave.sandbox.manager import SandboxManager
|
|
8
|
+
|
|
9
|
+
from evalscope.config import TaskConfig
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SandboxMixin:
|
|
15
|
+
"""Sandbox mixin for sandboxed code execution."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, task_config: 'TaskConfig'):
|
|
18
|
+
self._task_config = task_config
|
|
19
|
+
|
|
20
|
+
self._manager: Optional['SandboxManager'] = None
|
|
21
|
+
"""Sandbox manager instance."""
|
|
22
|
+
|
|
23
|
+
self._sandbox_id: Optional[str] = None
|
|
24
|
+
"""Sandbox ID."""
|
|
25
|
+
|
|
26
|
+
# Lazy init state
|
|
27
|
+
self._initialized: bool = False
|
|
28
|
+
|
|
29
|
+
# NOTE: Initialization is deferred.
|
|
30
|
+
super().__init__()
|
|
31
|
+
|
|
32
|
+
async def _async_init(self):
|
|
33
|
+
"""Async initialization helper."""
|
|
34
|
+
await self.init_sandbox_manager_async()
|
|
35
|
+
await self.init_sandbox_async()
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def use_sandbox(self) -> bool:
|
|
39
|
+
"""
|
|
40
|
+
Return whether to use sandbox for the benchmark.
|
|
41
|
+
"""
|
|
42
|
+
if not self._task_config:
|
|
43
|
+
return False
|
|
44
|
+
else:
|
|
45
|
+
return self._task_config.use_sandbox
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def sandbox_manager(self) -> Optional['SandboxManager']:
|
|
49
|
+
"""Get the sandbox manager instance."""
|
|
50
|
+
return self._manager
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def sandbox_id(self) -> Optional[str]:
|
|
54
|
+
"""Get the sandbox ID."""
|
|
55
|
+
return self._sandbox_id
|
|
56
|
+
|
|
57
|
+
@thread_safe
|
|
58
|
+
def ensure_sandbox_ready(self) -> bool:
|
|
59
|
+
"""
|
|
60
|
+
Ensure the sandbox loop, manager, and sandbox instance are initialized.
|
|
61
|
+
This method is thread-safe and idempotent.
|
|
62
|
+
"""
|
|
63
|
+
if not self.use_sandbox:
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
if self._initialized and self._manager and self._sandbox_id:
|
|
67
|
+
return True
|
|
68
|
+
|
|
69
|
+
# Initialize manager and sandbox using the class-level runner
|
|
70
|
+
AsyncioLoopRunner.run(self.init_sandbox_manager_async())
|
|
71
|
+
AsyncioLoopRunner.run(self.init_sandbox_async())
|
|
72
|
+
|
|
73
|
+
self._initialized = True
|
|
74
|
+
return True
|
|
75
|
+
|
|
76
|
+
async def init_sandbox_manager_async(self) -> Optional['SandboxManager']:
|
|
77
|
+
"""Initialize the sandbox manager asynchronously."""
|
|
78
|
+
if self._manager is not None:
|
|
79
|
+
return self._manager
|
|
80
|
+
|
|
81
|
+
if not self.use_sandbox:
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
from ms_enclave.sandbox.manager import HttpSandboxManager, LocalSandboxManager
|
|
85
|
+
|
|
86
|
+
manager_config = self._task_config.sandbox_manager_config or {}
|
|
87
|
+
if manager_config.get('base_url'):
|
|
88
|
+
# Remote manager
|
|
89
|
+
self._manager = HttpSandboxManager(**manager_config)
|
|
90
|
+
else:
|
|
91
|
+
# Local manager
|
|
92
|
+
self._manager = LocalSandboxManager(**manager_config)
|
|
93
|
+
|
|
94
|
+
await self._manager.start()
|
|
95
|
+
logger.info('Sandbox manager initialized.')
|
|
96
|
+
return self._manager
|
|
97
|
+
|
|
98
|
+
def init_sandbox_manager(self) -> Optional['SandboxManager']:
|
|
99
|
+
"""Initialize the sandbox manager."""
|
|
100
|
+
if self._manager is not None:
|
|
101
|
+
return self._manager
|
|
102
|
+
|
|
103
|
+
if not self.use_sandbox:
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
return AsyncioLoopRunner.run(self.init_sandbox_manager_async())
|
|
107
|
+
|
|
108
|
+
async def init_sandbox_async(self) -> Optional[str]:
|
|
109
|
+
"""Initialize the sandbox instance asynchronously."""
|
|
110
|
+
if self._sandbox_id is not None:
|
|
111
|
+
return self._sandbox_id
|
|
112
|
+
|
|
113
|
+
if not self.use_sandbox:
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
from ms_enclave.sandbox.model import DockerSandboxConfig, SandboxType
|
|
117
|
+
|
|
118
|
+
sandbox_config = self._task_config.sandbox_config or DockerSandboxConfig(
|
|
119
|
+
image='python:3.11-slim', tools_config={
|
|
120
|
+
'shell_executor': {},
|
|
121
|
+
'python_executor': {}
|
|
122
|
+
}
|
|
123
|
+
)
|
|
124
|
+
sandbox_type = self._task_config.sandbox_type or SandboxType.DOCKER
|
|
125
|
+
|
|
126
|
+
self._sandbox_id = await self._manager.create_sandbox(sandbox_type=sandbox_type, config=sandbox_config)
|
|
127
|
+
|
|
128
|
+
sandbox_info = await self._manager.get_sandbox_info(self._sandbox_id)
|
|
129
|
+
|
|
130
|
+
logger.info(f'Sandbox of type {sandbox_type} initialized. Info: {sandbox_info.model_dump(exclude_none=True)}')
|
|
131
|
+
return self._sandbox_id
|
|
132
|
+
|
|
133
|
+
def init_sandbox(self) -> Optional[str]:
|
|
134
|
+
"""Initialize the sandbox instance."""
|
|
135
|
+
if self._sandbox_id is not None:
|
|
136
|
+
return self._sandbox_id
|
|
137
|
+
|
|
138
|
+
if not self.use_sandbox:
|
|
139
|
+
return None
|
|
140
|
+
|
|
141
|
+
return AsyncioLoopRunner.run(self.init_sandbox_async())
|
|
142
|
+
|
|
143
|
+
def execute_code_in_sandbox(self, code: str, timeout: int = 60, language: str = 'python') -> Dict[str, Any]:
|
|
144
|
+
"""Execute code in the sandbox."""
|
|
145
|
+
# Lazy, thread-safe initialization
|
|
146
|
+
if not self.ensure_sandbox_ready():
|
|
147
|
+
logger.warning('Sandbox is not initialized.')
|
|
148
|
+
return {'error': 'Sandbox is not initialized.'}
|
|
149
|
+
|
|
150
|
+
from ms_enclave.sandbox.model import ExecutionStatus, ToolResult
|
|
151
|
+
|
|
152
|
+
async def _execute_async():
|
|
153
|
+
if language.lower() == 'python':
|
|
154
|
+
tool_name = 'python_executor'
|
|
155
|
+
parameters = {'code': code, 'timeout': timeout}
|
|
156
|
+
result = await self._manager.execute_tool(self._sandbox_id, tool_name, parameters)
|
|
157
|
+
elif language.lower() == 'shell':
|
|
158
|
+
tool_name = 'shell_executor'
|
|
159
|
+
parameters = {'command': code, 'timeout': timeout}
|
|
160
|
+
result = await self._manager.execute_tool(self._sandbox_id, tool_name, parameters)
|
|
161
|
+
else:
|
|
162
|
+
logger.warning(f"Unsupported language: {language}. Supported languages are 'python' and 'shell'.")
|
|
163
|
+
result = ToolResult(
|
|
164
|
+
status=ExecutionStatus.ERROR,
|
|
165
|
+
tool_name='code_executor',
|
|
166
|
+
output=f"Unsupported language: {language}. Supported languages are 'python' and 'shell'."
|
|
167
|
+
)
|
|
168
|
+
return result
|
|
169
|
+
|
|
170
|
+
# Execute in background loop via class-level runner
|
|
171
|
+
result = AsyncioLoopRunner.run(_execute_async(), timeout=timeout + 10)
|
|
172
|
+
return result.model_dump(exclude_none=True)
|
|
173
|
+
|
|
174
|
+
def sandbox_finalize(self, *args, **kwargs):
|
|
175
|
+
"""Finalize the sandbox manager."""
|
|
176
|
+
if self._manager:
|
|
177
|
+
try:
|
|
178
|
+
# Stop the manager but keep the shared loop alive
|
|
179
|
+
AsyncioLoopRunner.run(self._manager.stop(), timeout=30)
|
|
180
|
+
logger.info('Sandbox manager finalized.')
|
|
181
|
+
except Exception as e:
|
|
182
|
+
logger.warning(f'Error finalizing sandbox manager: {e}')
|
|
@@ -25,9 +25,7 @@ class ResponseSchema(BaseModel):
|
|
|
25
25
|
|
|
26
26
|
class GenerateConfig(BaseModel):
|
|
27
27
|
"""Model generation options."""
|
|
28
|
-
|
|
29
|
-
max_retries: Optional[int] = Field(default=None)
|
|
30
|
-
"""Maximum number of times to retry request (defaults to unlimited)."""
|
|
28
|
+
model_config = {'extra': 'allow'}
|
|
31
29
|
|
|
32
30
|
timeout: Optional[int] = Field(default=None)
|
|
33
31
|
"""Request timeout (in seconds)."""
|
|
@@ -38,9 +36,6 @@ class GenerateConfig(BaseModel):
|
|
|
38
36
|
stream: Optional[bool] = Field(default=None)
|
|
39
37
|
"""Whether to stream the response (default is model specific)."""
|
|
40
38
|
|
|
41
|
-
system_message: Optional[str] = Field(default=None)
|
|
42
|
-
"""Override the default system message."""
|
|
43
|
-
|
|
44
39
|
max_tokens: Optional[int] = Field(default=None)
|
|
45
40
|
"""The maximum number of tokens that can be generated in the completion (default is model specific)."""
|
|
46
41
|
|
|
@@ -62,6 +57,9 @@ class GenerateConfig(BaseModel):
|
|
|
62
57
|
presence_penalty: Optional[float] = Field(default=None)
|
|
63
58
|
"""Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. OpenAI, Google, Grok, Groq, vLLM, and SGLang only."""
|
|
64
59
|
|
|
60
|
+
repetition_penalty: Optional[float] = Field(default=None)
|
|
61
|
+
"""Exponential penalty applied to existing tokens in the generated text. 1.0 means no penalty. OpenAI, HuggingFace, and vLLM only."""
|
|
62
|
+
|
|
65
63
|
logit_bias: Optional[Dict[int, float]] = Field(default=None)
|
|
66
64
|
"""Map token Ids to an associated bias value from -100 to 100 (e.g. "42=10,43=-10"). OpenAI, Grok, Grok, and vLLM only."""
|
|
67
65
|
|
|
@@ -113,6 +111,12 @@ class GenerateConfig(BaseModel):
|
|
|
113
111
|
extra_body: Optional[Dict[str, Any]] = Field(default=None)
|
|
114
112
|
"""Extra body to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
|
|
115
113
|
|
|
114
|
+
extra_query: Optional[Dict[str, Any]] = Field(default=None)
|
|
115
|
+
"""Extra query parameters to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
|
|
116
|
+
|
|
117
|
+
extra_headers: Optional[Dict[str, str]] = Field(default=None)
|
|
118
|
+
"""Extra headers to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
|
|
119
|
+
|
|
116
120
|
height: Optional[int] = Field(default=None)
|
|
117
121
|
"""Image height for image generation model only"""
|
|
118
122
|
|
evalscope/api/model/model.py
CHANGED
|
@@ -318,7 +318,7 @@ def get_model_with_task_config(task_config: 'TaskConfig') -> Model:
|
|
|
318
318
|
|
|
319
319
|
@thread_safe
|
|
320
320
|
def get_model(
|
|
321
|
-
model: str,
|
|
321
|
+
model: Union[str, Model, ModelAPI],
|
|
322
322
|
eval_type: str,
|
|
323
323
|
base_url: Optional[str] = None,
|
|
324
324
|
api_key: Optional[str] = None,
|
|
@@ -346,6 +346,9 @@ def get_model(
|
|
|
346
346
|
if isinstance(model, Model):
|
|
347
347
|
return model
|
|
348
348
|
|
|
349
|
+
if isinstance(model, ModelAPI):
|
|
350
|
+
return Model(model, config, model_args)
|
|
351
|
+
|
|
349
352
|
# see if we can return a memoized model instance
|
|
350
353
|
# (exclude mockllm since custom_outputs is an infinite generator)
|
|
351
354
|
model_cache_key: str = ''
|
|
@@ -362,7 +365,7 @@ def get_model(
|
|
|
362
365
|
|
|
363
366
|
logger.info(
|
|
364
367
|
f'Creating model {model} with eval_type={eval_type} '
|
|
365
|
-
f'base_url={base_url},
|
|
368
|
+
f'base_url={base_url}, config={config.model_dump(exclude_none=True)}, model_args={model_args}'
|
|
366
369
|
)
|
|
367
370
|
|
|
368
371
|
# find a matching model type
|
evalscope/api/tool/tool_info.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import inspect
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from docstring_parser import Docstring, parse
|
|
4
|
-
from pydantic import BaseModel, Field
|
|
4
|
+
from pydantic import BaseModel, Field, field_validator
|
|
5
5
|
from typing import Any, Callable, Dict, List, Literal, Optional, TypeAlias, Union, get_args, get_type_hints
|
|
6
6
|
|
|
7
7
|
from evalscope.utils.json_schema import JSONSchema, JSONType, json_schema, python_type_to_json_type
|
evalscope/app/app.py
CHANGED
|
@@ -6,6 +6,7 @@ import argparse
|
|
|
6
6
|
from evalscope.utils.logger import configure_logging
|
|
7
7
|
from .arguments import add_argument
|
|
8
8
|
from .ui import create_app_ui
|
|
9
|
+
from .utils.env_utils import setup_env
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
def create_app(args: argparse.Namespace):
|
|
@@ -17,6 +18,8 @@ def create_app(args: argparse.Namespace):
|
|
|
17
18
|
"""
|
|
18
19
|
configure_logging(debug=args.debug)
|
|
19
20
|
|
|
21
|
+
setup_env(args)
|
|
22
|
+
|
|
20
23
|
demo = create_app_ui(args)
|
|
21
24
|
|
|
22
25
|
demo.launch(
|
evalscope/app/ui/multi_model.py
CHANGED
|
@@ -204,7 +204,12 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
204
204
|
data_score_df_b, _ = get_single_dataset_df(report_df_b, dataset_name)
|
|
205
205
|
|
|
206
206
|
# Get subset choices - should be same for both models
|
|
207
|
-
subsets
|
|
207
|
+
# Only select the subsets that Cat.0 is not '-'
|
|
208
|
+
df_for_subsets = data_score_df_a.copy()
|
|
209
|
+
subsets = sorted(
|
|
210
|
+
df_for_subsets.loc[df_for_subsets[f'{ReportKey.category_prefix}0'].ne('-'),
|
|
211
|
+
ReportKey.subset_name].dropna().unique().tolist()
|
|
212
|
+
)
|
|
208
213
|
|
|
209
214
|
return gr.update(choices=subsets, value=None), None
|
|
210
215
|
|
evalscope/app/ui/single_model.py
CHANGED
|
@@ -134,11 +134,17 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
134
134
|
)
|
|
135
135
|
def update_single_report_dataset(dataset_name, report_list):
|
|
136
136
|
logger.debug(f'Updating single report dataset: {dataset_name}')
|
|
137
|
-
report_df = get_data_frame(report_list=report_list)
|
|
137
|
+
report_df = get_data_frame(report_list=report_list, flatten_metrics=True, flatten_categories=True)
|
|
138
138
|
analysis = get_report_analysis(report_list, dataset_name)
|
|
139
139
|
data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
|
|
140
140
|
data_score_plot = plot_single_dataset_scores(data_score_df)
|
|
141
|
-
subsets
|
|
141
|
+
# Only select the subsets that Cat.0 is not '-'
|
|
142
|
+
df_for_subsets = data_score_df.copy()
|
|
143
|
+
subsets = sorted(
|
|
144
|
+
df_for_subsets.loc[df_for_subsets[f'{ReportKey.category_prefix}0'].ne('-'),
|
|
145
|
+
ReportKey.subset_name].dropna().unique().tolist()
|
|
146
|
+
)
|
|
147
|
+
|
|
142
148
|
logger.debug(f'subsets: {subsets}')
|
|
143
149
|
return data_score_plot, styler, gr.update(choices=subsets, value=None), None, analysis
|
|
144
150
|
|
|
@@ -198,9 +204,9 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
198
204
|
|
|
199
205
|
# Process the data for display
|
|
200
206
|
input_md = row['Input'] + '\n\n' + process_model_prediction(row['Metadata'])
|
|
201
|
-
generated_md =
|
|
202
|
-
gold_md =
|
|
203
|
-
pred_md =
|
|
207
|
+
generated_md = convert_markdown_image(row['Generated'])
|
|
208
|
+
gold_md = convert_markdown_image(row['Gold'])
|
|
209
|
+
pred_md = process_model_prediction(row['Pred'])
|
|
204
210
|
score_md = process_json_content(row['Score'])
|
|
205
211
|
nscore_val = float(row['NScore']) if not pd.isna(row['NScore']) else 0.0
|
|
206
212
|
|
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
Data loading and processing utilities for the Evalscope dashboard.
|
|
3
3
|
"""
|
|
4
4
|
import glob
|
|
5
|
-
import numpy as np
|
|
6
5
|
import os
|
|
7
6
|
import pandas as pd
|
|
8
7
|
from typing import Any, Dict, List, Union
|
|
@@ -160,17 +159,19 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
|
|
|
160
159
|
if f'{sample_dataset_name}/{sample_subset_name}' != subset_name:
|
|
161
160
|
continue
|
|
162
161
|
|
|
163
|
-
prediction = sample_score.score.prediction
|
|
164
|
-
target = review_result.target
|
|
165
|
-
extracted_prediction = sample_score.score.extracted_prediction
|
|
166
162
|
score = sample_score.score
|
|
163
|
+
metadata = sample_score.sample_metadata
|
|
164
|
+
prediction = score.prediction
|
|
165
|
+
target = review_result.target
|
|
166
|
+
extracted_prediction = score.extracted_prediction
|
|
167
167
|
raw_d = {
|
|
168
168
|
'Index': str(review_result.index),
|
|
169
169
|
'Input': review_result.input.replace('\n', '\n\n'), # for markdown
|
|
170
|
-
'Metadata':
|
|
171
|
-
'Generated': prediction
|
|
170
|
+
'Metadata': metadata,
|
|
171
|
+
'Generated': prediction or '', # Ensure no None value
|
|
172
172
|
'Gold': target,
|
|
173
|
-
'Pred': extracted_prediction
|
|
173
|
+
'Pred': (extracted_prediction if extracted_prediction != prediction else '*Same as Generated*')
|
|
174
|
+
or '', # Ensure no None value
|
|
174
175
|
'Score': score.model_dump(exclude_none=True),
|
|
175
176
|
'NScore': normalize_score(score.main_value)
|
|
176
177
|
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# flake8: noqa
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def setup_env(args):
|
|
6
|
+
compat_dsw_gradio(args)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def compat_dsw_gradio(args) -> None:
|
|
10
|
+
if ('JUPYTER_NAME' in os.environ) and ('dsw-'
|
|
11
|
+
in os.environ['JUPYTER_NAME']) and ('GRADIO_ROOT_PATH' not in os.environ):
|
|
12
|
+
os.environ['GRADIO_ROOT_PATH'] = f"/{os.environ['JUPYTER_NAME']}/proxy/{args.server_port}"
|
|
@@ -2,11 +2,9 @@
|
|
|
2
2
|
Text processing utilities for the Evalscope dashboard.
|
|
3
3
|
"""
|
|
4
4
|
import json
|
|
5
|
-
import numpy as np
|
|
6
5
|
import os
|
|
7
|
-
import pandas as pd
|
|
8
6
|
import re
|
|
9
|
-
from typing import Any, Dict, List
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
10
8
|
|
|
11
9
|
from evalscope.utils.logger import get_logger
|
|
12
10
|
from ..constants import LATEX_DELIMITERS
|
|
@@ -14,15 +12,19 @@ from ..constants import LATEX_DELIMITERS
|
|
|
14
12
|
logger = get_logger()
|
|
15
13
|
|
|
16
14
|
|
|
17
|
-
def convert_markdown_image(text):
|
|
18
|
-
if
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
text = os.path.abspath(text)
|
|
23
|
-
image_tag = f''
|
|
24
|
-
logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
|
|
15
|
+
def convert_markdown_image(text: str):
|
|
16
|
+
if text.startswith('data:image'):
|
|
17
|
+
# Convert base64 image data to a markdown image tag
|
|
18
|
+
image_tag = f''
|
|
19
|
+
logger.debug(f'Converting base64 image data to markdown: {text[:30]}... -> {image_tag[:40]}...')
|
|
25
20
|
return image_tag
|
|
21
|
+
elif os.path.isfile(text):
|
|
22
|
+
# Convert the image path to a markdown image tag
|
|
23
|
+
if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
|
|
24
|
+
text = os.path.abspath(text)
|
|
25
|
+
image_tag = f''
|
|
26
|
+
logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
|
|
27
|
+
return image_tag
|
|
26
28
|
return text
|
|
27
29
|
|
|
28
30
|
|
|
@@ -85,7 +87,7 @@ def process_model_prediction_old(item: Any, max_length: int = 2048) -> str:
|
|
|
85
87
|
return result
|
|
86
88
|
|
|
87
89
|
|
|
88
|
-
def process_model_prediction(item: Any, max_length: int =
|
|
90
|
+
def process_model_prediction(item: Any, max_length: Optional[int] = None) -> str:
|
|
89
91
|
if isinstance(item, (dict, list)):
|
|
90
92
|
result = json.dumps(item, ensure_ascii=False, indent=2)
|
|
91
93
|
result = f'```json\n{result}\n```'
|
|
@@ -18,7 +18,7 @@ logger = get_logger()
|
|
|
18
18
|
def plot_single_report_scores(df: pd.DataFrame):
|
|
19
19
|
if df is None:
|
|
20
20
|
return None
|
|
21
|
-
logger.debug(f'df: {df}')
|
|
21
|
+
logger.debug(f'df: \n{df}')
|
|
22
22
|
plot = px.bar(df, x=df[ReportKey.dataset_name], y=df[ReportKey.score], text=df[ReportKey.score])
|
|
23
23
|
|
|
24
24
|
width = DEFAULT_BAR_WIDTH if len(df[ReportKey.dataset_name]) <= 5 else None
|
|
@@ -36,7 +36,7 @@ def plot_single_report_sunburst(report_list: List[Report]):
|
|
|
36
36
|
df = get_data_frame(report_list=report_list, flatten_metrics=False)
|
|
37
37
|
categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
|
|
38
38
|
path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
|
|
39
|
-
logger.debug(f'df: {df}')
|
|
39
|
+
logger.debug(f'df: \n{df}')
|
|
40
40
|
df[categories] = df[categories].fillna('default') # NOTE: fillna for empty categories
|
|
41
41
|
|
|
42
42
|
plot = px.sunburst(
|
evalscope/arguments.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import argparse
|
|
3
3
|
import json
|
|
4
4
|
|
|
5
|
-
from evalscope.constants import EvalBackend, EvalType, JudgeStrategy, ModelTask
|
|
5
|
+
from evalscope.constants import EvalBackend, EvalType, JudgeStrategy, ModelTask
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class ParseStrArgsAction(argparse.Action):
|
|
@@ -60,8 +60,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
60
60
|
parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.') # noqa: E501
|
|
61
61
|
|
|
62
62
|
# Evaluation-related arguments
|
|
63
|
-
parser.add_argument('--eval-type', type=str, help='The type for evaluating.'
|
|
64
|
-
choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
|
|
63
|
+
parser.add_argument('--eval-type', type=str, help='The type for evaluating.')
|
|
65
64
|
parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
|
|
66
65
|
choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL]) # noqa: E501
|
|
67
66
|
parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
|
|
@@ -77,7 +76,6 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
77
76
|
# Debug and runtime mode arguments
|
|
78
77
|
parser.add_argument('--ignore-errors', action='store_true', default=False, help='Ignore errors during evaluation.')
|
|
79
78
|
parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
|
|
80
|
-
parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
|
|
81
79
|
parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
|
|
82
80
|
parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
|
|
83
81
|
parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
|
|
@@ -89,6 +87,12 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
89
87
|
parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
|
|
90
88
|
parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
|
|
91
89
|
parser.add_argument('--analysis-report', action='store_true', default=False, help='Generate analysis report for the evaluation results using judge model.') # noqa: E501
|
|
90
|
+
|
|
91
|
+
# Sandbox-related arguments
|
|
92
|
+
parser.add_argument('--use-sandbox', action='store_true', default=False, help='Whether to use sandbox for model evaluation.') # noqa: E501
|
|
93
|
+
parser.add_argument('--sandbox-type', type=str, default='docker', help='The sandbox type to use.') # noqa: E501
|
|
94
|
+
parser.add_argument('--sandbox-config', type=json.loads, default='{}', help='The sandbox config, should be a json string.') # noqa: E501
|
|
95
|
+
parser.add_argument('--sandbox-manager-config', type=json.loads, default='{}', help='The sandbox manager config, should be a json string.') # noqa: E501
|
|
92
96
|
# yapf: enable
|
|
93
97
|
|
|
94
98
|
|
|
@@ -47,7 +47,6 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
47
47
|
datasets: list, the datasets.
|
|
48
48
|
models: list, the models.
|
|
49
49
|
work_dir (Optional): str, the working directory. Default to None, which means the current directory.
|
|
50
|
-
dry_run (Optional): bool, the dry-run flag. Default to False.
|
|
51
50
|
debug (Optional): bool, the debug flag. Default to False.
|
|
52
51
|
reuse (Optional): str, reuse previous outputs & results. Default to None.
|
|
53
52
|
generation_kwargs (Optional): dict, the generation config. Default to {}.
|
|
@@ -140,7 +139,6 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
140
139
|
cmd_str = f'python -m run_oc ' \
|
|
141
140
|
f'--models {" ".join(self.args.models)} ' \
|
|
142
141
|
f'--datasets {" ".join(self.args.datasets)} ' \
|
|
143
|
-
f'{self.get_restore_arg("dry-run", self.args.dry_run)} ' \
|
|
144
142
|
f'{self.get_arg_with_default("work-dir", self.args.work_dir)}'
|
|
145
143
|
|
|
146
144
|
elif cmd_mode == CmdMode.SCRIPT:
|
|
@@ -164,6 +164,13 @@ class CrossEncoderModel(BaseModel):
|
|
|
164
164
|
max_length=self.max_seq_length,
|
|
165
165
|
automodel_args=self.model_kwargs,
|
|
166
166
|
)
|
|
167
|
+
self.tokenizer = self.model.tokenizer
|
|
168
|
+
# set pad token
|
|
169
|
+
if self.tokenizer.pad_token is None:
|
|
170
|
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
171
|
+
if ('pad_token_id' not in self.model.config) or (self.model.config.pad_token_id is None):
|
|
172
|
+
self.model.config.update({'pad_token_id': self.tokenizer.eos_token_id})
|
|
173
|
+
|
|
167
174
|
self.supported_encode_params = get_supported_params(self.model.predict)
|
|
168
175
|
|
|
169
176
|
def predict(self, sentences: List[List[str]], **kwargs) -> Tensor:
|
|
@@ -189,6 +196,7 @@ class APIEmbeddingModel(BaseModel):
|
|
|
189
196
|
self.openai_api_base = kwargs.get('api_base')
|
|
190
197
|
self.openai_api_key = kwargs.get('api_key')
|
|
191
198
|
self.dimensions = kwargs.get('dimensions')
|
|
199
|
+
self.check_embedding_ctx_length = kwargs.get('check_embedding_ctx_length', False)
|
|
192
200
|
self.framework = ['API']
|
|
193
201
|
|
|
194
202
|
self.model = OpenAIEmbeddings(
|
|
@@ -196,7 +204,7 @@ class APIEmbeddingModel(BaseModel):
|
|
|
196
204
|
openai_api_base=self.openai_api_base,
|
|
197
205
|
openai_api_key=self.openai_api_key,
|
|
198
206
|
dimensions=self.dimensions,
|
|
199
|
-
check_embedding_ctx_length=
|
|
207
|
+
check_embedding_ctx_length=self.check_embedding_ctx_length,
|
|
200
208
|
)
|
|
201
209
|
|
|
202
210
|
super().__init__(model_name_or_path=self.model_name, **kwargs)
|