evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +5 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +356 -0
- evalscope/api/benchmark/meta.py +121 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +262 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +378 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +275 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +243 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +155 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/app.py +3 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +26 -14
- evalscope/app/utils/data_utils.py +43 -27
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -14
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +7 -10
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +10 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +136 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +157 -57
- evalscope/constants.py +37 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +275 -419
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +47 -33
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +67 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +126 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +701 -0
- evalscope/perf/benchmark.py +4 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +15 -10
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +11 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -3
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +51 -35
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +33 -47
- evalscope/summarizer.py +1 -1
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +3 -2
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +142 -6
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +11 -7
- evalscope/utils/multi_choices.py +288 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
- tests/benchmark/test_eval.py +385 -0
- tests/benchmark/test_image_edit.py +65 -0
- tests/{aigc → benchmark}/test_t2i.py +22 -4
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +85 -47
- tests/cli/test_collection.py +20 -8
- tests/cli/test_custom.py +22 -15
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -2
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/__init__.py +0 -0
evalscope/app/ui/multi_model.py
CHANGED
|
@@ -10,8 +10,14 @@ from typing import TYPE_CHECKING
|
|
|
10
10
|
from evalscope.report import ReportKey, get_data_frame
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
12
12
|
from ..constants import LATEX_DELIMITERS, MODEL_TOKEN, REPORT_TOKEN
|
|
13
|
-
from ..utils.data_utils import (
|
|
14
|
-
|
|
13
|
+
from ..utils.data_utils import (
|
|
14
|
+
get_acc_report_df,
|
|
15
|
+
get_compare_report_df,
|
|
16
|
+
get_model_prediction,
|
|
17
|
+
get_single_dataset_df,
|
|
18
|
+
load_multi_report,
|
|
19
|
+
load_single_report,
|
|
20
|
+
)
|
|
15
21
|
from ..utils.localization import get_multi_model_locale
|
|
16
22
|
from ..utils.text_utils import convert_markdown_image, process_model_prediction
|
|
17
23
|
from ..utils.visualization import plot_multi_report_radar
|
|
@@ -62,7 +68,8 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
62
68
|
label=locale_dict.get('answer_mode'),
|
|
63
69
|
choices=['All', 'Pass A & B', 'Fail A & B', 'Pass A, Fail B', 'Fail A, Pass B'],
|
|
64
70
|
value='All',
|
|
65
|
-
interactive=True
|
|
71
|
+
interactive=True
|
|
72
|
+
)
|
|
66
73
|
score_threshold = gr.Number(value=0.99, label=locale_dict['score_threshold'], interactive=True)
|
|
67
74
|
|
|
68
75
|
data_comparison_df = gr.State(None)
|
|
@@ -75,7 +82,8 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
75
82
|
comparison_counts = gr.Markdown('')
|
|
76
83
|
with gr.Column():
|
|
77
84
|
page_number = gr.Number(
|
|
78
|
-
value=1, label=locale_dict['page'], minimum=1, maximum=1, step=1, interactive=True
|
|
85
|
+
value=1, label=locale_dict['page'], minimum=1, maximum=1, step=1, interactive=True
|
|
86
|
+
)
|
|
79
87
|
|
|
80
88
|
# Input and Gold answer sections remain at the top
|
|
81
89
|
with gr.Row(variant='panel'):
|
|
@@ -133,7 +141,8 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
133
141
|
|
|
134
142
|
@multi_report_name.change(
|
|
135
143
|
inputs=[sidebar.root_path, multi_report_name],
|
|
136
|
-
outputs=[report_list, radar_plot, score_table, model_a_select, model_b_select]
|
|
144
|
+
outputs=[report_list, radar_plot, score_table, model_a_select, model_b_select]
|
|
145
|
+
)
|
|
137
146
|
def update_multi_report_data(root_path, multi_report_names):
|
|
138
147
|
if not multi_report_names:
|
|
139
148
|
return gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip()
|
|
@@ -147,13 +156,14 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
147
156
|
model_choices = multi_report_names
|
|
148
157
|
|
|
149
158
|
return report_list, report_radar_plot, styler, gr.update(
|
|
150
|
-
choices=model_choices, value=model_choices[0]
|
|
151
|
-
|
|
159
|
+
choices=model_choices, value=model_choices[0]
|
|
160
|
+
), gr.update(choices=model_choices, value=model_choices[1] if len(model_choices) > 1 else None)
|
|
152
161
|
|
|
153
162
|
@gr.on(
|
|
154
163
|
triggers=[model_a_select.change, model_b_select.change],
|
|
155
164
|
inputs=[sidebar.root_path, model_a_select, model_b_select],
|
|
156
|
-
outputs=[model_a_report, model_b_report, model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_radio]
|
|
165
|
+
outputs=[model_a_report, model_b_report, model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_radio]
|
|
166
|
+
)
|
|
157
167
|
def update_selected_models(root_path, model_a, model_b):
|
|
158
168
|
if not model_a or not model_b:
|
|
159
169
|
return gr.skip()
|
|
@@ -172,13 +182,16 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
172
182
|
model_a_name = model_a.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0]
|
|
173
183
|
model_b_name = model_b.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0]
|
|
174
184
|
|
|
175
|
-
return (
|
|
176
|
-
|
|
185
|
+
return (
|
|
186
|
+
model_a_reports, model_b_reports, model_a_dir, model_b_dir, model_a_name, model_b_name,
|
|
187
|
+
gr.update(choices=common_datasets, value=common_datasets[0] if common_datasets else None)
|
|
188
|
+
)
|
|
177
189
|
|
|
178
190
|
@gr.on(
|
|
179
191
|
triggers=[dataset_radio.change],
|
|
180
192
|
inputs=[dataset_radio, model_a_report, model_b_report],
|
|
181
|
-
outputs=[subset_select, data_comparison_df]
|
|
193
|
+
outputs=[subset_select, data_comparison_df]
|
|
194
|
+
)
|
|
182
195
|
def update_dataset_comparison(dataset_name, model_a_report, model_b_report):
|
|
183
196
|
if not dataset_name or model_a_report is None or model_b_report is None:
|
|
184
197
|
return gr.skip()
|
|
@@ -198,7 +211,8 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
198
211
|
@gr.on(
|
|
199
212
|
triggers=[subset_select.change],
|
|
200
213
|
inputs=[model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_radio, subset_select],
|
|
201
|
-
outputs=[data_comparison_df, page_number]
|
|
214
|
+
outputs=[data_comparison_df, page_number]
|
|
215
|
+
)
|
|
202
216
|
def update_comparison_data(model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_name, subset_name):
|
|
203
217
|
if not subset_name or not dataset_name:
|
|
204
218
|
return gr.skip()
|
|
@@ -230,7 +244,8 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
230
244
|
@gr.on(
|
|
231
245
|
triggers=[data_comparison_df.change, answer_mode_radio.change, score_threshold.change],
|
|
232
246
|
inputs=[data_comparison_df, answer_mode_radio, score_threshold],
|
|
233
|
-
outputs=[filtered_comparison_df, page_number, comparison_counts]
|
|
247
|
+
outputs=[filtered_comparison_df, page_number, comparison_counts]
|
|
248
|
+
)
|
|
234
249
|
def filter_comparison_data(comparison_df, answer_mode, score_threshold):
|
|
235
250
|
if comparison_df is None:
|
|
236
251
|
return None, gr.update(value=1, maximum=1), ''
|
|
@@ -256,13 +271,19 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
256
271
|
# Count statistics
|
|
257
272
|
pass_a_count = len(comparison_df[comparison_df['A_NScore'] >= score_threshold])
|
|
258
273
|
pass_b_count = len(comparison_df[comparison_df['B_NScore'] >= score_threshold])
|
|
259
|
-
pass_both_count = len(
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
274
|
+
pass_both_count = len(
|
|
275
|
+
comparison_df[(comparison_df['A_NScore'] >= score_threshold)
|
|
276
|
+
& (comparison_df['B_NScore'] >= score_threshold)]
|
|
277
|
+
)
|
|
278
|
+
fail_both_count = len(
|
|
279
|
+
comparison_df[(comparison_df['A_NScore'] < score_threshold)
|
|
280
|
+
& (comparison_df['B_NScore'] < score_threshold)]
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
counts_text = (
|
|
284
|
+
f'### All: {all_count} | Pass A: {pass_a_count} | Pass B: {pass_b_count} | '
|
|
285
|
+
f'Pass Both: {pass_both_count} | Fail Both: {fail_both_count}'
|
|
286
|
+
)
|
|
266
287
|
|
|
267
288
|
max_page = max(1, len(filtered_df))
|
|
268
289
|
|
|
@@ -277,9 +298,11 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
277
298
|
outputs=[
|
|
278
299
|
input_text, gold_text, model_a_generated, model_a_pred, model_a_score, model_a_nscore, model_b_generated,
|
|
279
300
|
model_b_pred, model_b_score, model_b_nscore
|
|
280
|
-
]
|
|
281
|
-
|
|
282
|
-
|
|
301
|
+
]
|
|
302
|
+
)
|
|
303
|
+
def update_comparison_display(
|
|
304
|
+
filtered_df, page_number, score_threshold, model_a_select, model_b_select, model_a_name_val, model_b_name_val
|
|
305
|
+
):
|
|
283
306
|
if filtered_df is None or len(filtered_df) == 0:
|
|
284
307
|
return '', '', '', '', '', '', '', '', '', ''
|
|
285
308
|
|
|
@@ -317,7 +340,9 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
317
340
|
else:
|
|
318
341
|
b_nscore_html = f"<div style='background-color:rgb(151, 31, 44); padding:10px;'>{b_nscore_val}</div>"
|
|
319
342
|
|
|
320
|
-
return (
|
|
321
|
-
|
|
343
|
+
return (
|
|
344
|
+
input_md, gold_md, a_generated_md, a_pred_md, a_score_md, a_nscore_html, b_generated_md, b_pred_md,
|
|
345
|
+
b_score_md, b_nscore_html
|
|
346
|
+
)
|
|
322
347
|
|
|
323
348
|
return MultiModelComponents(multi_report_name=multi_report_name)
|
evalscope/app/ui/single_model.py
CHANGED
|
@@ -10,8 +10,13 @@ from typing import TYPE_CHECKING
|
|
|
10
10
|
from evalscope.report import Report, ReportKey, get_data_frame
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
12
12
|
from ..constants import DATASET_TOKEN, LATEX_DELIMITERS, MODEL_TOKEN, REPORT_TOKEN
|
|
13
|
-
from ..utils.data_utils import (
|
|
14
|
-
|
|
13
|
+
from ..utils.data_utils import (
|
|
14
|
+
get_acc_report_df,
|
|
15
|
+
get_model_prediction,
|
|
16
|
+
get_report_analysis,
|
|
17
|
+
get_single_dataset_df,
|
|
18
|
+
load_single_report,
|
|
19
|
+
)
|
|
15
20
|
from ..utils.localization import get_single_model_locale
|
|
16
21
|
from ..utils.text_utils import convert_markdown_image, process_json_content, process_model_prediction
|
|
17
22
|
from ..utils.visualization import plot_single_dataset_scores, plot_single_report_scores, plot_single_report_sunburst
|
|
@@ -63,7 +68,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
63
68
|
|
|
64
69
|
with gr.Row():
|
|
65
70
|
answer_mode_radio = gr.Radio(
|
|
66
|
-
label=locale_dict['answer_mode'], choices=['All', 'Pass', 'Fail'], value='All', interactive=True
|
|
71
|
+
label=locale_dict['answer_mode'], choices=['All', 'Pass', 'Fail'], value='All', interactive=True
|
|
72
|
+
)
|
|
67
73
|
score_threshold = gr.Number(value=0.99, label=locale_dict['score_threshold'], interactive=True)
|
|
68
74
|
|
|
69
75
|
data_review_df = gr.State(None)
|
|
@@ -76,7 +82,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
76
82
|
answer_mode_counts = gr.Markdown('')
|
|
77
83
|
with gr.Column():
|
|
78
84
|
page_number = gr.Number(
|
|
79
|
-
value=1, label=locale_dict['page'], minimum=1, maximum=1, step=1, interactive=True
|
|
85
|
+
value=1, label=locale_dict['page'], minimum=1, maximum=1, step=1, interactive=True
|
|
86
|
+
)
|
|
80
87
|
|
|
81
88
|
# show data review table
|
|
82
89
|
with gr.Row(variant='panel'):
|
|
@@ -98,14 +105,15 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
98
105
|
with gr.Row(variant='panel'):
|
|
99
106
|
with gr.Column():
|
|
100
107
|
gr.Markdown('### *Input*')
|
|
101
|
-
input_text = gr.
|
|
108
|
+
input_text = gr.Markdown('', elem_id='input_text', latex_delimiters=LATEX_DELIMITERS)
|
|
102
109
|
with gr.Column():
|
|
103
110
|
gr.Markdown('### *Generated*')
|
|
104
111
|
generated_text = gr.Markdown('', elem_id='generated_text', latex_delimiters=LATEX_DELIMITERS)
|
|
105
112
|
|
|
106
113
|
@report_name.change(
|
|
107
114
|
inputs=[sidebar.root_path, report_name],
|
|
108
|
-
outputs=[report_list, task_config, dataset_radio, work_dir, model_name]
|
|
115
|
+
outputs=[report_list, task_config, dataset_radio, work_dir, model_name]
|
|
116
|
+
)
|
|
109
117
|
def update_single_report_data(root_path, report_name):
|
|
110
118
|
report_list, datasets, task_cfg = load_single_report(root_path, report_name)
|
|
111
119
|
work_dir = os.path.join(root_path, report_name.split(REPORT_TOKEN)[0])
|
|
@@ -122,7 +130,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
122
130
|
@gr.on(
|
|
123
131
|
triggers=[dataset_radio.change, report_list.change],
|
|
124
132
|
inputs=[dataset_radio, report_list],
|
|
125
|
-
outputs=[dataset_plot, dataset_table, subset_select, data_review_df, report_analysis]
|
|
133
|
+
outputs=[dataset_plot, dataset_table, subset_select, data_review_df, report_analysis]
|
|
134
|
+
)
|
|
126
135
|
def update_single_report_dataset(dataset_name, report_list):
|
|
127
136
|
logger.debug(f'Updating single report dataset: {dataset_name}')
|
|
128
137
|
report_df = get_data_frame(report_list=report_list)
|
|
@@ -136,7 +145,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
136
145
|
@gr.on(
|
|
137
146
|
triggers=[subset_select.change],
|
|
138
147
|
inputs=[work_dir, model_name, dataset_radio, subset_select],
|
|
139
|
-
outputs=[data_review_df, page_number]
|
|
148
|
+
outputs=[data_review_df, page_number]
|
|
149
|
+
)
|
|
140
150
|
def update_single_report_subset(work_dir, model_name, dataset_name, subset_name):
|
|
141
151
|
if not subset_name:
|
|
142
152
|
return gr.skip()
|
|
@@ -146,7 +156,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
146
156
|
@gr.on(
|
|
147
157
|
triggers=[data_review_df.change, answer_mode_radio.change, score_threshold.change],
|
|
148
158
|
inputs=[data_review_df, answer_mode_radio, score_threshold],
|
|
149
|
-
outputs=[filtered_review_df, page_number, answer_mode_counts]
|
|
159
|
+
outputs=[filtered_review_df, page_number, answer_mode_counts]
|
|
160
|
+
)
|
|
150
161
|
def filter_data(data_review_df, answer_mode, score_threshold):
|
|
151
162
|
if data_review_df is None:
|
|
152
163
|
return None, gr.update(value=1, maximum=1), ''
|
|
@@ -172,7 +183,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
172
183
|
@gr.on(
|
|
173
184
|
triggers=[filtered_review_df.change, page_number.change],
|
|
174
185
|
inputs=[filtered_review_df, page_number, score_threshold],
|
|
175
|
-
outputs=[input_text, generated_text, gold_text, pred_text, score_text, nscore]
|
|
186
|
+
outputs=[input_text, generated_text, gold_text, pred_text, score_text, nscore]
|
|
187
|
+
)
|
|
176
188
|
def update_table_components(filtered_df, page_number, score_threshold):
|
|
177
189
|
if filtered_df is None or len(filtered_df) == 0:
|
|
178
190
|
return '', '', '', '', '', ''
|
|
@@ -185,10 +197,10 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
185
197
|
row = filtered_df.iloc[start]
|
|
186
198
|
|
|
187
199
|
# Process the data for display
|
|
188
|
-
input_md =
|
|
189
|
-
generated_md =
|
|
190
|
-
gold_md =
|
|
191
|
-
pred_md =
|
|
200
|
+
input_md = row['Input'] + '\n\n' + process_model_prediction(row['Metadata'])
|
|
201
|
+
generated_md = convert_markdown_image(row['Generated'])
|
|
202
|
+
gold_md = convert_markdown_image(row['Gold'])
|
|
203
|
+
pred_md = process_model_prediction(row['Pred'])
|
|
192
204
|
score_md = process_json_content(row['Score'])
|
|
193
205
|
nscore_val = float(row['NScore']) if not pd.isna(row['NScore']) else 0.0
|
|
194
206
|
|
|
@@ -2,14 +2,14 @@
|
|
|
2
2
|
Data loading and processing utilities for the Evalscope dashboard.
|
|
3
3
|
"""
|
|
4
4
|
import glob
|
|
5
|
-
import numpy as np
|
|
6
5
|
import os
|
|
7
6
|
import pandas as pd
|
|
8
7
|
from typing import Any, Dict, List, Union
|
|
9
8
|
|
|
9
|
+
from evalscope.api.evaluator import CacheManager, ReviewResult
|
|
10
10
|
from evalscope.constants import DataCollection
|
|
11
11
|
from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
|
|
12
|
-
from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
|
|
12
|
+
from evalscope.utils.io_utils import OutputsStructure, jsonl_to_list, yaml_to_dict
|
|
13
13
|
from evalscope.utils.logger import get_logger
|
|
14
14
|
from ..constants import DATASET_TOKEN, MODEL_TOKEN, REPORT_TOKEN
|
|
15
15
|
|
|
@@ -39,7 +39,8 @@ def scan_for_report_folders(root_path):
|
|
|
39
39
|
datasets.append(os.path.splitext(os.path.basename(dataset_item))[0])
|
|
40
40
|
datasets = DATASET_TOKEN.join(datasets)
|
|
41
41
|
reports.append(
|
|
42
|
-
f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}'
|
|
42
|
+
f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}'
|
|
43
|
+
)
|
|
43
44
|
|
|
44
45
|
reports = sorted(reports, reverse=True)
|
|
45
46
|
logger.debug(f'reports: {reports}')
|
|
@@ -61,7 +62,8 @@ def load_single_report(root_path: str, report_name: str):
|
|
|
61
62
|
config_files = glob.glob(os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR, '*.yaml'))
|
|
62
63
|
if not config_files:
|
|
63
64
|
raise FileNotFoundError(
|
|
64
|
-
f'No configuration files found in {os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR)}'
|
|
65
|
+
f'No configuration files found in {os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR)}'
|
|
66
|
+
)
|
|
65
67
|
task_cfg_path = config_files[0]
|
|
66
68
|
task_cfg = yaml_to_dict(task_cfg_path)
|
|
67
69
|
return report_list, datasets, task_cfg
|
|
@@ -134,31 +136,45 @@ def get_report_analysis(report_list: List[Report], dataset_name: str) -> str:
|
|
|
134
136
|
|
|
135
137
|
|
|
136
138
|
def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
139
|
+
# Load review cache
|
|
140
|
+
outputs = OutputsStructure(work_dir, is_make=False)
|
|
141
|
+
cache_manager = CacheManager(outputs, model_name, dataset_name)
|
|
142
|
+
if dataset_name == DataCollection.NAME:
|
|
143
|
+
review_cache_path = cache_manager.get_review_cache_path('default')
|
|
144
|
+
else:
|
|
145
|
+
review_cache_path = cache_manager.get_review_cache_path(subset_name)
|
|
146
|
+
logger.debug(f'review_path: {review_cache_path}')
|
|
147
|
+
review_caches = jsonl_to_list(review_cache_path)
|
|
142
148
|
|
|
143
149
|
ds = []
|
|
144
|
-
for
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
150
|
+
for cache in review_caches:
|
|
151
|
+
review_result = ReviewResult.model_validate(cache)
|
|
152
|
+
sample_score = review_result.sample_score
|
|
153
|
+
|
|
154
|
+
if dataset_name == DataCollection.NAME:
|
|
155
|
+
# Filter subset name
|
|
156
|
+
collection_info = sample_score.sample_metadata[DataCollection.INFO]
|
|
157
|
+
sample_dataset_name = collection_info.get('dataset_name', 'default')
|
|
158
|
+
sample_subset_name = collection_info.get('subset_name', 'default')
|
|
159
|
+
if f'{sample_dataset_name}/{sample_subset_name}' != subset_name:
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
score = sample_score.score
|
|
163
|
+
metadata = sample_score.sample_metadata
|
|
164
|
+
prediction = score.prediction
|
|
165
|
+
target = review_result.target
|
|
166
|
+
extracted_prediction = score.extracted_prediction
|
|
167
|
+
raw_d = {
|
|
168
|
+
'Index': str(review_result.index),
|
|
169
|
+
'Input': review_result.input.replace('\n', '\n\n'), # for markdown
|
|
170
|
+
'Metadata': metadata,
|
|
171
|
+
'Generated': prediction,
|
|
172
|
+
'Gold': target,
|
|
173
|
+
'Pred': extracted_prediction if extracted_prediction != prediction else '*Same as Generated*',
|
|
174
|
+
'Score': score.model_dump(exclude_none=True),
|
|
175
|
+
'NScore': normalize_score(score.main_value)
|
|
176
|
+
}
|
|
177
|
+
ds.append(raw_d)
|
|
162
178
|
|
|
163
179
|
df_subset = pd.DataFrame(ds)
|
|
164
180
|
return df_subset
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# flake8: noqa
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def setup_env(args):
|
|
6
|
+
compat_dsw_gradio(args)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def compat_dsw_gradio(args) -> None:
|
|
10
|
+
if ('JUPYTER_NAME' in os.environ) and ('dsw-'
|
|
11
|
+
in os.environ['JUPYTER_NAME']) and ('GRADIO_ROOT_PATH' not in os.environ):
|
|
12
|
+
os.environ['GRADIO_ROOT_PATH'] = f"/{os.environ['JUPYTER_NAME']}/proxy/{args.server_port}"
|
|
@@ -2,11 +2,9 @@
|
|
|
2
2
|
Text processing utilities for the Evalscope dashboard.
|
|
3
3
|
"""
|
|
4
4
|
import json
|
|
5
|
-
import numpy as np
|
|
6
5
|
import os
|
|
7
|
-
import pandas as pd
|
|
8
6
|
import re
|
|
9
|
-
from typing import Any, Dict, List
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
10
8
|
|
|
11
9
|
from evalscope.utils.logger import get_logger
|
|
12
10
|
from ..constants import LATEX_DELIMITERS
|
|
@@ -14,15 +12,19 @@ from ..constants import LATEX_DELIMITERS
|
|
|
14
12
|
logger = get_logger()
|
|
15
13
|
|
|
16
14
|
|
|
17
|
-
def convert_markdown_image(text):
|
|
18
|
-
if
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
text = os.path.abspath(text)
|
|
23
|
-
image_tag = f''
|
|
24
|
-
logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
|
|
15
|
+
def convert_markdown_image(text: str):
|
|
16
|
+
if text.startswith('data:image'):
|
|
17
|
+
# Convert base64 image data to a markdown image tag
|
|
18
|
+
image_tag = f''
|
|
19
|
+
logger.debug(f'Converting base64 image data to markdown: {text[:30]}... -> {image_tag[:40]}...')
|
|
25
20
|
return image_tag
|
|
21
|
+
elif os.path.isfile(text):
|
|
22
|
+
# Convert the image path to a markdown image tag
|
|
23
|
+
if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
|
|
24
|
+
text = os.path.abspath(text)
|
|
25
|
+
image_tag = f''
|
|
26
|
+
logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
|
|
27
|
+
return image_tag
|
|
26
28
|
return text
|
|
27
29
|
|
|
28
30
|
|
|
@@ -85,7 +87,7 @@ def process_model_prediction_old(item: Any, max_length: int = 2048) -> str:
|
|
|
85
87
|
return result
|
|
86
88
|
|
|
87
89
|
|
|
88
|
-
def process_model_prediction(item: Any, max_length: int =
|
|
90
|
+
def process_model_prediction(item: Any, max_length: Optional[int] = None) -> str:
|
|
89
91
|
if isinstance(item, (dict, list)):
|
|
90
92
|
result = json.dumps(item, ensure_ascii=False, indent=2)
|
|
91
93
|
result = f'```json\n{result}\n```'
|
|
@@ -109,8 +111,6 @@ def process_json_content(content: Any) -> str:
|
|
|
109
111
|
Returns:
|
|
110
112
|
str: The processed content formatted for markdown display.
|
|
111
113
|
"""
|
|
112
|
-
if isinstance(content, (np.bool_, np.int_, np.float_)):
|
|
113
|
-
content = str(content)
|
|
114
114
|
|
|
115
115
|
if isinstance(content, str):
|
|
116
116
|
content = {'content': content}
|
|
@@ -47,7 +47,8 @@ def plot_single_report_sunburst(report_list: List[Report]):
|
|
|
47
47
|
color_continuous_scale='RdYlGn', # see https://plotly.com/python/builtin-colorscales/
|
|
48
48
|
color_continuous_midpoint=np.average(df[ReportKey.score], weights=df[ReportKey.num]),
|
|
49
49
|
template=PLOTLY_THEME,
|
|
50
|
-
maxdepth=4
|
|
50
|
+
maxdepth=4
|
|
51
|
+
)
|
|
51
52
|
plot.update_traces(insidetextorientation='radial')
|
|
52
53
|
plot.update_layout(margin=dict(t=10, l=10, r=10, b=10), coloraxis=dict(cmin=0, cmax=1), height=600)
|
|
53
54
|
return plot
|
|
@@ -61,7 +62,8 @@ def plot_single_dataset_scores(df: pd.DataFrame):
|
|
|
61
62
|
y=df[ReportKey.score],
|
|
62
63
|
color=df[ReportKey.subset_name],
|
|
63
64
|
text=df[ReportKey.score],
|
|
64
|
-
barmode='group'
|
|
65
|
+
barmode='group'
|
|
66
|
+
)
|
|
65
67
|
|
|
66
68
|
width = 0.2 if len(df[ReportKey.subset_name]) <= 3 else None
|
|
67
69
|
plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
|
|
@@ -82,10 +84,13 @@ def plot_multi_report_radar(df: pd.DataFrame):
|
|
|
82
84
|
r=common_group[ReportKey.score],
|
|
83
85
|
theta=common_group[ReportKey.dataset_name],
|
|
84
86
|
name=model_name,
|
|
85
|
-
fill='toself'
|
|
87
|
+
fill='toself'
|
|
88
|
+
)
|
|
89
|
+
)
|
|
86
90
|
|
|
87
91
|
fig.update_layout(
|
|
88
92
|
template=PLOTLY_THEME,
|
|
89
93
|
polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
|
|
90
|
-
margin=dict(t=20, l=20, r=20, b=20)
|
|
94
|
+
margin=dict(t=20, l=20, r=20, b=20)
|
|
95
|
+
)
|
|
91
96
|
return fig
|
evalscope/arguments.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
1
2
|
import argparse
|
|
2
3
|
import json
|
|
3
4
|
|
|
4
|
-
from evalscope.constants import EvalBackend,
|
|
5
|
+
from evalscope.constants import EvalBackend, EvalType, JudgeStrategy, ModelTask
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
class ParseStrArgsAction(argparse.Action):
|
|
@@ -47,7 +48,6 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
47
48
|
parser.add_argument('--model-task', type=str, default=ModelTask.TEXT_GENERATION, choices=[ModelTask.TEXT_GENERATION, ModelTask.IMAGE_GENERATION], help='The model task for model id.') # noqa: E501
|
|
48
49
|
|
|
49
50
|
# Template-related arguments
|
|
50
|
-
parser.add_argument('--template-type', type=str, required=False, help='Deprecated, will be removed in v1.0.0.')
|
|
51
51
|
parser.add_argument('--chat-template', type=str, required=False, help='The custom jinja template for chat generation.') # noqa: E501
|
|
52
52
|
|
|
53
53
|
# Dataset-related arguments
|
|
@@ -60,30 +60,27 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
60
60
|
parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.') # noqa: E501
|
|
61
61
|
|
|
62
62
|
# Evaluation-related arguments
|
|
63
|
-
parser.add_argument('--eval-type', type=str, help='The type for evaluating.'
|
|
64
|
-
choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
|
|
63
|
+
parser.add_argument('--eval-type', type=str, help='The type for evaluating.')
|
|
65
64
|
parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
|
|
66
65
|
choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL]) # noqa: E501
|
|
67
66
|
parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
|
|
68
|
-
parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
|
|
69
|
-
choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
|
|
70
|
-
parser.add_argument('--limit', type=float, default=None, help='Max evaluation samples num for each subset.')
|
|
71
67
|
parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
|
|
68
|
+
parser.add_argument('--limit', type=float, default=None, help='Max evaluation samples num for each subset.')
|
|
69
|
+
parser.add_argument('--repeats', type=int, default=1, help='Number of times to repeat the dataset items for k-metrics.') # noqa: E501
|
|
72
70
|
|
|
73
71
|
# Cache and working directory arguments
|
|
74
|
-
parser.add_argument('--mem-cache', action='store_true', default=False, help='Deprecated, will be removed in v1.0.0.') # noqa: E501
|
|
75
72
|
parser.add_argument('--use-cache', type=str, help='Path to reuse the cached results.')
|
|
73
|
+
parser.add_argument('--rerun-review', action='store_true', default=False, help='Rerun the review process when use_cache.')
|
|
76
74
|
parser.add_argument('--work-dir', type=str, help='The root cache dir.')
|
|
77
75
|
|
|
78
76
|
# Debug and runtime mode arguments
|
|
79
77
|
parser.add_argument('--ignore-errors', action='store_true', default=False, help='Ignore errors during evaluation.')
|
|
80
78
|
parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
|
|
81
|
-
parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
|
|
82
79
|
parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
|
|
83
80
|
parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
|
|
84
81
|
parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
|
|
85
82
|
parser.add_argument('--timeout', type=float, default=None, help='The timeout for the remote API model.')
|
|
86
|
-
parser.add_argument('--stream', action='store_true', default=
|
|
83
|
+
parser.add_argument('--stream', action='store_true', default=None, help='Stream mode.') # noqa: E501
|
|
87
84
|
|
|
88
85
|
# LLMJudge arguments
|
|
89
86
|
parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
|
|
@@ -47,7 +47,6 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
47
47
|
datasets: list, the datasets.
|
|
48
48
|
models: list, the models.
|
|
49
49
|
work_dir (Optional): str, the working directory. Default to None, which means the current directory.
|
|
50
|
-
dry_run (Optional): bool, the dry-run flag. Default to False.
|
|
51
50
|
debug (Optional): bool, the debug flag. Default to False.
|
|
52
51
|
reuse (Optional): str, reuse previous outputs & results. Default to None.
|
|
53
52
|
generation_kwargs (Optional): dict, the generation config. Default to {}.
|
|
@@ -140,7 +139,6 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
140
139
|
cmd_str = f'python -m run_oc ' \
|
|
141
140
|
f'--models {" ".join(self.args.models)} ' \
|
|
142
141
|
f'--datasets {" ".join(self.args.datasets)} ' \
|
|
143
|
-
f'{self.get_restore_arg("dry-run", self.args.dry_run)} ' \
|
|
144
142
|
f'{self.get_arg_with_default("work-dir", self.args.work_dir)}'
|
|
145
143
|
|
|
146
144
|
elif cmd_mode == CmdMode.SCRIPT:
|
|
@@ -182,8 +180,10 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
182
180
|
else:
|
|
183
181
|
valid_dataset_names, invalid_dataset_names = get_valid_list(dataset_names, dataset_names_all)
|
|
184
182
|
if len(invalid_dataset_names) > 0:
|
|
185
|
-
logger.error(
|
|
186
|
-
|
|
183
|
+
logger.error(
|
|
184
|
+
f'Invalid datasets: {invalid_dataset_names}, '
|
|
185
|
+
f'refer to the following list to get proper dataset name: {dataset_names_all}'
|
|
186
|
+
)
|
|
187
187
|
assert len(valid_dataset_names) > 0, f'No valid datasets. ' \
|
|
188
188
|
f'To get the valid datasets, please refer to {dataset_names_all}'
|
|
189
189
|
|
|
@@ -252,7 +252,8 @@ if __name__ == '__main__':
|
|
|
252
252
|
'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'
|
|
253
253
|
}],
|
|
254
254
|
'limit': 5
|
|
255
|
-
}
|
|
255
|
+
}
|
|
256
|
+
)
|
|
256
257
|
all_datasets = OpenCompassBackendManager.list_datasets()
|
|
257
258
|
print(f'all_datasets: {all_datasets}')
|
|
258
259
|
oc_backend_manager.run()
|
|
@@ -100,16 +100,16 @@ class DatasetWrapper(TorchDataset):
|
|
|
100
100
|
|
|
101
101
|
def get_dataset_default_task(dataset):
|
|
102
102
|
if dataset in (
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
103
|
+
'custom',
|
|
104
|
+
'muge',
|
|
105
|
+
'flickr30k',
|
|
106
|
+
'flickr8k',
|
|
107
|
+
'mscoco_captions',
|
|
108
|
+
'mscoco_captions2017',
|
|
109
|
+
'multilingual_mscoco_captions',
|
|
110
|
+
'flickr30k-200',
|
|
111
|
+
'crossmodal3600',
|
|
112
|
+
'xtd200',
|
|
113
113
|
):
|
|
114
114
|
return 'zeroshot_retrieval'
|
|
115
115
|
else:
|
|
@@ -4,8 +4,11 @@ import torch
|
|
|
4
4
|
from itertools import product
|
|
5
5
|
|
|
6
6
|
from evalscope.backend.rag_eval.clip_benchmark.arguments import Arguments
|
|
7
|
-
from evalscope.backend.rag_eval.clip_benchmark.dataset_builder import (
|
|
8
|
-
|
|
7
|
+
from evalscope.backend.rag_eval.clip_benchmark.dataset_builder import (
|
|
8
|
+
build_dataset,
|
|
9
|
+
get_dataloader,
|
|
10
|
+
get_dataset_default_task,
|
|
11
|
+
)
|
|
9
12
|
from evalscope.backend.rag_eval.clip_benchmark.tasks import image_caption, zeroshot_classification, zeroshot_retrieval
|
|
10
13
|
from evalscope.backend.rag_eval.utils.clip import VisionModel
|
|
11
14
|
from evalscope.utils.logger import get_logger
|
|
@@ -66,8 +69,9 @@ def evaluate(args: Arguments):
|
|
|
66
69
|
if verbose:
|
|
67
70
|
logger.info(f'Zero-shot templates: {zeroshot_templates}')
|
|
68
71
|
classnames = dataset.classes if hasattr(dataset, 'classes') else None
|
|
69
|
-
assert (
|
|
70
|
-
|
|
72
|
+
assert (
|
|
73
|
+
zeroshot_templates is not None and classnames is not None
|
|
74
|
+
), 'Dataset does not support classification'
|
|
71
75
|
metrics = zeroshot_classification.evaluate(
|
|
72
76
|
model,
|
|
73
77
|
dataloader,
|