evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
evalscope/app/ui/multi_model.py
CHANGED
|
@@ -10,8 +10,14 @@ from typing import TYPE_CHECKING
|
|
|
10
10
|
from evalscope.report import ReportKey, get_data_frame
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
12
12
|
from ..constants import LATEX_DELIMITERS, MODEL_TOKEN, REPORT_TOKEN
|
|
13
|
-
from ..utils.data_utils import (
|
|
14
|
-
|
|
13
|
+
from ..utils.data_utils import (
|
|
14
|
+
get_acc_report_df,
|
|
15
|
+
get_compare_report_df,
|
|
16
|
+
get_model_prediction,
|
|
17
|
+
get_single_dataset_df,
|
|
18
|
+
load_multi_report,
|
|
19
|
+
load_single_report,
|
|
20
|
+
)
|
|
15
21
|
from ..utils.localization import get_multi_model_locale
|
|
16
22
|
from ..utils.text_utils import convert_markdown_image, process_model_prediction
|
|
17
23
|
from ..utils.visualization import plot_multi_report_radar
|
|
@@ -62,7 +68,8 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
62
68
|
label=locale_dict.get('answer_mode'),
|
|
63
69
|
choices=['All', 'Pass A & B', 'Fail A & B', 'Pass A, Fail B', 'Fail A, Pass B'],
|
|
64
70
|
value='All',
|
|
65
|
-
interactive=True
|
|
71
|
+
interactive=True
|
|
72
|
+
)
|
|
66
73
|
score_threshold = gr.Number(value=0.99, label=locale_dict['score_threshold'], interactive=True)
|
|
67
74
|
|
|
68
75
|
data_comparison_df = gr.State(None)
|
|
@@ -75,7 +82,8 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
75
82
|
comparison_counts = gr.Markdown('')
|
|
76
83
|
with gr.Column():
|
|
77
84
|
page_number = gr.Number(
|
|
78
|
-
value=1, label=locale_dict['page'], minimum=1, maximum=1, step=1, interactive=True
|
|
85
|
+
value=1, label=locale_dict['page'], minimum=1, maximum=1, step=1, interactive=True
|
|
86
|
+
)
|
|
79
87
|
|
|
80
88
|
# Input and Gold answer sections remain at the top
|
|
81
89
|
with gr.Row(variant='panel'):
|
|
@@ -133,7 +141,8 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
133
141
|
|
|
134
142
|
@multi_report_name.change(
|
|
135
143
|
inputs=[sidebar.root_path, multi_report_name],
|
|
136
|
-
outputs=[report_list, radar_plot, score_table, model_a_select, model_b_select]
|
|
144
|
+
outputs=[report_list, radar_plot, score_table, model_a_select, model_b_select]
|
|
145
|
+
)
|
|
137
146
|
def update_multi_report_data(root_path, multi_report_names):
|
|
138
147
|
if not multi_report_names:
|
|
139
148
|
return gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip()
|
|
@@ -147,13 +156,14 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
147
156
|
model_choices = multi_report_names
|
|
148
157
|
|
|
149
158
|
return report_list, report_radar_plot, styler, gr.update(
|
|
150
|
-
choices=model_choices, value=model_choices[0]
|
|
151
|
-
|
|
159
|
+
choices=model_choices, value=model_choices[0]
|
|
160
|
+
), gr.update(choices=model_choices, value=model_choices[1] if len(model_choices) > 1 else None)
|
|
152
161
|
|
|
153
162
|
@gr.on(
|
|
154
163
|
triggers=[model_a_select.change, model_b_select.change],
|
|
155
164
|
inputs=[sidebar.root_path, model_a_select, model_b_select],
|
|
156
|
-
outputs=[model_a_report, model_b_report, model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_radio]
|
|
165
|
+
outputs=[model_a_report, model_b_report, model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_radio]
|
|
166
|
+
)
|
|
157
167
|
def update_selected_models(root_path, model_a, model_b):
|
|
158
168
|
if not model_a or not model_b:
|
|
159
169
|
return gr.skip()
|
|
@@ -172,13 +182,16 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
172
182
|
model_a_name = model_a.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0]
|
|
173
183
|
model_b_name = model_b.split(REPORT_TOKEN)[1].split(MODEL_TOKEN)[0]
|
|
174
184
|
|
|
175
|
-
return (
|
|
176
|
-
|
|
185
|
+
return (
|
|
186
|
+
model_a_reports, model_b_reports, model_a_dir, model_b_dir, model_a_name, model_b_name,
|
|
187
|
+
gr.update(choices=common_datasets, value=common_datasets[0] if common_datasets else None)
|
|
188
|
+
)
|
|
177
189
|
|
|
178
190
|
@gr.on(
|
|
179
191
|
triggers=[dataset_radio.change],
|
|
180
192
|
inputs=[dataset_radio, model_a_report, model_b_report],
|
|
181
|
-
outputs=[subset_select, data_comparison_df]
|
|
193
|
+
outputs=[subset_select, data_comparison_df]
|
|
194
|
+
)
|
|
182
195
|
def update_dataset_comparison(dataset_name, model_a_report, model_b_report):
|
|
183
196
|
if not dataset_name or model_a_report is None or model_b_report is None:
|
|
184
197
|
return gr.skip()
|
|
@@ -198,7 +211,8 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
198
211
|
@gr.on(
|
|
199
212
|
triggers=[subset_select.change],
|
|
200
213
|
inputs=[model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_radio, subset_select],
|
|
201
|
-
outputs=[data_comparison_df, page_number]
|
|
214
|
+
outputs=[data_comparison_df, page_number]
|
|
215
|
+
)
|
|
202
216
|
def update_comparison_data(model_a_dir, model_b_dir, model_a_name, model_b_name, dataset_name, subset_name):
|
|
203
217
|
if not subset_name or not dataset_name:
|
|
204
218
|
return gr.skip()
|
|
@@ -230,7 +244,8 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
230
244
|
@gr.on(
|
|
231
245
|
triggers=[data_comparison_df.change, answer_mode_radio.change, score_threshold.change],
|
|
232
246
|
inputs=[data_comparison_df, answer_mode_radio, score_threshold],
|
|
233
|
-
outputs=[filtered_comparison_df, page_number, comparison_counts]
|
|
247
|
+
outputs=[filtered_comparison_df, page_number, comparison_counts]
|
|
248
|
+
)
|
|
234
249
|
def filter_comparison_data(comparison_df, answer_mode, score_threshold):
|
|
235
250
|
if comparison_df is None:
|
|
236
251
|
return None, gr.update(value=1, maximum=1), ''
|
|
@@ -256,13 +271,19 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
256
271
|
# Count statistics
|
|
257
272
|
pass_a_count = len(comparison_df[comparison_df['A_NScore'] >= score_threshold])
|
|
258
273
|
pass_b_count = len(comparison_df[comparison_df['B_NScore'] >= score_threshold])
|
|
259
|
-
pass_both_count = len(
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
274
|
+
pass_both_count = len(
|
|
275
|
+
comparison_df[(comparison_df['A_NScore'] >= score_threshold)
|
|
276
|
+
& (comparison_df['B_NScore'] >= score_threshold)]
|
|
277
|
+
)
|
|
278
|
+
fail_both_count = len(
|
|
279
|
+
comparison_df[(comparison_df['A_NScore'] < score_threshold)
|
|
280
|
+
& (comparison_df['B_NScore'] < score_threshold)]
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
counts_text = (
|
|
284
|
+
f'### All: {all_count} | Pass A: {pass_a_count} | Pass B: {pass_b_count} | '
|
|
285
|
+
f'Pass Both: {pass_both_count} | Fail Both: {fail_both_count}'
|
|
286
|
+
)
|
|
266
287
|
|
|
267
288
|
max_page = max(1, len(filtered_df))
|
|
268
289
|
|
|
@@ -277,9 +298,11 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
277
298
|
outputs=[
|
|
278
299
|
input_text, gold_text, model_a_generated, model_a_pred, model_a_score, model_a_nscore, model_b_generated,
|
|
279
300
|
model_b_pred, model_b_score, model_b_nscore
|
|
280
|
-
]
|
|
281
|
-
|
|
282
|
-
|
|
301
|
+
]
|
|
302
|
+
)
|
|
303
|
+
def update_comparison_display(
|
|
304
|
+
filtered_df, page_number, score_threshold, model_a_select, model_b_select, model_a_name_val, model_b_name_val
|
|
305
|
+
):
|
|
283
306
|
if filtered_df is None or len(filtered_df) == 0:
|
|
284
307
|
return '', '', '', '', '', '', '', '', '', ''
|
|
285
308
|
|
|
@@ -317,7 +340,9 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
317
340
|
else:
|
|
318
341
|
b_nscore_html = f"<div style='background-color:rgb(151, 31, 44); padding:10px;'>{b_nscore_val}</div>"
|
|
319
342
|
|
|
320
|
-
return (
|
|
321
|
-
|
|
343
|
+
return (
|
|
344
|
+
input_md, gold_md, a_generated_md, a_pred_md, a_score_md, a_nscore_html, b_generated_md, b_pred_md,
|
|
345
|
+
b_score_md, b_nscore_html
|
|
346
|
+
)
|
|
322
347
|
|
|
323
348
|
return MultiModelComponents(multi_report_name=multi_report_name)
|
evalscope/app/ui/single_model.py
CHANGED
|
@@ -10,8 +10,13 @@ from typing import TYPE_CHECKING
|
|
|
10
10
|
from evalscope.report import Report, ReportKey, get_data_frame
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
12
12
|
from ..constants import DATASET_TOKEN, LATEX_DELIMITERS, MODEL_TOKEN, REPORT_TOKEN
|
|
13
|
-
from ..utils.data_utils import (
|
|
14
|
-
|
|
13
|
+
from ..utils.data_utils import (
|
|
14
|
+
get_acc_report_df,
|
|
15
|
+
get_model_prediction,
|
|
16
|
+
get_report_analysis,
|
|
17
|
+
get_single_dataset_df,
|
|
18
|
+
load_single_report,
|
|
19
|
+
)
|
|
15
20
|
from ..utils.localization import get_single_model_locale
|
|
16
21
|
from ..utils.text_utils import convert_markdown_image, process_json_content, process_model_prediction
|
|
17
22
|
from ..utils.visualization import plot_single_dataset_scores, plot_single_report_scores, plot_single_report_sunburst
|
|
@@ -63,7 +68,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
63
68
|
|
|
64
69
|
with gr.Row():
|
|
65
70
|
answer_mode_radio = gr.Radio(
|
|
66
|
-
label=locale_dict['answer_mode'], choices=['All', 'Pass', 'Fail'], value='All', interactive=True
|
|
71
|
+
label=locale_dict['answer_mode'], choices=['All', 'Pass', 'Fail'], value='All', interactive=True
|
|
72
|
+
)
|
|
67
73
|
score_threshold = gr.Number(value=0.99, label=locale_dict['score_threshold'], interactive=True)
|
|
68
74
|
|
|
69
75
|
data_review_df = gr.State(None)
|
|
@@ -76,7 +82,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
76
82
|
answer_mode_counts = gr.Markdown('')
|
|
77
83
|
with gr.Column():
|
|
78
84
|
page_number = gr.Number(
|
|
79
|
-
value=1, label=locale_dict['page'], minimum=1, maximum=1, step=1, interactive=True
|
|
85
|
+
value=1, label=locale_dict['page'], minimum=1, maximum=1, step=1, interactive=True
|
|
86
|
+
)
|
|
80
87
|
|
|
81
88
|
# show data review table
|
|
82
89
|
with gr.Row(variant='panel'):
|
|
@@ -98,14 +105,15 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
98
105
|
with gr.Row(variant='panel'):
|
|
99
106
|
with gr.Column():
|
|
100
107
|
gr.Markdown('### *Input*')
|
|
101
|
-
input_text = gr.
|
|
108
|
+
input_text = gr.Markdown('', elem_id='input_text', latex_delimiters=LATEX_DELIMITERS)
|
|
102
109
|
with gr.Column():
|
|
103
110
|
gr.Markdown('### *Generated*')
|
|
104
111
|
generated_text = gr.Markdown('', elem_id='generated_text', latex_delimiters=LATEX_DELIMITERS)
|
|
105
112
|
|
|
106
113
|
@report_name.change(
|
|
107
114
|
inputs=[sidebar.root_path, report_name],
|
|
108
|
-
outputs=[report_list, task_config, dataset_radio, work_dir, model_name]
|
|
115
|
+
outputs=[report_list, task_config, dataset_radio, work_dir, model_name]
|
|
116
|
+
)
|
|
109
117
|
def update_single_report_data(root_path, report_name):
|
|
110
118
|
report_list, datasets, task_cfg = load_single_report(root_path, report_name)
|
|
111
119
|
work_dir = os.path.join(root_path, report_name.split(REPORT_TOKEN)[0])
|
|
@@ -122,7 +130,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
122
130
|
@gr.on(
|
|
123
131
|
triggers=[dataset_radio.change, report_list.change],
|
|
124
132
|
inputs=[dataset_radio, report_list],
|
|
125
|
-
outputs=[dataset_plot, dataset_table, subset_select, data_review_df, report_analysis]
|
|
133
|
+
outputs=[dataset_plot, dataset_table, subset_select, data_review_df, report_analysis]
|
|
134
|
+
)
|
|
126
135
|
def update_single_report_dataset(dataset_name, report_list):
|
|
127
136
|
logger.debug(f'Updating single report dataset: {dataset_name}')
|
|
128
137
|
report_df = get_data_frame(report_list=report_list)
|
|
@@ -136,7 +145,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
136
145
|
@gr.on(
|
|
137
146
|
triggers=[subset_select.change],
|
|
138
147
|
inputs=[work_dir, model_name, dataset_radio, subset_select],
|
|
139
|
-
outputs=[data_review_df, page_number]
|
|
148
|
+
outputs=[data_review_df, page_number]
|
|
149
|
+
)
|
|
140
150
|
def update_single_report_subset(work_dir, model_name, dataset_name, subset_name):
|
|
141
151
|
if not subset_name:
|
|
142
152
|
return gr.skip()
|
|
@@ -146,7 +156,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
146
156
|
@gr.on(
|
|
147
157
|
triggers=[data_review_df.change, answer_mode_radio.change, score_threshold.change],
|
|
148
158
|
inputs=[data_review_df, answer_mode_radio, score_threshold],
|
|
149
|
-
outputs=[filtered_review_df, page_number, answer_mode_counts]
|
|
159
|
+
outputs=[filtered_review_df, page_number, answer_mode_counts]
|
|
160
|
+
)
|
|
150
161
|
def filter_data(data_review_df, answer_mode, score_threshold):
|
|
151
162
|
if data_review_df is None:
|
|
152
163
|
return None, gr.update(value=1, maximum=1), ''
|
|
@@ -172,7 +183,8 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
172
183
|
@gr.on(
|
|
173
184
|
triggers=[filtered_review_df.change, page_number.change],
|
|
174
185
|
inputs=[filtered_review_df, page_number, score_threshold],
|
|
175
|
-
outputs=[input_text, generated_text, gold_text, pred_text, score_text, nscore]
|
|
186
|
+
outputs=[input_text, generated_text, gold_text, pred_text, score_text, nscore]
|
|
187
|
+
)
|
|
176
188
|
def update_table_components(filtered_df, page_number, score_threshold):
|
|
177
189
|
if filtered_df is None or len(filtered_df) == 0:
|
|
178
190
|
return '', '', '', '', '', ''
|
|
@@ -185,7 +197,7 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
185
197
|
row = filtered_df.iloc[start]
|
|
186
198
|
|
|
187
199
|
# Process the data for display
|
|
188
|
-
input_md =
|
|
200
|
+
input_md = row['Input'] + '\n\n' + process_model_prediction(row['Metadata'])
|
|
189
201
|
generated_md = process_model_prediction(row['Generated'])
|
|
190
202
|
gold_md = process_model_prediction(row['Gold'])
|
|
191
203
|
pred_md = convert_markdown_image(process_model_prediction(row['Pred']))
|
|
@@ -7,9 +7,10 @@ import os
|
|
|
7
7
|
import pandas as pd
|
|
8
8
|
from typing import Any, Dict, List, Union
|
|
9
9
|
|
|
10
|
+
from evalscope.api.evaluator import CacheManager, ReviewResult
|
|
10
11
|
from evalscope.constants import DataCollection
|
|
11
12
|
from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
|
|
12
|
-
from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
|
|
13
|
+
from evalscope.utils.io_utils import OutputsStructure, jsonl_to_list, yaml_to_dict
|
|
13
14
|
from evalscope.utils.logger import get_logger
|
|
14
15
|
from ..constants import DATASET_TOKEN, MODEL_TOKEN, REPORT_TOKEN
|
|
15
16
|
|
|
@@ -39,7 +40,8 @@ def scan_for_report_folders(root_path):
|
|
|
39
40
|
datasets.append(os.path.splitext(os.path.basename(dataset_item))[0])
|
|
40
41
|
datasets = DATASET_TOKEN.join(datasets)
|
|
41
42
|
reports.append(
|
|
42
|
-
f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}'
|
|
43
|
+
f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}'
|
|
44
|
+
)
|
|
43
45
|
|
|
44
46
|
reports = sorted(reports, reverse=True)
|
|
45
47
|
logger.debug(f'reports: {reports}')
|
|
@@ -61,7 +63,8 @@ def load_single_report(root_path: str, report_name: str):
|
|
|
61
63
|
config_files = glob.glob(os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR, '*.yaml'))
|
|
62
64
|
if not config_files:
|
|
63
65
|
raise FileNotFoundError(
|
|
64
|
-
f'No configuration files found in {os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR)}'
|
|
66
|
+
f'No configuration files found in {os.path.join(root_path, prefix, OutputsStructure.CONFIGS_DIR)}'
|
|
67
|
+
)
|
|
65
68
|
task_cfg_path = config_files[0]
|
|
66
69
|
task_cfg = yaml_to_dict(task_cfg_path)
|
|
67
70
|
return report_list, datasets, task_cfg
|
|
@@ -134,31 +137,44 @@ def get_report_analysis(report_list: List[Report], dataset_name: str) -> str:
|
|
|
134
137
|
|
|
135
138
|
|
|
136
139
|
def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
140
|
+
# Load review cache
|
|
141
|
+
outputs = OutputsStructure(work_dir, is_make=False)
|
|
142
|
+
cache_manager = CacheManager(outputs, model_name, dataset_name)
|
|
143
|
+
if dataset_name == DataCollection.NAME:
|
|
144
|
+
review_cache_path = cache_manager.get_review_cache_path('default')
|
|
145
|
+
else:
|
|
146
|
+
review_cache_path = cache_manager.get_review_cache_path(subset_name)
|
|
147
|
+
logger.debug(f'review_path: {review_cache_path}')
|
|
148
|
+
review_caches = jsonl_to_list(review_cache_path)
|
|
142
149
|
|
|
143
150
|
ds = []
|
|
144
|
-
for
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
151
|
+
for cache in review_caches:
|
|
152
|
+
review_result = ReviewResult.model_validate(cache)
|
|
153
|
+
sample_score = review_result.sample_score
|
|
154
|
+
|
|
155
|
+
if dataset_name == DataCollection.NAME:
|
|
156
|
+
# Filter subset name
|
|
157
|
+
collection_info = sample_score.sample_metadata[DataCollection.INFO]
|
|
158
|
+
sample_dataset_name = collection_info.get('dataset_name', 'default')
|
|
159
|
+
sample_subset_name = collection_info.get('subset_name', 'default')
|
|
160
|
+
if f'{sample_dataset_name}/{sample_subset_name}' != subset_name:
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
prediction = sample_score.score.prediction
|
|
164
|
+
target = review_result.target
|
|
165
|
+
extracted_prediction = sample_score.score.extracted_prediction
|
|
166
|
+
score = sample_score.score
|
|
167
|
+
raw_d = {
|
|
168
|
+
'Index': str(review_result.index),
|
|
169
|
+
'Input': review_result.input.replace('\n', '\n\n'), # for markdown
|
|
170
|
+
'Metadata': sample_score.sample_metadata,
|
|
171
|
+
'Generated': prediction if prediction != extracted_prediction else '*Same as Pred*',
|
|
172
|
+
'Gold': target,
|
|
173
|
+
'Pred': extracted_prediction,
|
|
174
|
+
'Score': score.model_dump(exclude_none=True),
|
|
175
|
+
'NScore': normalize_score(score.main_value)
|
|
176
|
+
}
|
|
177
|
+
ds.append(raw_d)
|
|
162
178
|
|
|
163
179
|
df_subset = pd.DataFrame(ds)
|
|
164
180
|
return df_subset
|
|
@@ -109,8 +109,6 @@ def process_json_content(content: Any) -> str:
|
|
|
109
109
|
Returns:
|
|
110
110
|
str: The processed content formatted for markdown display.
|
|
111
111
|
"""
|
|
112
|
-
if isinstance(content, (np.bool_, np.int_, np.float_)):
|
|
113
|
-
content = str(content)
|
|
114
112
|
|
|
115
113
|
if isinstance(content, str):
|
|
116
114
|
content = {'content': content}
|
|
@@ -47,7 +47,8 @@ def plot_single_report_sunburst(report_list: List[Report]):
|
|
|
47
47
|
color_continuous_scale='RdYlGn', # see https://plotly.com/python/builtin-colorscales/
|
|
48
48
|
color_continuous_midpoint=np.average(df[ReportKey.score], weights=df[ReportKey.num]),
|
|
49
49
|
template=PLOTLY_THEME,
|
|
50
|
-
maxdepth=4
|
|
50
|
+
maxdepth=4
|
|
51
|
+
)
|
|
51
52
|
plot.update_traces(insidetextorientation='radial')
|
|
52
53
|
plot.update_layout(margin=dict(t=10, l=10, r=10, b=10), coloraxis=dict(cmin=0, cmax=1), height=600)
|
|
53
54
|
return plot
|
|
@@ -61,7 +62,8 @@ def plot_single_dataset_scores(df: pd.DataFrame):
|
|
|
61
62
|
y=df[ReportKey.score],
|
|
62
63
|
color=df[ReportKey.subset_name],
|
|
63
64
|
text=df[ReportKey.score],
|
|
64
|
-
barmode='group'
|
|
65
|
+
barmode='group'
|
|
66
|
+
)
|
|
65
67
|
|
|
66
68
|
width = 0.2 if len(df[ReportKey.subset_name]) <= 3 else None
|
|
67
69
|
plot.update_traces(width=width, texttemplate='%{text:.2f}', textposition='outside')
|
|
@@ -82,10 +84,13 @@ def plot_multi_report_radar(df: pd.DataFrame):
|
|
|
82
84
|
r=common_group[ReportKey.score],
|
|
83
85
|
theta=common_group[ReportKey.dataset_name],
|
|
84
86
|
name=model_name,
|
|
85
|
-
fill='toself'
|
|
87
|
+
fill='toself'
|
|
88
|
+
)
|
|
89
|
+
)
|
|
86
90
|
|
|
87
91
|
fig.update_layout(
|
|
88
92
|
template=PLOTLY_THEME,
|
|
89
93
|
polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
|
|
90
|
-
margin=dict(t=20, l=20, r=20, b=20)
|
|
94
|
+
margin=dict(t=20, l=20, r=20, b=20)
|
|
95
|
+
)
|
|
91
96
|
return fig
|
evalscope/arguments.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
1
2
|
import argparse
|
|
2
3
|
import json
|
|
3
4
|
|
|
4
|
-
from evalscope.constants import EvalBackend,
|
|
5
|
+
from evalscope.constants import EvalBackend, EvalType, JudgeStrategy, ModelTask, OutputType
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
class ParseStrArgsAction(argparse.Action):
|
|
@@ -47,7 +48,6 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
47
48
|
parser.add_argument('--model-task', type=str, default=ModelTask.TEXT_GENERATION, choices=[ModelTask.TEXT_GENERATION, ModelTask.IMAGE_GENERATION], help='The model task for model id.') # noqa: E501
|
|
48
49
|
|
|
49
50
|
# Template-related arguments
|
|
50
|
-
parser.add_argument('--template-type', type=str, required=False, help='Deprecated, will be removed in v1.0.0.')
|
|
51
51
|
parser.add_argument('--chat-template', type=str, required=False, help='The custom jinja template for chat generation.') # noqa: E501
|
|
52
52
|
|
|
53
53
|
# Dataset-related arguments
|
|
@@ -65,14 +65,13 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
65
65
|
parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
|
|
66
66
|
choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL]) # noqa: E501
|
|
67
67
|
parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
|
|
68
|
-
parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
|
|
69
|
-
choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
|
|
70
|
-
parser.add_argument('--limit', type=float, default=None, help='Max evaluation samples num for each subset.')
|
|
71
68
|
parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
|
|
69
|
+
parser.add_argument('--limit', type=float, default=None, help='Max evaluation samples num for each subset.')
|
|
70
|
+
parser.add_argument('--repeats', type=int, default=1, help='Number of times to repeat the dataset items for k-metrics.') # noqa: E501
|
|
72
71
|
|
|
73
72
|
# Cache and working directory arguments
|
|
74
|
-
parser.add_argument('--mem-cache', action='store_true', default=False, help='Deprecated, will be removed in v1.0.0.') # noqa: E501
|
|
75
73
|
parser.add_argument('--use-cache', type=str, help='Path to reuse the cached results.')
|
|
74
|
+
parser.add_argument('--rerun-review', action='store_true', default=False, help='Rerun the review process when use_cache.')
|
|
76
75
|
parser.add_argument('--work-dir', type=str, help='The root cache dir.')
|
|
77
76
|
|
|
78
77
|
# Debug and runtime mode arguments
|
|
@@ -83,7 +82,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
83
82
|
parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
|
|
84
83
|
parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
|
|
85
84
|
parser.add_argument('--timeout', type=float, default=None, help='The timeout for the remote API model.')
|
|
86
|
-
parser.add_argument('--stream', action='store_true', default=
|
|
85
|
+
parser.add_argument('--stream', action='store_true', default=None, help='Stream mode.') # noqa: E501
|
|
87
86
|
|
|
88
87
|
# LLMJudge arguments
|
|
89
88
|
parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
|
|
@@ -182,8 +182,10 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
182
182
|
else:
|
|
183
183
|
valid_dataset_names, invalid_dataset_names = get_valid_list(dataset_names, dataset_names_all)
|
|
184
184
|
if len(invalid_dataset_names) > 0:
|
|
185
|
-
logger.error(
|
|
186
|
-
|
|
185
|
+
logger.error(
|
|
186
|
+
f'Invalid datasets: {invalid_dataset_names}, '
|
|
187
|
+
f'refer to the following list to get proper dataset name: {dataset_names_all}'
|
|
188
|
+
)
|
|
187
189
|
assert len(valid_dataset_names) > 0, f'No valid datasets. ' \
|
|
188
190
|
f'To get the valid datasets, please refer to {dataset_names_all}'
|
|
189
191
|
|
|
@@ -252,7 +254,8 @@ if __name__ == '__main__':
|
|
|
252
254
|
'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'
|
|
253
255
|
}],
|
|
254
256
|
'limit': 5
|
|
255
|
-
}
|
|
257
|
+
}
|
|
258
|
+
)
|
|
256
259
|
all_datasets = OpenCompassBackendManager.list_datasets()
|
|
257
260
|
print(f'all_datasets: {all_datasets}')
|
|
258
261
|
oc_backend_manager.run()
|
|
@@ -100,16 +100,16 @@ class DatasetWrapper(TorchDataset):
|
|
|
100
100
|
|
|
101
101
|
def get_dataset_default_task(dataset):
|
|
102
102
|
if dataset in (
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
103
|
+
'custom',
|
|
104
|
+
'muge',
|
|
105
|
+
'flickr30k',
|
|
106
|
+
'flickr8k',
|
|
107
|
+
'mscoco_captions',
|
|
108
|
+
'mscoco_captions2017',
|
|
109
|
+
'multilingual_mscoco_captions',
|
|
110
|
+
'flickr30k-200',
|
|
111
|
+
'crossmodal3600',
|
|
112
|
+
'xtd200',
|
|
113
113
|
):
|
|
114
114
|
return 'zeroshot_retrieval'
|
|
115
115
|
else:
|
|
@@ -4,8 +4,11 @@ import torch
|
|
|
4
4
|
from itertools import product
|
|
5
5
|
|
|
6
6
|
from evalscope.backend.rag_eval.clip_benchmark.arguments import Arguments
|
|
7
|
-
from evalscope.backend.rag_eval.clip_benchmark.dataset_builder import (
|
|
8
|
-
|
|
7
|
+
from evalscope.backend.rag_eval.clip_benchmark.dataset_builder import (
|
|
8
|
+
build_dataset,
|
|
9
|
+
get_dataloader,
|
|
10
|
+
get_dataset_default_task,
|
|
11
|
+
)
|
|
9
12
|
from evalscope.backend.rag_eval.clip_benchmark.tasks import image_caption, zeroshot_classification, zeroshot_retrieval
|
|
10
13
|
from evalscope.backend.rag_eval.utils.clip import VisionModel
|
|
11
14
|
from evalscope.utils.logger import get_logger
|
|
@@ -66,8 +69,9 @@ def evaluate(args: Arguments):
|
|
|
66
69
|
if verbose:
|
|
67
70
|
logger.info(f'Zero-shot templates: {zeroshot_templates}')
|
|
68
71
|
classnames = dataset.classes if hasattr(dataset, 'classes') else None
|
|
69
|
-
assert (
|
|
70
|
-
|
|
72
|
+
assert (
|
|
73
|
+
zeroshot_templates is not None and classnames is not None
|
|
74
|
+
), 'Dataset does not support classification'
|
|
71
75
|
metrics = zeroshot_classification.evaluate(
|
|
72
76
|
model,
|
|
73
77
|
dataloader,
|
|
@@ -44,8 +44,9 @@ def default_transforms(
|
|
|
44
44
|
return bins
|
|
45
45
|
|
|
46
46
|
def filter_doc_with_num_tokens(node, min_num_tokens=500):
|
|
47
|
-
return (
|
|
48
|
-
|
|
47
|
+
return (
|
|
48
|
+
node.type == NodeType.DOCUMENT and num_tokens_from_string(node.properties['page_content']) > min_num_tokens
|
|
49
|
+
)
|
|
49
50
|
|
|
50
51
|
def filter_docs(node):
|
|
51
52
|
return node.type == NodeType.DOCUMENT
|
|
@@ -90,7 +91,8 @@ def default_transforms(
|
|
|
90
91
|
target_lang=language,
|
|
91
92
|
llm=llm,
|
|
92
93
|
adapt_instruction=True,
|
|
93
|
-
)
|
|
94
|
+
)
|
|
95
|
+
)
|
|
94
96
|
|
|
95
97
|
transforms = [
|
|
96
98
|
headline_extractor,
|
|
@@ -121,7 +123,8 @@ def default_transforms(
|
|
|
121
123
|
target_lang=language,
|
|
122
124
|
llm=llm,
|
|
123
125
|
adapt_instruction=True,
|
|
124
|
-
)
|
|
126
|
+
)
|
|
127
|
+
)
|
|
125
128
|
|
|
126
129
|
transforms = [
|
|
127
130
|
summary_extractor,
|
|
@@ -113,7 +113,8 @@ def generate_testset(args: TestsetGenerationArguments) -> None:
|
|
|
113
113
|
|
|
114
114
|
# generate testset
|
|
115
115
|
generator = TestsetGenerator(
|
|
116
|
-
llm=wrapped_llm, embedding_model=wrapped_embeddings, knowledge_graph=knowledge_graph, persona_list=persona_list
|
|
116
|
+
llm=wrapped_llm, embedding_model=wrapped_embeddings, knowledge_graph=knowledge_graph, persona_list=persona_list
|
|
117
|
+
)
|
|
117
118
|
|
|
118
119
|
testset = generator.generate(
|
|
119
120
|
testset_size=args.test_size,
|
|
@@ -34,7 +34,8 @@ async def translate_prompt(
|
|
|
34
34
|
|
|
35
35
|
logger.info(f'Translating prompts to {target_lang}')
|
|
36
36
|
adapted_prompts = await prompt_user.adapt_prompts(
|
|
37
|
-
language=target_lang, llm=llm, adapt_instruction=adapt_instruction
|
|
37
|
+
language=target_lang, llm=llm, adapt_instruction=adapt_instruction
|
|
38
|
+
)
|
|
38
39
|
prompt_user.set_prompts(**adapted_prompts)
|
|
39
40
|
try:
|
|
40
41
|
prompt_user.save_prompts(prompt_dir)
|
|
@@ -196,7 +196,8 @@ class APIEmbeddingModel(BaseModel):
|
|
|
196
196
|
openai_api_base=self.openai_api_base,
|
|
197
197
|
openai_api_key=self.openai_api_key,
|
|
198
198
|
dimensions=self.dimensions,
|
|
199
|
-
check_embedding_ctx_length=False
|
|
199
|
+
check_embedding_ctx_length=False
|
|
200
|
+
)
|
|
200
201
|
|
|
201
202
|
super().__init__(model_name_or_path=self.model_name, **kwargs)
|
|
202
203
|
|