evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,57 +1,83 @@
|
|
|
1
|
-
from collections import defaultdict
|
|
2
1
|
from typing import Any, Dict, List
|
|
3
2
|
|
|
4
|
-
from evalscope.
|
|
5
|
-
from evalscope.
|
|
6
|
-
from evalscope.
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.
|
|
9
|
-
from evalscope.
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.evaluator import TaskState
|
|
6
|
+
from evalscope.api.messages import ChatMessageUser
|
|
7
|
+
from evalscope.api.metric import Score
|
|
8
|
+
from evalscope.api.registry import register_benchmark
|
|
9
|
+
from evalscope.constants import Tags
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@register_benchmark(
|
|
16
|
+
BenchmarkMeta(
|
|
17
|
+
name='ifeval',
|
|
18
|
+
pretty_name='IFEval',
|
|
19
|
+
description=
|
|
20
|
+
'IFEval is a benchmark for evaluating instruction-following language models, focusing on their ability to understand and respond to various prompts. It includes a diverse set of tasks and metrics to assess model performance comprehensively.', # noqa: E501
|
|
21
|
+
tags=[Tags.INSTRUCTION_FOLLOWING],
|
|
22
|
+
dataset_id='opencompass/ifeval',
|
|
23
|
+
subset_list=['default'],
|
|
24
|
+
metric_list=[
|
|
25
|
+
'prompt_level_strict',
|
|
26
|
+
'inst_level_strict',
|
|
27
|
+
'prompt_level_loose',
|
|
28
|
+
'inst_level_loose',
|
|
29
|
+
],
|
|
30
|
+
few_shot_num=0,
|
|
31
|
+
train_split=None,
|
|
32
|
+
eval_split='train',
|
|
33
|
+
prompt_template='',
|
|
34
|
+
)
|
|
27
35
|
)
|
|
28
|
-
class IFEvalAdapter(
|
|
36
|
+
class IFEvalAdapter(DefaultDataAdapter):
|
|
29
37
|
|
|
30
38
|
def __init__(self, **kwargs):
|
|
31
39
|
super().__init__(**kwargs)
|
|
32
40
|
|
|
33
|
-
def
|
|
34
|
-
|
|
41
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
42
|
+
"""
|
|
43
|
+
Convert a data record to a Sample object.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
record (Dict[str, Any]): Input data record.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Sample: Sample object with input, target, and metadata.
|
|
50
|
+
"""
|
|
51
|
+
prompt = record.get('prompt', '')
|
|
52
|
+
message_list = [ChatMessageUser(content=prompt)]
|
|
53
|
+
|
|
54
|
+
return Sample(input=message_list, target='', metadata=record)
|
|
55
|
+
|
|
56
|
+
def match_score(
|
|
57
|
+
self, original_prediction: str, filtered_prediction: str, reference: Dict, task_state: TaskState
|
|
58
|
+
) -> Score:
|
|
59
|
+
"""
|
|
60
|
+
Calculate evaluation scores by comparing prediction with reference.
|
|
61
|
+
"""
|
|
62
|
+
from evalscope.benchmarks.ifeval.utils import process_results
|
|
35
63
|
|
|
36
|
-
|
|
37
|
-
|
|
64
|
+
# Initialize the score object with prediction details
|
|
65
|
+
score = Score(
|
|
66
|
+
extracted_prediction=filtered_prediction,
|
|
67
|
+
prediction=original_prediction,
|
|
68
|
+
)
|
|
38
69
|
|
|
39
|
-
|
|
40
|
-
|
|
70
|
+
doc = task_state.metadata
|
|
71
|
+
try:
|
|
72
|
+
# Process results using the existing ifeval utility
|
|
73
|
+
results = process_results(doc, [filtered_prediction])
|
|
74
|
+
score.value.update(results)
|
|
41
75
|
|
|
42
|
-
|
|
43
|
-
|
|
76
|
+
# Set main score name
|
|
77
|
+
score.main_score_name = 'prompt_level_strict'
|
|
44
78
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
for res in review_res_list:
|
|
49
|
-
for k, v in res.items():
|
|
50
|
-
res_dict[k].append(v)
|
|
79
|
+
except Exception as e:
|
|
80
|
+
logger.error(f'Error calculating ifeval metrics: {e}')
|
|
81
|
+
score.value = {}
|
|
51
82
|
|
|
52
|
-
|
|
53
|
-
for metric in self.metric_list:
|
|
54
|
-
metric_name = metric.name
|
|
55
|
-
pred_value = res_dict[metric_name]
|
|
56
|
-
metrics.append({'metric_name': metric_name, 'score': metric.object(pred_value), 'num': len(pred_value)})
|
|
57
|
-
return metrics
|
|
83
|
+
return score
|
|
@@ -15,14 +15,13 @@
|
|
|
15
15
|
|
|
16
16
|
import collections
|
|
17
17
|
import json
|
|
18
|
-
import langdetect
|
|
19
18
|
import logging
|
|
20
19
|
import random
|
|
21
20
|
import re
|
|
22
21
|
import string
|
|
23
22
|
from typing import Dict, Optional, Sequence, Union
|
|
24
23
|
|
|
25
|
-
from
|
|
24
|
+
from . import instructions_util
|
|
26
25
|
|
|
27
26
|
_InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]]
|
|
28
27
|
|
|
@@ -141,8 +140,9 @@ class ResponseLanguageChecker(Instruction):
|
|
|
141
140
|
if self._language is None:
|
|
142
141
|
self._language = random.choice(list(_LANGUAGES.keys()))
|
|
143
142
|
# TODO(tianjianlu): opens the description generation to more choices.
|
|
144
|
-
self._description_pattern = (
|
|
145
|
-
|
|
143
|
+
self._description_pattern = (
|
|
144
|
+
'Your ENTIRE response should be in {language} language, no other ' + 'language is allowed.'
|
|
145
|
+
)
|
|
146
146
|
return self._description_pattern.format(language=_LANGUAGES[self._language])
|
|
147
147
|
|
|
148
148
|
def get_instruction_args(self):
|
|
@@ -163,7 +163,7 @@ class ResponseLanguageChecker(Instruction):
|
|
|
163
163
|
True if the language of `value` follows instruction; otherwise False.
|
|
164
164
|
"""
|
|
165
165
|
assert isinstance(value, str)
|
|
166
|
-
|
|
166
|
+
import langdetect
|
|
167
167
|
try:
|
|
168
168
|
return langdetect.detect(value) == self._language
|
|
169
169
|
except langdetect.LangDetectException as e:
|
|
@@ -198,8 +198,10 @@ class NumberOfSentences(Instruction):
|
|
|
198
198
|
if relation is None:
|
|
199
199
|
self._comparison_relation = random.choice(_COMPARISON_RELATION)
|
|
200
200
|
elif relation not in _COMPARISON_RELATION:
|
|
201
|
-
raise ValueError(
|
|
202
|
-
|
|
201
|
+
raise ValueError(
|
|
202
|
+
'The supported relation for comparison must be in '
|
|
203
|
+
f'{_COMPARISON_RELATION}, but {relation} is given.'
|
|
204
|
+
)
|
|
203
205
|
else:
|
|
204
206
|
self._comparison_relation = relation
|
|
205
207
|
|
|
@@ -256,8 +258,10 @@ class PlaceholderChecker(Instruction):
|
|
|
256
258
|
self._num_placeholders = num_placeholders
|
|
257
259
|
if self._num_placeholders is None or self._num_placeholders < 0:
|
|
258
260
|
self._num_placeholders = random.randint(1, _NUM_PLACEHOLDERS)
|
|
259
|
-
self._description_pattern = (
|
|
260
|
-
|
|
261
|
+
self._description_pattern = (
|
|
262
|
+
'The response must contain at least {num_placeholders} placeholders '
|
|
263
|
+
+ 'represented by square brackets, such as [address].'
|
|
264
|
+
)
|
|
261
265
|
return self._description_pattern.format(num_placeholders=self._num_placeholders)
|
|
262
266
|
|
|
263
267
|
def get_instruction_args(self):
|
|
@@ -299,9 +303,10 @@ class BulletListChecker(Instruction):
|
|
|
299
303
|
self._num_bullets = num_bullets
|
|
300
304
|
if self._num_bullets is None or self._num_bullets < 0:
|
|
301
305
|
self._num_bullets = random.randint(1, _NUM_BULLETS)
|
|
302
|
-
self._description_pattern = (
|
|
303
|
-
|
|
304
|
-
|
|
306
|
+
self._description_pattern = (
|
|
307
|
+
'Your answer must contain exactly {num_bullets} bullet points. '
|
|
308
|
+
+ 'Use the markdown bullet points such as:\n' + '* This is point 1. \n' + '* This is point 2'
|
|
309
|
+
)
|
|
305
310
|
return self._description_pattern.format(num_bullets=self._num_bullets)
|
|
306
311
|
|
|
307
312
|
def get_instruction_args(self):
|
|
@@ -380,8 +385,9 @@ class ConstrainedStartChecker(Instruction):
|
|
|
380
385
|
self._starter = starter.strip() if isinstance(starter, str) else starter
|
|
381
386
|
if self._starter is None:
|
|
382
387
|
self._starter = random.choice(_STARTER_OPTIONS)
|
|
383
|
-
self._description_pattern = (
|
|
384
|
-
|
|
388
|
+
self._description_pattern = (
|
|
389
|
+
'During the conversation, when it is your turn, ' + 'please always start with {starter}'
|
|
390
|
+
)
|
|
385
391
|
return self._description_pattern.format(starter=self._starter)
|
|
386
392
|
|
|
387
393
|
def get_instruction_args(self):
|
|
@@ -424,8 +430,10 @@ class HighlightSectionChecker(Instruction):
|
|
|
424
430
|
if self._num_highlights is None or self._num_highlights < 0:
|
|
425
431
|
self._num_highlights = random.randint(1, _NUM_HIGHLIGHTED_SECTIONS)
|
|
426
432
|
|
|
427
|
-
self._description_pattern = (
|
|
428
|
-
|
|
433
|
+
self._description_pattern = (
|
|
434
|
+
'Highlight at least {num_highlights} sections in your answer with '
|
|
435
|
+
+ 'markdown, i.e. *highlighted section*.'
|
|
436
|
+
)
|
|
429
437
|
|
|
430
438
|
return self._description_pattern.format(num_highlights=self._num_highlights)
|
|
431
439
|
|
|
@@ -483,9 +491,11 @@ class SectionChecker(Instruction):
|
|
|
483
491
|
if self._num_sections is None or self._num_sections < 0:
|
|
484
492
|
self._num_sections = random.randint(1, _NUM_SECTIONS)
|
|
485
493
|
|
|
486
|
-
self._description_pattern = (
|
|
487
|
-
|
|
488
|
-
|
|
494
|
+
self._description_pattern = (
|
|
495
|
+
'Your response must have {num_sections} sections. Mark the beginning '
|
|
496
|
+
+ 'of each section with {section_spliter} X, such as:\n' + '{section_spliter} 1\n'
|
|
497
|
+
+ '[content of section 1]\n' + '{section_spliter} 2\n' + '[content of section 2]'
|
|
498
|
+
)
|
|
489
499
|
|
|
490
500
|
return self._description_pattern.format(num_sections=self._num_sections, section_spliter=self._section_spliter)
|
|
491
501
|
|
|
@@ -535,8 +545,9 @@ class ParagraphChecker(Instruction):
|
|
|
535
545
|
if self._num_paragraphs is None or self._num_paragraphs < 0:
|
|
536
546
|
self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
|
|
537
547
|
|
|
538
|
-
self._description_pattern = (
|
|
539
|
-
|
|
548
|
+
self._description_pattern = (
|
|
549
|
+
'There should be {num_paragraphs} paragraphs. ' + 'Paragraphs are separated with the markdown divider: ***'
|
|
550
|
+
)
|
|
540
551
|
|
|
541
552
|
return self._description_pattern.format(num_paragraphs=self._num_paragraphs)
|
|
542
553
|
|
|
@@ -586,12 +597,14 @@ class PostscriptChecker(Instruction):
|
|
|
586
597
|
A string representing the instruction description.
|
|
587
598
|
"""
|
|
588
599
|
self._postscript_marker = (
|
|
589
|
-
postscript_marker.strip() if isinstance(postscript_marker, str) else postscript_marker
|
|
600
|
+
postscript_marker.strip() if isinstance(postscript_marker, str) else postscript_marker
|
|
601
|
+
)
|
|
590
602
|
if self._postscript_marker is None:
|
|
591
603
|
self._postscript_marker = random.choice(_POSTSCRIPT_MARKER)
|
|
592
604
|
|
|
593
|
-
self._description_pattern = (
|
|
594
|
-
|
|
605
|
+
self._description_pattern = (
|
|
606
|
+
'At the end of your response, please explicitly add a postscript ' + 'starting with {postscript}'
|
|
607
|
+
)
|
|
595
608
|
|
|
596
609
|
return self._description_pattern.format(postscript=self._postscript_marker)
|
|
597
610
|
|
|
@@ -645,8 +658,10 @@ class RephraseChecker(Instruction):
|
|
|
645
658
|
'in the form of *change me*.')
|
|
646
659
|
|
|
647
660
|
self._reference_without_change = original_message
|
|
648
|
-
self._description = (
|
|
649
|
-
|
|
661
|
+
self._description = (
|
|
662
|
+
'Rephrasing: Your rephrased response should only' + 'change the words/sentences in between two asterisks'
|
|
663
|
+
+ 'such as *change me*.'
|
|
664
|
+
)
|
|
650
665
|
return self._description
|
|
651
666
|
|
|
652
667
|
def get_instruction_args(self):
|
|
@@ -758,13 +773,16 @@ class KeywordFrequencyChecker(Instruction):
|
|
|
758
773
|
if relation is None:
|
|
759
774
|
self._comparison_relation = random.choice(_COMPARISON_RELATION)
|
|
760
775
|
elif relation not in _COMPARISON_RELATION:
|
|
761
|
-
raise ValueError(
|
|
762
|
-
|
|
776
|
+
raise ValueError(
|
|
777
|
+
'The supported relation for comparison must be in '
|
|
778
|
+
f'{_COMPARISON_RELATION}, but {relation} is given.'
|
|
779
|
+
)
|
|
763
780
|
else:
|
|
764
781
|
self._comparison_relation = relation
|
|
765
782
|
|
|
766
|
-
self._description_pattern = (
|
|
767
|
-
|
|
783
|
+
self._description_pattern = (
|
|
784
|
+
'In your response, the word {keyword} should appear {relation} ' + '{frequency} times.'
|
|
785
|
+
)
|
|
768
786
|
|
|
769
787
|
return self._description_pattern.format(
|
|
770
788
|
keyword=self._keyword,
|
|
@@ -820,8 +838,10 @@ class NumberOfWords(Instruction):
|
|
|
820
838
|
if relation is None:
|
|
821
839
|
self._comparison_relation = random.choice(_COMPARISON_RELATION)
|
|
822
840
|
elif relation not in _COMPARISON_RELATION:
|
|
823
|
-
raise ValueError(
|
|
824
|
-
|
|
841
|
+
raise ValueError(
|
|
842
|
+
'The supported relation for comparison must be in '
|
|
843
|
+
f'{_COMPARISON_RELATION}, but {relation} is given.'
|
|
844
|
+
)
|
|
825
845
|
else:
|
|
826
846
|
self._comparison_relation = relation
|
|
827
847
|
|
|
@@ -851,8 +871,10 @@ class JsonFormat(Instruction):
|
|
|
851
871
|
"""Check the Json format."""
|
|
852
872
|
|
|
853
873
|
def build_description(self):
|
|
854
|
-
self._description_pattern = (
|
|
855
|
-
|
|
874
|
+
self._description_pattern = (
|
|
875
|
+
'Entire output should be wrapped in JSON format. You can use markdown'
|
|
876
|
+
' ticks such as ```.'
|
|
877
|
+
)
|
|
856
878
|
return self._description_pattern
|
|
857
879
|
|
|
858
880
|
def get_instruction_args(self):
|
|
@@ -865,8 +887,9 @@ class JsonFormat(Instruction):
|
|
|
865
887
|
|
|
866
888
|
def check_following(self, value):
|
|
867
889
|
value = (
|
|
868
|
-
value.strip().removeprefix('```json').removeprefix('```Json').removeprefix('```JSON').removeprefix(
|
|
869
|
-
|
|
890
|
+
value.strip().removeprefix('```json').removeprefix('```Json').removeprefix('```JSON').removeprefix('```').
|
|
891
|
+
removesuffix('```').strip()
|
|
892
|
+
)
|
|
870
893
|
try:
|
|
871
894
|
json.loads(value)
|
|
872
895
|
except ValueError:
|
|
@@ -904,10 +927,12 @@ class ParagraphFirstWordCheck(Instruction):
|
|
|
904
927
|
self._first_word = instructions_util.generate_keywords(num_keywords=1)[0]
|
|
905
928
|
self._first_word = self._first_word.lower()
|
|
906
929
|
|
|
907
|
-
self._description_pattern = (
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
930
|
+
self._description_pattern = (
|
|
931
|
+
'There should be {num_paragraphs} paragraphs. '
|
|
932
|
+
+ 'Paragraphs and only paragraphs are separated with each other by two '
|
|
933
|
+
+ "new lines as if it was '\\n\\n' in python. "
|
|
934
|
+
+ 'Paragraph {nth_paragraph} must start with word {first_word}.'
|
|
935
|
+
)
|
|
911
936
|
|
|
912
937
|
return self._description_pattern.format(
|
|
913
938
|
num_paragraphs=self._num_paragraphs,
|
|
@@ -1085,11 +1110,12 @@ class RephraseParagraph(Instruction):
|
|
|
1085
1110
|
self._low = low
|
|
1086
1111
|
self._high = high
|
|
1087
1112
|
|
|
1088
|
-
self._description = (
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1113
|
+
self._description = (
|
|
1114
|
+
'Rephrase the following paragraph: ' + '{original_paragraph}\nYour response should have '
|
|
1115
|
+
+ 'between {low} and {high} of the same words. ' + 'Words are the same if and only if all of the '
|
|
1116
|
+
+ 'letters, ignoring cases, are the same. For ' + "example, 'run' is the same as 'Run' but different "
|
|
1117
|
+
+ "to 'ran'."
|
|
1118
|
+
)
|
|
1093
1119
|
|
|
1094
1120
|
return self._description.format(original_paragraph=original_paragraph, low=self._low, high=self._high)
|
|
1095
1121
|
|
|
@@ -1124,8 +1150,10 @@ class TwoResponsesChecker(Instruction):
|
|
|
1124
1150
|
|
|
1125
1151
|
def build_description(self):
|
|
1126
1152
|
"""Build the instruction description."""
|
|
1127
|
-
self._description_pattern = (
|
|
1128
|
-
|
|
1153
|
+
self._description_pattern = (
|
|
1154
|
+
'Give two different responses. Responses and only responses should'
|
|
1155
|
+
' be separated by 6 asterisk symbols: ******.'
|
|
1156
|
+
)
|
|
1129
1157
|
return self._description_pattern
|
|
1130
1158
|
|
|
1131
1159
|
def get_instruction_args(self):
|
|
@@ -1172,10 +1200,12 @@ class RepeatPromptThenAnswer(Instruction):
|
|
|
1172
1200
|
raise ValueError('prompt_to_repeat must be set.')
|
|
1173
1201
|
else:
|
|
1174
1202
|
self._prompt_to_repeat = prompt_to_repeat
|
|
1175
|
-
self._description_pattern = (
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1203
|
+
self._description_pattern = (
|
|
1204
|
+
'First repeat the request word for word without change,'
|
|
1205
|
+
' then give your answer (1. do not say any words or characters'
|
|
1206
|
+
' before repeating the request; 2. the request you need to repeat'
|
|
1207
|
+
' does not include this sentence)'
|
|
1208
|
+
)
|
|
1179
1209
|
return self._description_pattern
|
|
1180
1210
|
|
|
1181
1211
|
def get_instruction_args(self):
|
|
@@ -1206,8 +1236,10 @@ class EndChecker(Instruction):
|
|
|
1206
1236
|
self._end_phrase = (end_phrase.strip() if isinstance(end_phrase, str) else end_phrase)
|
|
1207
1237
|
if self._end_phrase is None:
|
|
1208
1238
|
self._end_phrase = random.choice(_ENDING_OPTIONS)
|
|
1209
|
-
self._description_pattern = (
|
|
1210
|
-
|
|
1239
|
+
self._description_pattern = (
|
|
1240
|
+
'Finish your response with this exact phrase {ender}. '
|
|
1241
|
+
'No other words should follow this phrase.'
|
|
1242
|
+
)
|
|
1211
1243
|
return self._description_pattern.format(ender=self._end_phrase)
|
|
1212
1244
|
|
|
1213
1245
|
def get_instruction_args(self):
|
|
@@ -1229,8 +1261,10 @@ class TitleChecker(Instruction):
|
|
|
1229
1261
|
|
|
1230
1262
|
def build_description(self):
|
|
1231
1263
|
"""Build the instruction description."""
|
|
1232
|
-
self._description_pattern = (
|
|
1233
|
-
|
|
1264
|
+
self._description_pattern = (
|
|
1265
|
+
'Your answer must contain a title, wrapped in double angular brackets,'
|
|
1266
|
+
' such as <<poem of joy>>.'
|
|
1267
|
+
)
|
|
1234
1268
|
return self._description_pattern
|
|
1235
1269
|
|
|
1236
1270
|
def get_instruction_args(self):
|
|
@@ -1284,13 +1318,17 @@ class LetterFrequencyChecker(Instruction):
|
|
|
1284
1318
|
if let_relation is None:
|
|
1285
1319
|
self._comparison_relation = random.choice(_COMPARISON_RELATION)
|
|
1286
1320
|
elif let_relation not in _COMPARISON_RELATION:
|
|
1287
|
-
raise ValueError(
|
|
1288
|
-
|
|
1321
|
+
raise ValueError(
|
|
1322
|
+
'The supported relation for comparison must be in '
|
|
1323
|
+
f'{_COMPARISON_RELATION}, but {let_relation} is given.'
|
|
1324
|
+
)
|
|
1289
1325
|
else:
|
|
1290
1326
|
self._comparison_relation = let_relation
|
|
1291
1327
|
|
|
1292
|
-
self._description_pattern = (
|
|
1293
|
-
|
|
1328
|
+
self._description_pattern = (
|
|
1329
|
+
'In your response, the letter {letter} should appear {let_relation}'
|
|
1330
|
+
' {let_frequency} times.'
|
|
1331
|
+
)
|
|
1294
1332
|
|
|
1295
1333
|
return self._description_pattern.format(
|
|
1296
1334
|
letter=self._letter,
|
|
@@ -1339,7 +1377,7 @@ class CapitalLettersEnglishChecker(Instruction):
|
|
|
1339
1377
|
def check_following(self, value):
|
|
1340
1378
|
"""Checks that the response is in English and in all capital letters."""
|
|
1341
1379
|
assert isinstance(value, str)
|
|
1342
|
-
|
|
1380
|
+
import langdetect
|
|
1343
1381
|
try:
|
|
1344
1382
|
return value.isupper() and langdetect.detect(value) == 'en'
|
|
1345
1383
|
except langdetect.LangDetectException as e:
|
|
@@ -1353,8 +1391,10 @@ class LowercaseLettersEnglishChecker(Instruction):
|
|
|
1353
1391
|
|
|
1354
1392
|
def build_description(self):
|
|
1355
1393
|
"""Build the instruction description."""
|
|
1356
|
-
self._description_pattern = (
|
|
1357
|
-
|
|
1394
|
+
self._description_pattern = (
|
|
1395
|
+
'Your entire response should be in English, and in all lowercase'
|
|
1396
|
+
' letters. No capital letters are allowed.'
|
|
1397
|
+
)
|
|
1358
1398
|
return self._description_pattern
|
|
1359
1399
|
|
|
1360
1400
|
def get_instruction_args(self):
|
|
@@ -1367,7 +1407,7 @@ class LowercaseLettersEnglishChecker(Instruction):
|
|
|
1367
1407
|
def check_following(self, value):
|
|
1368
1408
|
"""Checks that the response is in English and in all lowercase letters."""
|
|
1369
1409
|
assert isinstance(value, str)
|
|
1370
|
-
|
|
1410
|
+
import langdetect
|
|
1371
1411
|
try:
|
|
1372
1412
|
return value.islower() and langdetect.detect(value) == 'en'
|
|
1373
1413
|
except langdetect.LangDetectException as e:
|
|
@@ -1423,11 +1463,15 @@ class CapitalWordFrequencyChecker(Instruction):
|
|
|
1423
1463
|
if capital_relation is None:
|
|
1424
1464
|
self._comparison_relation = random.choice(_COMPARISON_RELATION)
|
|
1425
1465
|
elif capital_relation not in _COMPARISON_RELATION:
|
|
1426
|
-
raise ValueError(
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1466
|
+
raise ValueError(
|
|
1467
|
+
'The supported relation for comparison must be in '
|
|
1468
|
+
f'{_COMPARISON_RELATION}, but {capital_relation} is given.'
|
|
1469
|
+
)
|
|
1470
|
+
|
|
1471
|
+
self._description_pattern = (
|
|
1472
|
+
'In your response, words with all capital letters should appear'
|
|
1473
|
+
' {relation} {frequency} times.'
|
|
1474
|
+
)
|
|
1431
1475
|
|
|
1432
1476
|
return self._description_pattern.format(frequency=self._frequency, relation=self._comparison_relation)
|
|
1433
1477
|
|
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
"""Utility library of instructions."""
|
|
15
15
|
|
|
16
16
|
import functools
|
|
17
|
-
import immutabledict
|
|
18
17
|
import nltk
|
|
19
18
|
import os
|
|
20
19
|
import random
|
|
@@ -1551,7 +1550,7 @@ WORD_LIST = [
|
|
|
1551
1550
|
] # pylint: disable=line-too-long
|
|
1552
1551
|
|
|
1553
1552
|
# ISO 639-1 codes to language names.
|
|
1554
|
-
LANGUAGE_CODES =
|
|
1553
|
+
LANGUAGE_CODES = {
|
|
1555
1554
|
'en': 'English',
|
|
1556
1555
|
'es': 'Spanish',
|
|
1557
1556
|
'pt': 'Portuguese',
|
|
@@ -1582,7 +1581,7 @@ LANGUAGE_CODES = immutabledict.immutabledict({
|
|
|
1582
1581
|
'pa': 'Punjabi',
|
|
1583
1582
|
'ml': 'Malayalam',
|
|
1584
1583
|
'fi': 'Finnish',
|
|
1585
|
-
}
|
|
1584
|
+
}
|
|
1586
1585
|
|
|
1587
1586
|
_ALPHABETS = '([A-Za-z])'
|
|
1588
1587
|
_PREFIXES = '(Mr|St|Mrs|Ms|Dr)[.]'
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
from typing import Dict, Optional, Union
|
|
3
3
|
|
|
4
|
-
from
|
|
4
|
+
from . import instructions_registry
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
@dataclasses.dataclass
|
|
@@ -121,14 +121,13 @@ def process_results(doc, results):
|
|
|
121
121
|
out_loose = test_instruction_following_loose(inp, response)
|
|
122
122
|
|
|
123
123
|
return {
|
|
124
|
-
'
|
|
125
|
-
'
|
|
126
|
-
'
|
|
127
|
-
'
|
|
124
|
+
'prompt_level_strict': float(out_strict.follow_all_instructions),
|
|
125
|
+
'inst_level_strict': agg_inst_level_acc(out_strict.follow_instruction_list),
|
|
126
|
+
'prompt_level_loose': float(out_loose.follow_all_instructions),
|
|
127
|
+
'inst_level_loose': agg_inst_level_acc(out_loose.follow_instruction_list),
|
|
128
128
|
}
|
|
129
129
|
|
|
130
130
|
|
|
131
131
|
def agg_inst_level_acc(items):
|
|
132
|
-
|
|
133
|
-
inst_level_acc = sum(flat_items) / len(flat_items)
|
|
132
|
+
inst_level_acc = sum(items) / len(items) if items else 0
|
|
134
133
|
return inst_level_acc
|
|
File without changes
|
|
File without changes
|