evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
evalscope/arguments.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
1
2
|
import argparse
|
|
2
3
|
import json
|
|
3
4
|
|
|
4
|
-
from evalscope.constants import EvalBackend,
|
|
5
|
+
from evalscope.constants import EvalBackend, EvalType, JudgeStrategy, ModelTask
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
class ParseStrArgsAction(argparse.Action):
|
|
@@ -9,6 +10,15 @@ class ParseStrArgsAction(argparse.Action):
|
|
|
9
10
|
def __call__(self, parser, namespace, values, option_string=None):
|
|
10
11
|
assert isinstance(values, str), 'args should be a string.'
|
|
11
12
|
|
|
13
|
+
# try json load first
|
|
14
|
+
try:
|
|
15
|
+
arg_dict = json.loads(values)
|
|
16
|
+
setattr(namespace, self.dest, arg_dict)
|
|
17
|
+
return
|
|
18
|
+
except (json.JSONDecodeError, ValueError):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
# If JSON load fails, fall back to parsing as key=value pairs
|
|
12
22
|
arg_dict = {}
|
|
13
23
|
for arg in values.strip().split(','):
|
|
14
24
|
key, value = map(str.strip, arg.split('=', 1)) # Use maxsplit=1 to handle multiple '='
|
|
@@ -35,9 +45,9 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
35
45
|
parser.add_argument('--model', type=str, required=False, help='The model id on modelscope, or local model dir.')
|
|
36
46
|
parser.add_argument('--model-id', type=str, required=False, help='The model id for model name in report.')
|
|
37
47
|
parser.add_argument('--model-args', type=str, action=ParseStrArgsAction, help='The model args, should be a string.')
|
|
48
|
+
parser.add_argument('--model-task', type=str, default=ModelTask.TEXT_GENERATION, choices=[ModelTask.TEXT_GENERATION, ModelTask.IMAGE_GENERATION], help='The model task for model id.') # noqa: E501
|
|
38
49
|
|
|
39
50
|
# Template-related arguments
|
|
40
|
-
parser.add_argument('--template-type', type=str, required=False, help='Deprecated, will be removed in v1.0.0.')
|
|
41
51
|
parser.add_argument('--chat-template', type=str, required=False, help='The custom jinja template for chat generation.') # noqa: E501
|
|
42
52
|
|
|
43
53
|
# Dataset-related arguments
|
|
@@ -50,26 +60,39 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
50
60
|
parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.') # noqa: E501
|
|
51
61
|
|
|
52
62
|
# Evaluation-related arguments
|
|
53
|
-
parser.add_argument('--eval-type', type=str, help='The type for evaluating.'
|
|
54
|
-
choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
|
|
63
|
+
parser.add_argument('--eval-type', type=str, help='The type for evaluating.')
|
|
55
64
|
parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
|
|
56
65
|
choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL]) # noqa: E501
|
|
57
66
|
parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
|
|
58
|
-
parser.add_argument('--
|
|
59
|
-
|
|
60
|
-
parser.add_argument('--
|
|
67
|
+
parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
|
|
68
|
+
parser.add_argument('--limit', type=float, default=None, help='Max evaluation samples num for each subset.')
|
|
69
|
+
parser.add_argument('--repeats', type=int, default=1, help='Number of times to repeat the dataset items for k-metrics.') # noqa: E501
|
|
61
70
|
|
|
62
71
|
# Cache and working directory arguments
|
|
63
|
-
parser.add_argument('--mem-cache', action='store_true', default=False, help='Deprecated, will be removed in v1.0.0.') # noqa: E501
|
|
64
72
|
parser.add_argument('--use-cache', type=str, help='Path to reuse the cached results.')
|
|
73
|
+
parser.add_argument('--rerun-review', action='store_true', default=False, help='Rerun the review process when use_cache.')
|
|
65
74
|
parser.add_argument('--work-dir', type=str, help='The root cache dir.')
|
|
66
75
|
|
|
67
76
|
# Debug and runtime mode arguments
|
|
77
|
+
parser.add_argument('--ignore-errors', action='store_true', default=False, help='Ignore errors during evaluation.')
|
|
68
78
|
parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
|
|
69
|
-
parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
|
|
70
79
|
parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
|
|
71
80
|
parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
|
|
72
81
|
parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
|
|
82
|
+
parser.add_argument('--timeout', type=float, default=None, help='The timeout for the remote API model.')
|
|
83
|
+
parser.add_argument('--stream', action='store_true', default=None, help='Stream mode.') # noqa: E501
|
|
84
|
+
|
|
85
|
+
# LLMJudge arguments
|
|
86
|
+
parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
|
|
87
|
+
parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
|
|
88
|
+
parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
|
|
89
|
+
parser.add_argument('--analysis-report', action='store_true', default=False, help='Generate analysis report for the evaluation results using judge model.') # noqa: E501
|
|
90
|
+
|
|
91
|
+
# Sandbox-related arguments
|
|
92
|
+
parser.add_argument('--use-sandbox', action='store_true', default=False, help='Whether to use sandbox for model evaluation.') # noqa: E501
|
|
93
|
+
parser.add_argument('--sandbox-type', type=str, default='docker', help='The sandbox type to use.') # noqa: E501
|
|
94
|
+
parser.add_argument('--sandbox-config', type=json.loads, default='{}', help='The sandbox config, should be a json string.') # noqa: E501
|
|
95
|
+
parser.add_argument('--sandbox-manager-config', type=json.loads, default='{}', help='The sandbox manager config, should be a json string.') # noqa: E501
|
|
73
96
|
# yapf: enable
|
|
74
97
|
|
|
75
98
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os
|
|
2
3
|
import subprocess
|
|
3
4
|
import tempfile
|
|
4
5
|
from dataclasses import asdict
|
|
@@ -7,7 +8,8 @@ from typing import Optional, Union
|
|
|
7
8
|
|
|
8
9
|
from evalscope.backend.base import BackendManager
|
|
9
10
|
from evalscope.backend.opencompass.api_meta_template import get_template
|
|
10
|
-
from evalscope.utils import get_module_path,
|
|
11
|
+
from evalscope.utils.import_utils import get_module_path, is_module_installed
|
|
12
|
+
from evalscope.utils.io_utils import get_valid_list
|
|
11
13
|
from evalscope.utils.logger import get_logger
|
|
12
14
|
|
|
13
15
|
logger = get_logger()
|
|
@@ -45,7 +47,6 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
45
47
|
datasets: list, the datasets.
|
|
46
48
|
models: list, the models.
|
|
47
49
|
work_dir (Optional): str, the working directory. Default to None, which means the current directory.
|
|
48
|
-
dry_run (Optional): bool, the dry-run flag. Default to False.
|
|
49
50
|
debug (Optional): bool, the debug flag. Default to False.
|
|
50
51
|
reuse (Optional): str, reuse previous outputs & results. Default to None.
|
|
51
52
|
generation_kwargs (Optional): dict, the generation config. Default to {}.
|
|
@@ -138,7 +139,6 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
138
139
|
cmd_str = f'python -m run_oc ' \
|
|
139
140
|
f'--models {" ".join(self.args.models)} ' \
|
|
140
141
|
f'--datasets {" ".join(self.args.datasets)} ' \
|
|
141
|
-
f'{self.get_restore_arg("dry-run", self.args.dry_run)} ' \
|
|
142
142
|
f'{self.get_arg_with_default("work-dir", self.args.work_dir)}'
|
|
143
143
|
|
|
144
144
|
elif cmd_mode == CmdMode.SCRIPT:
|
|
@@ -180,8 +180,10 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
180
180
|
else:
|
|
181
181
|
valid_dataset_names, invalid_dataset_names = get_valid_list(dataset_names, dataset_names_all)
|
|
182
182
|
if len(invalid_dataset_names) > 0:
|
|
183
|
-
logger.error(
|
|
184
|
-
|
|
183
|
+
logger.error(
|
|
184
|
+
f'Invalid datasets: {invalid_dataset_names}, '
|
|
185
|
+
f'refer to the following list to get proper dataset name: {dataset_names_all}'
|
|
186
|
+
)
|
|
185
187
|
assert len(valid_dataset_names) > 0, f'No valid datasets. ' \
|
|
186
188
|
f'To get the valid datasets, please refer to {dataset_names_all}'
|
|
187
189
|
|
|
@@ -204,7 +206,7 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
204
206
|
model_d['meta_template'] = get_template(model_d['meta_template'])
|
|
205
207
|
|
|
206
208
|
# set the 'abbr' as the 'path' if 'abbr' is not specified
|
|
207
|
-
model_d['abbr'] = model_d['path']
|
|
209
|
+
model_d['abbr'] = os.path.basename(model_d['path'])
|
|
208
210
|
|
|
209
211
|
model_config = ApiModelConfig(**model_d)
|
|
210
212
|
models.append(asdict(model_config))
|
|
@@ -250,7 +252,8 @@ if __name__ == '__main__':
|
|
|
250
252
|
'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'
|
|
251
253
|
}],
|
|
252
254
|
'limit': 5
|
|
253
|
-
}
|
|
255
|
+
}
|
|
256
|
+
)
|
|
254
257
|
all_datasets = OpenCompassBackendManager.list_datasets()
|
|
255
258
|
print(f'all_datasets: {all_datasets}')
|
|
256
259
|
oc_backend_manager.run()
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from evalscope.backend.rag_eval.backend_manager import RAGEvalBackendManager
|
|
1
|
+
from evalscope.backend.rag_eval.backend_manager import RAGEvalBackendManager, Tools
|
|
2
2
|
from evalscope.backend.rag_eval.utils.clip import VisionModel
|
|
3
3
|
from evalscope.backend.rag_eval.utils.embedding import EmbeddingModel
|
|
4
4
|
from evalscope.backend.rag_eval.utils.llm import LLM, ChatOpenAI, LocalLLM
|
|
@@ -2,12 +2,19 @@ import os
|
|
|
2
2
|
from typing import Optional, Union
|
|
3
3
|
|
|
4
4
|
from evalscope.backend.base import BackendManager
|
|
5
|
-
from evalscope.utils import
|
|
5
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
6
|
+
from evalscope.utils.io_utils import get_valid_list
|
|
6
7
|
from evalscope.utils.logger import get_logger
|
|
7
8
|
|
|
8
9
|
logger = get_logger()
|
|
9
10
|
|
|
10
11
|
|
|
12
|
+
class Tools:
|
|
13
|
+
MTEB = 'mteb'
|
|
14
|
+
RAGAS = 'ragas'
|
|
15
|
+
CLIP_BENCHMARK = 'clip_benchmark'
|
|
16
|
+
|
|
17
|
+
|
|
11
18
|
class RAGEvalBackendManager(BackendManager):
|
|
12
19
|
|
|
13
20
|
def __init__(self, config: Union[str, dict], **kwargs):
|
|
@@ -47,9 +54,19 @@ class RAGEvalBackendManager(BackendManager):
|
|
|
47
54
|
from evalscope.backend.rag_eval.ragas.tasks import generate_testset
|
|
48
55
|
|
|
49
56
|
if testset_args is not None:
|
|
50
|
-
|
|
57
|
+
if isinstance(testset_args, dict):
|
|
58
|
+
generate_testset(TestsetGenerationArguments(**testset_args))
|
|
59
|
+
elif isinstance(testset_args, TestsetGenerationArguments):
|
|
60
|
+
generate_testset(testset_args)
|
|
61
|
+
else:
|
|
62
|
+
raise ValueError('Please provide the testset generation arguments.')
|
|
51
63
|
if eval_args is not None:
|
|
52
|
-
|
|
64
|
+
if isinstance(eval_args, dict):
|
|
65
|
+
rag_eval(EvaluationArguments(**eval_args))
|
|
66
|
+
elif isinstance(eval_args, EvaluationArguments):
|
|
67
|
+
rag_eval(eval_args)
|
|
68
|
+
else:
|
|
69
|
+
raise ValueError('Please provide the evaluation arguments.')
|
|
53
70
|
|
|
54
71
|
@staticmethod
|
|
55
72
|
def run_clip_benchmark(args):
|
|
@@ -59,17 +76,17 @@ class RAGEvalBackendManager(BackendManager):
|
|
|
59
76
|
|
|
60
77
|
def run(self, *args, **kwargs):
|
|
61
78
|
tool = self.config_d.pop('tool')
|
|
62
|
-
if tool.lower() ==
|
|
79
|
+
if tool.lower() == Tools.MTEB:
|
|
63
80
|
self._check_env('mteb')
|
|
64
81
|
model_args = self.config_d['model']
|
|
65
82
|
eval_args = self.config_d['eval']
|
|
66
83
|
self.run_mteb(model_args, eval_args)
|
|
67
|
-
elif tool.lower() ==
|
|
84
|
+
elif tool.lower() == Tools.RAGAS:
|
|
68
85
|
self._check_env('ragas')
|
|
69
86
|
testset_args = self.config_d.get('testset_generation', None)
|
|
70
87
|
eval_args = self.config_d.get('eval', None)
|
|
71
88
|
self.run_ragas(testset_args, eval_args)
|
|
72
|
-
elif tool.lower() ==
|
|
89
|
+
elif tool.lower() == Tools.CLIP_BENCHMARK:
|
|
73
90
|
self._check_env('webdataset')
|
|
74
91
|
self.run_clip_benchmark(self.config_d['eval'])
|
|
75
92
|
else:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import posixpath # For URL path handling
|
|
2
3
|
import torch
|
|
3
4
|
from torch.utils.data import DataLoader
|
|
4
5
|
from torch.utils.data import Dataset as TorchDataset
|
|
@@ -99,16 +100,16 @@ class DatasetWrapper(TorchDataset):
|
|
|
99
100
|
|
|
100
101
|
def get_dataset_default_task(dataset):
|
|
101
102
|
if dataset in (
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
103
|
+
'custom',
|
|
104
|
+
'muge',
|
|
105
|
+
'flickr30k',
|
|
106
|
+
'flickr8k',
|
|
107
|
+
'mscoco_captions',
|
|
108
|
+
'mscoco_captions2017',
|
|
109
|
+
'multilingual_mscoco_captions',
|
|
110
|
+
'flickr30k-200',
|
|
111
|
+
'crossmodal3600',
|
|
112
|
+
'xtd200',
|
|
112
113
|
):
|
|
113
114
|
return 'zeroshot_retrieval'
|
|
114
115
|
else:
|
|
@@ -186,42 +187,53 @@ def build_wds_dataset(dataset_name, transform, split='test', data_dir='root', ca
|
|
|
186
187
|
|
|
187
188
|
Set `cache_dir` to a path to cache the dataset, otherwise, no caching will occur.
|
|
188
189
|
"""
|
|
190
|
+
import requests
|
|
189
191
|
import webdataset as wds
|
|
190
192
|
|
|
191
193
|
def read_txt(fname):
|
|
192
|
-
if '://'
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
194
|
+
if fname.startswith(('http://', 'https://')):
|
|
195
|
+
try:
|
|
196
|
+
response = requests.get(fname)
|
|
197
|
+
response.raise_for_status() # Ensure the HTTP request was successful
|
|
198
|
+
return response.text
|
|
199
|
+
except requests.exceptions.RequestException as e:
|
|
200
|
+
raise FileNotFoundError(f'Failed to read {fname}: {e}')
|
|
197
201
|
else:
|
|
198
202
|
with open(fname, 'r') as file:
|
|
199
|
-
|
|
200
|
-
|
|
203
|
+
return file.read()
|
|
204
|
+
|
|
205
|
+
def url_path_join(*parts):
|
|
206
|
+
"""Join URL path parts with forward slashes regardless of platform"""
|
|
207
|
+
return posixpath.join(*parts)
|
|
201
208
|
|
|
202
209
|
if not data_dir:
|
|
203
210
|
data_dir = f'https://modelscope.cn/datasets/clip-benchmark/wds_{dataset_name}/resolve/master'
|
|
204
211
|
|
|
205
212
|
# Git LFS files have a different file path to access the raw data than other files
|
|
206
|
-
|
|
213
|
+
is_url = data_dir.startswith(('http://', 'https://'))
|
|
214
|
+
if is_url and data_dir.startswith('https://modelscope.cn/datasets'):
|
|
207
215
|
*split_url_head, _, url_path = data_dir.split('/', 7)
|
|
208
216
|
url_head = '/'.join(split_url_head)
|
|
209
217
|
metadata_dir = '/'.join([url_head, 'resolve', url_path])
|
|
210
218
|
tardata_dir = '/'.join([url_head, 'resolve', url_path])
|
|
211
219
|
else:
|
|
212
220
|
metadata_dir = tardata_dir = data_dir
|
|
221
|
+
|
|
222
|
+
# Use appropriate path joining function based on whether we're dealing with a URL
|
|
223
|
+
path_join = url_path_join if is_url else os.path.join
|
|
224
|
+
|
|
213
225
|
# Get number of shards
|
|
214
|
-
nshards_fname =
|
|
226
|
+
nshards_fname = path_join(metadata_dir, split, 'nshards.txt')
|
|
215
227
|
nshards = int(read_txt(nshards_fname)) # Do not catch FileNotFound, nshards.txt should be mandatory
|
|
216
228
|
|
|
217
229
|
# Get dataset type (classification or retrieval)
|
|
218
|
-
type_fname =
|
|
230
|
+
type_fname = path_join(metadata_dir, 'dataset_type.txt')
|
|
219
231
|
try:
|
|
220
232
|
dataset_type = read_txt(type_fname).strip().lower()
|
|
221
233
|
except FileNotFoundError:
|
|
222
234
|
dataset_type = 'classification'
|
|
223
235
|
|
|
224
|
-
filepattern =
|
|
236
|
+
filepattern = path_join(tardata_dir, split, '{0..%d}.tar' % (nshards - 1))
|
|
225
237
|
# Load webdataset (support WEBP, PNG, and JPG for now)
|
|
226
238
|
if not cache_dir or not isinstance(cache_dir, str):
|
|
227
239
|
cache_dir = None
|
|
@@ -4,8 +4,11 @@ import torch
|
|
|
4
4
|
from itertools import product
|
|
5
5
|
|
|
6
6
|
from evalscope.backend.rag_eval.clip_benchmark.arguments import Arguments
|
|
7
|
-
from evalscope.backend.rag_eval.clip_benchmark.dataset_builder import (
|
|
8
|
-
|
|
7
|
+
from evalscope.backend.rag_eval.clip_benchmark.dataset_builder import (
|
|
8
|
+
build_dataset,
|
|
9
|
+
get_dataloader,
|
|
10
|
+
get_dataset_default_task,
|
|
11
|
+
)
|
|
9
12
|
from evalscope.backend.rag_eval.clip_benchmark.tasks import image_caption, zeroshot_classification, zeroshot_retrieval
|
|
10
13
|
from evalscope.backend.rag_eval.utils.clip import VisionModel
|
|
11
14
|
from evalscope.utils.logger import get_logger
|
|
@@ -66,8 +69,9 @@ def evaluate(args: Arguments):
|
|
|
66
69
|
if verbose:
|
|
67
70
|
logger.info(f'Zero-shot templates: {zeroshot_templates}')
|
|
68
71
|
classnames = dataset.classes if hasattr(dataset, 'classes') else None
|
|
69
|
-
assert (
|
|
70
|
-
|
|
72
|
+
assert (
|
|
73
|
+
zeroshot_templates is not None and classnames is not None
|
|
74
|
+
), 'Dataset does not support classification'
|
|
71
75
|
metrics = zeroshot_classification.evaluate(
|
|
72
76
|
model,
|
|
73
77
|
dataloader,
|
|
@@ -11,7 +11,9 @@ class ModelArguments:
|
|
|
11
11
|
pooling_mode: Optional[str] = None
|
|
12
12
|
max_seq_length: int = 512 # max sequence length
|
|
13
13
|
# prompt for llm based model
|
|
14
|
-
prompt: str =
|
|
14
|
+
prompt: Optional[str] = None
|
|
15
|
+
# prompts dictionary for different tasks, if prompt is not set
|
|
16
|
+
prompts: Optional[Dict[str, str]] = None
|
|
15
17
|
# model kwargs
|
|
16
18
|
model_kwargs: dict = field(default_factory=dict)
|
|
17
19
|
# config kwargs
|
|
@@ -20,6 +22,12 @@ class ModelArguments:
|
|
|
20
22
|
encode_kwargs: dict = field(default_factory=lambda: {'show_progress_bar': True, 'batch_size': 32})
|
|
21
23
|
hub: str = 'modelscope' # modelscope or huggingface
|
|
22
24
|
|
|
25
|
+
# for API embedding model
|
|
26
|
+
model_name: Optional[str] = None
|
|
27
|
+
api_base: Optional[str] = None
|
|
28
|
+
api_key: Optional[str] = None
|
|
29
|
+
dimensions: Optional[int] = None
|
|
30
|
+
|
|
23
31
|
def to_dict(self) -> Dict[str, Any]:
|
|
24
32
|
return {
|
|
25
33
|
'model_name_or_path': self.model_name_or_path,
|
|
@@ -27,10 +35,15 @@ class ModelArguments:
|
|
|
27
35
|
'pooling_mode': self.pooling_mode,
|
|
28
36
|
'max_seq_length': self.max_seq_length,
|
|
29
37
|
'prompt': self.prompt,
|
|
38
|
+
'prompts': self.prompts,
|
|
30
39
|
'model_kwargs': self.model_kwargs,
|
|
31
40
|
'config_kwargs': self.config_kwargs,
|
|
32
41
|
'encode_kwargs': self.encode_kwargs,
|
|
33
42
|
'hub': self.hub,
|
|
43
|
+
'model_name': self.model_name,
|
|
44
|
+
'api_base': self.api_base,
|
|
45
|
+
'api_key': self.api_key,
|
|
46
|
+
'dimensions': self.dimensions,
|
|
34
47
|
}
|
|
35
48
|
|
|
36
49
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import mteb
|
|
2
2
|
import os
|
|
3
|
-
from
|
|
3
|
+
from tabulate import tabulate
|
|
4
4
|
|
|
5
5
|
from evalscope.backend.rag_eval import EmbeddingModel, cmteb
|
|
6
6
|
from evalscope.utils.logger import get_logger
|
|
@@ -12,14 +12,27 @@ def show_results(output_folder, model, results):
|
|
|
12
12
|
model_name = model.mteb_model_meta.model_name_as_path()
|
|
13
13
|
revision = model.mteb_model_meta.revision
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
data = []
|
|
16
|
+
for model_res in results:
|
|
17
|
+
main_res = model_res.only_main_score()
|
|
18
|
+
for split, score in main_res.scores.items():
|
|
19
|
+
for sub_score in score:
|
|
20
|
+
data.append({
|
|
21
|
+
'Model': model_name.replace('eval__', ''),
|
|
22
|
+
'Revision': revision,
|
|
23
|
+
'Task Type': main_res.task_type,
|
|
24
|
+
'Task': main_res.task_name,
|
|
25
|
+
'Split': split,
|
|
26
|
+
'Subset': sub_score['hf_subset'],
|
|
27
|
+
'Main Score': sub_score['main_score'],
|
|
28
|
+
})
|
|
16
29
|
|
|
17
30
|
save_path = os.path.join(
|
|
18
31
|
output_folder,
|
|
19
32
|
model_name,
|
|
20
33
|
revision,
|
|
21
34
|
)
|
|
22
|
-
logger.info(f'Evaluation results:\n{
|
|
35
|
+
logger.info(f'Evaluation results:\n{tabulate(data, headers="keys", tablefmt="grid")}')
|
|
23
36
|
logger.info(f'Evaluation results saved in {os.path.abspath(save_path)}')
|
|
24
37
|
|
|
25
38
|
|
|
@@ -34,6 +47,7 @@ def one_stage_eval(
|
|
|
34
47
|
tasks = cmteb.TaskBase.get_tasks(task_names=eval_args['tasks'], dataset_path=custom_dataset_path)
|
|
35
48
|
evaluation = mteb.MTEB(tasks=tasks)
|
|
36
49
|
|
|
50
|
+
eval_args['encode_kwargs'] = model_args.get('encode_kwargs', {})
|
|
37
51
|
# run evaluation
|
|
38
52
|
results = evaluation.run(model, **eval_args)
|
|
39
53
|
|
|
@@ -66,6 +80,7 @@ def two_stage_eval(
|
|
|
66
80
|
overwrite_results=True,
|
|
67
81
|
hub=eval_args['hub'],
|
|
68
82
|
limits=eval_args['limits'],
|
|
83
|
+
encode_kwargs=model1_args.get('encode_kwargs', {}),
|
|
69
84
|
)
|
|
70
85
|
# stage 2: run cross encoder
|
|
71
86
|
results = evaluation.run(
|
|
@@ -77,6 +92,7 @@ def two_stage_eval(
|
|
|
77
92
|
overwrite_results=True,
|
|
78
93
|
hub=eval_args['hub'],
|
|
79
94
|
limits=eval_args['limits'],
|
|
95
|
+
encode_kwargs=model2_args.get('encode_kwargs', {}),
|
|
80
96
|
)
|
|
81
97
|
|
|
82
98
|
# save and log results
|
|
@@ -9,7 +9,6 @@ class CustomRetrieval(AbsTaskRetrieval):
|
|
|
9
9
|
ignore_identical_ids: bool = True
|
|
10
10
|
|
|
11
11
|
def __init__(self, dataset_path: Optional[str] = 'custom_eval/text/retrieval', **kwargs):
|
|
12
|
-
super().__init__(**kwargs)
|
|
13
12
|
self.metadata = TaskMetadata(
|
|
14
13
|
name='CustomRetrieval',
|
|
15
14
|
description='CustomRetrieval Task',
|
|
@@ -34,6 +33,7 @@ class CustomRetrieval(AbsTaskRetrieval):
|
|
|
34
33
|
bibtex_citation='',
|
|
35
34
|
descriptive_stats={},
|
|
36
35
|
)
|
|
36
|
+
super().__init__(**kwargs)
|
|
37
37
|
|
|
38
38
|
def load_data(self, **kwargs):
|
|
39
39
|
if self.data_loaded:
|
|
@@ -21,7 +21,6 @@ class TestsetGenerationArguments:
|
|
|
21
21
|
"""
|
|
22
22
|
generator_llm: Dict = field(default_factory=dict)
|
|
23
23
|
embeddings: Dict = field(default_factory=dict)
|
|
24
|
-
distribution: str = field(default_factory=lambda: {'simple': 0.5, 'multi_context': 0.4, 'reasoning': 0.1})
|
|
25
24
|
# For LLM based evaluation
|
|
26
25
|
# available: ['english', 'hindi', 'marathi', 'chinese', 'spanish', 'amharic', 'arabic',
|
|
27
26
|
# 'armenian', 'bulgarian', 'urdu', 'russian', 'polish', 'persian', 'dutch', 'danish',
|
|
@@ -44,8 +44,9 @@ def default_transforms(
|
|
|
44
44
|
return bins
|
|
45
45
|
|
|
46
46
|
def filter_doc_with_num_tokens(node, min_num_tokens=500):
|
|
47
|
-
return (
|
|
48
|
-
|
|
47
|
+
return (
|
|
48
|
+
node.type == NodeType.DOCUMENT and num_tokens_from_string(node.properties['page_content']) > min_num_tokens
|
|
49
|
+
)
|
|
49
50
|
|
|
50
51
|
def filter_docs(node):
|
|
51
52
|
return node.type == NodeType.DOCUMENT
|
|
@@ -90,7 +91,8 @@ def default_transforms(
|
|
|
90
91
|
target_lang=language,
|
|
91
92
|
llm=llm,
|
|
92
93
|
adapt_instruction=True,
|
|
93
|
-
)
|
|
94
|
+
)
|
|
95
|
+
)
|
|
94
96
|
|
|
95
97
|
transforms = [
|
|
96
98
|
headline_extractor,
|
|
@@ -121,7 +123,8 @@ def default_transforms(
|
|
|
121
123
|
target_lang=language,
|
|
122
124
|
llm=llm,
|
|
123
125
|
adapt_instruction=True,
|
|
124
|
-
)
|
|
126
|
+
)
|
|
127
|
+
)
|
|
125
128
|
|
|
126
129
|
transforms = [
|
|
127
130
|
summary_extractor,
|
|
@@ -67,9 +67,14 @@ def get_persona(llm, kg, language):
|
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
def load_data(file_path):
|
|
70
|
-
|
|
70
|
+
import nltk
|
|
71
|
+
from langchain_unstructured import UnstructuredLoader
|
|
71
72
|
|
|
72
|
-
|
|
73
|
+
if nltk.data.find('taggers/averaged_perceptron_tagger_eng') is False:
|
|
74
|
+
# need to download nltk data for the first time
|
|
75
|
+
nltk.download('averaged_perceptron_tagger_eng')
|
|
76
|
+
|
|
77
|
+
loader = UnstructuredLoader(file_path)
|
|
73
78
|
data = loader.load()
|
|
74
79
|
return data
|
|
75
80
|
|
|
@@ -108,7 +113,8 @@ def generate_testset(args: TestsetGenerationArguments) -> None:
|
|
|
108
113
|
|
|
109
114
|
# generate testset
|
|
110
115
|
generator = TestsetGenerator(
|
|
111
|
-
llm=wrapped_llm, embedding_model=wrapped_embeddings, knowledge_graph=knowledge_graph, persona_list=persona_list
|
|
116
|
+
llm=wrapped_llm, embedding_model=wrapped_embeddings, knowledge_graph=knowledge_graph, persona_list=persona_list
|
|
117
|
+
)
|
|
112
118
|
|
|
113
119
|
testset = generator.generate(
|
|
114
120
|
testset_size=args.test_size,
|
|
@@ -2,7 +2,6 @@ import asyncio
|
|
|
2
2
|
import os
|
|
3
3
|
from ragas.llms import BaseRagasLLM
|
|
4
4
|
from ragas.prompt import PromptMixin, PydanticPrompt
|
|
5
|
-
from ragas.utils import RAGAS_SUPPORTED_LANGUAGE_CODES
|
|
6
5
|
from typing import List
|
|
7
6
|
|
|
8
7
|
from evalscope.utils.logger import get_logger
|
|
@@ -16,10 +15,6 @@ async def translate_prompt(
|
|
|
16
15
|
llm: BaseRagasLLM,
|
|
17
16
|
adapt_instruction: bool = False,
|
|
18
17
|
):
|
|
19
|
-
if target_lang not in RAGAS_SUPPORTED_LANGUAGE_CODES:
|
|
20
|
-
logger.warning(f'{target_lang} is not in supported language: {list(RAGAS_SUPPORTED_LANGUAGE_CODES)}')
|
|
21
|
-
return
|
|
22
|
-
|
|
23
18
|
if not issubclass(type(prompt_user), PromptMixin):
|
|
24
19
|
logger.info(f"{prompt_user} is not a PromptMixin, don't translate it")
|
|
25
20
|
return
|
|
@@ -39,7 +34,8 @@ async def translate_prompt(
|
|
|
39
34
|
|
|
40
35
|
logger.info(f'Translating prompts to {target_lang}')
|
|
41
36
|
adapted_prompts = await prompt_user.adapt_prompts(
|
|
42
|
-
language=target_lang, llm=llm, adapt_instruction=adapt_instruction
|
|
37
|
+
language=target_lang, llm=llm, adapt_instruction=adapt_instruction
|
|
38
|
+
)
|
|
43
39
|
prompt_user.set_prompts(**adapted_prompts)
|
|
44
40
|
try:
|
|
45
41
|
prompt_user.save_prompts(prompt_dir)
|