evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +11 -0
- evalscope/api/benchmark/adapters/__init__.py +7 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +404 -0
- evalscope/api/benchmark/meta.py +124 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +370 -0
- evalscope/api/dataset/loader.py +266 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +382 -0
- evalscope/api/evaluator/evaluator.py +61 -0
- evalscope/api/evaluator/state.py +280 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +248 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +60 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/llm_judge_mixin.py +170 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +161 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/__init__.py +28 -0
- evalscope/app/app.py +38 -0
- evalscope/app/arguments.py +11 -0
- evalscope/app/constants.py +22 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +53 -0
- evalscope/app/ui/multi_model.py +353 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +220 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +195 -0
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +96 -0
- evalscope/arguments.py +32 -9
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +10 -7
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +23 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
- evalscope/backend/rag_eval/utils/embedding.py +125 -32
- evalscope/backend/rag_eval/utils/llm.py +16 -16
- evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
- evalscope/benchmarks/__init__.py +17 -5
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/__init__.py +0 -0
- evalscope/benchmarks/aime/aime24_adapter.py +55 -0
- evalscope/benchmarks/aime/aime25_adapter.py +181 -0
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arc/arc_adapter.py +34 -149
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
- evalscope/benchmarks/arena_hard/utils.py +186 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
- evalscope/benchmarks/bfcl/v3/generation.py +222 -0
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
- evalscope/benchmarks/docmath/utils.py +219 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +155 -0
- evalscope/benchmarks/drop/utils.py +156 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +175 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
- evalscope/benchmarks/general_arena/utils.py +223 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
- evalscope/benchmarks/gpqa/prompt.py +88 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +153 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
- evalscope/benchmarks/ifeval/instructions.py +112 -68
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/musr/__init__.py +0 -0
- evalscope/benchmarks/musr/musr_adapter.py +43 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/__init__.py +0 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/race/race_adapter.py +33 -120
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/prompt.py +88 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
- evalscope/benchmarks/super_gpqa/utils.py +86 -0
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +12 -2
- evalscope/cli/start_eval.py +4 -3
- evalscope/cli/start_perf.py +10 -2
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +27 -3
- evalscope/collections/sampler.py +12 -11
- evalscope/collections/schema.py +13 -12
- evalscope/config.py +218 -147
- evalscope/constants.py +78 -82
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +334 -318
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +59 -3
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +211 -0
- evalscope/metrics/math_parser.py +545 -0
- evalscope/metrics/metric.py +611 -0
- evalscope/metrics/metrics.py +112 -23
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/metrics/t2v_metrics/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +23 -13
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +69 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +144 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +708 -0
- evalscope/perf/__init__.py +0 -1
- evalscope/perf/arguments.py +103 -69
- evalscope/perf/benchmark.py +114 -163
- evalscope/perf/http_client.py +59 -89
- evalscope/perf/main.py +91 -18
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +27 -7
- evalscope/perf/plugin/api/custom_api.py +170 -57
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +214 -0
- evalscope/perf/plugin/api/openai_api.py +120 -41
- evalscope/perf/plugin/datasets/__init__.py +10 -6
- evalscope/perf/plugin/datasets/base.py +43 -1
- evalscope/perf/plugin/datasets/custom.py +22 -3
- evalscope/perf/plugin/datasets/flickr8k.py +5 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +7 -3
- evalscope/perf/plugin/datasets/longalpaca.py +7 -3
- evalscope/perf/plugin/datasets/openqa.py +13 -14
- evalscope/perf/plugin/datasets/random_dataset.py +67 -0
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +95 -55
- evalscope/perf/utils/db_util.py +115 -78
- evalscope/perf/utils/local_server.py +12 -47
- evalscope/perf/utils/log_utils.py +63 -0
- evalscope/perf/utils/rich_display.py +192 -0
- evalscope/report/__init__.py +46 -3
- evalscope/report/combinator.py +143 -32
- evalscope/report/generator.py +74 -34
- evalscope/report/report.py +238 -0
- evalscope/run.py +71 -46
- evalscope/summarizer.py +5 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/__init__.py +3 -0
- evalscope/third_party/thinkbench/eval.py +441 -0
- evalscope/third_party/thinkbench/infer.py +130 -0
- evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
- evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
- evalscope/third_party/thinkbench/tools/__init__.py +0 -0
- evalscope/third_party/thinkbench/tools/llm.py +48 -0
- evalscope/third_party/thinkbench/tools/utils.py +13 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +82 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/chat_service.py +8 -6
- evalscope/utils/deprecation_utils.py +53 -0
- evalscope/utils/function_utils.py +266 -0
- evalscope/utils/import_utils.py +154 -0
- evalscope/utils/io_utils.py +336 -8
- evalscope/utils/json_schema.py +231 -0
- evalscope/utils/logger.py +121 -31
- evalscope/utils/model_utils.py +57 -1
- evalscope/utils/multi_choices.py +303 -0
- evalscope/utils/ner.py +377 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- evalscope-1.2.0.dist-info/METADATA +553 -0
- evalscope-1.2.0.dist-info/RECORD +628 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -76
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -291
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/collections/evaluator.py +0 -198
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/named_metrics.py +0 -17
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- evalscope/models/base_adapter.py +0 -52
- evalscope/models/chat_adapter.py +0 -138
- evalscope/models/choice_adapter.py +0 -211
- evalscope/models/custom/__init__.py +0 -3
- evalscope/models/custom/custom_model.py +0 -53
- evalscope/models/custom/dummy_model.py +0 -63
- evalscope/models/custom_adapter.py +0 -67
- evalscope/models/local_model.py +0 -74
- evalscope/models/model.py +0 -229
- evalscope/models/server_adapter.py +0 -111
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/report/app.py +0 -506
- evalscope/report/utils.py +0 -133
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- evalscope/utils/utils.py +0 -301
- evalscope-0.10.0.dist-info/METADATA +0 -565
- evalscope-0.10.0.dist-info/RECORD +0 -286
- tests/__init__.py +0 -1
- tests/cli/__init__.py +0 -1
- tests/cli/test_collection.py +0 -57
- tests/cli/test_run.py +0 -165
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -101
- tests/rag/test_clip_benchmark.py +0 -85
- tests/rag/test_mteb.py +0 -138
- tests/rag/test_ragas.py +0 -120
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -145
- tests/swift/test_run_swift_vlm_eval.py +0 -127
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
- tests/test_run_all.py +0 -12
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -60
- {tests/rag → evalscope/api}/__init__.py +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import threading
|
|
3
|
+
import time
|
|
4
|
+
from concurrent.futures import ThreadPoolExecutor, wait
|
|
5
|
+
from contextlib import contextmanager
|
|
6
|
+
from functools import wraps
|
|
7
|
+
from tqdm import tqdm
|
|
8
|
+
from typing import Any, Awaitable, Callable, List, Optional, Sequence, TypeVar, Union
|
|
9
|
+
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
T = TypeVar('T')
|
|
15
|
+
R = TypeVar('R')
|
|
16
|
+
|
|
17
|
+
# Global lock to safely create per-instance locks in decorators
|
|
18
|
+
_THREAD_SAFE_GLOBAL_LOCK = threading.RLock()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def thread_safe(func: Callable[..., T]) -> Callable[..., T]:
|
|
22
|
+
"""Thread-safe decorator.
|
|
23
|
+
- If decorating a bound method, uses a per-instance, per-method lock.
|
|
24
|
+
- If decorating a function, uses a function-scoped lock.
|
|
25
|
+
"""
|
|
26
|
+
func_lock = threading.RLock()
|
|
27
|
+
lock_attr_name = f'__lock_{func.__name__}'
|
|
28
|
+
|
|
29
|
+
@wraps(func)
|
|
30
|
+
def wrapper(*args, **kwargs):
|
|
31
|
+
# Prefer per-instance lock if the first arg looks like 'self'
|
|
32
|
+
if args and hasattr(args[0], '__dict__'):
|
|
33
|
+
self_obj = args[0]
|
|
34
|
+
lock = getattr(self_obj, lock_attr_name, None)
|
|
35
|
+
if lock is None:
|
|
36
|
+
with _THREAD_SAFE_GLOBAL_LOCK:
|
|
37
|
+
lock = getattr(self_obj, lock_attr_name, None)
|
|
38
|
+
if lock is None:
|
|
39
|
+
lock = threading.RLock()
|
|
40
|
+
setattr(self_obj, lock_attr_name, lock)
|
|
41
|
+
else:
|
|
42
|
+
lock = func_lock
|
|
43
|
+
|
|
44
|
+
with lock:
|
|
45
|
+
return func(*args, **kwargs)
|
|
46
|
+
|
|
47
|
+
return wrapper
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def run_once(func: Callable[..., T]) -> Callable[..., T]:
|
|
51
|
+
"""Decorator to ensure a function is executed at most once across threads."""
|
|
52
|
+
lock = threading.RLock()
|
|
53
|
+
has_run: bool = False
|
|
54
|
+
result: Optional[T] = None
|
|
55
|
+
|
|
56
|
+
@wraps(func)
|
|
57
|
+
def wrapper(*args, **kwargs):
|
|
58
|
+
nonlocal has_run, result
|
|
59
|
+
if has_run:
|
|
60
|
+
return result
|
|
61
|
+
# Double-checked locking to avoid redundant locking on hot path
|
|
62
|
+
with lock:
|
|
63
|
+
if not has_run:
|
|
64
|
+
result = func(*args, **kwargs)
|
|
65
|
+
has_run = True
|
|
66
|
+
return result
|
|
67
|
+
|
|
68
|
+
return wrapper
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def retry_func(retries=3, sleep_interval=0):
|
|
72
|
+
"""A decorator that retries a function call up to `retries` times if an exception occurs."""
|
|
73
|
+
|
|
74
|
+
def decorator(func):
|
|
75
|
+
|
|
76
|
+
@wraps(func)
|
|
77
|
+
def wrapper(*args, **kwargs):
|
|
78
|
+
last_exception = None
|
|
79
|
+
for attempt in range(retries):
|
|
80
|
+
try:
|
|
81
|
+
return func(*args, **kwargs)
|
|
82
|
+
except Exception as e:
|
|
83
|
+
last_exception = e
|
|
84
|
+
if sleep_interval > 0:
|
|
85
|
+
time.sleep(sleep_interval)
|
|
86
|
+
raise last_exception
|
|
87
|
+
|
|
88
|
+
return wrapper
|
|
89
|
+
|
|
90
|
+
return decorator
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@contextmanager
|
|
94
|
+
def retry_context(retries=3, sleep_interval=0):
|
|
95
|
+
"""A context manager that retries the code block up to `retries` times if an exception occurs."""
|
|
96
|
+
last_exception = None
|
|
97
|
+
for attempt in range(retries):
|
|
98
|
+
try:
|
|
99
|
+
yield
|
|
100
|
+
return # If no exception, exit successfully
|
|
101
|
+
except Exception as e:
|
|
102
|
+
last_exception = e
|
|
103
|
+
if sleep_interval > 0:
|
|
104
|
+
time.sleep(sleep_interval)
|
|
105
|
+
if attempt == retries - 1: # Last attempt
|
|
106
|
+
break
|
|
107
|
+
raise last_exception
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class AsyncioLoopRunner:
|
|
111
|
+
"""Singleton background asyncio loop runner for sync→async bridging."""
|
|
112
|
+
_instance: Optional['AsyncioLoopRunner'] = None
|
|
113
|
+
_inst_lock = threading.Lock()
|
|
114
|
+
|
|
115
|
+
def __init__(self) -> None:
|
|
116
|
+
self._loop: Optional[asyncio.AbstractEventLoop] = None
|
|
117
|
+
self._thread: Optional[threading.Thread] = None
|
|
118
|
+
self._start_loop()
|
|
119
|
+
|
|
120
|
+
def _start_loop(self) -> None:
|
|
121
|
+
loop = asyncio.new_event_loop()
|
|
122
|
+
self._loop = loop
|
|
123
|
+
|
|
124
|
+
def run_loop() -> None:
|
|
125
|
+
asyncio.set_event_loop(loop)
|
|
126
|
+
loop.run_forever()
|
|
127
|
+
|
|
128
|
+
self._thread = threading.Thread(target=run_loop, daemon=True, name='AsyncioLoopRunner')
|
|
129
|
+
self._thread.start()
|
|
130
|
+
|
|
131
|
+
@classmethod
|
|
132
|
+
def instance(cls) -> 'AsyncioLoopRunner':
|
|
133
|
+
if cls._instance is not None:
|
|
134
|
+
return cls._instance
|
|
135
|
+
with cls._inst_lock:
|
|
136
|
+
if cls._instance is None:
|
|
137
|
+
cls._instance = AsyncioLoopRunner()
|
|
138
|
+
return cls._instance
|
|
139
|
+
|
|
140
|
+
@classmethod
|
|
141
|
+
def run(cls, coro: Awaitable[T], timeout: Optional[float] = None) -> T:
|
|
142
|
+
"""Submit a coroutine to the background loop and wait for result."""
|
|
143
|
+
inst = cls.instance()
|
|
144
|
+
fut = asyncio.run_coroutine_threadsafe(coro, inst._loop)
|
|
145
|
+
return fut.result(timeout=timeout)
|
|
146
|
+
|
|
147
|
+
@property
|
|
148
|
+
def loop(self) -> Optional[asyncio.AbstractEventLoop]:
|
|
149
|
+
"""Access the underlying event loop (read-only use)."""
|
|
150
|
+
return self._loop
|
|
151
|
+
|
|
152
|
+
def stop(self, join_timeout: float = 5.0) -> None:
|
|
153
|
+
"""Optional shutdown of the background loop (generally not needed)."""
|
|
154
|
+
if not self._loop:
|
|
155
|
+
return
|
|
156
|
+
self._loop.call_soon_threadsafe(self._loop.stop)
|
|
157
|
+
if self._thread:
|
|
158
|
+
self._thread.join(timeout=join_timeout)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def run_in_threads_with_progress(
|
|
162
|
+
items: Sequence[T],
|
|
163
|
+
worker: Callable[[T], R],
|
|
164
|
+
*,
|
|
165
|
+
desc: str,
|
|
166
|
+
max_workers: int,
|
|
167
|
+
heartbeat_sec: int,
|
|
168
|
+
on_result: Optional[Callable[[T, R], None]] = None,
|
|
169
|
+
on_error: Optional[Callable[[T, Exception], None]] = None,
|
|
170
|
+
filter_none_results: bool = False,
|
|
171
|
+
) -> List[R]:
|
|
172
|
+
"""
|
|
173
|
+
Execute a collection of tasks concurrently with a ThreadPoolExecutor while
|
|
174
|
+
displaying a tqdm progress bar and emitting periodic heartbeat logs.
|
|
175
|
+
|
|
176
|
+
Key behaviors:
|
|
177
|
+
- Concurrency: Uses up to `min(len(items), max_workers)` threads.
|
|
178
|
+
- Progress: A tqdm bar advances when each task finishes (success or failure).
|
|
179
|
+
- Heartbeat: If no tasks finish within `heartbeat_sec`, a status line is logged.
|
|
180
|
+
- Ordering: Results are appended in completion order (not the original order).
|
|
181
|
+
- Error handling:
|
|
182
|
+
* If `on_error` is provided, it is called for each failed item; execution continues
|
|
183
|
+
unless `on_error` itself raises.
|
|
184
|
+
* If `on_error` is None, the first exception is raised immediately and stops processing.
|
|
185
|
+
- Callbacks:
|
|
186
|
+
* `on_result(item, result)` is called after a successful result is obtained.
|
|
187
|
+
* Both callbacks run in the main thread (not worker threads).
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
items: A sequence of items (inputs) to process. Converted to a list internally.
|
|
191
|
+
worker: A callable executed in threads to process a single item and return a result.
|
|
192
|
+
desc: A short text shown as the tqdm progress bar description.
|
|
193
|
+
max_workers: Upper bound on the number of concurrent threads.
|
|
194
|
+
heartbeat_sec: Interval (in seconds) to wait before emitting a heartbeat log if
|
|
195
|
+
no tasks complete in that window.
|
|
196
|
+
on_result: Optional callback invoked as on_result(item, result) after success.
|
|
197
|
+
on_error: Optional callback invoked as on_error(item, exception) on failure. If omitted,
|
|
198
|
+
the exception is propagated and the function terminates early.
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
A list of results collected as tasks complete (completion order).
|
|
202
|
+
If some tasks fail and `on_error` is provided (and does not re-raise), those failures
|
|
203
|
+
are skipped and not included in the returned results.
|
|
204
|
+
|
|
205
|
+
Raises:
|
|
206
|
+
Exception: Propagates the first task exception if `on_error` is not provided, or if
|
|
207
|
+
`on_error` re-raises.
|
|
208
|
+
|
|
209
|
+
Notes:
|
|
210
|
+
- The function is blocking until all tasks complete or an exception is propagated.
|
|
211
|
+
- Use `on_error` to implement "best-effort" processing where failures are logged
|
|
212
|
+
and the rest continue.
|
|
213
|
+
"""
|
|
214
|
+
# Defensive copy to avoid consuming a generator multiple times and to compute pool size.
|
|
215
|
+
pending_items: List[T] = list(items)
|
|
216
|
+
if not pending_items:
|
|
217
|
+
return []
|
|
218
|
+
|
|
219
|
+
# Include indices to ensure results are returned in input order
|
|
220
|
+
indexed_items = list(enumerate(items))
|
|
221
|
+
results: List[Optional[R]] = [None] * len(items) # Preallocate results list
|
|
222
|
+
|
|
223
|
+
# Bound the pool by actual workload size for efficiency.
|
|
224
|
+
with ThreadPoolExecutor(max_workers=min(len(indexed_items), max_workers)) as executor:
|
|
225
|
+
# Submit all tasks up-front and map futures back to their originating item.
|
|
226
|
+
future_to_index = {executor.submit(worker, item): index for index, item in indexed_items}
|
|
227
|
+
|
|
228
|
+
# Progress bar reflects total number of submitted tasks; updated per finished future.
|
|
229
|
+
with tqdm(total=len(indexed_items), desc=desc, mininterval=1, dynamic_ncols=True) as pbar:
|
|
230
|
+
# Track unfinished futures and poll with a timeout to enable heartbeat logs.
|
|
231
|
+
pending = set(future_to_index.keys())
|
|
232
|
+
while pending:
|
|
233
|
+
# Wait with timeout to detect stalls and emit heartbeats proactively.
|
|
234
|
+
done, not_done = wait(pending, timeout=heartbeat_sec)
|
|
235
|
+
if not done:
|
|
236
|
+
# Heartbeat when nothing has completed within the window.
|
|
237
|
+
logger.info(f'{desc} still processing... pending={len(not_done)}')
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
# Consume completed futures.
|
|
241
|
+
for future in done:
|
|
242
|
+
index = future_to_index[future]
|
|
243
|
+
try:
|
|
244
|
+
res = future.result()
|
|
245
|
+
results[index] = res # Store result at the correct index
|
|
246
|
+
# Invoke success callback in caller thread (not in worker).
|
|
247
|
+
if on_result is not None:
|
|
248
|
+
on_result(items[index], res)
|
|
249
|
+
except Exception as exc:
|
|
250
|
+
# Delegate failure handling to on_error if provided; otherwise bubble up.
|
|
251
|
+
if on_error is not None:
|
|
252
|
+
on_error(items[index], exc)
|
|
253
|
+
else:
|
|
254
|
+
raise
|
|
255
|
+
finally:
|
|
256
|
+
# Always advance progress for completed futures (success or failure).
|
|
257
|
+
pbar.update(1)
|
|
258
|
+
|
|
259
|
+
# Continue polling remaining futures.
|
|
260
|
+
pending = not_done
|
|
261
|
+
|
|
262
|
+
# Return results, which are now guaranteed to be in input order
|
|
263
|
+
if filter_none_results:
|
|
264
|
+
# Filter out None results if on_error was used and some tasks failed
|
|
265
|
+
results = [res for res in results if res is not None]
|
|
266
|
+
return results
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
# Copyright 2023-present the HuggingFace Inc. team.
|
|
3
|
+
|
|
4
|
+
import importlib
|
|
5
|
+
import os
|
|
6
|
+
from itertools import chain
|
|
7
|
+
from types import ModuleType
|
|
8
|
+
from typing import Any, Optional, Union
|
|
9
|
+
|
|
10
|
+
from evalscope.constants import IS_BUILD_DOC
|
|
11
|
+
from .logger import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger() # pylint: disable=invalid-name
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def check_import(
|
|
17
|
+
module_name: Union[str, list[str]],
|
|
18
|
+
package: Optional[Union[str, list[str]]] = None,
|
|
19
|
+
raise_warning: bool = True,
|
|
20
|
+
raise_error: bool = False,
|
|
21
|
+
feature_name: Optional[str] = 'this feature',
|
|
22
|
+
) -> bool:
|
|
23
|
+
"""Check if a module or list of modules can be imported.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
module_name (Union[str, list[str]]): The name(s) of the module(s) to check.
|
|
27
|
+
package (Union[str, list[str]], optional): The package(s) to install if the module(s) are not found.
|
|
28
|
+
Defaults to None.
|
|
29
|
+
raise_error (bool, optional): Whether to raise an error if any module is not found. Defaults to False.
|
|
30
|
+
raise_warning (bool, optional): Whether to log a warning if any module is not found. Defaults to True.
|
|
31
|
+
feature_name (str, optional): The feature name that requires the module(s). Used in the warning/error message.
|
|
32
|
+
Defaults to 'this feature'.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
bool: True if all modules can be imported, False otherwise.
|
|
36
|
+
"""
|
|
37
|
+
# Convert single strings to lists for uniform processing
|
|
38
|
+
if isinstance(module_name, str):
|
|
39
|
+
module_names = [module_name]
|
|
40
|
+
else:
|
|
41
|
+
module_names = module_name
|
|
42
|
+
|
|
43
|
+
if package is None:
|
|
44
|
+
packages = [None] * len(module_names)
|
|
45
|
+
elif isinstance(package, str):
|
|
46
|
+
packages = [package] * len(module_names)
|
|
47
|
+
else:
|
|
48
|
+
packages = package
|
|
49
|
+
# Ensure packages list has same length as module_names
|
|
50
|
+
if len(packages) < len(module_names):
|
|
51
|
+
packages.extend([None] * (len(module_names) - len(packages)))
|
|
52
|
+
|
|
53
|
+
missing_modules = []
|
|
54
|
+
missing_packages = []
|
|
55
|
+
|
|
56
|
+
for i, mod_name in enumerate(module_names):
|
|
57
|
+
try:
|
|
58
|
+
importlib.import_module(mod_name)
|
|
59
|
+
except ImportError:
|
|
60
|
+
missing_modules.append(mod_name)
|
|
61
|
+
if i < len(packages) and packages[i]:
|
|
62
|
+
missing_packages.append(packages[i])
|
|
63
|
+
|
|
64
|
+
if missing_modules:
|
|
65
|
+
if len(missing_modules) == 1:
|
|
66
|
+
error_msg = f'`{missing_modules[0]}` not found.'
|
|
67
|
+
else:
|
|
68
|
+
error_msg = f'The following modules are not found: {", ".join(f"`{mod}`" for mod in missing_modules)}.'
|
|
69
|
+
|
|
70
|
+
if missing_packages:
|
|
71
|
+
if len(missing_packages) == 1:
|
|
72
|
+
error_msg += f' Please run `pip install {missing_packages[0]}` to use {feature_name}.'
|
|
73
|
+
else:
|
|
74
|
+
unique_packages = list(dict.fromkeys(missing_packages)) # Remove duplicates while preserving order
|
|
75
|
+
error_msg += f' Please run `pip install {" ".join(unique_packages)}` to use {feature_name}.'
|
|
76
|
+
|
|
77
|
+
if raise_warning:
|
|
78
|
+
logger.warning(error_msg)
|
|
79
|
+
|
|
80
|
+
if not IS_BUILD_DOC and raise_error:
|
|
81
|
+
raise ImportError(error_msg)
|
|
82
|
+
return False
|
|
83
|
+
|
|
84
|
+
return True
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class _LazyModule(ModuleType):
|
|
88
|
+
"""
|
|
89
|
+
Module class that surfaces all objects but only performs associated imports when the objects are requested.
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
# Very heavily inspired by optuna.integration._IntegrationModule
|
|
93
|
+
# https://github.com/optuna/optuna/blob/master/optuna/integration/__init__.py
|
|
94
|
+
def __init__(self, name, module_file, import_structure, module_spec=None, extra_objects=None):
|
|
95
|
+
super().__init__(name)
|
|
96
|
+
self._modules = set(import_structure.keys())
|
|
97
|
+
self._class_to_module = {}
|
|
98
|
+
for key, values in import_structure.items():
|
|
99
|
+
for value in values:
|
|
100
|
+
self._class_to_module[value] = key
|
|
101
|
+
# Needed for autocompletion in an IDE
|
|
102
|
+
self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values()))
|
|
103
|
+
self.__file__ = module_file
|
|
104
|
+
self.__spec__ = module_spec
|
|
105
|
+
self.__path__ = [os.path.dirname(module_file)]
|
|
106
|
+
self._objects = {} if extra_objects is None else extra_objects
|
|
107
|
+
self._name = name
|
|
108
|
+
self._import_structure = import_structure
|
|
109
|
+
|
|
110
|
+
# Needed for autocompletion in an IDE
|
|
111
|
+
def __dir__(self):
|
|
112
|
+
result = super().__dir__()
|
|
113
|
+
# The elements of self.__all__ that are submodules may or may not be in the dir already, depending on whether
|
|
114
|
+
# they have been accessed or not. So we only add the elements of self.__all__ that are not already in the dir.
|
|
115
|
+
for attr in self.__all__:
|
|
116
|
+
if attr not in result:
|
|
117
|
+
result.append(attr)
|
|
118
|
+
return result
|
|
119
|
+
|
|
120
|
+
def __getattr__(self, name: str) -> Any:
|
|
121
|
+
if name in self._objects:
|
|
122
|
+
return self._objects[name]
|
|
123
|
+
if name in self._modules:
|
|
124
|
+
value = self._get_module(name)
|
|
125
|
+
elif name in self._class_to_module.keys():
|
|
126
|
+
module = self._get_module(self._class_to_module[name])
|
|
127
|
+
value = getattr(module, name)
|
|
128
|
+
else:
|
|
129
|
+
raise AttributeError(f'module {self.__name__} has no attribute {name}')
|
|
130
|
+
|
|
131
|
+
setattr(self, name, value)
|
|
132
|
+
return value
|
|
133
|
+
|
|
134
|
+
def _get_module(self, module_name: str):
|
|
135
|
+
return importlib.import_module('.' + module_name, self.__name__)
|
|
136
|
+
|
|
137
|
+
def __reduce__(self):
|
|
138
|
+
return self.__class__, (self._name, self.__file__, self._import_structure)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def is_module_installed(module_name):
|
|
142
|
+
try:
|
|
143
|
+
importlib.import_module(module_name)
|
|
144
|
+
return True
|
|
145
|
+
except ImportError:
|
|
146
|
+
return False
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def get_module_path(module_name):
|
|
150
|
+
spec = importlib.util.find_spec(module_name)
|
|
151
|
+
if spec and spec.origin:
|
|
152
|
+
return os.path.abspath(spec.origin)
|
|
153
|
+
else:
|
|
154
|
+
raise ValueError(f'Cannot find module: {module_name}')
|