PyPI - evalscope - Versions diffs - 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 0.10.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (606) hide show

evalscope/__init__.py +4 -1
evalscope/api/benchmark/__init__.py +11 -0
evalscope/api/benchmark/adapters/__init__.py +7 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +754 -0
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +86 -0
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +157 -0
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +404 -0
evalscope/api/benchmark/meta.py +124 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +370 -0
evalscope/api/dataset/loader.py +266 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +382 -0
evalscope/api/evaluator/evaluator.py +61 -0
evalscope/api/evaluator/state.py +280 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +12 -0
evalscope/api/messages/chat_message.py +248 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +60 -0
evalscope/api/metric/scorer.py +113 -0
evalscope/api/mixin/__init__.py +2 -0
evalscope/api/mixin/llm_judge_mixin.py +170 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +161 -0
evalscope/api/model/model.py +386 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/__init__.py +28 -0
evalscope/app/app.py +38 -0
evalscope/app/arguments.py +11 -0
evalscope/app/constants.py +22 -0
evalscope/app/ui/__init__.py +20 -0
evalscope/app/ui/app_ui.py +53 -0
evalscope/app/ui/multi_model.py +353 -0
evalscope/app/ui/sidebar.py +42 -0
evalscope/app/ui/single_model.py +220 -0
evalscope/app/ui/visualization.py +36 -0
evalscope/app/utils/data_utils.py +195 -0
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/localization.py +221 -0
evalscope/app/utils/text_utils.py +119 -0
evalscope/app/utils/visualization.py +96 -0
evalscope/arguments.py +32 -9
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +10 -7
evalscope/backend/rag_eval/__init__.py +1 -1
evalscope/backend/rag_eval/backend_manager.py +23 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +33 -21
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/cmteb/arguments.py +14 -1
evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
evalscope/backend/rag_eval/ragas/arguments.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +9 -3
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -6
evalscope/backend/rag_eval/utils/embedding.py +125 -32
evalscope/backend/rag_eval/utils/llm.py +16 -16
evalscope/backend/vlm_eval_kit/backend_manager.py +8 -3
evalscope/benchmarks/__init__.py +17 -5
evalscope/benchmarks/aa_lcr/__init__.py +0 -0
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/__init__.py +0 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/__init__.py +0 -0
evalscope/benchmarks/aime/aime24_adapter.py +55 -0
evalscope/benchmarks/aime/aime25_adapter.py +181 -0
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/{metrics/math_accuracy.py → benchmarks/aime/math_normalize.py} +61 -72
evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
evalscope/benchmarks/amc/__init__.py +0 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arc/arc_adapter.py +34 -149
evalscope/benchmarks/arena_hard/__init__.py +0 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +149 -0
evalscope/benchmarks/arena_hard/utils.py +186 -0
evalscope/benchmarks/bbh/bbh_adapter.py +117 -157
evalscope/benchmarks/bfcl/__init__.py +0 -0
evalscope/benchmarks/bfcl/v3/__init__.py +0 -0
evalscope/benchmarks/bfcl/v3/bfcl_v3_adapter.py +370 -0
evalscope/benchmarks/bfcl/v3/generation.py +222 -0
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +93 -174
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +170 -0
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -140
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +64 -112
evalscope/benchmarks/data_collection/__init__.py +0 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +215 -0
evalscope/benchmarks/docmath/__init__.py +0 -0
evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
evalscope/benchmarks/docmath/utils.py +219 -0
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/__init__.py +0 -0
evalscope/benchmarks/drop/drop_adapter.py +155 -0
evalscope/benchmarks/drop/utils.py +156 -0
evalscope/benchmarks/frames/__init__.py +0 -0
evalscope/benchmarks/frames/frames_adapter.py +175 -0
evalscope/benchmarks/frames/utils.py +37 -0
evalscope/benchmarks/general_arena/__init__.py +0 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +454 -0
evalscope/benchmarks/general_arena/utils.py +223 -0
evalscope/benchmarks/general_mcq/__init__.py +0 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
evalscope/benchmarks/general_qa/general_qa_adapter.py +75 -107
evalscope/benchmarks/gpqa/__init__.py +0 -0
evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
evalscope/benchmarks/gpqa/prompt.py +88 -0
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +77 -144
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +36 -134
evalscope/benchmarks/hle/__init__.py +0 -0
evalscope/benchmarks/hle/hle_adapter.py +153 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +80 -88
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/ifeval_adapter.py +71 -45
evalscope/benchmarks/ifeval/instructions.py +112 -68
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -58
evalscope/benchmarks/live_code_bench/__init__.py +0 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +195 -0
evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +150 -0
evalscope/benchmarks/live_code_bench/load_utils.py +63 -0
evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
evalscope/benchmarks/live_code_bench/prompts.py +207 -0
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/live_code_bench/testing_util.py +544 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/maritime_bench/__init__.py +0 -0
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
evalscope/benchmarks/math_500/__init__.py +0 -0
evalscope/benchmarks/math_500/math_500_adapter.py +55 -0
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -210
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +87 -103
evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/musr/__init__.py +0 -0
evalscope/benchmarks/musr/musr_adapter.py +43 -0
evalscope/benchmarks/needle_haystack/__init__.py +0 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +389 -0
evalscope/benchmarks/needle_haystack/utils.py +79 -0
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/__init__.py +0 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +171 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/race/race_adapter.py +33 -120
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/__init__.py +0 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +169 -0
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/super_gpqa/__init__.py +0 -0
evalscope/benchmarks/super_gpqa/prompt.py +88 -0
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
evalscope/benchmarks/super_gpqa/utils.py +86 -0
evalscope/benchmarks/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench/tau_bench_adapter.py +168 -0
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
evalscope/benchmarks/tool_bench/__init__.py +0 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
evalscope/benchmarks/tool_bench/utils.py +203 -0
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -118
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -270
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/winogrande/__init__.py +0 -0
evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +12 -2
evalscope/cli/start_eval.py +4 -3
evalscope/cli/start_perf.py +10 -2
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +27 -3
evalscope/collections/sampler.py +12 -11
evalscope/collections/schema.py +13 -12
evalscope/config.py +218 -147
evalscope/constants.py +78 -82
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +334 -318
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +59 -3
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
evalscope/metrics/llm_judge.py +211 -0
evalscope/metrics/math_parser.py +545 -0
evalscope/metrics/metric.py +611 -0
evalscope/metrics/metrics.py +112 -23
evalscope/metrics/rouge_metric.py +11 -13
evalscope/metrics/t2v_metrics/__init__.py +0 -0
evalscope/metrics/t2v_metrics/clipscore.py +14 -0
evalscope/metrics/t2v_metrics/constants.py +12 -0
evalscope/metrics/t2v_metrics/itmscore.py +14 -0
evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +134 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +282 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +115 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +87 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +86 -0
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +85 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +99 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +176 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +82 -0
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +74 -0
evalscope/metrics/t2v_metrics/models/model.py +45 -0
evalscope/metrics/t2v_metrics/models/utils.py +25 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +306 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +84 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +223 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +153 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +24 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +190 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +100 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +313 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +192 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +320 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +212 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1111 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +457 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +370 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +765 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +274 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +896 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1876 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +83 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +58 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +187 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +179 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +115 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +348 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +870 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +273 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +514 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1291 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +476 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +35 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +393 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +129 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +18 -0
evalscope/metrics/t2v_metrics/score.py +78 -0
evalscope/metrics/t2v_metrics/vqascore.py +14 -0
evalscope/models/__init__.py +23 -13
evalscope/models/image_edit_model.py +125 -0
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +69 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +144 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +708 -0
evalscope/perf/__init__.py +0 -1
evalscope/perf/arguments.py +103 -69
evalscope/perf/benchmark.py +114 -163
evalscope/perf/http_client.py +59 -89
evalscope/perf/main.py +91 -18
evalscope/perf/plugin/__init__.py +3 -2
evalscope/perf/plugin/api/__init__.py +4 -3
evalscope/perf/plugin/api/base.py +27 -7
evalscope/perf/plugin/api/custom_api.py +170 -57
evalscope/perf/plugin/api/dashscope_api.py +4 -10
evalscope/perf/plugin/api/default_api.py +214 -0
evalscope/perf/plugin/api/openai_api.py +120 -41
evalscope/perf/plugin/datasets/__init__.py +10 -6
evalscope/perf/plugin/datasets/base.py +43 -1
evalscope/perf/plugin/datasets/custom.py +22 -3
evalscope/perf/plugin/datasets/flickr8k.py +5 -27
evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
evalscope/perf/plugin/datasets/line_by_line.py +7 -3
evalscope/perf/plugin/datasets/longalpaca.py +7 -3
evalscope/perf/plugin/datasets/openqa.py +13 -14
evalscope/perf/plugin/datasets/random_dataset.py +67 -0
evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
evalscope/perf/plugin/registry.py +36 -16
evalscope/perf/utils/analysis_result.py +24 -23
evalscope/perf/utils/benchmark_util.py +95 -55
evalscope/perf/utils/db_util.py +115 -78
evalscope/perf/utils/local_server.py +12 -47
evalscope/perf/utils/log_utils.py +63 -0
evalscope/perf/utils/rich_display.py +192 -0
evalscope/report/__init__.py +46 -3
evalscope/report/combinator.py +143 -32
evalscope/report/generator.py +74 -34
evalscope/report/report.py +238 -0
evalscope/run.py +71 -46
evalscope/summarizer.py +5 -5
evalscope/third_party/longbench_write/infer.py +1 -1
evalscope/third_party/thinkbench/__init__.py +3 -0
evalscope/third_party/thinkbench/eval.py +441 -0
evalscope/third_party/thinkbench/infer.py +130 -0
evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
evalscope/third_party/thinkbench/tools/__init__.py +0 -0
evalscope/third_party/thinkbench/tools/llm.py +48 -0
evalscope/third_party/thinkbench/tools/utils.py +13 -0
evalscope/third_party/toolbench_static/llm/swift_infer.py +46 -20
evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
evalscope/utils/__init__.py +82 -2
evalscope/utils/argument_utils.py +64 -0
evalscope/utils/chat_service.py +8 -6
evalscope/utils/deprecation_utils.py +53 -0
evalscope/utils/function_utils.py +266 -0
evalscope/utils/import_utils.py +154 -0
evalscope/utils/io_utils.py +336 -8
evalscope/utils/json_schema.py +231 -0
evalscope/utils/logger.py +121 -31
evalscope/utils/model_utils.py +57 -1
evalscope/utils/multi_choices.py +303 -0
evalscope/utils/ner.py +377 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
evalscope-1.2.0.dist-info/METADATA +553 -0
evalscope-1.2.0.dist-info/RECORD +628 -0
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -76
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/ceval/samples.jsonl +0 -1
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -291
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/collections/evaluator.py +0 -198
evalscope/evaluator/rating_eval.py +0 -157
evalscope/evaluator/reviewer/__init__.py +0 -1
evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
evalscope/metrics/code_metric.py +0 -98
evalscope/metrics/named_metrics.py +0 -17
evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
evalscope/models/base_adapter.py +0 -52
evalscope/models/chat_adapter.py +0 -138
evalscope/models/choice_adapter.py +0 -211
evalscope/models/custom/__init__.py +0 -3
evalscope/models/custom/custom_model.py +0 -53
evalscope/models/custom/dummy_model.py +0 -63
evalscope/models/custom_adapter.py +0 -67
evalscope/models/local_model.py +0 -74
evalscope/models/model.py +0 -229
evalscope/models/server_adapter.py +0 -111
evalscope/registry/__init__.py +0 -1
evalscope/registry/config/cfg_arena.yaml +0 -77
evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
evalscope/registry/config/cfg_single.yaml +0 -78
evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
evalscope/registry/data/qa_browser/battle.jsonl +0 -634
evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
evalscope/registry/data/question.jsonl +0 -80
evalscope/registry/tasks/arc.yaml +0 -28
evalscope/registry/tasks/bbh.yaml +0 -26
evalscope/registry/tasks/bbh_mini.yaml +0 -26
evalscope/registry/tasks/ceval.yaml +0 -27
evalscope/registry/tasks/ceval_mini.yaml +0 -26
evalscope/registry/tasks/cmmlu.yaml +0 -27
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
evalscope/registry/tasks/general_qa.yaml +0 -27
evalscope/registry/tasks/gsm8k.yaml +0 -29
evalscope/registry/tasks/mmlu.yaml +0 -29
evalscope/registry/tasks/mmlu_mini.yaml +0 -27
evalscope/report/app.py +0 -506
evalscope/report/utils.py +0 -133
evalscope/run_arena.py +0 -202
evalscope/utils/arena_utils.py +0 -217
evalscope/utils/completion_parsers.py +0 -82
evalscope/utils/utils.py +0 -301
evalscope-0.10.0.dist-info/METADATA +0 -565
evalscope-0.10.0.dist-info/RECORD +0 -286
tests/__init__.py +0 -1
tests/cli/__init__.py +0 -1
tests/cli/test_collection.py +0 -57
tests/cli/test_run.py +0 -165
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -101
tests/rag/test_clip_benchmark.py +0 -85
tests/rag/test_mteb.py +0 -138
tests/rag/test_ragas.py +0 -120
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -145
tests/swift/test_run_swift_vlm_eval.py +0 -127
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -156
tests/test_run_all.py +0 -12
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -60
{tests/rag → evalscope/api}/__init__.py +0 -0
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.10.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/utils/io_utils.py CHANGED Viewed

@@ -1,7 +1,18 @@
+import base64
+import csv
+import hashlib
+import io
 import json
 import jsonlines as jsonl
 import os
+import re
+import string
+import unicodedata
 import yaml
+from datetime import datetime
+from io import BytesIO
+from PIL import Image
+from typing import Tuple
 from evalscope.constants import DumpMode
 from evalscope.utils.logger import get_logger
@@ -27,7 +38,7 @@ class OutputsStructure:
             'configs_dir': None
         }
-    def _get_dir(self, attr_name, dir_name):
+    def _get_dir(self, attr_name, dir_name) -> str:
         if self._dirs[attr_name] is None:
             dir_path = os.path.join(self.outputs_dir, dir_name)
             if self.is_make:
@@ -66,10 +77,20 @@ def jsonl_to_list(jsonl_file):
     Returns:
         list: list of lines. Each line is a dict.
     """
-    res_list = []
-    with jsonl.open(jsonl_file, mode='r') as reader:
-        for line in reader.iter(type=dict, allow_none=True, skip_invalid=False):
-            res_list.append(line)
+    try:
+        res_list = []
+        with jsonl.open(jsonl_file, mode='r') as reader:
+            for line in reader.iter(type=dict, allow_none=True, skip_invalid=False):
+                res_list.append(line)
+    except Exception:
+        # Fallback to reading line by line
+        res_list = []
+        with open(jsonl_file, 'r', encoding='utf-8') as f:
+            for line in f:
+                if line.strip():  # Skip empty lines
+                    res_list.append(json.loads(line.strip()))
+    if not res_list:
+        logger.warning(f'No data found in {jsonl_file}.')
     return res_list
@@ -104,6 +125,9 @@ def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
     if not isinstance(data_list, list):
         data_list = [data_list]
+    # Convert non-serializable types to serializable ones
+    data_list = convert_normal_types(data_list)
     if dump_mode == DumpMode.OVERWRITE:
         dump_mode = 'w'
     elif dump_mode == DumpMode.APPEND:
@@ -112,8 +136,76 @@ def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
         writer.write_all(data_list)
-def jsonl_to_csv():
-    pass
+def jsonl_to_csv(jsonl_file, csv_file):
+    """
+    Convert jsonl file to csv file.
+    Args:
+        jsonl_file: jsonl file path.
+        csv_file: csv file path.
+    """
+    data = jsonl_to_list(jsonl_file)
+    if not data:
+        logger.warning(f'No data found in {jsonl_file}.')
+        return
+    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.writer(f)
+        writer.writerow(data[0].keys())  # Write header
+        for item in data:
+            writer.writerow(item.values())
+def csv_to_list(csv_file) -> list:
+    """
+    Read csv file to list.
+    Args:
+        csv_file: csv file path.
+    Returns:
+        list: list of lines. Each line is a dict.
+    """
+    res_list = []
+    with open(csv_file, 'r', encoding='utf-8') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            res_list.append(row)
+    return res_list
+def tsv_to_list(tsv_file) -> list:
+    """
+    Read tsv file to list.
+    Args:
+        tsv_file: tsv file path.
+    Returns:
+        list: list of lines. Each line is a dict.
+    """
+    res_list = []
+    with open(tsv_file, 'r', encoding='utf-8') as f:
+        reader = csv.DictReader(f, delimiter='\t')
+        for row in reader:
+            res_list.append(row)
+    return res_list
+def csv_to_jsonl(csv_file, jsonl_file):
+    """
+    Convert csv file to jsonl file.
+    Args:
+        csv_file: csv file path.
+        jsonl_file: jsonl file path.
+    """
+    data = csv_to_list(csv_file)
+    if not data:
+        logger.warning(f'No data found in {csv_file}.')
+        return
+    dump_jsonl_data(data, jsonl_file, dump_mode=DumpMode.OVERWRITE)
 def yaml_to_dict(yaml_file) -> dict:
@@ -135,7 +227,7 @@ def dict_to_yaml(d: dict, yaml_file: str):
     Dump dict to yaml file.
     """
     with open(yaml_file, 'w') as f:
-        yaml.dump(d, f, default_flow_style=False)
+        yaml.dump(d, f, default_flow_style=False, allow_unicode=True)
 def json_to_dict(json_file) -> dict:
@@ -168,3 +260,239 @@ def dict_to_json(d: dict, json_file: str):
     """
     with open(json_file, 'w') as f:
         json.dump(d, f, indent=4, ensure_ascii=False)
+def get_latest_folder_path(work_dir):
+    from datetime import datetime
+    # Get all subdirectories in the work_dir
+    folders = [f for f in os.listdir(work_dir) if os.path.isdir(os.path.join(work_dir, f))]
+    # Get the timestamp（YYYYMMDD_HHMMSS）
+    timestamp_pattern = re.compile(r'^\d{8}_\d{6}$')
+    # Filter out the folders
+    timestamped_folders = [f for f in folders if timestamp_pattern.match(f)]
+    if not timestamped_folders:
+        print(f'>> No timestamped folders found in {work_dir}!')
+        return None
+    # timestamp parser
+    def parse_timestamp(folder_name):
+        return datetime.strptime(folder_name, '%Y%m%d_%H%M%S')
+    # Find the latest folder
+    latest_folder = max(timestamped_folders, key=parse_timestamp)
+    return os.path.join(work_dir, latest_folder)
+def gen_hash(name: str, bits: int = 32):
+    return hashlib.md5(name.encode(encoding='UTF-8')).hexdigest()[:bits]
+def get_valid_list(input_list, candidate_list):
+    """
+    Get the valid and invalid list from input_list based on candidate_list.
+    Args:
+        input_list: The input list.
+        candidate_list: The candidate list.
+    Returns:
+        valid_list: The valid list.
+        invalid_list: The invalid list.
+    """
+    return [i for i in input_list if i in candidate_list], \
+           [i for i in input_list if i not in candidate_list]
+def PIL_to_base64(image: Image.Image, format: str = 'JPEG', add_header: bool = False) -> str:
+    """
+    Convert a PIL Image to a base64 encoded string.
+    Args:
+        image (Image.Image): The PIL Image to convert.
+        format (str): The format to save the image in. Default is 'JPEG'.
+        add_header (bool): Whether to add the base64 header. Default is False.
+    Returns:
+        str: Base64 encoded string of the image.
+    """
+    buffered = BytesIO()
+    image.save(buffered, format=format)
+    img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
+    if add_header:
+        img_str = f'data:image/{format.lower()};base64,{img_str}'
+    return img_str
+def bytes_to_base64(bytes_data: bytes, *, format: str = 'png', add_header: bool = False, content_type='image') -> str:
+    """Convert bytes to a base64 encoded string.
+    Args:
+        bytes_data (bytes): The bytes to convert.
+        format (str): The format of the image. Default is 'png'.
+        add_header (bool): Whether to add the base64 header. Default is False.
+        content_type (str): The type of the data, 'image' or 'audio'. Default is 'image'.
+    Returns:
+        str: Base64 encoded string of the bytes.
+    """
+    base64_str = base64.b64encode(bytes_data).decode('utf-8')
+    if add_header:
+        base64_str = f'data:{content_type}/{format};base64,{base64_str}'
+    return base64_str
+def base64_to_PIL(base64_str):
+    """Convert a base64 encoded string to a PIL Image.
+    Args:
+        base64_str (str): The base64 encoded string.
+    Returns:
+        Image.Image: The decoded PIL Image.
+    """
+    # remove header
+    if ',' in base64_str:
+        base64_str = base64_str.split(',', 1)[1]
+    # decode
+    img_data = base64.b64decode(base64_str)
+    img_file = io.BytesIO(img_data)
+    img = Image.open(img_file)
+    return img
+def safe_filename(s: str, max_length: int = 255) -> str:
+    """
+    Convert a string into a safe filename by removing or replacing unsafe characters.
+    Args:
+        s (str): The input string to convert
+        max_length (int): Maximum length of the resulting filename (default 255)
+    Returns:
+        str: A safe filename string
+    Examples:
+        >>> safe_filename("Hello/World?.txt")
+        'Hello_World.txt'
+    """
+    # normalize unicode characters
+    s = unicodedata.normalize('NFKD', s)
+    s = s.encode('ASCII', 'ignore').decode('ASCII')
+    # remove or replace unsafe characters
+    # Keep only alphanumeric characters, dots, dashes, and underscores
+    safe_chars = string.ascii_letters + string.digits + '.-_'
+    s = ''.join(c if c in safe_chars else '_' for c in s)
+    # remove consecutive underscores
+    s = re.sub(r'_+', '_', s)
+    # remove leading/trailing periods and underscores
+    s = s.strip('._')
+    # handle empty string case
+    if not s:
+        s = 'untitled'
+    # handle starting with a period (hidden files)
+    if s.startswith('.'):
+        s = '_' + s
+    # enforce length limit
+    if len(s) > max_length:
+        # If we need to truncate, preserve the file extension if present
+        name, ext = os.path.splitext(s)
+        ext_len = len(ext)
+        if ext_len > 0:
+            max_name_length = max_length - ext_len
+            s = name[:max_name_length] + ext
+        else:
+            s = s[:max_length]
+    return s
+def convert_normal_types(obj):
+    """Recursively convert numpy types and datetime objects to native Python types for JSON serialization."""
+    import numpy as np
+    if isinstance(obj, datetime):
+        return obj.isoformat()
+    elif isinstance(obj, np.bool_):
+        return bool(obj)
+    elif isinstance(obj, np.integer):
+        return int(obj)
+    elif isinstance(obj, np.floating):
+        return float(obj)
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, dict):
+        return {key: convert_normal_types(value) for key, value in obj.items()}
+    elif isinstance(obj, list):
+        return [convert_normal_types(item) for item in obj]
+    elif isinstance(obj, tuple):
+        return tuple(convert_normal_types(item) for item in obj)
+    elif isinstance(obj, os.PathLike):
+        return str(obj)
+    else:
+        return obj
+def compress_image_to_limit(image_bytes: bytes, max_bytes: int = 10_000_000) -> Tuple[bytes, str]:
+    """
+    Ensure image bytes are under max_bytes by re-encoding to JPEG with quality reduction
+    and optional downscaling. Returns (processed_bytes, format_str).
+    If the original bytes are already below the limit, returns them as PNG.
+    """
+    if len(image_bytes) <= max_bytes:
+        return image_bytes, 'png'
+    try:
+        img = Image.open(BytesIO(image_bytes))
+    except Exception as exc:
+        logger.warning(f'Failed to open image bytes with PIL, sending original image; may exceed API limit: {exc}')
+        return image_bytes, 'png'
+    # Convert to RGB for JPEG if needed
+    if img.mode not in ('RGB', 'L'):
+        img = img.convert('RGB')
+    def encode_jpeg(source: Image.Image, quality: int) -> bytes:
+        buf = BytesIO()
+        source.save(buf, format='JPEG', quality=quality, optimize=True, progressive=True)
+        return buf.getvalue()
+    # Start with moderate quality and reduce
+    quality: int = 85
+    out: bytes = encode_jpeg(img, quality)
+    quality_floor: int = 40
+    while len(out) > max_bytes and quality > quality_floor:
+        quality -= 10
+        out = encode_jpeg(img, quality)
+    # If still too large, progressively downscale
+    min_side_floor: int = 256
+    scale: float = 0.9
+    while len(out) > max_bytes and min(img.size) > min_side_floor:
+        new_w = max(min_side_floor, int(img.width * scale))
+        new_h = max(min_side_floor, int(img.height * scale))
+        if (new_w, new_h) == img.size:
+            break
+        img = img.resize((new_w, new_h), Image.LANCZOS)
+        out = encode_jpeg(img, quality)
+    if len(out) > max_bytes:
+        logger.warning(f'Image remains above limit after compression: size={len(out)} bytes (limit={max_bytes}).')
+    else:
+        logger.info(
+            f'Compressed image from {len(image_bytes)} to {len(out)} bytes; '
+            f'quality={quality}, size={img.width}x{img.height}.'
+        )
+    return out, 'jpeg'

evalscope/utils/json_schema.py ADDED Viewed

@@ -0,0 +1,231 @@
+import types
+import typing
+from copy import deepcopy
+from dataclasses import is_dataclass
+from datetime import date, datetime, time
+from enum import EnumMeta
+from pydantic import BaseModel, Field, field_validator, model_validator
+from typing import (
+    Any,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    Union,
+    cast,
+    get_args,
+    get_origin,
+    get_type_hints,
+    is_typeddict,
+)
+JSONType = Literal['string', 'integer', 'number', 'boolean', 'array', 'object', 'null']
+"""Valid types within JSON schema."""
+class JSONSchema(BaseModel):
+    """JSON Schema for type."""
+    type: Optional[JSONType] = Field(default=None)
+    """JSON type of tool parameter."""
+    format: Optional[str] = Field(default=None)
+    """Format of the parameter (e.g. date-time)."""
+    description: Optional[str] = Field(default=None)
+    """Parameter description."""
+    default: Any = Field(default=None)
+    """Default value for parameter."""
+    enum: Optional[List[Any]] = Field(default=None)
+    """Valid values for enum parameters."""
+    items: Optional['JSONSchema'] = Field(default=None)
+    """Valid type for array parameters."""
+    properties: Optional[Dict[str, 'JSONSchema']] = Field(default=None)
+    """Valid fields for object parametrs."""
+    additionalProperties: Optional[Union['JSONSchema', bool]] = Field(default=None)
+    """Are additional properties allowed?"""
+    anyOf: Optional[List['JSONSchema']] = Field(default=None)
+    """Valid types for union parameters."""
+    required: Optional[List[str]] = Field(default=None)
+    """Required fields for object parameters."""
+    @model_validator(mode='before')
+    def convert_type_before_validation(cls, values):
+        values = deepcopy(values)
+        def recursive_convert_type(obj):
+            if isinstance(obj, dict):
+                # Convert 'type' field if it's a string
+                if 'type' in obj and isinstance(obj['type'], str):
+                    try:
+                        obj['type'] = python_type_to_json_type(obj['type'])
+                    except ValueError:
+                        # If conversion fails, leave it as is
+                        pass
+                # Recursively process nested structures
+                for k, v in obj.items():
+                    obj[k] = recursive_convert_type(v)
+            elif isinstance(obj, list):
+                return [recursive_convert_type(item) for item in obj]
+            return obj
+        return recursive_convert_type(values)
+def json_schema(t: Type[Any]) -> JSONSchema:
+    """Provide a JSON Schema for the specified type.
+    Schemas can be automatically inferred for a wide variety of
+    Python class types including Pydantic BaseModel, dataclasses,
+    and typed dicts.
+    Args:
+        t: Python type
+    Returns:
+        JSON Schema for type.
+    """
+    origin = get_origin(t)
+    args = get_args(t)
+    if origin is None:
+        if t is int:
+            return JSONSchema(type='integer')
+        elif t is float:
+            return JSONSchema(type='number')
+        elif t is str:
+            return JSONSchema(type='string')
+        elif t is bool:
+            return JSONSchema(type='boolean')
+        elif t is datetime:
+            return JSONSchema(type='string', format='date-time')
+        elif t is date:
+            return JSONSchema(type='string', format='date')
+        elif t is time:
+            return JSONSchema(type='string', format='time')
+        elif t is list or t is set:
+            return JSONSchema(type='array', items=JSONSchema())
+        elif t is dict:
+            return JSONSchema(type='object', additionalProperties=JSONSchema())
+        elif (is_dataclass(t) or is_typeddict(t) or (isinstance(t, type) and issubclass(t, BaseModel))):
+            return cls_json_schema(t)
+        elif isinstance(t, EnumMeta):
+            return JSONSchema(enum=[item.value for item in t])
+        elif t is type(None):
+            return JSONSchema(type='null')
+        else:
+            return JSONSchema()
+    elif (origin is list or origin is List or origin is tuple or origin is Tuple or origin is set or origin is Set):
+        return JSONSchema(type='array', items=json_schema(args[0]) if args else JSONSchema())
+    elif origin is dict or origin is Dict:
+        return JSONSchema(
+            type='object',
+            additionalProperties=json_schema(args[1]) if len(args) > 1 else JSONSchema(),
+        )
+    elif origin is Union or origin is types.UnionType:
+        return JSONSchema(anyOf=[json_schema(arg) for arg in args])
+    elif origin is Optional:
+        return JSONSchema(anyOf=[json_schema(arg) for arg in args] + [JSONSchema(type='null')])
+    elif origin is typing.Literal:
+        return JSONSchema(enum=list(args))
+    return JSONSchema()  # Default case if we can't determine the type
+def cls_json_schema(cls: Type[Any]) -> JSONSchema:
+    properties: Dict[str, JSONSchema] = {}
+    required: List[str] = []
+    if is_dataclass(cls):
+        fields = cls.__dataclass_fields__  # type: ignore
+        for name, field in fields.items():
+            properties[name] = json_schema(field.type)  # type: ignore
+            if field.default == field.default_factory:
+                required.append(name)
+    elif isinstance(cls, type) and issubclass(cls, BaseModel):
+        schema = cls.model_json_schema()
+        schema = resolve_schema_references(schema)
+        for name, prop in schema.get('properties', {}).items():
+            properties[name] = JSONSchema(**prop)
+        required = schema.get('required', [])
+    elif is_typeddict(cls):
+        annotations = get_type_hints(cls)
+        for name, type_hint in annotations.items():
+            properties[name] = json_schema(type_hint)
+            if name in cls.__required_keys__:
+                required.append(name)
+    return JSONSchema(
+        type='object',
+        properties=properties,
+        required=required if required else None,
+        additionalProperties=False,
+    )
+def python_type_to_json_type(python_type: Optional[str]) -> JSONType:
+    if python_type is not None and python_type in get_args(JSONType):
+        return python_type
+    if python_type == 'str':
+        return 'string'
+    elif python_type == 'int':
+        return 'integer'
+    elif python_type == 'float':
+        return 'number'
+    elif python_type == 'bool':
+        return 'boolean'
+    elif python_type == 'list':
+        return 'array'
+    elif python_type == 'dict':
+        return 'object'
+    elif python_type == 'None':
+        return 'null'
+    elif python_type is None:
+        # treat 'unknown' as string as anything can be converted to string
+        return 'string'
+    else:
+        raise ValueError(f'Unsupported type: {python_type} for Python to JSON conversion.')
+def resolve_schema_references(schema: Dict[str, Any]) -> Dict[str, Any]:
+    """Resolves all $ref references in a JSON schema by inlining the definitions."""
+    schema = deepcopy(schema)
+    definitions = schema.pop('$defs', {})
+    def _resolve_refs(obj: Any) -> Any:
+        if isinstance(obj, dict):
+            if '$ref' in obj and obj['$ref'].startswith('#/$defs/'):
+                ref_key = obj['$ref'].split('/')[-1]
+                if ref_key in definitions:
+                    # Replace with a deep copy of the definition
+                    resolved = deepcopy(definitions[ref_key])
+                    # Process any nested references in the definition
+                    resolved = _resolve_refs(resolved)
+                    # Merge in the current object fields, which should take priority
+                    # This means that if you have e.g.
+                    # {"$ref": "#/$defs/SubType", "description": "subtype of type SubType"},
+                    # and SubType resolves to
+                    # {"description": "The SubType Class", "parameters": {"param1": {"type": "string"}}},
+                    # the final result will be:
+                    # {"description": "subtype of type SubType", "parameters": {"param1": {"type": "string"}}}
+                    return resolved | {k: o for k, o in obj.items() if k != '$ref'}
+            # Process all entries in the dictionary
+            return {k: _resolve_refs(v) for k, v in obj.items()}
+        elif isinstance(obj, list):
+            return [_resolve_refs(item) for item in obj]
+        else:
+            return obj
+    return cast(Dict[str, Any], _resolve_refs(schema))

evalscope 0.10.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 0.10.0py3-none-any.whl → 1.2.0py3-none-any.whl