evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/api/benchmark/__init__.py +9 -1
- evalscope/api/benchmark/adapters/__init__.py +4 -0
- evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
- evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
- evalscope/api/benchmark/benchmark.py +85 -2
- evalscope/api/benchmark/meta.py +10 -1
- evalscope/api/dataset/dataset.py +27 -6
- evalscope/api/dataset/loader.py +8 -3
- evalscope/api/evaluator/cache.py +31 -4
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +17 -1
- evalscope/api/messages/__init__.py +1 -0
- evalscope/api/messages/chat_message.py +52 -2
- evalscope/api/metric/__init__.py +1 -1
- evalscope/api/metric/metric.py +6 -1
- evalscope/api/metric/scorer.py +15 -7
- evalscope/api/mixin/__init__.py +1 -1
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +182 -0
- evalscope/api/model/generate_config.py +10 -6
- evalscope/api/model/model.py +5 -2
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/app.py +3 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +11 -5
- evalscope/app/utils/data_utils.py +8 -7
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -12
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +8 -4
- evalscope/backend/opencompass/backend_manager.py +0 -2
- evalscope/backend/rag_eval/utils/embedding.py +9 -1
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/aime/aime24_adapter.py +5 -0
- evalscope/benchmarks/aime/aime25_adapter.py +136 -1
- evalscope/benchmarks/aime/grader.py +307 -0
- evalscope/benchmarks/aime/math_normalize.py +189 -0
- evalscope/benchmarks/amc/amc_adapter.py +51 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
- evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
- evalscope/benchmarks/bfcl/v3/utils.py +23 -0
- evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
- evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
- evalscope/benchmarks/bfcl/v4/utils.py +410 -0
- evalscope/benchmarks/biomix_qa/__init__.py +0 -0
- evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/coin_flip/__init__.py +0 -0
- evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
- evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
- evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drivelology/__init__.py +0 -0
- evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
- evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
- evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
- evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
- evalscope/benchmarks/drop/drop_adapter.py +15 -44
- evalscope/benchmarks/drop/utils.py +97 -0
- evalscope/benchmarks/frames/frames_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
- evalscope/benchmarks/halu_eval/__init__.py +0 -0
- evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
- evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/image_edit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/logi_qa/__int__.py +0 -0
- evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
- evalscope/benchmarks/math_qa/__init__.py +0 -0
- evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
- evalscope/benchmarks/med_mcqa/__init__.py +0 -0
- evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
- evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
- evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/music_trivia/__init__.py +0 -0
- evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/piqa/__init__.py +0 -0
- evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +112 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
- evalscope/benchmarks/pumed_qa/__init__.py +0 -0
- evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
- evalscope/benchmarks/qasc/__init__.py +0 -0
- evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/sciq/__init__.py +0 -0
- evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/siqa/__init__.py +0 -0
- evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
- evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
- evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
- evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
- evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
- evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
- evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/wmt/__init__.py +0 -0
- evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/config.py +103 -18
- evalscope/constants.py +18 -0
- evalscope/evaluator/evaluator.py +138 -82
- evalscope/metrics/bert_score/__init__.py +0 -0
- evalscope/metrics/bert_score/scorer.py +338 -0
- evalscope/metrics/bert_score/utils.py +697 -0
- evalscope/metrics/llm_judge.py +19 -7
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +317 -13
- evalscope/metrics/metrics.py +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/model_apis.py +22 -0
- evalscope/models/openai_compatible.py +21 -0
- evalscope/models/text2image_model.py +2 -2
- evalscope/models/utils/openai.py +16 -6
- evalscope/perf/arguments.py +26 -4
- evalscope/perf/benchmark.py +76 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +188 -79
- evalscope/perf/plugin/api/openai_api.py +85 -20
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +43 -27
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +3 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +13 -3
- evalscope/report/combinator.py +91 -20
- evalscope/report/generator.py +8 -87
- evalscope/report/report.py +8 -4
- evalscope/run.py +13 -5
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/argument_utils.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/function_utils.py +249 -12
- evalscope/utils/import_utils.py +73 -1
- evalscope/utils/io_utils.py +132 -7
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +69 -18
- evalscope/utils/model_utils.py +4 -3
- evalscope/utils/multi_choices.py +39 -7
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
- evalscope/api/mixin/dataset_mixin.py +0 -105
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
- tests/__init__.py +0 -1
- tests/aigc/__init__.py +0 -1
- tests/aigc/test_t2i.py +0 -142
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -386
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -229
- tests/cli/test_collection.py +0 -96
- tests/cli/test_custom.py +0 -268
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -176
- tests/rag/test_clip_benchmark.py +0 -90
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- /evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
- {tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import random
|
|
5
|
+
import regex as re
|
|
6
|
+
from typing import Union
|
|
7
|
+
|
|
8
|
+
from evalscope.utils.logger import get_logger
|
|
9
|
+
|
|
10
|
+
logger = get_logger()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def fix_json(input_str):
|
|
14
|
+
# Add double quotes around keys using regex
|
|
15
|
+
fixed_str = re.sub(r'(\w+):', r'"\1":', input_str)
|
|
16
|
+
|
|
17
|
+
# Add double quotes around string values if necessary and wrap int/float values in []
|
|
18
|
+
def format_value(match):
|
|
19
|
+
key, value, comma = match.groups()
|
|
20
|
+
value = value.strip()
|
|
21
|
+
# Check if value is an integer or float
|
|
22
|
+
if re.match(r'^-?\d+(\.\d+)?$', value):
|
|
23
|
+
value = f'[{value}]'
|
|
24
|
+
# Check if value is a boolean or null
|
|
25
|
+
elif re.match(r'^(true|false|null)$', value, re.IGNORECASE):
|
|
26
|
+
pass # leave as is
|
|
27
|
+
else:
|
|
28
|
+
# Add quotes around string values
|
|
29
|
+
value = f'"{value}"'
|
|
30
|
+
return f'{key}: {value}{comma}'
|
|
31
|
+
|
|
32
|
+
fixed_str = re.sub(r'(".*?"):(.*?)(,|})', format_value, fixed_str)
|
|
33
|
+
|
|
34
|
+
return fixed_str
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def read_file_to_string(file_path):
|
|
38
|
+
"""
|
|
39
|
+
Reads the contents of a text file and returns it as a string.
|
|
40
|
+
|
|
41
|
+
:param file_path: The path to the text file.
|
|
42
|
+
:return: A string containing the contents of the file.
|
|
43
|
+
"""
|
|
44
|
+
try:
|
|
45
|
+
with open(file_path, 'r', encoding='utf-8') as file:
|
|
46
|
+
return file.read()
|
|
47
|
+
except FileNotFoundError:
|
|
48
|
+
logger.info(f'The file {file_path} was not found.')
|
|
49
|
+
return None
|
|
50
|
+
except Exception as e:
|
|
51
|
+
logger.info(f'An error occurred: {e}')
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def read_files_to_string(file_paths):
|
|
56
|
+
"""
|
|
57
|
+
Reads the contents of multiple text files and returns them as a single string,
|
|
58
|
+
with each file's contents separated by a newline.
|
|
59
|
+
|
|
60
|
+
:param file_paths: A list of paths to text files.
|
|
61
|
+
:return: A string containing the concatenated contents of the files.
|
|
62
|
+
"""
|
|
63
|
+
all_contents = [] # List to hold the contents of each file
|
|
64
|
+
|
|
65
|
+
for file_path in file_paths:
|
|
66
|
+
try:
|
|
67
|
+
with open(file_path, 'r', encoding='utf-8') as file:
|
|
68
|
+
all_contents.append(file.read())
|
|
69
|
+
except FileNotFoundError:
|
|
70
|
+
logger.info(f'The file {file_path} was not found.')
|
|
71
|
+
except Exception as e:
|
|
72
|
+
logger.info(f'An error occurred while reading {file_path}: {e}')
|
|
73
|
+
|
|
74
|
+
# Join all the contents with a newline character
|
|
75
|
+
return '\n'.join(all_contents)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def get_file_path(filename: Union[str, os.PathLike], search_from: Union[str, os.PathLike] = '.'):
|
|
79
|
+
"""
|
|
80
|
+
Search for a file across a directory and return its absolute path.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
filename (Union[str, os.PathLike]): The name of the file to search for.
|
|
84
|
+
search_from (Union[str, os.PathLike], optional): The directory from which to start the search. Defaults to ".".
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
str: Absolute path to the found file.
|
|
88
|
+
|
|
89
|
+
Raises:
|
|
90
|
+
FileNotFoundError: If the file is not found.
|
|
91
|
+
"""
|
|
92
|
+
for root, dirs, files in os.walk(search_from):
|
|
93
|
+
for name in files:
|
|
94
|
+
if name == filename:
|
|
95
|
+
return os.path.abspath(os.path.join(root, name))
|
|
96
|
+
raise FileNotFoundError(filename, 'not found.')
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# +=========================================================================================
|
|
100
|
+
def verify(s, target_sequence):
|
|
101
|
+
# Count the occurrences of the target sequence
|
|
102
|
+
count = s.count(target_sequence)
|
|
103
|
+
|
|
104
|
+
# Check if the target sequence appears exactly twice
|
|
105
|
+
return count == 2
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def is_int_between_0_and_10(s):
|
|
109
|
+
try:
|
|
110
|
+
num = int(s)
|
|
111
|
+
return 0 <= num <= 10
|
|
112
|
+
except ValueError:
|
|
113
|
+
return False
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def is_str_a_list_of_ints_0_to_10(s):
|
|
117
|
+
try:
|
|
118
|
+
# Attempt to parse the string as a Python literal (list, dict, etc.)
|
|
119
|
+
parsed = ast.literal_eval(s)
|
|
120
|
+
|
|
121
|
+
# Check if the parsed object is a list
|
|
122
|
+
if not isinstance(parsed, list):
|
|
123
|
+
return False
|
|
124
|
+
|
|
125
|
+
# Check if all elements are integers and between 0 to 10
|
|
126
|
+
return all(isinstance(item, int) and 0 <= item <= 10 for item in parsed)
|
|
127
|
+
|
|
128
|
+
except (ValueError, SyntaxError):
|
|
129
|
+
# If parsing fails or any other error occurs
|
|
130
|
+
return False
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def is_str_valid_score_format_brackets(s):
|
|
134
|
+
try:
|
|
135
|
+
# Removing brackets and splitting the string by commas
|
|
136
|
+
content = s.strip('[]').split(',')
|
|
137
|
+
|
|
138
|
+
length = len(content)
|
|
139
|
+
|
|
140
|
+
# Parsing each element and checking the format and range
|
|
141
|
+
scores = {}
|
|
142
|
+
for item in content:
|
|
143
|
+
key, value = item.split(':')
|
|
144
|
+
key = key.strip()
|
|
145
|
+
value = int(value.strip())
|
|
146
|
+
|
|
147
|
+
# Check if the key starts with 'score' and the value is in the correct range
|
|
148
|
+
if not key.startswith('score') or not 0 <= value <= 10:
|
|
149
|
+
return False
|
|
150
|
+
|
|
151
|
+
scores[key] = value
|
|
152
|
+
|
|
153
|
+
fetch_words = [f'score{i+1}' for i in range(length)]
|
|
154
|
+
# Check if at least 'score1' and 'score2' are present
|
|
155
|
+
return all(key in scores for key in fetch_words)
|
|
156
|
+
|
|
157
|
+
except (ValueError, SyntaxError):
|
|
158
|
+
# If any parsing error occurs
|
|
159
|
+
return False
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# +=========================================================================================
|
|
163
|
+
def mllm_output_to_dict(input_string, give_up_parsing=False):
|
|
164
|
+
"""
|
|
165
|
+
Args:
|
|
166
|
+
input_string (str): actually the output of the mllm model to be parsed
|
|
167
|
+
output_file_name (str): The name of the output file.
|
|
168
|
+
"""
|
|
169
|
+
# Catch for gpt4v rate_limit_exceeded error
|
|
170
|
+
if input_string == 'rate_limit_exceeded':
|
|
171
|
+
return 'rate_limit_exceeded'
|
|
172
|
+
|
|
173
|
+
# Define the delimiters
|
|
174
|
+
delimiter = '||V^=^V||'
|
|
175
|
+
|
|
176
|
+
if input_string.count(delimiter) == 2:
|
|
177
|
+
if not verify(input_string, delimiter):
|
|
178
|
+
logger.info('The required delimiters were not found correctly in the string.')
|
|
179
|
+
return False
|
|
180
|
+
# Extract the content between the delimiters
|
|
181
|
+
start_index = input_string.find(delimiter) + len(delimiter)
|
|
182
|
+
end_index = input_string.rfind(delimiter)
|
|
183
|
+
else:
|
|
184
|
+
# find the json mannually
|
|
185
|
+
# some mllm tends not to output the delimiters, but it does output the json contents
|
|
186
|
+
# so we will find the json content mannually
|
|
187
|
+
start_index = input_string.find('{')
|
|
188
|
+
end_index = input_string.rfind('}') + 1
|
|
189
|
+
if start_index == -1 or end_index == 0:
|
|
190
|
+
# json not found
|
|
191
|
+
# some mllm tends to output only a list of scores like [6, 0],
|
|
192
|
+
# this time we will just get the scores and ignore the reasoning (other part of the json)
|
|
193
|
+
start_index = input_string.find('[')
|
|
194
|
+
end_index = input_string.rfind(']') + 1
|
|
195
|
+
if give_up_parsing: # if we want to give up parsing
|
|
196
|
+
guessed_value = random.randint(0, 10)
|
|
197
|
+
logger.info(f'Failed to find the json content in the string. Guess a value : {guessed_value}.')
|
|
198
|
+
json_content = {'score': [guessed_value], 'reasoning': f'guess_if_cannot_parse | {input_string}'}
|
|
199
|
+
json_str = json.dumps(json_content)
|
|
200
|
+
input_string = json_str
|
|
201
|
+
start_index = 0
|
|
202
|
+
end_index = len(json_str)
|
|
203
|
+
elif re.match(r'^\[\d+, ?\d+\]$', input_string[start_index:end_index]):
|
|
204
|
+
scores = json.loads(input_string[start_index:end_index])
|
|
205
|
+
if not isinstance(scores, list):
|
|
206
|
+
scores = [scores]
|
|
207
|
+
json_content = {'score': scores, 'reasoning': 'System: output is simply a list of scores'}
|
|
208
|
+
json_str = json.dumps(json_content)
|
|
209
|
+
input_string = json_str
|
|
210
|
+
start_index = 0
|
|
211
|
+
end_index = len(json_str)
|
|
212
|
+
elif is_int_between_0_and_10(input_string): # if output is simply a number
|
|
213
|
+
scores = [int(input_string)]
|
|
214
|
+
json_content = {'score': scores, 'reasoning': 'System: output is simply a number'}
|
|
215
|
+
json_str = json.dumps(json_content)
|
|
216
|
+
input_string = json_str
|
|
217
|
+
start_index = 0
|
|
218
|
+
end_index = len(json_str)
|
|
219
|
+
else:
|
|
220
|
+
logger.info('Failed to find the json content in the string.')
|
|
221
|
+
return False
|
|
222
|
+
|
|
223
|
+
# Check if we found two delimiters
|
|
224
|
+
if start_index != -1 and end_index != -1 and start_index != end_index:
|
|
225
|
+
# Extract the JSON string
|
|
226
|
+
json_str = input_string[start_index:end_index].strip()
|
|
227
|
+
json_str = json_str.replace('\n', '')
|
|
228
|
+
# Parse the JSON string into a dictionary
|
|
229
|
+
try:
|
|
230
|
+
new_data = json.loads(json_str)
|
|
231
|
+
if not isinstance(new_data['score'], list):
|
|
232
|
+
new_data['score'] = [new_data['score']]
|
|
233
|
+
except Exception:
|
|
234
|
+
logger.info('Now fixing: ', json_str)
|
|
235
|
+
try:
|
|
236
|
+
new_data = json.loads(fix_json(json_str))
|
|
237
|
+
return new_data
|
|
238
|
+
except Exception:
|
|
239
|
+
logger.info('Error: Cannot fix', json_str)
|
|
240
|
+
return False
|
|
241
|
+
return new_data
|
|
242
|
+
else:
|
|
243
|
+
logger.info('The required delimiters were not found correctly in the string.')
|
|
244
|
+
return False
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def write_entry_to_json_file(input_string, uid, prompt_input, vision_input, output_file_name, give_up_parsing=False):
|
|
248
|
+
"""
|
|
249
|
+
Args:
|
|
250
|
+
input_string (str): actually the output of the mllm model to be parsed
|
|
251
|
+
uid (str): The unique identifier for the each item in the test data
|
|
252
|
+
prompt_input (str): The prompt input for the entry. text prompt.
|
|
253
|
+
vision_input (str): The vision input for the entry. image links.
|
|
254
|
+
output_file_name (str): The name of the output file.
|
|
255
|
+
"""
|
|
256
|
+
# Catch for gpt4v rate_limit_exceeded error
|
|
257
|
+
if input_string == 'rate_limit_exceeded':
|
|
258
|
+
return 'rate_limit_exceeded'
|
|
259
|
+
|
|
260
|
+
# Define the delimiters
|
|
261
|
+
delimiter = '||V^=^V||'
|
|
262
|
+
|
|
263
|
+
if input_string.count(delimiter) == 2:
|
|
264
|
+
if not verify(input_string, delimiter):
|
|
265
|
+
logger.info('The required delimiters were not found correctly in the string.')
|
|
266
|
+
return False
|
|
267
|
+
# Extract the content between the delimiters
|
|
268
|
+
start_index = input_string.find(delimiter) + len(delimiter)
|
|
269
|
+
end_index = input_string.rfind(delimiter)
|
|
270
|
+
else:
|
|
271
|
+
# find the json mannually
|
|
272
|
+
# some mllm tends not to output the delimiters, but it does output the json contents
|
|
273
|
+
# so we will find the json content mannually
|
|
274
|
+
start_index = input_string.find('{')
|
|
275
|
+
end_index = input_string.rfind('}') + 1
|
|
276
|
+
if start_index == -1 or end_index == 0:
|
|
277
|
+
# json not found
|
|
278
|
+
# some mllm tends to output only a list of scores like [6, 0],
|
|
279
|
+
# this time we will just get the scores and ignore the reasoning (other part of the json)
|
|
280
|
+
start_index = input_string.find('[')
|
|
281
|
+
end_index = input_string.rfind(']') + 1
|
|
282
|
+
if give_up_parsing: # if we want to give up parsing
|
|
283
|
+
guessed_value = random.randint(0, 10)
|
|
284
|
+
logger.info(f'Failed to find the json content in the string. Guess a value : {guessed_value}.')
|
|
285
|
+
json_content = {'score': [guessed_value], 'reasoning': f'guess_if_cannot_parse | {input_string}'}
|
|
286
|
+
json_str = json.dumps(json_content)
|
|
287
|
+
input_string = json_str
|
|
288
|
+
start_index = 0
|
|
289
|
+
end_index = len(json_str)
|
|
290
|
+
elif re.match(r'^\[\d+, ?\d+\]$', input_string[start_index:end_index]):
|
|
291
|
+
scores = json.loads(input_string[start_index:end_index])
|
|
292
|
+
json_content = {'score': scores, 'reasoning': None}
|
|
293
|
+
json_str = json.dumps(json_content)
|
|
294
|
+
input_string = json_str
|
|
295
|
+
start_index = 0
|
|
296
|
+
end_index = len(json_str)
|
|
297
|
+
elif is_int_between_0_and_10(input_string): # if output is simply a number
|
|
298
|
+
scores = [int(input_string)]
|
|
299
|
+
json_content = {'score': scores, 'reasoning': None}
|
|
300
|
+
json_str = json.dumps(json_content)
|
|
301
|
+
input_string = json_str
|
|
302
|
+
start_index = 0
|
|
303
|
+
end_index = len(json_str)
|
|
304
|
+
else:
|
|
305
|
+
logger.info('Failed to find the json content in the string.')
|
|
306
|
+
return False
|
|
307
|
+
|
|
308
|
+
# Check if we found two delimiters
|
|
309
|
+
if start_index != -1 and end_index != -1 and start_index != end_index:
|
|
310
|
+
# Extract the JSON string
|
|
311
|
+
json_str = input_string[start_index:end_index].strip()
|
|
312
|
+
json_str = json_str.replace('\n', '')
|
|
313
|
+
try:
|
|
314
|
+
# Parse the JSON string into a dictionary
|
|
315
|
+
new_data = json.loads(json_str)
|
|
316
|
+
|
|
317
|
+
# Ensure the directory exists
|
|
318
|
+
os.makedirs(os.path.dirname(output_file_name), exist_ok=True)
|
|
319
|
+
|
|
320
|
+
# Initialize or load existing data
|
|
321
|
+
if os.path.exists(output_file_name):
|
|
322
|
+
with open(output_file_name, 'r') as json_file:
|
|
323
|
+
data = json.load(json_file)
|
|
324
|
+
else:
|
|
325
|
+
data = {}
|
|
326
|
+
|
|
327
|
+
# If the additional key is already in the data, add or update notes
|
|
328
|
+
if uid in data:
|
|
329
|
+
data[uid].update(new_data) # Update with new data
|
|
330
|
+
if prompt_input: # If there are new notes, update or add them
|
|
331
|
+
data[uid]['prompt_input'] = prompt_input
|
|
332
|
+
if vision_input: # If there are new notes, update or add them
|
|
333
|
+
data[uid]['vision_input'] = vision_input
|
|
334
|
+
else:
|
|
335
|
+
# If it's a new key, add the entry to the dictionary
|
|
336
|
+
data[uid] = new_data
|
|
337
|
+
if prompt_input:
|
|
338
|
+
data[uid]['prompt_input'] = prompt_input
|
|
339
|
+
if vision_input:
|
|
340
|
+
data[uid]['vision_input'] = vision_input
|
|
341
|
+
|
|
342
|
+
# Write the updated data to the file
|
|
343
|
+
with open(output_file_name, 'w') as json_file:
|
|
344
|
+
json.dump(data, json_file, indent=4)
|
|
345
|
+
|
|
346
|
+
logger.info(f'Data was successfully updated in {output_file_name}')
|
|
347
|
+
return True
|
|
348
|
+
except json.JSONDecodeError as e:
|
|
349
|
+
logger.info(f'An error occurred while parsing the JSON content: {e}')
|
|
350
|
+
return False
|
|
351
|
+
else:
|
|
352
|
+
logger.info('The required delimiters were not found correctly in the string.')
|
|
353
|
+
return False
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def check_key_in_json(file_path, key):
|
|
357
|
+
try:
|
|
358
|
+
with open(file_path, 'r') as json_file:
|
|
359
|
+
data = json.load(json_file)
|
|
360
|
+
|
|
361
|
+
# Check if the key exists at the top level of the JSON structure
|
|
362
|
+
if key in data:
|
|
363
|
+
return True
|
|
364
|
+
else:
|
|
365
|
+
return False
|
|
366
|
+
except FileNotFoundError:
|
|
367
|
+
logger.info(f'The file {file_path} was not found.')
|
|
368
|
+
except json.JSONDecodeError as e:
|
|
369
|
+
logger.info(f'Error reading {file_path}: {e}')
|
|
370
|
+
except Exception as e:
|
|
371
|
+
logger.info(f'An error occurred with {file_path}: {e}')
|
|
372
|
+
return False
|