evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -0,0 +1,372 @@
1
+ import ast
2
+ import json
3
+ import os
4
+ import random
5
+ import regex as re
6
+ from typing import Union
7
+
8
+ from evalscope.utils.logger import get_logger
9
+
10
+ logger = get_logger()
11
+
12
+
13
+ def fix_json(input_str):
14
+ # Add double quotes around keys using regex
15
+ fixed_str = re.sub(r'(\w+):', r'"\1":', input_str)
16
+
17
+ # Add double quotes around string values if necessary and wrap int/float values in []
18
+ def format_value(match):
19
+ key, value, comma = match.groups()
20
+ value = value.strip()
21
+ # Check if value is an integer or float
22
+ if re.match(r'^-?\d+(\.\d+)?$', value):
23
+ value = f'[{value}]'
24
+ # Check if value is a boolean or null
25
+ elif re.match(r'^(true|false|null)$', value, re.IGNORECASE):
26
+ pass # leave as is
27
+ else:
28
+ # Add quotes around string values
29
+ value = f'"{value}"'
30
+ return f'{key}: {value}{comma}'
31
+
32
+ fixed_str = re.sub(r'(".*?"):(.*?)(,|})', format_value, fixed_str)
33
+
34
+ return fixed_str
35
+
36
+
37
+ def read_file_to_string(file_path):
38
+ """
39
+ Reads the contents of a text file and returns it as a string.
40
+
41
+ :param file_path: The path to the text file.
42
+ :return: A string containing the contents of the file.
43
+ """
44
+ try:
45
+ with open(file_path, 'r', encoding='utf-8') as file:
46
+ return file.read()
47
+ except FileNotFoundError:
48
+ logger.info(f'The file {file_path} was not found.')
49
+ return None
50
+ except Exception as e:
51
+ logger.info(f'An error occurred: {e}')
52
+ return None
53
+
54
+
55
+ def read_files_to_string(file_paths):
56
+ """
57
+ Reads the contents of multiple text files and returns them as a single string,
58
+ with each file's contents separated by a newline.
59
+
60
+ :param file_paths: A list of paths to text files.
61
+ :return: A string containing the concatenated contents of the files.
62
+ """
63
+ all_contents = [] # List to hold the contents of each file
64
+
65
+ for file_path in file_paths:
66
+ try:
67
+ with open(file_path, 'r', encoding='utf-8') as file:
68
+ all_contents.append(file.read())
69
+ except FileNotFoundError:
70
+ logger.info(f'The file {file_path} was not found.')
71
+ except Exception as e:
72
+ logger.info(f'An error occurred while reading {file_path}: {e}')
73
+
74
+ # Join all the contents with a newline character
75
+ return '\n'.join(all_contents)
76
+
77
+
78
+ def get_file_path(filename: Union[str, os.PathLike], search_from: Union[str, os.PathLike] = '.'):
79
+ """
80
+ Search for a file across a directory and return its absolute path.
81
+
82
+ Args:
83
+ filename (Union[str, os.PathLike]): The name of the file to search for.
84
+ search_from (Union[str, os.PathLike], optional): The directory from which to start the search. Defaults to ".".
85
+
86
+ Returns:
87
+ str: Absolute path to the found file.
88
+
89
+ Raises:
90
+ FileNotFoundError: If the file is not found.
91
+ """
92
+ for root, dirs, files in os.walk(search_from):
93
+ for name in files:
94
+ if name == filename:
95
+ return os.path.abspath(os.path.join(root, name))
96
+ raise FileNotFoundError(filename, 'not found.')
97
+
98
+
99
+ # +=========================================================================================
100
+ def verify(s, target_sequence):
101
+ # Count the occurrences of the target sequence
102
+ count = s.count(target_sequence)
103
+
104
+ # Check if the target sequence appears exactly twice
105
+ return count == 2
106
+
107
+
108
+ def is_int_between_0_and_10(s):
109
+ try:
110
+ num = int(s)
111
+ return 0 <= num <= 10
112
+ except ValueError:
113
+ return False
114
+
115
+
116
+ def is_str_a_list_of_ints_0_to_10(s):
117
+ try:
118
+ # Attempt to parse the string as a Python literal (list, dict, etc.)
119
+ parsed = ast.literal_eval(s)
120
+
121
+ # Check if the parsed object is a list
122
+ if not isinstance(parsed, list):
123
+ return False
124
+
125
+ # Check if all elements are integers and between 0 to 10
126
+ return all(isinstance(item, int) and 0 <= item <= 10 for item in parsed)
127
+
128
+ except (ValueError, SyntaxError):
129
+ # If parsing fails or any other error occurs
130
+ return False
131
+
132
+
133
+ def is_str_valid_score_format_brackets(s):
134
+ try:
135
+ # Removing brackets and splitting the string by commas
136
+ content = s.strip('[]').split(',')
137
+
138
+ length = len(content)
139
+
140
+ # Parsing each element and checking the format and range
141
+ scores = {}
142
+ for item in content:
143
+ key, value = item.split(':')
144
+ key = key.strip()
145
+ value = int(value.strip())
146
+
147
+ # Check if the key starts with 'score' and the value is in the correct range
148
+ if not key.startswith('score') or not 0 <= value <= 10:
149
+ return False
150
+
151
+ scores[key] = value
152
+
153
+ fetch_words = [f'score{i+1}' for i in range(length)]
154
+ # Check if at least 'score1' and 'score2' are present
155
+ return all(key in scores for key in fetch_words)
156
+
157
+ except (ValueError, SyntaxError):
158
+ # If any parsing error occurs
159
+ return False
160
+
161
+
162
+ # +=========================================================================================
163
+ def mllm_output_to_dict(input_string, give_up_parsing=False):
164
+ """
165
+ Args:
166
+ input_string (str): actually the output of the mllm model to be parsed
167
+ output_file_name (str): The name of the output file.
168
+ """
169
+ # Catch for gpt4v rate_limit_exceeded error
170
+ if input_string == 'rate_limit_exceeded':
171
+ return 'rate_limit_exceeded'
172
+
173
+ # Define the delimiters
174
+ delimiter = '||V^=^V||'
175
+
176
+ if input_string.count(delimiter) == 2:
177
+ if not verify(input_string, delimiter):
178
+ logger.info('The required delimiters were not found correctly in the string.')
179
+ return False
180
+ # Extract the content between the delimiters
181
+ start_index = input_string.find(delimiter) + len(delimiter)
182
+ end_index = input_string.rfind(delimiter)
183
+ else:
184
+ # find the json mannually
185
+ # some mllm tends not to output the delimiters, but it does output the json contents
186
+ # so we will find the json content mannually
187
+ start_index = input_string.find('{')
188
+ end_index = input_string.rfind('}') + 1
189
+ if start_index == -1 or end_index == 0:
190
+ # json not found
191
+ # some mllm tends to output only a list of scores like [6, 0],
192
+ # this time we will just get the scores and ignore the reasoning (other part of the json)
193
+ start_index = input_string.find('[')
194
+ end_index = input_string.rfind(']') + 1
195
+ if give_up_parsing: # if we want to give up parsing
196
+ guessed_value = random.randint(0, 10)
197
+ logger.info(f'Failed to find the json content in the string. Guess a value : {guessed_value}.')
198
+ json_content = {'score': [guessed_value], 'reasoning': f'guess_if_cannot_parse | {input_string}'}
199
+ json_str = json.dumps(json_content)
200
+ input_string = json_str
201
+ start_index = 0
202
+ end_index = len(json_str)
203
+ elif re.match(r'^\[\d+, ?\d+\]$', input_string[start_index:end_index]):
204
+ scores = json.loads(input_string[start_index:end_index])
205
+ if not isinstance(scores, list):
206
+ scores = [scores]
207
+ json_content = {'score': scores, 'reasoning': 'System: output is simply a list of scores'}
208
+ json_str = json.dumps(json_content)
209
+ input_string = json_str
210
+ start_index = 0
211
+ end_index = len(json_str)
212
+ elif is_int_between_0_and_10(input_string): # if output is simply a number
213
+ scores = [int(input_string)]
214
+ json_content = {'score': scores, 'reasoning': 'System: output is simply a number'}
215
+ json_str = json.dumps(json_content)
216
+ input_string = json_str
217
+ start_index = 0
218
+ end_index = len(json_str)
219
+ else:
220
+ logger.info('Failed to find the json content in the string.')
221
+ return False
222
+
223
+ # Check if we found two delimiters
224
+ if start_index != -1 and end_index != -1 and start_index != end_index:
225
+ # Extract the JSON string
226
+ json_str = input_string[start_index:end_index].strip()
227
+ json_str = json_str.replace('\n', '')
228
+ # Parse the JSON string into a dictionary
229
+ try:
230
+ new_data = json.loads(json_str)
231
+ if not isinstance(new_data['score'], list):
232
+ new_data['score'] = [new_data['score']]
233
+ except Exception:
234
+ logger.info('Now fixing: ', json_str)
235
+ try:
236
+ new_data = json.loads(fix_json(json_str))
237
+ return new_data
238
+ except Exception:
239
+ logger.info('Error: Cannot fix', json_str)
240
+ return False
241
+ return new_data
242
+ else:
243
+ logger.info('The required delimiters were not found correctly in the string.')
244
+ return False
245
+
246
+
247
+ def write_entry_to_json_file(input_string, uid, prompt_input, vision_input, output_file_name, give_up_parsing=False):
248
+ """
249
+ Args:
250
+ input_string (str): actually the output of the mllm model to be parsed
251
+ uid (str): The unique identifier for the each item in the test data
252
+ prompt_input (str): The prompt input for the entry. text prompt.
253
+ vision_input (str): The vision input for the entry. image links.
254
+ output_file_name (str): The name of the output file.
255
+ """
256
+ # Catch for gpt4v rate_limit_exceeded error
257
+ if input_string == 'rate_limit_exceeded':
258
+ return 'rate_limit_exceeded'
259
+
260
+ # Define the delimiters
261
+ delimiter = '||V^=^V||'
262
+
263
+ if input_string.count(delimiter) == 2:
264
+ if not verify(input_string, delimiter):
265
+ logger.info('The required delimiters were not found correctly in the string.')
266
+ return False
267
+ # Extract the content between the delimiters
268
+ start_index = input_string.find(delimiter) + len(delimiter)
269
+ end_index = input_string.rfind(delimiter)
270
+ else:
271
+ # find the json mannually
272
+ # some mllm tends not to output the delimiters, but it does output the json contents
273
+ # so we will find the json content mannually
274
+ start_index = input_string.find('{')
275
+ end_index = input_string.rfind('}') + 1
276
+ if start_index == -1 or end_index == 0:
277
+ # json not found
278
+ # some mllm tends to output only a list of scores like [6, 0],
279
+ # this time we will just get the scores and ignore the reasoning (other part of the json)
280
+ start_index = input_string.find('[')
281
+ end_index = input_string.rfind(']') + 1
282
+ if give_up_parsing: # if we want to give up parsing
283
+ guessed_value = random.randint(0, 10)
284
+ logger.info(f'Failed to find the json content in the string. Guess a value : {guessed_value}.')
285
+ json_content = {'score': [guessed_value], 'reasoning': f'guess_if_cannot_parse | {input_string}'}
286
+ json_str = json.dumps(json_content)
287
+ input_string = json_str
288
+ start_index = 0
289
+ end_index = len(json_str)
290
+ elif re.match(r'^\[\d+, ?\d+\]$', input_string[start_index:end_index]):
291
+ scores = json.loads(input_string[start_index:end_index])
292
+ json_content = {'score': scores, 'reasoning': None}
293
+ json_str = json.dumps(json_content)
294
+ input_string = json_str
295
+ start_index = 0
296
+ end_index = len(json_str)
297
+ elif is_int_between_0_and_10(input_string): # if output is simply a number
298
+ scores = [int(input_string)]
299
+ json_content = {'score': scores, 'reasoning': None}
300
+ json_str = json.dumps(json_content)
301
+ input_string = json_str
302
+ start_index = 0
303
+ end_index = len(json_str)
304
+ else:
305
+ logger.info('Failed to find the json content in the string.')
306
+ return False
307
+
308
+ # Check if we found two delimiters
309
+ if start_index != -1 and end_index != -1 and start_index != end_index:
310
+ # Extract the JSON string
311
+ json_str = input_string[start_index:end_index].strip()
312
+ json_str = json_str.replace('\n', '')
313
+ try:
314
+ # Parse the JSON string into a dictionary
315
+ new_data = json.loads(json_str)
316
+
317
+ # Ensure the directory exists
318
+ os.makedirs(os.path.dirname(output_file_name), exist_ok=True)
319
+
320
+ # Initialize or load existing data
321
+ if os.path.exists(output_file_name):
322
+ with open(output_file_name, 'r') as json_file:
323
+ data = json.load(json_file)
324
+ else:
325
+ data = {}
326
+
327
+ # If the additional key is already in the data, add or update notes
328
+ if uid in data:
329
+ data[uid].update(new_data) # Update with new data
330
+ if prompt_input: # If there are new notes, update or add them
331
+ data[uid]['prompt_input'] = prompt_input
332
+ if vision_input: # If there are new notes, update or add them
333
+ data[uid]['vision_input'] = vision_input
334
+ else:
335
+ # If it's a new key, add the entry to the dictionary
336
+ data[uid] = new_data
337
+ if prompt_input:
338
+ data[uid]['prompt_input'] = prompt_input
339
+ if vision_input:
340
+ data[uid]['vision_input'] = vision_input
341
+
342
+ # Write the updated data to the file
343
+ with open(output_file_name, 'w') as json_file:
344
+ json.dump(data, json_file, indent=4)
345
+
346
+ logger.info(f'Data was successfully updated in {output_file_name}')
347
+ return True
348
+ except json.JSONDecodeError as e:
349
+ logger.info(f'An error occurred while parsing the JSON content: {e}')
350
+ return False
351
+ else:
352
+ logger.info('The required delimiters were not found correctly in the string.')
353
+ return False
354
+
355
+
356
+ def check_key_in_json(file_path, key):
357
+ try:
358
+ with open(file_path, 'r') as json_file:
359
+ data = json.load(json_file)
360
+
361
+ # Check if the key exists at the top level of the JSON structure
362
+ if key in data:
363
+ return True
364
+ else:
365
+ return False
366
+ except FileNotFoundError:
367
+ logger.info(f'The file {file_path} was not found.')
368
+ except json.JSONDecodeError as e:
369
+ logger.info(f'Error reading {file_path}: {e}')
370
+ except Exception as e:
371
+ logger.info(f'An error occurred with {file_path}: {e}')
372
+ return False