evalscope 0.13.2__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (214) hide show
  1. evalscope/arguments.py +2 -1
  2. evalscope/backend/rag_eval/__init__.py +1 -1
  3. evalscope/backend/rag_eval/backend_manager.py +21 -5
  4. evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
  5. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  6. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
  7. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
  8. evalscope/backend/rag_eval/utils/embedding.py +49 -3
  9. evalscope/backend/rag_eval/utils/llm.py +4 -4
  10. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
  11. evalscope/benchmarks/__init__.py +2 -2
  12. evalscope/benchmarks/aigc/__init__.py +0 -0
  13. evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
  14. evalscope/benchmarks/aigc/t2i/base.py +56 -0
  15. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +77 -0
  16. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +58 -0
  17. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +58 -0
  18. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +57 -0
  19. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +37 -0
  20. evalscope/benchmarks/aime/aime24_adapter.py +1 -1
  21. evalscope/benchmarks/aime/aime25_adapter.py +4 -4
  22. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -2
  23. evalscope/benchmarks/arc/arc_adapter.py +2 -2
  24. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -3
  25. evalscope/benchmarks/ceval/ceval_adapter.py +2 -2
  26. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -3
  27. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +1 -1
  28. evalscope/benchmarks/competition_math/competition_math_adapter.py +1 -2
  29. evalscope/benchmarks/data_adapter.py +21 -10
  30. evalscope/benchmarks/data_collection/data_collection_adapter.py +6 -4
  31. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -2
  32. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  33. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  34. evalscope/benchmarks/live_code_bench/evaluate_utils.py +16 -21
  35. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +5 -4
  36. evalscope/benchmarks/live_code_bench/testing_util.py +369 -550
  37. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  38. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
  39. evalscope/benchmarks/math_500/math_500_adapter.py +1 -1
  40. evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -8
  41. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  42. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  43. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  44. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -2
  45. evalscope/benchmarks/utils.py +7 -16
  46. evalscope/cli/start_app.py +1 -1
  47. evalscope/collections/evaluator.py +20 -6
  48. evalscope/config.py +8 -4
  49. evalscope/constants.py +11 -0
  50. evalscope/evaluator/evaluator.py +2 -2
  51. evalscope/evaluator/reviewer/auto_reviewer.py +1 -1
  52. evalscope/metrics/__init__.py +49 -4
  53. evalscope/metrics/llm_judge.py +1 -1
  54. evalscope/metrics/named_metrics.py +13 -0
  55. evalscope/metrics/t2v_metrics/__init__.py +66 -0
  56. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  57. evalscope/metrics/t2v_metrics/constants.py +12 -0
  58. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  59. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  60. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  61. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  62. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  63. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +132 -0
  64. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +286 -0
  65. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +114 -0
  66. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +86 -0
  67. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +85 -0
  68. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  69. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  70. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +84 -0
  71. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +97 -0
  72. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +171 -0
  73. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  74. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +80 -0
  75. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +73 -0
  76. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  77. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  78. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  79. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  80. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  81. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +300 -0
  82. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  83. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +82 -0
  84. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  85. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +218 -0
  86. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +150 -0
  87. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  88. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  89. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  90. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +22 -0
  91. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +188 -0
  92. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +106 -0
  93. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +307 -0
  94. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  95. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  96. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +191 -0
  97. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +318 -0
  98. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  99. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  100. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  101. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  102. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  103. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  104. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  105. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  106. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  107. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  108. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  109. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  110. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  111. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  112. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  113. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  114. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  115. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  116. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  117. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  118. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  119. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  120. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  121. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +208 -0
  122. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  123. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1093 -0
  124. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  125. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  126. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  127. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +452 -0
  128. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +364 -0
  129. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +755 -0
  130. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +273 -0
  131. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +880 -0
  132. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1844 -0
  133. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +81 -0
  134. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +56 -0
  135. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  136. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  137. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  138. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +185 -0
  139. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +178 -0
  140. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +112 -0
  141. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  142. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +344 -0
  143. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +858 -0
  144. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +271 -0
  145. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +503 -0
  146. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1270 -0
  147. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +473 -0
  148. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +31 -0
  149. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +392 -0
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +127 -0
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +17 -0
  154. evalscope/metrics/t2v_metrics/score.py +78 -0
  155. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  156. evalscope/models/__init__.py +50 -14
  157. evalscope/models/adapters/__init__.py +17 -0
  158. evalscope/models/{base_adapter.py → adapters/base_adapter.py} +17 -17
  159. evalscope/models/{chat_adapter.py → adapters/chat_adapter.py} +10 -7
  160. evalscope/models/{choice_adapter.py → adapters/choice_adapter.py} +2 -6
  161. evalscope/models/{custom_adapter.py → adapters/custom_adapter.py} +2 -4
  162. evalscope/models/{server_adapter.py → adapters/server_adapter.py} +1 -3
  163. evalscope/models/adapters/t2i_adapter.py +76 -0
  164. evalscope/models/custom/__init__.py +2 -1
  165. evalscope/models/custom/dummy_model.py +11 -13
  166. evalscope/models/local_model.py +82 -33
  167. evalscope/models/model.py +2 -42
  168. evalscope/models/register.py +26 -0
  169. evalscope/perf/arguments.py +24 -5
  170. evalscope/perf/benchmark.py +28 -42
  171. evalscope/perf/http_client.py +2 -3
  172. evalscope/perf/plugin/api/custom_api.py +1 -1
  173. evalscope/perf/plugin/api/openai_api.py +2 -2
  174. evalscope/perf/plugin/datasets/custom.py +4 -1
  175. evalscope/perf/plugin/datasets/flickr8k.py +2 -1
  176. evalscope/perf/plugin/datasets/line_by_line.py +4 -1
  177. evalscope/perf/plugin/datasets/longalpaca.py +4 -1
  178. evalscope/perf/plugin/datasets/openqa.py +4 -1
  179. evalscope/perf/plugin/datasets/random_dataset.py +13 -6
  180. evalscope/perf/utils/benchmark_util.py +14 -8
  181. evalscope/perf/utils/db_util.py +9 -3
  182. evalscope/perf/utils/log_utils.py +41 -0
  183. evalscope/report/__init__.py +1 -0
  184. evalscope/report/app.py +128 -78
  185. evalscope/report/app_arguments.py +11 -0
  186. evalscope/report/generator.py +1 -1
  187. evalscope/run.py +10 -3
  188. evalscope/summarizer.py +2 -1
  189. evalscope/third_party/thinkbench/eval.py +19 -7
  190. evalscope/utils/chat_service.py +2 -2
  191. evalscope/utils/import_utils.py +66 -0
  192. evalscope/utils/utils.py +48 -29
  193. evalscope/version.py +2 -2
  194. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/METADATA +37 -15
  195. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/RECORD +209 -96
  196. tests/aigc/__init__.py +1 -0
  197. tests/aigc/test_t2i.py +87 -0
  198. tests/cli/test_all.py +4 -4
  199. tests/cli/test_collection.py +2 -1
  200. tests/cli/test_run.py +19 -12
  201. tests/perf/test_perf.py +3 -3
  202. tests/rag/test_clip_benchmark.py +0 -1
  203. tests/rag/test_mteb.py +37 -8
  204. tests/rag/test_ragas.py +29 -26
  205. tests/vlm/test_vlmeval.py +37 -1
  206. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  207. evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
  208. evalscope/metrics/code_metric.py +0 -98
  209. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  210. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  211. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/LICENSE +0 -0
  212. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/WHEEL +0 -0
  213. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/entry_points.txt +0 -0
  214. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,4 @@
1
- # Copyright LiveCodeBench @ 2024,
2
-
1
+ # flake8: noqa
3
2
  import ast
4
3
  import faulthandler
5
4
  import json
@@ -8,23 +7,30 @@ import platform
8
7
  # to run the solution files we're using a timing based approach
9
8
  import signal
10
9
  import sys
10
+ import time
11
11
  # used for debugging to time steps
12
12
  from datetime import datetime
13
+ from decimal import Decimal
13
14
  from enum import Enum
14
- # for capturing the stdout
15
+ from functools import partial
15
16
  from io import StringIO
16
-
17
- try:
18
- from pyext import RuntimeModule
19
- except Exception:
20
- print('pyext not found, please install with `pip install pyext`')
21
- pyext = None
17
+ # from pyext import RuntimeModule
18
+ from types import ModuleType
22
19
  # used for testing the code that reads from input
23
20
  from unittest.mock import mock_open, patch
24
21
 
22
+ from evalscope.utils.logger import get_logger
23
+
24
+ logger = get_logger()
25
+
26
+ import_string = 'from string import *\nfrom re import *\nfrom datetime import *\nfrom collections import *\nfrom heapq import *\nfrom bisect import *\nfrom copy import *\nfrom math import *\nfrom random import *\nfrom statistics import *\nfrom itertools import *\nfrom functools import *\nfrom operator import *\nfrom io import *\nfrom sys import *\nfrom json import *\nfrom builtins import *\nfrom typing import *\nimport string\nimport re\nimport datetime\nimport collections\nimport heapq\nimport bisect\nimport copy\nimport math\nimport random\nimport statistics\nimport itertools\nimport functools\nimport operator\nimport io\nimport sys\nimport json\nsys.setrecursionlimit(50000)\n'
27
+
25
28
 
26
29
  def truncatefn(s, length=300):
27
- assert isinstance(s, str)
30
+ if isinstance(s, str):
31
+ pass
32
+ else:
33
+ s = str(s)
28
34
  if len(s) <= length:
29
35
  return s
30
36
 
@@ -41,17 +47,12 @@ class TimeoutException(Exception):
41
47
  pass
42
48
 
43
49
 
44
- def timeout_handler(signum, frame):
45
- print('alarm went off')
46
- # return
50
+ def timeout_handler(debug, signum, frame):
51
+ if debug:
52
+ logger.info('timeout occured: alarm went off')
47
53
  raise TimeoutException
48
54
 
49
55
 
50
- signal.signal(signal.SIGALRM, timeout_handler)
51
-
52
- # timeout = 6 # seconds
53
-
54
-
55
56
  # used to capture stdout as a list
56
57
  # from https://stackoverflow.com/a/16571630/6416660
57
58
  # alternative use redirect_stdout() from contextlib
@@ -70,587 +71,405 @@ class Capturing(list):
70
71
  sys.stdout = self._stdout
71
72
 
72
73
 
73
- def only_int_check(val):
74
- return isinstance(val, int)
74
+ def clean_if_name(code: str) -> str:
75
+ try:
76
+ astree = ast.parse(code)
77
+ last_block = astree.body[-1]
78
+ if isinstance(last_block, ast.If):
79
+ condition = last_block.test
80
+ if ast.unparse(condition).strip() == "__name__ == '__main__'":
81
+ code = (
82
+ ast.unparse(astree.body[:-1]) + '\n' + ast.unparse(last_block.body) # type: ignore
83
+ )
84
+ except:
85
+ pass
75
86
 
87
+ return code
76
88
 
77
- def string_int_check(val):
78
- return isinstance(val, str) and val.isdigit()
79
89
 
90
+ def make_function(code: str) -> str:
91
+ try:
92
+ import_stmts = []
93
+ all_other_stmts = []
94
+ astree = ast.parse(code)
95
+ for stmt in astree.body:
96
+ if isinstance(stmt, (ast.Import, ast.ImportFrom)):
97
+ import_stmts.append(stmt)
98
+ else:
99
+ all_other_stmts.append(stmt)
100
+
101
+ function_ast = ast.FunctionDef(
102
+ name='wrapped_function',
103
+ args=ast.arguments(posonlyargs=[], args=[], kwonlyargs=[], kw_defaults=[], defaults=[]),
104
+ body=all_other_stmts,
105
+ decorator_list=[],
106
+ lineno=-1,
107
+ )
108
+ main_code = (
109
+ import_string + '\n' + ast.unparse(import_stmts) # type: ignore
110
+ + '\n' + ast.unparse(function_ast) # type: ignore
111
+ )
112
+ return main_code
113
+ except Exception as e:
114
+ return code
80
115
 
81
- def combined_int_check(val):
82
- return only_int_check(val) or string_int_check(val)
83
116
 
117
+ def call_method(method, inputs):
84
118
 
85
- def run_test(sample, test=None, debug=False, timeout=6):
86
- """if test(generated_code) is not None it'll try to run the code.
119
+ if isinstance(inputs, list):
120
+ inputs = '\n'.join(inputs)
121
+
122
+ inputs_line_iterator = iter(inputs.split('\n'))
123
+
124
+ # sys.setrecursionlimit(10000)
125
+
126
+ # @patch('builtins.input', side_effect=inputs.split("\n"))
127
+ @patch('builtins.open', mock_open(read_data=inputs))
128
+ @patch('sys.stdin', StringIO(inputs))
129
+ @patch('sys.stdin.readline', lambda *args: next(inputs_line_iterator))
130
+ @patch('sys.stdin.readlines', lambda *args: inputs.split('\n'))
131
+ @patch('sys.stdin.read', lambda *args: inputs)
132
+ # @patch('sys.stdout.write', print)
133
+ def _inner_call_method(_method):
134
+ try:
135
+ return _method()
136
+ except SystemExit as e:
137
+ pass
138
+ finally:
139
+ pass
140
+
141
+ return _inner_call_method(method)
142
+
143
+
144
+ def get_function(compiled_sol, fn_name: str): # type: ignore
145
+ try:
146
+ assert hasattr(compiled_sol, fn_name)
147
+ return getattr(compiled_sol, fn_name)
148
+ except Exception as e:
149
+ return
150
+
151
+
152
+ def compile_code(code: str, timeout: int):
153
+ signal.alarm(timeout)
154
+ try:
155
+ tmp_sol = ModuleType('tmp_sol', '')
156
+ exec(code, tmp_sol.__dict__)
157
+ if 'class Solution' in code:
158
+ # leetcode wraps solutions in `Solution`
159
+ # this is a hack to check if it is leetcode solution or not
160
+ # currently livecodebench only supports LeetCode but
161
+ # else condition allows future extensibility to other platforms
162
+ compiled_sol = tmp_sol.Solution()
163
+ else:
164
+ # do nothing in the other case since function is accesible
165
+ compiled_sol = tmp_sol
166
+
167
+ assert compiled_sol is not None
168
+ finally:
169
+ signal.alarm(0)
170
+
171
+ return compiled_sol
172
+
173
+
174
+ def convert_line_to_decimals(line: str) -> tuple[bool, list[Decimal]]:
175
+ try:
176
+ decimal_line = [Decimal(elem) for elem in line.split()]
177
+ except:
178
+ return False, []
179
+ return True, decimal_line
180
+
181
+
182
+ def get_stripped_lines(val: str):
183
+ ## you don't want empty lines to add empty list after splitlines!
184
+ val = val.strip()
185
+
186
+ return [val_line.strip() for val_line in val.split('\n')]
187
+
188
+
189
+ def grade_call_based(code: str, all_inputs: list, all_outputs: list, fn_name: str, timeout: int):
190
+ # call-based clean up logic
191
+ # need to wrap in try-catch logic after to catch the correct errors, but for now this is fine.
192
+ code = import_string + '\n\n' + code
193
+ compiled_sol = compile_code(code, timeout)
194
+
195
+ if compiled_sol is None:
196
+ return
197
+
198
+ method = get_function(compiled_sol, fn_name)
199
+
200
+ if method is None:
201
+ return
202
+
203
+ all_inputs = [[json.loads(line) for line in inputs.split('\n')] for inputs in all_inputs]
204
+
205
+ all_outputs = [json.loads(output) for output in all_outputs]
206
+
207
+ total_execution = 0
208
+ all_results = []
209
+ for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
210
+ signal.alarm(timeout)
211
+ # faulthandler.enable()
212
+ try:
213
+ # can lock here so time is useful
214
+ start = time.time()
215
+ prediction = method(*gt_inp)
216
+ total_execution += time.time() - start
217
+ signal.alarm(0)
218
+
219
+ # don't penalize model if it produces tuples instead of lists
220
+ # ground truth sequences are not tuples
221
+ if isinstance(prediction, tuple):
222
+ prediction = list(prediction)
223
+
224
+ tmp_result = prediction == gt_out
225
+
226
+ # handle floating point comparisons
227
+
228
+ all_results.append(tmp_result)
229
+
230
+ if not tmp_result:
231
+ return all_results, {
232
+ 'output': truncatefn(prediction),
233
+ 'inputs': truncatefn(gt_inp),
234
+ 'expected': truncatefn(gt_out),
235
+ 'error_code': -2,
236
+ 'error_message': 'Wrong Answer',
237
+ }
238
+ except Exception as e:
239
+ signal.alarm(0)
240
+ if 'timeoutexception' in repr(e).lower():
241
+ all_results.append(-3)
242
+ return all_results, {
243
+ 'error': repr(e),
244
+ 'error_code': -3,
245
+ 'error_message': 'Time Limit Exceeded',
246
+ 'inputs': truncatefn(gt_inp),
247
+ 'expected': truncatefn(gt_out),
248
+ }
249
+ else:
250
+ all_results.append(-4)
251
+ return all_results, {
252
+ 'error': repr(e),
253
+ 'error_code': -4,
254
+ 'error_message': 'Runtime Error',
255
+ 'inputs': truncatefn(gt_inp),
256
+ 'expected': truncatefn(gt_out),
257
+ }
258
+
259
+ finally:
260
+ signal.alarm(0)
261
+ # faulthandler.disable()
262
+
263
+ return all_results, {'execution time': total_execution}
264
+
265
+
266
+ def grade_stdio(
267
+ code: str,
268
+ all_inputs: list,
269
+ all_outputs: list,
270
+ timeout: int,
271
+ ):
272
+ ## runtime doesn't interact well with __name__ == '__main__'
273
+ code = clean_if_name(code)
274
+
275
+ ## we wrap the given code inside another function
276
+ code = make_function(code)
277
+
278
+ compiled_sol = compile_code(code, timeout)
279
+ if compiled_sol is None:
280
+ return
281
+
282
+ method = get_function(compiled_sol, 'wrapped_function')
87
283
 
284
+ if method is None:
285
+ return
286
+
287
+ all_results = []
288
+ total_execution_time = 0
289
+ for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
290
+ signal.alarm(timeout)
291
+ # faulthandler.enable()
292
+
293
+ with Capturing() as captured_output:
294
+ try:
295
+ start = time.time()
296
+ call_method(method, gt_inp)
297
+ total_execution_time += time.time() - start
298
+ # reset the alarm
299
+ signal.alarm(0)
300
+ except Exception as e:
301
+ signal.alarm(0)
302
+ if 'timeoutexception' in repr(e).lower():
303
+ all_results.append(-3)
304
+ return all_results, {
305
+ 'error': repr(e),
306
+ 'error_code': -3,
307
+ 'error_message': 'Time Limit Exceeded',
308
+ 'inputs': truncatefn(gt_inp),
309
+ 'expected': truncatefn(gt_out),
310
+ }
311
+ else:
312
+ all_results.append(-4)
313
+ return all_results, {
314
+ 'error': repr(e),
315
+ 'error_code': -4,
316
+ 'error_message': 'Runtime Error',
317
+ 'inputs': truncatefn(gt_inp),
318
+ 'expected': truncatefn(gt_out),
319
+ }
320
+
321
+ finally:
322
+ signal.alarm(0)
323
+ # faulthandler.disable()
324
+
325
+ prediction = captured_output[0]
326
+
327
+ stripped_prediction_lines = get_stripped_lines(prediction)
328
+ stripped_gt_out_lines = get_stripped_lines(gt_out)
329
+
330
+ ## WA happens in multiple circumstances
331
+ ## so cache the return to make it clean!
332
+ WA_send_args = {
333
+ 'output': truncatefn(prediction),
334
+ 'inputs': truncatefn(gt_inp),
335
+ 'expected': truncatefn(gt_out),
336
+ 'error_code': -2,
337
+ }
338
+
339
+ if len(stripped_prediction_lines) != len(stripped_gt_out_lines):
340
+ all_results.append(-2)
341
+ WA_send_args['error_message'] = 'Wrong answer: mismatched output length'
342
+ return all_results, WA_send_args
343
+
344
+ for output_line_idx, (
345
+ stripped_prediction_line,
346
+ stripped_gt_out_line,
347
+ ) in enumerate(zip(stripped_prediction_lines, stripped_gt_out_lines)):
348
+ WA_send_args['error_message'] = (
349
+ f'Wrong answer at {output_line_idx=}: {truncatefn(stripped_prediction_line)} != {truncatefn(stripped_gt_out_line)}'
350
+ )
351
+
352
+ ## CASE 1: exact match
353
+ if stripped_prediction_line == stripped_gt_out_line:
354
+ continue
355
+
356
+ ## CASE 2: element-wise comparision
357
+ ## if there are floating elements
358
+ ## use `decimal` library for good floating point comparision
359
+ ## otherwise gotcha: np.isclose(50000000000000000, 50000000000000001) = True
360
+ ## note that we should always be able to convert to decimals
361
+
362
+ success, decimal_prediction_line = convert_line_to_decimals(stripped_prediction_line)
363
+ if not success:
364
+ all_results.append(-2)
365
+ return all_results, WA_send_args
366
+ success, decimal_gtout_line = convert_line_to_decimals(stripped_gt_out_line)
367
+ if not success:
368
+ all_results.append(-2)
369
+ return all_results, WA_send_args
370
+
371
+ if decimal_prediction_line == decimal_gtout_line:
372
+ continue
373
+
374
+ all_results.append(-2)
375
+ return all_results, WA_send_args
376
+ all_results.append(True)
377
+
378
+ return all_results, {'execution time': total_execution_time}
379
+
380
+
381
+ def run_test(sample, test=None, debug=False, timeout=6):
382
+ """
383
+ if test(generated_code) is not None it'll try to run the code.
88
384
  otherwise it'll just return an input and output pair.
89
385
  """
386
+ timeout_handler_wrapper = partial(timeout_handler, debug)
387
+ signal.signal(signal.SIGALRM, timeout_handler_wrapper)
388
+
90
389
  # Disable functionalities that can make destructive changes to the test.
390
+ # max memory is set to 4GB
91
391
  reliability_guard()
92
392
 
93
393
  if debug:
94
- print(f'start = {datetime.now().time()}')
394
+ logger.info(f'start = {datetime.now().time()}')
95
395
 
96
396
  try:
97
397
  in_outs = json.loads(sample['input_output'])
98
- except ValueError:
398
+ except ValueError as e:
399
+ raise e
99
400
  in_outs = None
401
+
100
402
  if in_outs:
101
403
  if in_outs.get('fn_name') is None:
102
404
  which_type = CODE_TYPE.standard_input # Standard input
103
405
  method_name = None
406
+
104
407
  else:
105
408
  which_type = CODE_TYPE.call_based # Call-based
106
409
  method_name = in_outs['fn_name']
107
410
 
108
411
  if debug:
109
- print(f'loaded input_output = {datetime.now().time()}')
412
+ logger.info(f'loaded input_output = {datetime.now().time()}')
110
413
 
111
414
  if test is None:
112
415
  assert False, 'should not happen: test code is none'
113
416
  return in_outs, {'error': 'no test code provided'}
114
417
  elif test is not None:
115
418
  results = []
116
- sol = 'from string import *\nfrom re import *\nfrom datetime import *\nfrom collections import *\nfrom heapq import *\nfrom bisect import *\nfrom copy import *\nfrom math import *\nfrom random import *\nfrom statistics import *\nfrom itertools import *\nfrom functools import *\nfrom operator import *\nfrom io import *\nfrom sys import *\nfrom json import *\nfrom builtins import *\nfrom typing import *\nimport string\nimport re\nimport datetime\nimport collections\nimport heapq\nimport bisect\nimport copy\nimport math\nimport random\nimport statistics\nimport itertools\nimport functools\nimport operator\nimport io\nimport sys\nimport json\nsys.setrecursionlimit(6*10**5)\n' # noqa: E501
419
+ sol = import_string
117
420
  if debug:
118
- print(f'loading test code = {datetime.now().time()}')
421
+ logger.info(f'loading test code = {datetime.now().time()}')
119
422
 
120
423
  if which_type == CODE_TYPE.call_based:
121
-
122
- sol += test
123
- if debug:
124
- print(f'sol = {sol}')
125
424
  signal.alarm(timeout)
126
425
  try:
127
- tmp_sol = RuntimeModule.from_string('tmp_sol', '', sol)
128
- if 'class Solution' not in test:
129
- tmp = tmp_sol
130
- else:
131
- tmp = tmp_sol.Solution()
132
- signal.alarm(0)
426
+ results, metadata = grade_call_based(
427
+ code=test,
428
+ all_inputs=in_outs['inputs'],
429
+ all_outputs=in_outs['outputs'],
430
+ fn_name=method_name,
431
+ timeout=timeout,
432
+ )
433
+ return results, metadata
133
434
  except Exception as e:
134
- signal.alarm(0)
135
- if debug:
136
- print(f'type 0 compilation error = {e}')
137
- results.append(-2)
138
- return results, {
139
- 'error': repr(e),
140
- 'error_code': -1,
141
- 'error_message': 'Compilation Error',
435
+ return [-4], {
436
+ 'error_code': -4,
437
+ 'error_message': f'Error during testing: {e}',
142
438
  }
143
- signal.alarm(0)
144
-
439
+ finally:
440
+ signal.alarm(0)
145
441
  elif which_type == CODE_TYPE.standard_input:
146
442
  # sol
147
443
  # if code has if __name__ == "__main__": then remove it
148
- try:
149
- astree = ast.parse(test)
150
- last_block = astree.body[-1]
151
- if isinstance(last_block, ast.If):
152
- condition = last_block.test
153
- if ast.unparse(condition).strip() == "__name__ == '__main__'":
154
- test = (ast.unparse(astree.body[:-1]) + '\n' + ast.unparse(last_block.body))
155
- except Exception as e: # noqa:
156
- pass
157
-
158
- tmp_test = test.split('\n')
159
-
160
- new_test = []
161
- for x in tmp_test:
162
- if (not x.startswith('from ')) and (not x.startswith('import ')):
163
- new_test.append('\t' + x + '\n')
164
- else:
165
- new_test.append(x + '\n')
166
- tmp_test = new_test
167
-
168
- new_test = ''
169
- started = False
170
- for i in tmp_test:
171
- if i.startswith('\t') and not started:
172
- new_test += 'stdin = sys.stdin\nstdout = sys.stdout\n'
173
- new_test += 'def code():\n'
174
- new_test += i
175
- started = True
176
- elif started and ((i.startswith('from ')) or (i.startswith('import '))):
177
- new_test += '\t' + i
178
- else:
179
- new_test += i
180
- tmp_test = new_test
181
444
 
182
- sol += tmp_test
183
- if debug:
184
- print(f'sol = {sol}')
185
- method_name = 'code'
186
445
  signal.alarm(timeout)
187
446
  try:
188
- tmp_sol = RuntimeModule.from_string('tmp_sol', '', sol)
189
- tmp = tmp_sol
190
- signal.alarm(0)
447
+ results, metadata = grade_stdio(
448
+ code=test,
449
+ all_inputs=in_outs['inputs'],
450
+ all_outputs=in_outs['outputs'],
451
+ timeout=timeout,
452
+ )
453
+ return results, metadata
191
454
  except Exception as e:
192
- signal.alarm(0)
193
- if debug:
194
- print(f'type 1 compilation error = {e}')
195
- results.append(-2)
196
- return results, {
197
- 'error': repr(e),
198
- 'error_code': -1,
199
- 'error_message': 'Compilation Error',
455
+ return [-4], {
456
+ 'error_code': -4,
457
+ 'error_message': f'Error during testing: {e}',
200
458
  }
201
- signal.alarm(0)
202
- if debug:
203
- print(f'get method = {datetime.now().time()}')
204
-
205
- try:
206
- method = getattr(tmp, method_name) # get_attr second arg must be str
207
- except Exception as e:
208
- signal.alarm(0)
209
- e = sys.exc_info()
210
- print(f'unable to get function error = {e}')
211
- results.append(-2)
212
- return results, {
213
- 'error': repr(e),
214
- 'error_code': -1,
215
- 'error_message': 'Unable to extract code',
216
- }
217
-
218
- for index, inputs in enumerate(in_outs['inputs']):
219
- raw_inputs = inputs
220
- raw_outputs = in_outs['outputs'][index]
221
- if which_type == CODE_TYPE.call_based:
222
- inputs = [json.loads(line) for line in inputs.split('\n')]
223
- in_outs['outputs'][index] = json.loads(in_outs['outputs'][index])
224
-
225
- truncate_line_size = 300 // (raw_inputs.count('\n') + 1)
226
- raw_inputs = '\n'.join(
227
- [truncatefn(line, truncate_line_size) for line in raw_inputs.strip().split('\n')])
228
- raw_outputs = truncatefn(raw_outputs, 200)
229
- else:
230
- raw_inputs = truncatefn(raw_inputs)
231
- raw_outputs = truncatefn(raw_outputs, 200)
232
- # JSON forces dictionaries to have string keys; this undoes this
233
- # (assuming a singleton list)
234
- try:
235
- if isinstance(inputs[0], dict):
236
- inputs = [{int(k): v for k, v in inputs[0].items()}]
237
- except Exception as e: # noqa: F841
238
- True
239
- try:
240
- if isinstance(in_outs['outputs'][index], dict):
241
- in_outs['outputs'][index] = [{int(k): v for k, v in in_outs['outputs'][index].items()}]
242
- except Exception as e: # noqa: F841
243
- True
244
- try:
245
- if isinstance(in_outs['outputs'][index][0], dict):
246
- in_outs['outputs'][index] = [{int(k): v for k, v in in_outs['outputs'][index][0].items()}]
247
- except Exception as e: # noqa: F841
248
- True
249
-
250
- if debug:
251
- print(f'time: {datetime.now().time()} testing index = {index} '
252
- f'inputs = {inputs}, {type(inputs)}. type = {which_type}')
253
- if which_type == CODE_TYPE.call_based: # Call-based
254
- signal.alarm(timeout)
255
- faulthandler.enable()
256
- try:
257
- output = method(*inputs)
258
- raw_true_output = output
259
-
260
- raw_true_output_copy = json.dumps(output)
261
- raw_true_output_copy = truncatefn(raw_true_output_copy, 200)
262
-
263
- # ground truth sequences are not tuples
264
- if isinstance(output, tuple):
265
- output = list(output)
266
-
267
- tmp_result = output == in_outs['outputs'][index]
268
- if (isinstance(in_outs['outputs'][index], list) and in_outs['outputs'][index]):
269
- tmp_result = tmp_result or (output == in_outs['outputs'][index][0])
270
-
271
- # ground truth sequences are not tuples
272
- try:
273
- if isinstance(output[0], tuple):
274
- tmp_result = tmp_result or ([list(x) for x in output] == in_outs['outputs'][index][0])
275
- except Exception as e: # noqa: F841
276
- True
277
- results.append(tmp_result)
278
- if tmp_result is not True:
279
- return results, {
280
- 'output': raw_true_output_copy,
281
- 'expected': raw_outputs,
282
- 'inputs': raw_inputs,
283
- 'error_code': -2,
284
- 'error_message': 'Wrong Answer',
285
- }
286
- # reset the alarm
287
- signal.alarm(0)
288
- except Exception as e:
289
- signal.alarm(0)
290
- faulthandler.disable()
291
- if debug:
292
- print(f'Standard input runtime error or time limit exceeded error = {e}' # noqa: E501
293
- )
294
- results.append(-1)
295
- if 'timeoutexception' in repr(e).lower():
296
- return results, {
297
- 'error': repr(e),
298
- 'error_code': -3,
299
- 'error_message': 'Time Limit Exceeded',
300
- 'inputs': raw_inputs,
301
- 'expected': raw_outputs,
302
- }
303
- else:
304
- return results, {
305
- 'error': repr(e),
306
- 'error_code': -4,
307
- 'error_message': 'Runtime Error',
308
- 'inputs': raw_inputs,
309
- 'expected': raw_outputs,
310
- }
311
- faulthandler.disable()
459
+ finally:
312
460
  signal.alarm(0)
313
- if debug:
314
- print(
315
- f"outputs = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
316
- )
317
- elif which_type == CODE_TYPE.standard_input: # Standard input
318
- faulthandler.enable()
319
- passed = False
320
-
321
- if isinstance(inputs, list):
322
- inputs = '\n'.join(inputs)
323
- if isinstance(in_outs['outputs'][index], list):
324
- in_outs['outputs'][index] = '\n'.join(in_outs['outputs'][index])
325
-
326
- signal.alarm(timeout)
327
- with Capturing() as output:
328
- try:
329
- call_method(method, inputs)
330
- # reset the alarm
331
- signal.alarm(0)
332
- passed = True
333
- except Exception as e:
334
- # runtime error or took too long
335
- signal.alarm(0)
336
- print(f'Call-based runtime error or time limit exceeded error = {repr(e)}{e}' # noqa: E501
337
- )
338
- results.append(-1)
339
- if 'timeoutexception' in repr(e).lower():
340
- return results, {
341
- 'error': repr(e),
342
- 'error_code': -3,
343
- 'error_message': 'Time Limit Exceeded',
344
- 'inputs': raw_inputs,
345
- 'expected': raw_outputs,
346
- }
347
- else:
348
- return results, {
349
- 'error': repr(e),
350
- 'error_code': -4,
351
- 'error_message': 'Runtime Error',
352
- 'inputs': raw_inputs,
353
- 'expected': raw_outputs,
354
- }
355
- signal.alarm(0)
356
- raw_true_output = output[0]
357
- raw_true_output_copy = truncatefn(raw_true_output, 200)
358
- output = raw_true_output.splitlines()
359
- if not passed:
360
- if debug:
361
- nl = '\n'
362
- if not isinstance(inputs, list):
363
- print(
364
- f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl, ' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
365
- )
366
- else:
367
- print(
368
- f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
369
- )
370
- continue
371
-
372
- if passed and debug:
373
- print(f"==> output = {output}, test outputs = {in_outs['outputs'][index]}" # noqa: E501
374
- )
375
-
376
- if custom_compare_(output, in_outs['outputs'][index]):
377
- tmp_result = True
378
- results.append(tmp_result)
379
- continue
380
-
381
- # ground truth sequences are expressed as lists not tuples
382
- if isinstance(output, tuple):
383
- output = list(output)
384
-
385
- tmp_result = False
386
- try:
387
- tmp_result = output == [in_outs['outputs'][index]]
388
- if isinstance(in_outs['outputs'][index], list):
389
- tmp_result = tmp_result or (output == in_outs['outputs'][index])
390
- if isinstance(output[0], str):
391
- tmp_result = tmp_result or ([e.strip() for e in output] == in_outs['outputs'][index])
392
- except Exception as e:
393
- if debug:
394
- print(f'Failed check1 exception = {e}')
395
- pass
396
-
397
- if tmp_result is True:
398
- results.append(tmp_result)
399
- continue
400
-
401
- # try one more time without \n
402
- if isinstance(in_outs['outputs'][index], list):
403
- for tmp_index, i in enumerate(in_outs['outputs'][index]):
404
- in_outs['outputs'][index][tmp_index] = i.split('\n')
405
- in_outs['outputs'][index][tmp_index] = [
406
- x.strip() for x in in_outs['outputs'][index][tmp_index] if x
407
- ]
408
- else:
409
- in_outs['outputs'][index] = in_outs['outputs'][index].split('\n')
410
- in_outs['outputs'][index] = list(filter(len, in_outs['outputs'][index]))
411
- in_outs['outputs'][index] = list(map(lambda x: x.strip(), in_outs['outputs'][index]))
412
-
413
- try:
414
- tmp_result = output == [in_outs['outputs'][index]]
415
- if isinstance(in_outs['outputs'][index], list):
416
- tmp_result = tmp_result or (output == in_outs['outputs'][index])
417
- except Exception as e:
418
- if debug:
419
- print(f'Failed check2 exception = {e}')
420
- pass
421
-
422
- if tmp_result is True:
423
- results.append(tmp_result)
424
- continue
425
-
426
- # try by converting the output into a split up list too
427
- if isinstance(output, list):
428
- output = list(filter(len, output))
429
-
430
- if debug:
431
- nl = '\n'
432
- if not isinstance(inputs, list):
433
- print(
434
- f"@1 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl, ' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]} {tmp_result=}" # noqa: E501
435
- )
436
- else:
437
- print(
438
- f"@1 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]} {tmp_result=}" # noqa: E501
439
- )
440
-
441
- if tmp_result is True:
442
- results.append(tmp_result)
443
- continue
444
-
445
- if debug:
446
- print(f'{tmp_result=} @a')
447
-
448
- try:
449
- tmp_result = output == [in_outs['outputs'][index]]
450
- if isinstance(in_outs['outputs'][index], list):
451
- tmp_result = tmp_result or (output == in_outs['outputs'][index])
452
- except Exception as e:
453
- if debug:
454
- print(f'Failed check3 exception = {e}')
455
- pass
456
-
457
- if debug:
458
- print(f'{tmp_result=} @b')
459
-
460
- try:
461
- all_ints = all(
462
- combined_int_check(e1) and combined_int_check(e2)
463
- for e1, e2 in zip(output, in_outs['outputs'][index]))
464
- if not all_ints:
465
- if debug:
466
- print([
467
- combined_int_check(e1) and combined_int_check(e2)
468
- for e1, e2 in zip(output, in_outs['outputs'][index])
469
- ])
470
- output_float = [float(e) for e in output]
471
- gt_float = [float(e) for e in in_outs['outputs'][index]]
472
- tmp_result = tmp_result or ((len(output_float) == len(gt_float))
473
- and np.allclose(output_float, gt_float))
474
- except Exception as e: # noqa: F841
475
- pass
476
-
477
- if debug:
478
- print(f'{tmp_result=} @c')
479
-
480
- try:
481
- if isinstance(output[0], list):
482
- all_ints = all(
483
- combined_int_check(e1) and combined_int_check(e2)
484
- for e1, e2 in zip(output[0], in_outs['outputs'][index]))
485
- if not all_ints:
486
- output_float = [float(e) for e in output[0]]
487
- gt_float = [float(e) for e in in_outs['outputs'][index][0]]
488
- tmp_result = tmp_result or ((len(output_float) == len(gt_float))
489
- and np.allclose(output_float, gt_float))
490
- except Exception as e: # noqa: F841
491
- pass
492
-
493
- if tmp_result is True:
494
- results.append(tmp_result)
495
- continue
496
-
497
- if debug:
498
- print(f'{tmp_result=} @d')
499
- # try by converting the stuff into split up list
500
- if isinstance(in_outs['outputs'][index], list):
501
- for tmp_index, i in enumerate(in_outs['outputs'][index]):
502
- in_outs['outputs'][index][tmp_index] = set(i.split())
503
- else:
504
- in_outs['outputs'][index] = set(in_outs['outputs'][index].split())
505
-
506
- if debug:
507
- print(f'{tmp_result=} @e')
508
-
509
- try:
510
- tmp_result = output == in_outs['outputs'][index]
511
- except Exception as e:
512
- if debug:
513
- print(f'Failed check4 exception = {e}')
514
- continue
515
-
516
- if tmp_result is True:
517
- results.append(tmp_result)
518
- continue
519
-
520
- if debug:
521
- print(f'{tmp_result=} @f')
522
-
523
- # try by converting the output into a split up list too
524
- if isinstance(output, list):
525
- for tmp_index, i in enumerate(output):
526
- output[tmp_index] = i.split()
527
- output = list(filter(len, output))
528
- for tmp_index, i in enumerate(output):
529
- output[tmp_index] = set(i)
530
- else:
531
- output = output.split()
532
- output = list(filter(len, output))
533
- output = set(output)
534
-
535
- if debug:
536
- print(f'{tmp_result=} @g')
537
- # try:
538
- # tmp_result = set(frozenset(s) for s in output) == set(
539
- # frozenset(s) for s in in_outs["outputs"][index]
540
- # )
541
- # except Exception as e:
542
- # if debug:
543
- # print(f"Failed check5 exception = {e}")
544
-
545
- # if they are all numbers, round so that similar numbers are
546
- # treated as identical
547
- # try:
548
- # all_ints = all(
549
- # combined_int_check(e1) and combined_int_check(e2)
550
- # for e1, e2 in zip(output, in_outs["outputs"][index])
551
- # )
552
- # tmp_result = tmp_result or (
553
- # set(
554
- # frozenset(round(float(t), 3) for t in s) for s in output)
555
- # == set(
556
- # frozenset(round(float(t), 3) for t in s)
557
- # for s in in_outs["outputs"][index]
558
- # )
559
- # )
560
- # except Exception as e:
561
- # if debug:
562
- # print(f"Failed check6 exception = {e}")
563
-
564
- if debug:
565
- print(f'{tmp_result=} @h')
566
-
567
- if tmp_result is True and debug:
568
- print('PASSED')
569
-
570
- results.append(tmp_result)
571
- if tmp_result is not True:
572
- return results, {
573
- 'output': raw_true_output_copy,
574
- 'expected': raw_outputs,
575
- 'inputs': raw_inputs,
576
- 'error_code': -2,
577
- 'error_message': 'Wrong Answer',
578
- }
579
-
580
- if debug:
581
- nl = '\n'
582
- if not isinstance(inputs, list):
583
- print(
584
- f"@2 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl, ' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
585
- )
586
- else:
587
- print(
588
- f"@2 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
589
- )
590
-
591
- print(f'results = {results}')
592
-
593
- return results, {}
594
-
595
-
596
- def custom_compare_(output, ground_truth):
597
-
598
- if isinstance(output, list):
599
- output_1 = '\n'.join(output)
600
- if stripped_string_compare(output_1, ground_truth):
601
- return True
602
-
603
- if isinstance(output, list):
604
- output_2 = [o.lstrip().rstrip() for o in output]
605
- output_2 = '\n'.join(output_2)
606
- if stripped_string_compare(output_2, ground_truth):
607
- return True
608
-
609
- return False
610
-
611
-
612
- def stripped_string_compare(s1, s2):
613
- s1 = s1.lstrip().rstrip()
614
- s2 = s2.lstrip().rstrip()
615
- return s1 == s2
616
-
617
-
618
- def call_method(method, inputs):
619
-
620
- if isinstance(inputs, list):
621
- inputs = '\n'.join(inputs)
622
-
623
- inputs_line_iterator = iter(inputs.split('\n'))
624
-
625
- # sys.setrecursionlimit(10000)
626
-
627
- # @patch('builtins.input', side_effect=inputs.split("\n"))
628
- @patch('builtins.open', mock_open(read_data=inputs))
629
- @patch('sys.stdin', StringIO(inputs))
630
- @patch('sys.stdin.readline', lambda *args: next(inputs_line_iterator))
631
- @patch('sys.stdin.readlines', lambda *args: inputs.split('\n'))
632
- @patch('sys.stdin.read', lambda *args: inputs)
633
- # @patch('sys.stdout.write', print)
634
- def _inner_call_method(_method):
635
- try:
636
- return _method()
637
- except SystemExit as e: # noqa: F841
638
- pass
639
- finally:
640
- pass
641
-
642
- return _inner_call_method(method)
643
461
 
644
462
 
645
463
  def reliability_guard(maximum_memory_bytes=None):
646
- """This disables various destructive functions and prevents the generated
647
- code from interfering with the test (e.g. fork bomb, killing other
648
- processes, removing filesystem files, etc.) WARNING This function is NOT a
649
- security sandbox.
650
-
651
- Untrusted code, including, model- generated code, should not be blindly
652
- executed outside of one. See the Codex paper for more information about
653
- OpenAI's code sandbox, and proceed with caution.
464
+ """
465
+ This disables various destructive functions and prevents the generated code
466
+ from interfering with the test (e.g. fork bomb, killing other processes,
467
+ removing filesystem files, etc.)
468
+ WARNING
469
+ This function is NOT a security sandbox. Untrusted code, including, model-
470
+ generated code, should not be blindly executed outside of one. See the
471
+ Codex paper for more information about OpenAI's code sandbox, and proceed
472
+ with caution.
654
473
  """
655
474
 
656
475
  if maximum_memory_bytes is not None:
@@ -661,11 +480,11 @@ def reliability_guard(maximum_memory_bytes=None):
661
480
  if not platform.uname().system == 'Darwin':
662
481
  resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
663
482
 
664
- faulthandler.disable()
483
+ # faulthandler.disable()
665
484
 
666
485
  import builtins
667
486
 
668
- builtins.exit = None
487
+ # builtins.exit = None
669
488
  builtins.quit = None
670
489
 
671
490
  import os