evalscope 0.13.2__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +2 -1
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +21 -5
- evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
- evalscope/backend/rag_eval/utils/embedding.py +49 -3
- evalscope/backend/rag_eval/utils/llm.py +4 -4
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
- evalscope/benchmarks/__init__.py +2 -2
- evalscope/benchmarks/aigc/__init__.py +0 -0
- evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/t2i/base.py +56 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +77 -0
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +58 -0
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +58 -0
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +57 -0
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +37 -0
- evalscope/benchmarks/aime/aime24_adapter.py +1 -1
- evalscope/benchmarks/aime/aime25_adapter.py +4 -4
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -2
- evalscope/benchmarks/arc/arc_adapter.py +2 -2
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -3
- evalscope/benchmarks/ceval/ceval_adapter.py +2 -2
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -3
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +1 -1
- evalscope/benchmarks/competition_math/competition_math_adapter.py +1 -2
- evalscope/benchmarks/data_adapter.py +21 -10
- evalscope/benchmarks/data_collection/data_collection_adapter.py +6 -4
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +16 -21
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +5 -4
- evalscope/benchmarks/live_code_bench/testing_util.py +369 -550
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +1 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -8
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -2
- evalscope/benchmarks/utils.py +7 -16
- evalscope/cli/start_app.py +1 -1
- evalscope/collections/evaluator.py +20 -6
- evalscope/config.py +8 -4
- evalscope/constants.py +11 -0
- evalscope/evaluator/evaluator.py +2 -2
- evalscope/evaluator/reviewer/auto_reviewer.py +1 -1
- evalscope/metrics/__init__.py +49 -4
- evalscope/metrics/llm_judge.py +1 -1
- evalscope/metrics/named_metrics.py +13 -0
- evalscope/metrics/t2v_metrics/__init__.py +66 -0
- evalscope/metrics/t2v_metrics/clipscore.py +14 -0
- evalscope/metrics/t2v_metrics/constants.py +12 -0
- evalscope/metrics/t2v_metrics/itmscore.py +14 -0
- evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +132 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +286 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +114 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +86 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +85 -0
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +84 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +97 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +171 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +80 -0
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +73 -0
- evalscope/metrics/t2v_metrics/models/model.py +45 -0
- evalscope/metrics/t2v_metrics/models/utils.py +25 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +300 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +82 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +218 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +150 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +188 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +106 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +307 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +191 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +318 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +208 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1093 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +452 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +364 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +755 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +273 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +880 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1844 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +81 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +56 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +185 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +178 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +112 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +344 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +858 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +271 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +503 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1270 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +473 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +31 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +392 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +127 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +17 -0
- evalscope/metrics/t2v_metrics/score.py +78 -0
- evalscope/metrics/t2v_metrics/vqascore.py +14 -0
- evalscope/models/__init__.py +50 -14
- evalscope/models/adapters/__init__.py +17 -0
- evalscope/models/{base_adapter.py → adapters/base_adapter.py} +17 -17
- evalscope/models/{chat_adapter.py → adapters/chat_adapter.py} +10 -7
- evalscope/models/{choice_adapter.py → adapters/choice_adapter.py} +2 -6
- evalscope/models/{custom_adapter.py → adapters/custom_adapter.py} +2 -4
- evalscope/models/{server_adapter.py → adapters/server_adapter.py} +1 -3
- evalscope/models/adapters/t2i_adapter.py +76 -0
- evalscope/models/custom/__init__.py +2 -1
- evalscope/models/custom/dummy_model.py +11 -13
- evalscope/models/local_model.py +82 -33
- evalscope/models/model.py +2 -42
- evalscope/models/register.py +26 -0
- evalscope/perf/arguments.py +24 -5
- evalscope/perf/benchmark.py +28 -42
- evalscope/perf/http_client.py +2 -3
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +2 -2
- evalscope/perf/plugin/datasets/custom.py +4 -1
- evalscope/perf/plugin/datasets/flickr8k.py +2 -1
- evalscope/perf/plugin/datasets/line_by_line.py +4 -1
- evalscope/perf/plugin/datasets/longalpaca.py +4 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -1
- evalscope/perf/plugin/datasets/random_dataset.py +13 -6
- evalscope/perf/utils/benchmark_util.py +14 -8
- evalscope/perf/utils/db_util.py +9 -3
- evalscope/perf/utils/log_utils.py +41 -0
- evalscope/report/__init__.py +1 -0
- evalscope/report/app.py +128 -78
- evalscope/report/app_arguments.py +11 -0
- evalscope/report/generator.py +1 -1
- evalscope/run.py +10 -3
- evalscope/summarizer.py +2 -1
- evalscope/third_party/thinkbench/eval.py +19 -7
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/import_utils.py +66 -0
- evalscope/utils/utils.py +48 -29
- evalscope/version.py +2 -2
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/METADATA +37 -15
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/RECORD +209 -96
- tests/aigc/__init__.py +1 -0
- tests/aigc/test_t2i.py +87 -0
- tests/cli/test_all.py +4 -4
- tests/cli/test_collection.py +2 -1
- tests/cli/test_run.py +19 -12
- tests/perf/test_perf.py +3 -3
- tests/rag/test_clip_benchmark.py +0 -1
- tests/rag/test_mteb.py +37 -8
- tests/rag/test_ragas.py +29 -26
- tests/vlm/test_vlmeval.py +37 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
- evalscope/metrics/code_metric.py +0 -98
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/LICENSE +0 -0
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/WHEEL +0 -0
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
#
|
|
2
|
-
|
|
1
|
+
# flake8: noqa
|
|
3
2
|
import ast
|
|
4
3
|
import faulthandler
|
|
5
4
|
import json
|
|
@@ -8,23 +7,30 @@ import platform
|
|
|
8
7
|
# to run the solution files we're using a timing based approach
|
|
9
8
|
import signal
|
|
10
9
|
import sys
|
|
10
|
+
import time
|
|
11
11
|
# used for debugging to time steps
|
|
12
12
|
from datetime import datetime
|
|
13
|
+
from decimal import Decimal
|
|
13
14
|
from enum import Enum
|
|
14
|
-
|
|
15
|
+
from functools import partial
|
|
15
16
|
from io import StringIO
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
from pyext import RuntimeModule
|
|
19
|
-
except Exception:
|
|
20
|
-
print('pyext not found, please install with `pip install pyext`')
|
|
21
|
-
pyext = None
|
|
17
|
+
# from pyext import RuntimeModule
|
|
18
|
+
from types import ModuleType
|
|
22
19
|
# used for testing the code that reads from input
|
|
23
20
|
from unittest.mock import mock_open, patch
|
|
24
21
|
|
|
22
|
+
from evalscope.utils.logger import get_logger
|
|
23
|
+
|
|
24
|
+
logger = get_logger()
|
|
25
|
+
|
|
26
|
+
import_string = 'from string import *\nfrom re import *\nfrom datetime import *\nfrom collections import *\nfrom heapq import *\nfrom bisect import *\nfrom copy import *\nfrom math import *\nfrom random import *\nfrom statistics import *\nfrom itertools import *\nfrom functools import *\nfrom operator import *\nfrom io import *\nfrom sys import *\nfrom json import *\nfrom builtins import *\nfrom typing import *\nimport string\nimport re\nimport datetime\nimport collections\nimport heapq\nimport bisect\nimport copy\nimport math\nimport random\nimport statistics\nimport itertools\nimport functools\nimport operator\nimport io\nimport sys\nimport json\nsys.setrecursionlimit(50000)\n'
|
|
27
|
+
|
|
25
28
|
|
|
26
29
|
def truncatefn(s, length=300):
|
|
27
|
-
|
|
30
|
+
if isinstance(s, str):
|
|
31
|
+
pass
|
|
32
|
+
else:
|
|
33
|
+
s = str(s)
|
|
28
34
|
if len(s) <= length:
|
|
29
35
|
return s
|
|
30
36
|
|
|
@@ -41,17 +47,12 @@ class TimeoutException(Exception):
|
|
|
41
47
|
pass
|
|
42
48
|
|
|
43
49
|
|
|
44
|
-
def timeout_handler(signum, frame):
|
|
45
|
-
|
|
46
|
-
|
|
50
|
+
def timeout_handler(debug, signum, frame):
|
|
51
|
+
if debug:
|
|
52
|
+
logger.info('timeout occured: alarm went off')
|
|
47
53
|
raise TimeoutException
|
|
48
54
|
|
|
49
55
|
|
|
50
|
-
signal.signal(signal.SIGALRM, timeout_handler)
|
|
51
|
-
|
|
52
|
-
# timeout = 6 # seconds
|
|
53
|
-
|
|
54
|
-
|
|
55
56
|
# used to capture stdout as a list
|
|
56
57
|
# from https://stackoverflow.com/a/16571630/6416660
|
|
57
58
|
# alternative use redirect_stdout() from contextlib
|
|
@@ -70,587 +71,405 @@ class Capturing(list):
|
|
|
70
71
|
sys.stdout = self._stdout
|
|
71
72
|
|
|
72
73
|
|
|
73
|
-
def
|
|
74
|
-
|
|
74
|
+
def clean_if_name(code: str) -> str:
|
|
75
|
+
try:
|
|
76
|
+
astree = ast.parse(code)
|
|
77
|
+
last_block = astree.body[-1]
|
|
78
|
+
if isinstance(last_block, ast.If):
|
|
79
|
+
condition = last_block.test
|
|
80
|
+
if ast.unparse(condition).strip() == "__name__ == '__main__'":
|
|
81
|
+
code = (
|
|
82
|
+
ast.unparse(astree.body[:-1]) + '\n' + ast.unparse(last_block.body) # type: ignore
|
|
83
|
+
)
|
|
84
|
+
except:
|
|
85
|
+
pass
|
|
75
86
|
|
|
87
|
+
return code
|
|
76
88
|
|
|
77
|
-
def string_int_check(val):
|
|
78
|
-
return isinstance(val, str) and val.isdigit()
|
|
79
89
|
|
|
90
|
+
def make_function(code: str) -> str:
|
|
91
|
+
try:
|
|
92
|
+
import_stmts = []
|
|
93
|
+
all_other_stmts = []
|
|
94
|
+
astree = ast.parse(code)
|
|
95
|
+
for stmt in astree.body:
|
|
96
|
+
if isinstance(stmt, (ast.Import, ast.ImportFrom)):
|
|
97
|
+
import_stmts.append(stmt)
|
|
98
|
+
else:
|
|
99
|
+
all_other_stmts.append(stmt)
|
|
100
|
+
|
|
101
|
+
function_ast = ast.FunctionDef(
|
|
102
|
+
name='wrapped_function',
|
|
103
|
+
args=ast.arguments(posonlyargs=[], args=[], kwonlyargs=[], kw_defaults=[], defaults=[]),
|
|
104
|
+
body=all_other_stmts,
|
|
105
|
+
decorator_list=[],
|
|
106
|
+
lineno=-1,
|
|
107
|
+
)
|
|
108
|
+
main_code = (
|
|
109
|
+
import_string + '\n' + ast.unparse(import_stmts) # type: ignore
|
|
110
|
+
+ '\n' + ast.unparse(function_ast) # type: ignore
|
|
111
|
+
)
|
|
112
|
+
return main_code
|
|
113
|
+
except Exception as e:
|
|
114
|
+
return code
|
|
80
115
|
|
|
81
|
-
def combined_int_check(val):
|
|
82
|
-
return only_int_check(val) or string_int_check(val)
|
|
83
116
|
|
|
117
|
+
def call_method(method, inputs):
|
|
84
118
|
|
|
85
|
-
|
|
86
|
-
|
|
119
|
+
if isinstance(inputs, list):
|
|
120
|
+
inputs = '\n'.join(inputs)
|
|
121
|
+
|
|
122
|
+
inputs_line_iterator = iter(inputs.split('\n'))
|
|
123
|
+
|
|
124
|
+
# sys.setrecursionlimit(10000)
|
|
125
|
+
|
|
126
|
+
# @patch('builtins.input', side_effect=inputs.split("\n"))
|
|
127
|
+
@patch('builtins.open', mock_open(read_data=inputs))
|
|
128
|
+
@patch('sys.stdin', StringIO(inputs))
|
|
129
|
+
@patch('sys.stdin.readline', lambda *args: next(inputs_line_iterator))
|
|
130
|
+
@patch('sys.stdin.readlines', lambda *args: inputs.split('\n'))
|
|
131
|
+
@patch('sys.stdin.read', lambda *args: inputs)
|
|
132
|
+
# @patch('sys.stdout.write', print)
|
|
133
|
+
def _inner_call_method(_method):
|
|
134
|
+
try:
|
|
135
|
+
return _method()
|
|
136
|
+
except SystemExit as e:
|
|
137
|
+
pass
|
|
138
|
+
finally:
|
|
139
|
+
pass
|
|
140
|
+
|
|
141
|
+
return _inner_call_method(method)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def get_function(compiled_sol, fn_name: str): # type: ignore
|
|
145
|
+
try:
|
|
146
|
+
assert hasattr(compiled_sol, fn_name)
|
|
147
|
+
return getattr(compiled_sol, fn_name)
|
|
148
|
+
except Exception as e:
|
|
149
|
+
return
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def compile_code(code: str, timeout: int):
|
|
153
|
+
signal.alarm(timeout)
|
|
154
|
+
try:
|
|
155
|
+
tmp_sol = ModuleType('tmp_sol', '')
|
|
156
|
+
exec(code, tmp_sol.__dict__)
|
|
157
|
+
if 'class Solution' in code:
|
|
158
|
+
# leetcode wraps solutions in `Solution`
|
|
159
|
+
# this is a hack to check if it is leetcode solution or not
|
|
160
|
+
# currently livecodebench only supports LeetCode but
|
|
161
|
+
# else condition allows future extensibility to other platforms
|
|
162
|
+
compiled_sol = tmp_sol.Solution()
|
|
163
|
+
else:
|
|
164
|
+
# do nothing in the other case since function is accesible
|
|
165
|
+
compiled_sol = tmp_sol
|
|
166
|
+
|
|
167
|
+
assert compiled_sol is not None
|
|
168
|
+
finally:
|
|
169
|
+
signal.alarm(0)
|
|
170
|
+
|
|
171
|
+
return compiled_sol
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def convert_line_to_decimals(line: str) -> tuple[bool, list[Decimal]]:
|
|
175
|
+
try:
|
|
176
|
+
decimal_line = [Decimal(elem) for elem in line.split()]
|
|
177
|
+
except:
|
|
178
|
+
return False, []
|
|
179
|
+
return True, decimal_line
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def get_stripped_lines(val: str):
|
|
183
|
+
## you don't want empty lines to add empty list after splitlines!
|
|
184
|
+
val = val.strip()
|
|
185
|
+
|
|
186
|
+
return [val_line.strip() for val_line in val.split('\n')]
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def grade_call_based(code: str, all_inputs: list, all_outputs: list, fn_name: str, timeout: int):
|
|
190
|
+
# call-based clean up logic
|
|
191
|
+
# need to wrap in try-catch logic after to catch the correct errors, but for now this is fine.
|
|
192
|
+
code = import_string + '\n\n' + code
|
|
193
|
+
compiled_sol = compile_code(code, timeout)
|
|
194
|
+
|
|
195
|
+
if compiled_sol is None:
|
|
196
|
+
return
|
|
197
|
+
|
|
198
|
+
method = get_function(compiled_sol, fn_name)
|
|
199
|
+
|
|
200
|
+
if method is None:
|
|
201
|
+
return
|
|
202
|
+
|
|
203
|
+
all_inputs = [[json.loads(line) for line in inputs.split('\n')] for inputs in all_inputs]
|
|
204
|
+
|
|
205
|
+
all_outputs = [json.loads(output) for output in all_outputs]
|
|
206
|
+
|
|
207
|
+
total_execution = 0
|
|
208
|
+
all_results = []
|
|
209
|
+
for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
|
|
210
|
+
signal.alarm(timeout)
|
|
211
|
+
# faulthandler.enable()
|
|
212
|
+
try:
|
|
213
|
+
# can lock here so time is useful
|
|
214
|
+
start = time.time()
|
|
215
|
+
prediction = method(*gt_inp)
|
|
216
|
+
total_execution += time.time() - start
|
|
217
|
+
signal.alarm(0)
|
|
218
|
+
|
|
219
|
+
# don't penalize model if it produces tuples instead of lists
|
|
220
|
+
# ground truth sequences are not tuples
|
|
221
|
+
if isinstance(prediction, tuple):
|
|
222
|
+
prediction = list(prediction)
|
|
223
|
+
|
|
224
|
+
tmp_result = prediction == gt_out
|
|
225
|
+
|
|
226
|
+
# handle floating point comparisons
|
|
227
|
+
|
|
228
|
+
all_results.append(tmp_result)
|
|
229
|
+
|
|
230
|
+
if not tmp_result:
|
|
231
|
+
return all_results, {
|
|
232
|
+
'output': truncatefn(prediction),
|
|
233
|
+
'inputs': truncatefn(gt_inp),
|
|
234
|
+
'expected': truncatefn(gt_out),
|
|
235
|
+
'error_code': -2,
|
|
236
|
+
'error_message': 'Wrong Answer',
|
|
237
|
+
}
|
|
238
|
+
except Exception as e:
|
|
239
|
+
signal.alarm(0)
|
|
240
|
+
if 'timeoutexception' in repr(e).lower():
|
|
241
|
+
all_results.append(-3)
|
|
242
|
+
return all_results, {
|
|
243
|
+
'error': repr(e),
|
|
244
|
+
'error_code': -3,
|
|
245
|
+
'error_message': 'Time Limit Exceeded',
|
|
246
|
+
'inputs': truncatefn(gt_inp),
|
|
247
|
+
'expected': truncatefn(gt_out),
|
|
248
|
+
}
|
|
249
|
+
else:
|
|
250
|
+
all_results.append(-4)
|
|
251
|
+
return all_results, {
|
|
252
|
+
'error': repr(e),
|
|
253
|
+
'error_code': -4,
|
|
254
|
+
'error_message': 'Runtime Error',
|
|
255
|
+
'inputs': truncatefn(gt_inp),
|
|
256
|
+
'expected': truncatefn(gt_out),
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
finally:
|
|
260
|
+
signal.alarm(0)
|
|
261
|
+
# faulthandler.disable()
|
|
262
|
+
|
|
263
|
+
return all_results, {'execution time': total_execution}
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def grade_stdio(
|
|
267
|
+
code: str,
|
|
268
|
+
all_inputs: list,
|
|
269
|
+
all_outputs: list,
|
|
270
|
+
timeout: int,
|
|
271
|
+
):
|
|
272
|
+
## runtime doesn't interact well with __name__ == '__main__'
|
|
273
|
+
code = clean_if_name(code)
|
|
274
|
+
|
|
275
|
+
## we wrap the given code inside another function
|
|
276
|
+
code = make_function(code)
|
|
277
|
+
|
|
278
|
+
compiled_sol = compile_code(code, timeout)
|
|
279
|
+
if compiled_sol is None:
|
|
280
|
+
return
|
|
281
|
+
|
|
282
|
+
method = get_function(compiled_sol, 'wrapped_function')
|
|
87
283
|
|
|
284
|
+
if method is None:
|
|
285
|
+
return
|
|
286
|
+
|
|
287
|
+
all_results = []
|
|
288
|
+
total_execution_time = 0
|
|
289
|
+
for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
|
|
290
|
+
signal.alarm(timeout)
|
|
291
|
+
# faulthandler.enable()
|
|
292
|
+
|
|
293
|
+
with Capturing() as captured_output:
|
|
294
|
+
try:
|
|
295
|
+
start = time.time()
|
|
296
|
+
call_method(method, gt_inp)
|
|
297
|
+
total_execution_time += time.time() - start
|
|
298
|
+
# reset the alarm
|
|
299
|
+
signal.alarm(0)
|
|
300
|
+
except Exception as e:
|
|
301
|
+
signal.alarm(0)
|
|
302
|
+
if 'timeoutexception' in repr(e).lower():
|
|
303
|
+
all_results.append(-3)
|
|
304
|
+
return all_results, {
|
|
305
|
+
'error': repr(e),
|
|
306
|
+
'error_code': -3,
|
|
307
|
+
'error_message': 'Time Limit Exceeded',
|
|
308
|
+
'inputs': truncatefn(gt_inp),
|
|
309
|
+
'expected': truncatefn(gt_out),
|
|
310
|
+
}
|
|
311
|
+
else:
|
|
312
|
+
all_results.append(-4)
|
|
313
|
+
return all_results, {
|
|
314
|
+
'error': repr(e),
|
|
315
|
+
'error_code': -4,
|
|
316
|
+
'error_message': 'Runtime Error',
|
|
317
|
+
'inputs': truncatefn(gt_inp),
|
|
318
|
+
'expected': truncatefn(gt_out),
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
finally:
|
|
322
|
+
signal.alarm(0)
|
|
323
|
+
# faulthandler.disable()
|
|
324
|
+
|
|
325
|
+
prediction = captured_output[0]
|
|
326
|
+
|
|
327
|
+
stripped_prediction_lines = get_stripped_lines(prediction)
|
|
328
|
+
stripped_gt_out_lines = get_stripped_lines(gt_out)
|
|
329
|
+
|
|
330
|
+
## WA happens in multiple circumstances
|
|
331
|
+
## so cache the return to make it clean!
|
|
332
|
+
WA_send_args = {
|
|
333
|
+
'output': truncatefn(prediction),
|
|
334
|
+
'inputs': truncatefn(gt_inp),
|
|
335
|
+
'expected': truncatefn(gt_out),
|
|
336
|
+
'error_code': -2,
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
if len(stripped_prediction_lines) != len(stripped_gt_out_lines):
|
|
340
|
+
all_results.append(-2)
|
|
341
|
+
WA_send_args['error_message'] = 'Wrong answer: mismatched output length'
|
|
342
|
+
return all_results, WA_send_args
|
|
343
|
+
|
|
344
|
+
for output_line_idx, (
|
|
345
|
+
stripped_prediction_line,
|
|
346
|
+
stripped_gt_out_line,
|
|
347
|
+
) in enumerate(zip(stripped_prediction_lines, stripped_gt_out_lines)):
|
|
348
|
+
WA_send_args['error_message'] = (
|
|
349
|
+
f'Wrong answer at {output_line_idx=}: {truncatefn(stripped_prediction_line)} != {truncatefn(stripped_gt_out_line)}'
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
## CASE 1: exact match
|
|
353
|
+
if stripped_prediction_line == stripped_gt_out_line:
|
|
354
|
+
continue
|
|
355
|
+
|
|
356
|
+
## CASE 2: element-wise comparision
|
|
357
|
+
## if there are floating elements
|
|
358
|
+
## use `decimal` library for good floating point comparision
|
|
359
|
+
## otherwise gotcha: np.isclose(50000000000000000, 50000000000000001) = True
|
|
360
|
+
## note that we should always be able to convert to decimals
|
|
361
|
+
|
|
362
|
+
success, decimal_prediction_line = convert_line_to_decimals(stripped_prediction_line)
|
|
363
|
+
if not success:
|
|
364
|
+
all_results.append(-2)
|
|
365
|
+
return all_results, WA_send_args
|
|
366
|
+
success, decimal_gtout_line = convert_line_to_decimals(stripped_gt_out_line)
|
|
367
|
+
if not success:
|
|
368
|
+
all_results.append(-2)
|
|
369
|
+
return all_results, WA_send_args
|
|
370
|
+
|
|
371
|
+
if decimal_prediction_line == decimal_gtout_line:
|
|
372
|
+
continue
|
|
373
|
+
|
|
374
|
+
all_results.append(-2)
|
|
375
|
+
return all_results, WA_send_args
|
|
376
|
+
all_results.append(True)
|
|
377
|
+
|
|
378
|
+
return all_results, {'execution time': total_execution_time}
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def run_test(sample, test=None, debug=False, timeout=6):
|
|
382
|
+
"""
|
|
383
|
+
if test(generated_code) is not None it'll try to run the code.
|
|
88
384
|
otherwise it'll just return an input and output pair.
|
|
89
385
|
"""
|
|
386
|
+
timeout_handler_wrapper = partial(timeout_handler, debug)
|
|
387
|
+
signal.signal(signal.SIGALRM, timeout_handler_wrapper)
|
|
388
|
+
|
|
90
389
|
# Disable functionalities that can make destructive changes to the test.
|
|
390
|
+
# max memory is set to 4GB
|
|
91
391
|
reliability_guard()
|
|
92
392
|
|
|
93
393
|
if debug:
|
|
94
|
-
|
|
394
|
+
logger.info(f'start = {datetime.now().time()}')
|
|
95
395
|
|
|
96
396
|
try:
|
|
97
397
|
in_outs = json.loads(sample['input_output'])
|
|
98
|
-
except ValueError:
|
|
398
|
+
except ValueError as e:
|
|
399
|
+
raise e
|
|
99
400
|
in_outs = None
|
|
401
|
+
|
|
100
402
|
if in_outs:
|
|
101
403
|
if in_outs.get('fn_name') is None:
|
|
102
404
|
which_type = CODE_TYPE.standard_input # Standard input
|
|
103
405
|
method_name = None
|
|
406
|
+
|
|
104
407
|
else:
|
|
105
408
|
which_type = CODE_TYPE.call_based # Call-based
|
|
106
409
|
method_name = in_outs['fn_name']
|
|
107
410
|
|
|
108
411
|
if debug:
|
|
109
|
-
|
|
412
|
+
logger.info(f'loaded input_output = {datetime.now().time()}')
|
|
110
413
|
|
|
111
414
|
if test is None:
|
|
112
415
|
assert False, 'should not happen: test code is none'
|
|
113
416
|
return in_outs, {'error': 'no test code provided'}
|
|
114
417
|
elif test is not None:
|
|
115
418
|
results = []
|
|
116
|
-
sol =
|
|
419
|
+
sol = import_string
|
|
117
420
|
if debug:
|
|
118
|
-
|
|
421
|
+
logger.info(f'loading test code = {datetime.now().time()}')
|
|
119
422
|
|
|
120
423
|
if which_type == CODE_TYPE.call_based:
|
|
121
|
-
|
|
122
|
-
sol += test
|
|
123
|
-
if debug:
|
|
124
|
-
print(f'sol = {sol}')
|
|
125
424
|
signal.alarm(timeout)
|
|
126
425
|
try:
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
426
|
+
results, metadata = grade_call_based(
|
|
427
|
+
code=test,
|
|
428
|
+
all_inputs=in_outs['inputs'],
|
|
429
|
+
all_outputs=in_outs['outputs'],
|
|
430
|
+
fn_name=method_name,
|
|
431
|
+
timeout=timeout,
|
|
432
|
+
)
|
|
433
|
+
return results, metadata
|
|
133
434
|
except Exception as e:
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
results.append(-2)
|
|
138
|
-
return results, {
|
|
139
|
-
'error': repr(e),
|
|
140
|
-
'error_code': -1,
|
|
141
|
-
'error_message': 'Compilation Error',
|
|
435
|
+
return [-4], {
|
|
436
|
+
'error_code': -4,
|
|
437
|
+
'error_message': f'Error during testing: {e}',
|
|
142
438
|
}
|
|
143
|
-
|
|
144
|
-
|
|
439
|
+
finally:
|
|
440
|
+
signal.alarm(0)
|
|
145
441
|
elif which_type == CODE_TYPE.standard_input:
|
|
146
442
|
# sol
|
|
147
443
|
# if code has if __name__ == "__main__": then remove it
|
|
148
|
-
try:
|
|
149
|
-
astree = ast.parse(test)
|
|
150
|
-
last_block = astree.body[-1]
|
|
151
|
-
if isinstance(last_block, ast.If):
|
|
152
|
-
condition = last_block.test
|
|
153
|
-
if ast.unparse(condition).strip() == "__name__ == '__main__'":
|
|
154
|
-
test = (ast.unparse(astree.body[:-1]) + '\n' + ast.unparse(last_block.body))
|
|
155
|
-
except Exception as e: # noqa:
|
|
156
|
-
pass
|
|
157
|
-
|
|
158
|
-
tmp_test = test.split('\n')
|
|
159
|
-
|
|
160
|
-
new_test = []
|
|
161
|
-
for x in tmp_test:
|
|
162
|
-
if (not x.startswith('from ')) and (not x.startswith('import ')):
|
|
163
|
-
new_test.append('\t' + x + '\n')
|
|
164
|
-
else:
|
|
165
|
-
new_test.append(x + '\n')
|
|
166
|
-
tmp_test = new_test
|
|
167
|
-
|
|
168
|
-
new_test = ''
|
|
169
|
-
started = False
|
|
170
|
-
for i in tmp_test:
|
|
171
|
-
if i.startswith('\t') and not started:
|
|
172
|
-
new_test += 'stdin = sys.stdin\nstdout = sys.stdout\n'
|
|
173
|
-
new_test += 'def code():\n'
|
|
174
|
-
new_test += i
|
|
175
|
-
started = True
|
|
176
|
-
elif started and ((i.startswith('from ')) or (i.startswith('import '))):
|
|
177
|
-
new_test += '\t' + i
|
|
178
|
-
else:
|
|
179
|
-
new_test += i
|
|
180
|
-
tmp_test = new_test
|
|
181
444
|
|
|
182
|
-
sol += tmp_test
|
|
183
|
-
if debug:
|
|
184
|
-
print(f'sol = {sol}')
|
|
185
|
-
method_name = 'code'
|
|
186
445
|
signal.alarm(timeout)
|
|
187
446
|
try:
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
447
|
+
results, metadata = grade_stdio(
|
|
448
|
+
code=test,
|
|
449
|
+
all_inputs=in_outs['inputs'],
|
|
450
|
+
all_outputs=in_outs['outputs'],
|
|
451
|
+
timeout=timeout,
|
|
452
|
+
)
|
|
453
|
+
return results, metadata
|
|
191
454
|
except Exception as e:
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
results.append(-2)
|
|
196
|
-
return results, {
|
|
197
|
-
'error': repr(e),
|
|
198
|
-
'error_code': -1,
|
|
199
|
-
'error_message': 'Compilation Error',
|
|
455
|
+
return [-4], {
|
|
456
|
+
'error_code': -4,
|
|
457
|
+
'error_message': f'Error during testing: {e}',
|
|
200
458
|
}
|
|
201
|
-
|
|
202
|
-
if debug:
|
|
203
|
-
print(f'get method = {datetime.now().time()}')
|
|
204
|
-
|
|
205
|
-
try:
|
|
206
|
-
method = getattr(tmp, method_name) # get_attr second arg must be str
|
|
207
|
-
except Exception as e:
|
|
208
|
-
signal.alarm(0)
|
|
209
|
-
e = sys.exc_info()
|
|
210
|
-
print(f'unable to get function error = {e}')
|
|
211
|
-
results.append(-2)
|
|
212
|
-
return results, {
|
|
213
|
-
'error': repr(e),
|
|
214
|
-
'error_code': -1,
|
|
215
|
-
'error_message': 'Unable to extract code',
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
for index, inputs in enumerate(in_outs['inputs']):
|
|
219
|
-
raw_inputs = inputs
|
|
220
|
-
raw_outputs = in_outs['outputs'][index]
|
|
221
|
-
if which_type == CODE_TYPE.call_based:
|
|
222
|
-
inputs = [json.loads(line) for line in inputs.split('\n')]
|
|
223
|
-
in_outs['outputs'][index] = json.loads(in_outs['outputs'][index])
|
|
224
|
-
|
|
225
|
-
truncate_line_size = 300 // (raw_inputs.count('\n') + 1)
|
|
226
|
-
raw_inputs = '\n'.join(
|
|
227
|
-
[truncatefn(line, truncate_line_size) for line in raw_inputs.strip().split('\n')])
|
|
228
|
-
raw_outputs = truncatefn(raw_outputs, 200)
|
|
229
|
-
else:
|
|
230
|
-
raw_inputs = truncatefn(raw_inputs)
|
|
231
|
-
raw_outputs = truncatefn(raw_outputs, 200)
|
|
232
|
-
# JSON forces dictionaries to have string keys; this undoes this
|
|
233
|
-
# (assuming a singleton list)
|
|
234
|
-
try:
|
|
235
|
-
if isinstance(inputs[0], dict):
|
|
236
|
-
inputs = [{int(k): v for k, v in inputs[0].items()}]
|
|
237
|
-
except Exception as e: # noqa: F841
|
|
238
|
-
True
|
|
239
|
-
try:
|
|
240
|
-
if isinstance(in_outs['outputs'][index], dict):
|
|
241
|
-
in_outs['outputs'][index] = [{int(k): v for k, v in in_outs['outputs'][index].items()}]
|
|
242
|
-
except Exception as e: # noqa: F841
|
|
243
|
-
True
|
|
244
|
-
try:
|
|
245
|
-
if isinstance(in_outs['outputs'][index][0], dict):
|
|
246
|
-
in_outs['outputs'][index] = [{int(k): v for k, v in in_outs['outputs'][index][0].items()}]
|
|
247
|
-
except Exception as e: # noqa: F841
|
|
248
|
-
True
|
|
249
|
-
|
|
250
|
-
if debug:
|
|
251
|
-
print(f'time: {datetime.now().time()} testing index = {index} '
|
|
252
|
-
f'inputs = {inputs}, {type(inputs)}. type = {which_type}')
|
|
253
|
-
if which_type == CODE_TYPE.call_based: # Call-based
|
|
254
|
-
signal.alarm(timeout)
|
|
255
|
-
faulthandler.enable()
|
|
256
|
-
try:
|
|
257
|
-
output = method(*inputs)
|
|
258
|
-
raw_true_output = output
|
|
259
|
-
|
|
260
|
-
raw_true_output_copy = json.dumps(output)
|
|
261
|
-
raw_true_output_copy = truncatefn(raw_true_output_copy, 200)
|
|
262
|
-
|
|
263
|
-
# ground truth sequences are not tuples
|
|
264
|
-
if isinstance(output, tuple):
|
|
265
|
-
output = list(output)
|
|
266
|
-
|
|
267
|
-
tmp_result = output == in_outs['outputs'][index]
|
|
268
|
-
if (isinstance(in_outs['outputs'][index], list) and in_outs['outputs'][index]):
|
|
269
|
-
tmp_result = tmp_result or (output == in_outs['outputs'][index][0])
|
|
270
|
-
|
|
271
|
-
# ground truth sequences are not tuples
|
|
272
|
-
try:
|
|
273
|
-
if isinstance(output[0], tuple):
|
|
274
|
-
tmp_result = tmp_result or ([list(x) for x in output] == in_outs['outputs'][index][0])
|
|
275
|
-
except Exception as e: # noqa: F841
|
|
276
|
-
True
|
|
277
|
-
results.append(tmp_result)
|
|
278
|
-
if tmp_result is not True:
|
|
279
|
-
return results, {
|
|
280
|
-
'output': raw_true_output_copy,
|
|
281
|
-
'expected': raw_outputs,
|
|
282
|
-
'inputs': raw_inputs,
|
|
283
|
-
'error_code': -2,
|
|
284
|
-
'error_message': 'Wrong Answer',
|
|
285
|
-
}
|
|
286
|
-
# reset the alarm
|
|
287
|
-
signal.alarm(0)
|
|
288
|
-
except Exception as e:
|
|
289
|
-
signal.alarm(0)
|
|
290
|
-
faulthandler.disable()
|
|
291
|
-
if debug:
|
|
292
|
-
print(f'Standard input runtime error or time limit exceeded error = {e}' # noqa: E501
|
|
293
|
-
)
|
|
294
|
-
results.append(-1)
|
|
295
|
-
if 'timeoutexception' in repr(e).lower():
|
|
296
|
-
return results, {
|
|
297
|
-
'error': repr(e),
|
|
298
|
-
'error_code': -3,
|
|
299
|
-
'error_message': 'Time Limit Exceeded',
|
|
300
|
-
'inputs': raw_inputs,
|
|
301
|
-
'expected': raw_outputs,
|
|
302
|
-
}
|
|
303
|
-
else:
|
|
304
|
-
return results, {
|
|
305
|
-
'error': repr(e),
|
|
306
|
-
'error_code': -4,
|
|
307
|
-
'error_message': 'Runtime Error',
|
|
308
|
-
'inputs': raw_inputs,
|
|
309
|
-
'expected': raw_outputs,
|
|
310
|
-
}
|
|
311
|
-
faulthandler.disable()
|
|
459
|
+
finally:
|
|
312
460
|
signal.alarm(0)
|
|
313
|
-
if debug:
|
|
314
|
-
print(
|
|
315
|
-
f"outputs = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
|
|
316
|
-
)
|
|
317
|
-
elif which_type == CODE_TYPE.standard_input: # Standard input
|
|
318
|
-
faulthandler.enable()
|
|
319
|
-
passed = False
|
|
320
|
-
|
|
321
|
-
if isinstance(inputs, list):
|
|
322
|
-
inputs = '\n'.join(inputs)
|
|
323
|
-
if isinstance(in_outs['outputs'][index], list):
|
|
324
|
-
in_outs['outputs'][index] = '\n'.join(in_outs['outputs'][index])
|
|
325
|
-
|
|
326
|
-
signal.alarm(timeout)
|
|
327
|
-
with Capturing() as output:
|
|
328
|
-
try:
|
|
329
|
-
call_method(method, inputs)
|
|
330
|
-
# reset the alarm
|
|
331
|
-
signal.alarm(0)
|
|
332
|
-
passed = True
|
|
333
|
-
except Exception as e:
|
|
334
|
-
# runtime error or took too long
|
|
335
|
-
signal.alarm(0)
|
|
336
|
-
print(f'Call-based runtime error or time limit exceeded error = {repr(e)}{e}' # noqa: E501
|
|
337
|
-
)
|
|
338
|
-
results.append(-1)
|
|
339
|
-
if 'timeoutexception' in repr(e).lower():
|
|
340
|
-
return results, {
|
|
341
|
-
'error': repr(e),
|
|
342
|
-
'error_code': -3,
|
|
343
|
-
'error_message': 'Time Limit Exceeded',
|
|
344
|
-
'inputs': raw_inputs,
|
|
345
|
-
'expected': raw_outputs,
|
|
346
|
-
}
|
|
347
|
-
else:
|
|
348
|
-
return results, {
|
|
349
|
-
'error': repr(e),
|
|
350
|
-
'error_code': -4,
|
|
351
|
-
'error_message': 'Runtime Error',
|
|
352
|
-
'inputs': raw_inputs,
|
|
353
|
-
'expected': raw_outputs,
|
|
354
|
-
}
|
|
355
|
-
signal.alarm(0)
|
|
356
|
-
raw_true_output = output[0]
|
|
357
|
-
raw_true_output_copy = truncatefn(raw_true_output, 200)
|
|
358
|
-
output = raw_true_output.splitlines()
|
|
359
|
-
if not passed:
|
|
360
|
-
if debug:
|
|
361
|
-
nl = '\n'
|
|
362
|
-
if not isinstance(inputs, list):
|
|
363
|
-
print(
|
|
364
|
-
f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl, ' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
|
|
365
|
-
)
|
|
366
|
-
else:
|
|
367
|
-
print(
|
|
368
|
-
f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
|
|
369
|
-
)
|
|
370
|
-
continue
|
|
371
|
-
|
|
372
|
-
if passed and debug:
|
|
373
|
-
print(f"==> output = {output}, test outputs = {in_outs['outputs'][index]}" # noqa: E501
|
|
374
|
-
)
|
|
375
|
-
|
|
376
|
-
if custom_compare_(output, in_outs['outputs'][index]):
|
|
377
|
-
tmp_result = True
|
|
378
|
-
results.append(tmp_result)
|
|
379
|
-
continue
|
|
380
|
-
|
|
381
|
-
# ground truth sequences are expressed as lists not tuples
|
|
382
|
-
if isinstance(output, tuple):
|
|
383
|
-
output = list(output)
|
|
384
|
-
|
|
385
|
-
tmp_result = False
|
|
386
|
-
try:
|
|
387
|
-
tmp_result = output == [in_outs['outputs'][index]]
|
|
388
|
-
if isinstance(in_outs['outputs'][index], list):
|
|
389
|
-
tmp_result = tmp_result or (output == in_outs['outputs'][index])
|
|
390
|
-
if isinstance(output[0], str):
|
|
391
|
-
tmp_result = tmp_result or ([e.strip() for e in output] == in_outs['outputs'][index])
|
|
392
|
-
except Exception as e:
|
|
393
|
-
if debug:
|
|
394
|
-
print(f'Failed check1 exception = {e}')
|
|
395
|
-
pass
|
|
396
|
-
|
|
397
|
-
if tmp_result is True:
|
|
398
|
-
results.append(tmp_result)
|
|
399
|
-
continue
|
|
400
|
-
|
|
401
|
-
# try one more time without \n
|
|
402
|
-
if isinstance(in_outs['outputs'][index], list):
|
|
403
|
-
for tmp_index, i in enumerate(in_outs['outputs'][index]):
|
|
404
|
-
in_outs['outputs'][index][tmp_index] = i.split('\n')
|
|
405
|
-
in_outs['outputs'][index][tmp_index] = [
|
|
406
|
-
x.strip() for x in in_outs['outputs'][index][tmp_index] if x
|
|
407
|
-
]
|
|
408
|
-
else:
|
|
409
|
-
in_outs['outputs'][index] = in_outs['outputs'][index].split('\n')
|
|
410
|
-
in_outs['outputs'][index] = list(filter(len, in_outs['outputs'][index]))
|
|
411
|
-
in_outs['outputs'][index] = list(map(lambda x: x.strip(), in_outs['outputs'][index]))
|
|
412
|
-
|
|
413
|
-
try:
|
|
414
|
-
tmp_result = output == [in_outs['outputs'][index]]
|
|
415
|
-
if isinstance(in_outs['outputs'][index], list):
|
|
416
|
-
tmp_result = tmp_result or (output == in_outs['outputs'][index])
|
|
417
|
-
except Exception as e:
|
|
418
|
-
if debug:
|
|
419
|
-
print(f'Failed check2 exception = {e}')
|
|
420
|
-
pass
|
|
421
|
-
|
|
422
|
-
if tmp_result is True:
|
|
423
|
-
results.append(tmp_result)
|
|
424
|
-
continue
|
|
425
|
-
|
|
426
|
-
# try by converting the output into a split up list too
|
|
427
|
-
if isinstance(output, list):
|
|
428
|
-
output = list(filter(len, output))
|
|
429
|
-
|
|
430
|
-
if debug:
|
|
431
|
-
nl = '\n'
|
|
432
|
-
if not isinstance(inputs, list):
|
|
433
|
-
print(
|
|
434
|
-
f"@1 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl, ' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]} {tmp_result=}" # noqa: E501
|
|
435
|
-
)
|
|
436
|
-
else:
|
|
437
|
-
print(
|
|
438
|
-
f"@1 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]} {tmp_result=}" # noqa: E501
|
|
439
|
-
)
|
|
440
|
-
|
|
441
|
-
if tmp_result is True:
|
|
442
|
-
results.append(tmp_result)
|
|
443
|
-
continue
|
|
444
|
-
|
|
445
|
-
if debug:
|
|
446
|
-
print(f'{tmp_result=} @a')
|
|
447
|
-
|
|
448
|
-
try:
|
|
449
|
-
tmp_result = output == [in_outs['outputs'][index]]
|
|
450
|
-
if isinstance(in_outs['outputs'][index], list):
|
|
451
|
-
tmp_result = tmp_result or (output == in_outs['outputs'][index])
|
|
452
|
-
except Exception as e:
|
|
453
|
-
if debug:
|
|
454
|
-
print(f'Failed check3 exception = {e}')
|
|
455
|
-
pass
|
|
456
|
-
|
|
457
|
-
if debug:
|
|
458
|
-
print(f'{tmp_result=} @b')
|
|
459
|
-
|
|
460
|
-
try:
|
|
461
|
-
all_ints = all(
|
|
462
|
-
combined_int_check(e1) and combined_int_check(e2)
|
|
463
|
-
for e1, e2 in zip(output, in_outs['outputs'][index]))
|
|
464
|
-
if not all_ints:
|
|
465
|
-
if debug:
|
|
466
|
-
print([
|
|
467
|
-
combined_int_check(e1) and combined_int_check(e2)
|
|
468
|
-
for e1, e2 in zip(output, in_outs['outputs'][index])
|
|
469
|
-
])
|
|
470
|
-
output_float = [float(e) for e in output]
|
|
471
|
-
gt_float = [float(e) for e in in_outs['outputs'][index]]
|
|
472
|
-
tmp_result = tmp_result or ((len(output_float) == len(gt_float))
|
|
473
|
-
and np.allclose(output_float, gt_float))
|
|
474
|
-
except Exception as e: # noqa: F841
|
|
475
|
-
pass
|
|
476
|
-
|
|
477
|
-
if debug:
|
|
478
|
-
print(f'{tmp_result=} @c')
|
|
479
|
-
|
|
480
|
-
try:
|
|
481
|
-
if isinstance(output[0], list):
|
|
482
|
-
all_ints = all(
|
|
483
|
-
combined_int_check(e1) and combined_int_check(e2)
|
|
484
|
-
for e1, e2 in zip(output[0], in_outs['outputs'][index]))
|
|
485
|
-
if not all_ints:
|
|
486
|
-
output_float = [float(e) for e in output[0]]
|
|
487
|
-
gt_float = [float(e) for e in in_outs['outputs'][index][0]]
|
|
488
|
-
tmp_result = tmp_result or ((len(output_float) == len(gt_float))
|
|
489
|
-
and np.allclose(output_float, gt_float))
|
|
490
|
-
except Exception as e: # noqa: F841
|
|
491
|
-
pass
|
|
492
|
-
|
|
493
|
-
if tmp_result is True:
|
|
494
|
-
results.append(tmp_result)
|
|
495
|
-
continue
|
|
496
|
-
|
|
497
|
-
if debug:
|
|
498
|
-
print(f'{tmp_result=} @d')
|
|
499
|
-
# try by converting the stuff into split up list
|
|
500
|
-
if isinstance(in_outs['outputs'][index], list):
|
|
501
|
-
for tmp_index, i in enumerate(in_outs['outputs'][index]):
|
|
502
|
-
in_outs['outputs'][index][tmp_index] = set(i.split())
|
|
503
|
-
else:
|
|
504
|
-
in_outs['outputs'][index] = set(in_outs['outputs'][index].split())
|
|
505
|
-
|
|
506
|
-
if debug:
|
|
507
|
-
print(f'{tmp_result=} @e')
|
|
508
|
-
|
|
509
|
-
try:
|
|
510
|
-
tmp_result = output == in_outs['outputs'][index]
|
|
511
|
-
except Exception as e:
|
|
512
|
-
if debug:
|
|
513
|
-
print(f'Failed check4 exception = {e}')
|
|
514
|
-
continue
|
|
515
|
-
|
|
516
|
-
if tmp_result is True:
|
|
517
|
-
results.append(tmp_result)
|
|
518
|
-
continue
|
|
519
|
-
|
|
520
|
-
if debug:
|
|
521
|
-
print(f'{tmp_result=} @f')
|
|
522
|
-
|
|
523
|
-
# try by converting the output into a split up list too
|
|
524
|
-
if isinstance(output, list):
|
|
525
|
-
for tmp_index, i in enumerate(output):
|
|
526
|
-
output[tmp_index] = i.split()
|
|
527
|
-
output = list(filter(len, output))
|
|
528
|
-
for tmp_index, i in enumerate(output):
|
|
529
|
-
output[tmp_index] = set(i)
|
|
530
|
-
else:
|
|
531
|
-
output = output.split()
|
|
532
|
-
output = list(filter(len, output))
|
|
533
|
-
output = set(output)
|
|
534
|
-
|
|
535
|
-
if debug:
|
|
536
|
-
print(f'{tmp_result=} @g')
|
|
537
|
-
# try:
|
|
538
|
-
# tmp_result = set(frozenset(s) for s in output) == set(
|
|
539
|
-
# frozenset(s) for s in in_outs["outputs"][index]
|
|
540
|
-
# )
|
|
541
|
-
# except Exception as e:
|
|
542
|
-
# if debug:
|
|
543
|
-
# print(f"Failed check5 exception = {e}")
|
|
544
|
-
|
|
545
|
-
# if they are all numbers, round so that similar numbers are
|
|
546
|
-
# treated as identical
|
|
547
|
-
# try:
|
|
548
|
-
# all_ints = all(
|
|
549
|
-
# combined_int_check(e1) and combined_int_check(e2)
|
|
550
|
-
# for e1, e2 in zip(output, in_outs["outputs"][index])
|
|
551
|
-
# )
|
|
552
|
-
# tmp_result = tmp_result or (
|
|
553
|
-
# set(
|
|
554
|
-
# frozenset(round(float(t), 3) for t in s) for s in output)
|
|
555
|
-
# == set(
|
|
556
|
-
# frozenset(round(float(t), 3) for t in s)
|
|
557
|
-
# for s in in_outs["outputs"][index]
|
|
558
|
-
# )
|
|
559
|
-
# )
|
|
560
|
-
# except Exception as e:
|
|
561
|
-
# if debug:
|
|
562
|
-
# print(f"Failed check6 exception = {e}")
|
|
563
|
-
|
|
564
|
-
if debug:
|
|
565
|
-
print(f'{tmp_result=} @h')
|
|
566
|
-
|
|
567
|
-
if tmp_result is True and debug:
|
|
568
|
-
print('PASSED')
|
|
569
|
-
|
|
570
|
-
results.append(tmp_result)
|
|
571
|
-
if tmp_result is not True:
|
|
572
|
-
return results, {
|
|
573
|
-
'output': raw_true_output_copy,
|
|
574
|
-
'expected': raw_outputs,
|
|
575
|
-
'inputs': raw_inputs,
|
|
576
|
-
'error_code': -2,
|
|
577
|
-
'error_message': 'Wrong Answer',
|
|
578
|
-
}
|
|
579
|
-
|
|
580
|
-
if debug:
|
|
581
|
-
nl = '\n'
|
|
582
|
-
if not isinstance(inputs, list):
|
|
583
|
-
print(
|
|
584
|
-
f"@2 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl, ' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
|
|
585
|
-
)
|
|
586
|
-
else:
|
|
587
|
-
print(
|
|
588
|
-
f"@2 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
|
|
589
|
-
)
|
|
590
|
-
|
|
591
|
-
print(f'results = {results}')
|
|
592
|
-
|
|
593
|
-
return results, {}
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
def custom_compare_(output, ground_truth):
|
|
597
|
-
|
|
598
|
-
if isinstance(output, list):
|
|
599
|
-
output_1 = '\n'.join(output)
|
|
600
|
-
if stripped_string_compare(output_1, ground_truth):
|
|
601
|
-
return True
|
|
602
|
-
|
|
603
|
-
if isinstance(output, list):
|
|
604
|
-
output_2 = [o.lstrip().rstrip() for o in output]
|
|
605
|
-
output_2 = '\n'.join(output_2)
|
|
606
|
-
if stripped_string_compare(output_2, ground_truth):
|
|
607
|
-
return True
|
|
608
|
-
|
|
609
|
-
return False
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
def stripped_string_compare(s1, s2):
|
|
613
|
-
s1 = s1.lstrip().rstrip()
|
|
614
|
-
s2 = s2.lstrip().rstrip()
|
|
615
|
-
return s1 == s2
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
def call_method(method, inputs):
|
|
619
|
-
|
|
620
|
-
if isinstance(inputs, list):
|
|
621
|
-
inputs = '\n'.join(inputs)
|
|
622
|
-
|
|
623
|
-
inputs_line_iterator = iter(inputs.split('\n'))
|
|
624
|
-
|
|
625
|
-
# sys.setrecursionlimit(10000)
|
|
626
|
-
|
|
627
|
-
# @patch('builtins.input', side_effect=inputs.split("\n"))
|
|
628
|
-
@patch('builtins.open', mock_open(read_data=inputs))
|
|
629
|
-
@patch('sys.stdin', StringIO(inputs))
|
|
630
|
-
@patch('sys.stdin.readline', lambda *args: next(inputs_line_iterator))
|
|
631
|
-
@patch('sys.stdin.readlines', lambda *args: inputs.split('\n'))
|
|
632
|
-
@patch('sys.stdin.read', lambda *args: inputs)
|
|
633
|
-
# @patch('sys.stdout.write', print)
|
|
634
|
-
def _inner_call_method(_method):
|
|
635
|
-
try:
|
|
636
|
-
return _method()
|
|
637
|
-
except SystemExit as e: # noqa: F841
|
|
638
|
-
pass
|
|
639
|
-
finally:
|
|
640
|
-
pass
|
|
641
|
-
|
|
642
|
-
return _inner_call_method(method)
|
|
643
461
|
|
|
644
462
|
|
|
645
463
|
def reliability_guard(maximum_memory_bytes=None):
|
|
646
|
-
"""
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
Untrusted code, including, model-
|
|
652
|
-
executed outside of one. See the
|
|
653
|
-
OpenAI's code sandbox, and proceed
|
|
464
|
+
"""
|
|
465
|
+
This disables various destructive functions and prevents the generated code
|
|
466
|
+
from interfering with the test (e.g. fork bomb, killing other processes,
|
|
467
|
+
removing filesystem files, etc.)
|
|
468
|
+
WARNING
|
|
469
|
+
This function is NOT a security sandbox. Untrusted code, including, model-
|
|
470
|
+
generated code, should not be blindly executed outside of one. See the
|
|
471
|
+
Codex paper for more information about OpenAI's code sandbox, and proceed
|
|
472
|
+
with caution.
|
|
654
473
|
"""
|
|
655
474
|
|
|
656
475
|
if maximum_memory_bytes is not None:
|
|
@@ -661,11 +480,11 @@ def reliability_guard(maximum_memory_bytes=None):
|
|
|
661
480
|
if not platform.uname().system == 'Darwin':
|
|
662
481
|
resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
|
|
663
482
|
|
|
664
|
-
faulthandler.disable()
|
|
483
|
+
# faulthandler.disable()
|
|
665
484
|
|
|
666
485
|
import builtins
|
|
667
486
|
|
|
668
|
-
builtins.exit = None
|
|
487
|
+
# builtins.exit = None
|
|
669
488
|
builtins.quit = None
|
|
670
489
|
|
|
671
490
|
import os
|