evalscope 0.13.2__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (57) hide show
  1. evalscope/backend/rag_eval/__init__.py +1 -1
  2. evalscope/backend/rag_eval/backend_manager.py +21 -5
  3. evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
  4. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  5. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
  6. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
  7. evalscope/backend/rag_eval/utils/embedding.py +49 -3
  8. evalscope/backend/rag_eval/utils/llm.py +4 -4
  9. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
  10. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  11. evalscope/benchmarks/data_adapter.py +6 -2
  12. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  13. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  14. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +1 -3
  15. evalscope/benchmarks/live_code_bench/testing_util.py +365 -549
  16. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  17. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
  18. evalscope/benchmarks/mmlu/mmlu_adapter.py +5 -7
  19. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  20. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  21. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  22. evalscope/collections/evaluator.py +4 -2
  23. evalscope/config.py +1 -1
  24. evalscope/perf/arguments.py +24 -5
  25. evalscope/perf/benchmark.py +28 -42
  26. evalscope/perf/http_client.py +2 -3
  27. evalscope/perf/plugin/api/custom_api.py +1 -1
  28. evalscope/perf/plugin/api/openai_api.py +2 -2
  29. evalscope/perf/plugin/datasets/custom.py +4 -1
  30. evalscope/perf/plugin/datasets/line_by_line.py +4 -1
  31. evalscope/perf/plugin/datasets/longalpaca.py +4 -1
  32. evalscope/perf/plugin/datasets/openqa.py +4 -1
  33. evalscope/perf/plugin/datasets/random_dataset.py +13 -6
  34. evalscope/perf/utils/benchmark_util.py +12 -6
  35. evalscope/perf/utils/db_util.py +1 -1
  36. evalscope/perf/utils/log_utils.py +41 -0
  37. evalscope/report/app.py +11 -11
  38. evalscope/run.py +7 -0
  39. evalscope/summarizer.py +2 -1
  40. evalscope/utils/utils.py +36 -25
  41. evalscope/version.py +2 -2
  42. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/METADATA +20 -15
  43. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/RECORD +55 -54
  44. tests/cli/test_all.py +4 -4
  45. tests/cli/test_collection.py +2 -1
  46. tests/cli/test_run.py +9 -8
  47. tests/perf/test_perf.py +1 -2
  48. tests/rag/test_clip_benchmark.py +0 -1
  49. tests/rag/test_mteb.py +37 -8
  50. tests/rag/test_ragas.py +29 -26
  51. tests/vlm/test_vlmeval.py +37 -1
  52. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  53. evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
  54. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/LICENSE +0 -0
  55. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/WHEEL +0 -0
  56. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/entry_points.txt +0 -0
  57. {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,4 @@
1
- # Copyright LiveCodeBench @ 2024,
2
-
1
+ # flake8: noqa
3
2
  import ast
4
3
  import faulthandler
5
4
  import json
@@ -8,23 +7,29 @@ import platform
8
7
  # to run the solution files we're using a timing based approach
9
8
  import signal
10
9
  import sys
10
+ import time
11
11
  # used for debugging to time steps
12
12
  from datetime import datetime
13
+ from decimal import Decimal
13
14
  from enum import Enum
14
- # for capturing the stdout
15
15
  from io import StringIO
16
-
17
- try:
18
- from pyext import RuntimeModule
19
- except Exception:
20
- print('pyext not found, please install with `pip install pyext`')
21
- pyext = None
16
+ # from pyext import RuntimeModule
17
+ from types import ModuleType
22
18
  # used for testing the code that reads from input
23
19
  from unittest.mock import mock_open, patch
24
20
 
21
+ from evalscope.utils.logger import get_logger
22
+
23
+ logger = get_logger()
24
+
25
+ import_string = 'from string import *\nfrom re import *\nfrom datetime import *\nfrom collections import *\nfrom heapq import *\nfrom bisect import *\nfrom copy import *\nfrom math import *\nfrom random import *\nfrom statistics import *\nfrom itertools import *\nfrom functools import *\nfrom operator import *\nfrom io import *\nfrom sys import *\nfrom json import *\nfrom builtins import *\nfrom typing import *\nimport string\nimport re\nimport datetime\nimport collections\nimport heapq\nimport bisect\nimport copy\nimport math\nimport random\nimport statistics\nimport itertools\nimport functools\nimport operator\nimport io\nimport sys\nimport json\nsys.setrecursionlimit(50000)\n'
26
+
25
27
 
26
28
  def truncatefn(s, length=300):
27
- assert isinstance(s, str)
29
+ if isinstance(s, str):
30
+ pass
31
+ else:
32
+ s = str(s)
28
33
  if len(s) <= length:
29
34
  return s
30
35
 
@@ -42,16 +47,10 @@ class TimeoutException(Exception):
42
47
 
43
48
 
44
49
  def timeout_handler(signum, frame):
45
- print('alarm went off')
46
- # return
50
+ logger.info('timeout occured: alarm went off')
47
51
  raise TimeoutException
48
52
 
49
53
 
50
- signal.signal(signal.SIGALRM, timeout_handler)
51
-
52
- # timeout = 6 # seconds
53
-
54
-
55
54
  # used to capture stdout as a list
56
55
  # from https://stackoverflow.com/a/16571630/6416660
57
56
  # alternative use redirect_stdout() from contextlib
@@ -70,587 +69,404 @@ class Capturing(list):
70
69
  sys.stdout = self._stdout
71
70
 
72
71
 
73
- def only_int_check(val):
74
- return isinstance(val, int)
72
+ def clean_if_name(code: str) -> str:
73
+ try:
74
+ astree = ast.parse(code)
75
+ last_block = astree.body[-1]
76
+ if isinstance(last_block, ast.If):
77
+ condition = last_block.test
78
+ if ast.unparse(condition).strip() == "__name__ == '__main__'":
79
+ code = (
80
+ ast.unparse(astree.body[:-1]) + '\n' + ast.unparse(last_block.body) # type: ignore
81
+ )
82
+ except:
83
+ pass
75
84
 
85
+ return code
76
86
 
77
- def string_int_check(val):
78
- return isinstance(val, str) and val.isdigit()
79
87
 
88
+ def make_function(code: str) -> str:
89
+ try:
90
+ import_stmts = []
91
+ all_other_stmts = []
92
+ astree = ast.parse(code)
93
+ for stmt in astree.body:
94
+ if isinstance(stmt, (ast.Import, ast.ImportFrom)):
95
+ import_stmts.append(stmt)
96
+ else:
97
+ all_other_stmts.append(stmt)
98
+
99
+ function_ast = ast.FunctionDef(
100
+ name='wrapped_function',
101
+ args=ast.arguments(posonlyargs=[], args=[], kwonlyargs=[], kw_defaults=[], defaults=[]),
102
+ body=all_other_stmts,
103
+ decorator_list=[],
104
+ lineno=-1,
105
+ )
106
+ main_code = (
107
+ import_string + '\n' + ast.unparse(import_stmts) # type: ignore
108
+ + '\n' + ast.unparse(function_ast) # type: ignore
109
+ )
110
+ return main_code
111
+ except Exception as e:
112
+ return code
80
113
 
81
- def combined_int_check(val):
82
- return only_int_check(val) or string_int_check(val)
83
114
 
115
+ def call_method(method, inputs):
84
116
 
85
- def run_test(sample, test=None, debug=False, timeout=6):
86
- """if test(generated_code) is not None it'll try to run the code.
117
+ if isinstance(inputs, list):
118
+ inputs = '\n'.join(inputs)
119
+
120
+ inputs_line_iterator = iter(inputs.split('\n'))
121
+
122
+ # sys.setrecursionlimit(10000)
123
+
124
+ # @patch('builtins.input', side_effect=inputs.split("\n"))
125
+ @patch('builtins.open', mock_open(read_data=inputs))
126
+ @patch('sys.stdin', StringIO(inputs))
127
+ @patch('sys.stdin.readline', lambda *args: next(inputs_line_iterator))
128
+ @patch('sys.stdin.readlines', lambda *args: inputs.split('\n'))
129
+ @patch('sys.stdin.read', lambda *args: inputs)
130
+ # @patch('sys.stdout.write', print)
131
+ def _inner_call_method(_method):
132
+ try:
133
+ return _method()
134
+ except SystemExit as e:
135
+ pass
136
+ finally:
137
+ pass
138
+
139
+ return _inner_call_method(method)
140
+
141
+
142
+ def get_function(compiled_sol, fn_name: str): # type: ignore
143
+ try:
144
+ assert hasattr(compiled_sol, fn_name)
145
+ return getattr(compiled_sol, fn_name)
146
+ except Exception as e:
147
+ return
148
+
149
+
150
+ def compile_code(code: str, timeout: int):
151
+ signal.alarm(timeout)
152
+ try:
153
+ tmp_sol = ModuleType('tmp_sol', '')
154
+ exec(code, tmp_sol.__dict__)
155
+ if 'class Solution' in code:
156
+ # leetcode wraps solutions in `Solution`
157
+ # this is a hack to check if it is leetcode solution or not
158
+ # currently livecodebench only supports LeetCode but
159
+ # else condition allows future extensibility to other platforms
160
+ compiled_sol = tmp_sol.Solution()
161
+ else:
162
+ # do nothing in the other case since function is accesible
163
+ compiled_sol = tmp_sol
164
+
165
+ assert compiled_sol is not None
166
+ finally:
167
+ signal.alarm(0)
168
+
169
+ return compiled_sol
170
+
171
+
172
+ def convert_line_to_decimals(line: str) -> tuple[bool, list[Decimal]]:
173
+ try:
174
+ decimal_line = [Decimal(elem) for elem in line.split()]
175
+ except:
176
+ return False, []
177
+ return True, decimal_line
178
+
179
+
180
+ def get_stripped_lines(val: str):
181
+ ## you don't want empty lines to add empty list after splitlines!
182
+ val = val.strip()
183
+
184
+ return [val_line.strip() for val_line in val.split('\n')]
185
+
186
+
187
+ def grade_call_based(code: str, all_inputs: list, all_outputs: list, fn_name: str, timeout: int):
188
+ # call-based clean up logic
189
+ # need to wrap in try-catch logic after to catch the correct errors, but for now this is fine.
190
+ code = import_string + '\n\n' + code
191
+ compiled_sol = compile_code(code, timeout)
192
+
193
+ if compiled_sol is None:
194
+ return
195
+
196
+ method = get_function(compiled_sol, fn_name)
197
+
198
+ if method is None:
199
+ return
200
+
201
+ all_inputs = [[json.loads(line) for line in inputs.split('\n')] for inputs in all_inputs]
202
+
203
+ all_outputs = [json.loads(output) for output in all_outputs]
204
+
205
+ total_execution = 0
206
+ all_results = []
207
+ for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
208
+ signal.alarm(timeout)
209
+ # faulthandler.enable()
210
+ try:
211
+ # can lock here so time is useful
212
+ start = time.time()
213
+ prediction = method(*gt_inp)
214
+ total_execution += time.time() - start
215
+ signal.alarm(0)
216
+
217
+ # don't penalize model if it produces tuples instead of lists
218
+ # ground truth sequences are not tuples
219
+ if isinstance(prediction, tuple):
220
+ prediction = list(prediction)
221
+
222
+ tmp_result = prediction == gt_out
223
+
224
+ # handle floating point comparisons
225
+
226
+ all_results.append(tmp_result)
227
+
228
+ if not tmp_result:
229
+ return all_results, {
230
+ 'output': truncatefn(prediction),
231
+ 'inputs': truncatefn(gt_inp),
232
+ 'expected': truncatefn(gt_out),
233
+ 'error_code': -2,
234
+ 'error_message': 'Wrong Answer',
235
+ }
236
+ except Exception as e:
237
+ signal.alarm(0)
238
+ if 'timeoutexception' in repr(e).lower():
239
+ all_results.append(-3)
240
+ return all_results, {
241
+ 'error': repr(e),
242
+ 'error_code': -3,
243
+ 'error_message': 'Time Limit Exceeded',
244
+ 'inputs': truncatefn(gt_inp),
245
+ 'expected': truncatefn(gt_out),
246
+ }
247
+ else:
248
+ all_results.append(-4)
249
+ return all_results, {
250
+ 'error': repr(e),
251
+ 'error_code': -4,
252
+ 'error_message': 'Runtime Error',
253
+ 'inputs': truncatefn(gt_inp),
254
+ 'expected': truncatefn(gt_out),
255
+ }
256
+
257
+ finally:
258
+ signal.alarm(0)
259
+ # faulthandler.disable()
260
+
261
+ return all_results, {'execution time': total_execution}
262
+
263
+
264
+ def grade_stdio(
265
+ code: str,
266
+ all_inputs: list,
267
+ all_outputs: list,
268
+ timeout: int,
269
+ ):
270
+ ## runtime doesn't interact well with __name__ == '__main__'
271
+ code = clean_if_name(code)
272
+
273
+ ## we wrap the given code inside another function
274
+ code = make_function(code)
275
+
276
+ compiled_sol = compile_code(code, timeout)
277
+ if compiled_sol is None:
278
+ return
279
+
280
+ method = get_function(compiled_sol, 'wrapped_function')
281
+
282
+ if method is None:
283
+ return
87
284
 
285
+ all_results = []
286
+ total_execution_time = 0
287
+ for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
288
+ signal.alarm(timeout)
289
+ # faulthandler.enable()
290
+
291
+ with Capturing() as captured_output:
292
+ try:
293
+ start = time.time()
294
+ call_method(method, gt_inp)
295
+ total_execution_time += time.time() - start
296
+ # reset the alarm
297
+ signal.alarm(0)
298
+ except Exception as e:
299
+ signal.alarm(0)
300
+ if 'timeoutexception' in repr(e).lower():
301
+ all_results.append(-3)
302
+ return all_results, {
303
+ 'error': repr(e),
304
+ 'error_code': -3,
305
+ 'error_message': 'Time Limit Exceeded',
306
+ 'inputs': truncatefn(gt_inp),
307
+ 'expected': truncatefn(gt_out),
308
+ }
309
+ else:
310
+ all_results.append(-4)
311
+ return all_results, {
312
+ 'error': repr(e),
313
+ 'error_code': -4,
314
+ 'error_message': 'Runtime Error',
315
+ 'inputs': truncatefn(gt_inp),
316
+ 'expected': truncatefn(gt_out),
317
+ }
318
+
319
+ finally:
320
+ signal.alarm(0)
321
+ # faulthandler.disable()
322
+
323
+ prediction = captured_output[0]
324
+
325
+ stripped_prediction_lines = get_stripped_lines(prediction)
326
+ stripped_gt_out_lines = get_stripped_lines(gt_out)
327
+
328
+ ## WA happens in multiple circumstances
329
+ ## so cache the return to make it clean!
330
+ WA_send_args = {
331
+ 'output': truncatefn(prediction),
332
+ 'inputs': truncatefn(gt_inp),
333
+ 'expected': truncatefn(gt_out),
334
+ 'error_code': -2,
335
+ }
336
+
337
+ if len(stripped_prediction_lines) != len(stripped_gt_out_lines):
338
+ all_results.append(-2)
339
+ WA_send_args['error_message'] = 'Wrong answer: mismatched output length'
340
+ return all_results, WA_send_args
341
+
342
+ for output_line_idx, (
343
+ stripped_prediction_line,
344
+ stripped_gt_out_line,
345
+ ) in enumerate(zip(stripped_prediction_lines, stripped_gt_out_lines)):
346
+ WA_send_args['error_message'] = (
347
+ f'Wrong answer at {output_line_idx=}: {truncatefn(stripped_prediction_line)} != {truncatefn(stripped_gt_out_line)}'
348
+ )
349
+
350
+ ## CASE 1: exact match
351
+ if stripped_prediction_line == stripped_gt_out_line:
352
+ continue
353
+
354
+ ## CASE 2: element-wise comparision
355
+ ## if there are floating elements
356
+ ## use `decimal` library for good floating point comparision
357
+ ## otherwise gotcha: np.isclose(50000000000000000, 50000000000000001) = True
358
+ ## note that we should always be able to convert to decimals
359
+
360
+ success, decimal_prediction_line = convert_line_to_decimals(stripped_prediction_line)
361
+ if not success:
362
+ all_results.append(-2)
363
+ return all_results, WA_send_args
364
+ success, decimal_gtout_line = convert_line_to_decimals(stripped_gt_out_line)
365
+ if not success:
366
+ all_results.append(-2)
367
+ return all_results, WA_send_args
368
+
369
+ if decimal_prediction_line == decimal_gtout_line:
370
+ continue
371
+
372
+ all_results.append(-2)
373
+ return all_results, WA_send_args
374
+ all_results.append(True)
375
+
376
+ return all_results, {'execution time': total_execution_time}
377
+
378
+
379
+ def run_test(sample, test=None, debug=False, timeout=6):
380
+ """
381
+ if test(generated_code) is not None it'll try to run the code.
88
382
  otherwise it'll just return an input and output pair.
89
383
  """
384
+ signal.signal(signal.SIGALRM, timeout_handler)
385
+
90
386
  # Disable functionalities that can make destructive changes to the test.
387
+ # max memory is set to 4GB
91
388
  reliability_guard()
92
389
 
93
390
  if debug:
94
- print(f'start = {datetime.now().time()}')
391
+ logger.info(f'start = {datetime.now().time()}')
95
392
 
96
393
  try:
97
394
  in_outs = json.loads(sample['input_output'])
98
- except ValueError:
395
+ except ValueError as e:
396
+ raise e
99
397
  in_outs = None
398
+
100
399
  if in_outs:
101
400
  if in_outs.get('fn_name') is None:
102
401
  which_type = CODE_TYPE.standard_input # Standard input
103
402
  method_name = None
403
+
104
404
  else:
105
405
  which_type = CODE_TYPE.call_based # Call-based
106
406
  method_name = in_outs['fn_name']
107
407
 
108
408
  if debug:
109
- print(f'loaded input_output = {datetime.now().time()}')
409
+ logger.info(f'loaded input_output = {datetime.now().time()}')
110
410
 
111
411
  if test is None:
112
412
  assert False, 'should not happen: test code is none'
113
413
  return in_outs, {'error': 'no test code provided'}
114
414
  elif test is not None:
115
415
  results = []
116
- sol = 'from string import *\nfrom re import *\nfrom datetime import *\nfrom collections import *\nfrom heapq import *\nfrom bisect import *\nfrom copy import *\nfrom math import *\nfrom random import *\nfrom statistics import *\nfrom itertools import *\nfrom functools import *\nfrom operator import *\nfrom io import *\nfrom sys import *\nfrom json import *\nfrom builtins import *\nfrom typing import *\nimport string\nimport re\nimport datetime\nimport collections\nimport heapq\nimport bisect\nimport copy\nimport math\nimport random\nimport statistics\nimport itertools\nimport functools\nimport operator\nimport io\nimport sys\nimport json\nsys.setrecursionlimit(6*10**5)\n' # noqa: E501
416
+ sol = import_string
117
417
  if debug:
118
- print(f'loading test code = {datetime.now().time()}')
418
+ logger.info(f'loading test code = {datetime.now().time()}')
119
419
 
120
420
  if which_type == CODE_TYPE.call_based:
121
-
122
- sol += test
123
- if debug:
124
- print(f'sol = {sol}')
125
421
  signal.alarm(timeout)
126
422
  try:
127
- tmp_sol = RuntimeModule.from_string('tmp_sol', '', sol)
128
- if 'class Solution' not in test:
129
- tmp = tmp_sol
130
- else:
131
- tmp = tmp_sol.Solution()
132
- signal.alarm(0)
423
+ results, metadata = grade_call_based(
424
+ code=test,
425
+ all_inputs=in_outs['inputs'],
426
+ all_outputs=in_outs['outputs'],
427
+ fn_name=method_name,
428
+ timeout=timeout,
429
+ )
430
+ return results, metadata
133
431
  except Exception as e:
134
- signal.alarm(0)
135
- if debug:
136
- print(f'type 0 compilation error = {e}')
137
- results.append(-2)
138
- return results, {
139
- 'error': repr(e),
140
- 'error_code': -1,
141
- 'error_message': 'Compilation Error',
432
+ return [-4], {
433
+ 'error_code': -4,
434
+ 'error_message': f'Error during testing: {e}',
142
435
  }
143
- signal.alarm(0)
144
-
436
+ finally:
437
+ signal.alarm(0)
145
438
  elif which_type == CODE_TYPE.standard_input:
146
439
  # sol
147
440
  # if code has if __name__ == "__main__": then remove it
148
- try:
149
- astree = ast.parse(test)
150
- last_block = astree.body[-1]
151
- if isinstance(last_block, ast.If):
152
- condition = last_block.test
153
- if ast.unparse(condition).strip() == "__name__ == '__main__'":
154
- test = (ast.unparse(astree.body[:-1]) + '\n' + ast.unparse(last_block.body))
155
- except Exception as e: # noqa:
156
- pass
157
-
158
- tmp_test = test.split('\n')
159
-
160
- new_test = []
161
- for x in tmp_test:
162
- if (not x.startswith('from ')) and (not x.startswith('import ')):
163
- new_test.append('\t' + x + '\n')
164
- else:
165
- new_test.append(x + '\n')
166
- tmp_test = new_test
167
-
168
- new_test = ''
169
- started = False
170
- for i in tmp_test:
171
- if i.startswith('\t') and not started:
172
- new_test += 'stdin = sys.stdin\nstdout = sys.stdout\n'
173
- new_test += 'def code():\n'
174
- new_test += i
175
- started = True
176
- elif started and ((i.startswith('from ')) or (i.startswith('import '))):
177
- new_test += '\t' + i
178
- else:
179
- new_test += i
180
- tmp_test = new_test
181
441
 
182
- sol += tmp_test
183
- if debug:
184
- print(f'sol = {sol}')
185
- method_name = 'code'
186
442
  signal.alarm(timeout)
187
443
  try:
188
- tmp_sol = RuntimeModule.from_string('tmp_sol', '', sol)
189
- tmp = tmp_sol
190
- signal.alarm(0)
444
+ results, metadata = grade_stdio(
445
+ code=test,
446
+ all_inputs=in_outs['inputs'],
447
+ all_outputs=in_outs['outputs'],
448
+ timeout=timeout,
449
+ )
450
+ return results, metadata
191
451
  except Exception as e:
192
- signal.alarm(0)
193
- if debug:
194
- print(f'type 1 compilation error = {e}')
195
- results.append(-2)
196
- return results, {
197
- 'error': repr(e),
198
- 'error_code': -1,
199
- 'error_message': 'Compilation Error',
452
+ return [-4], {
453
+ 'error_code': -4,
454
+ 'error_message': f'Error during testing: {e}',
200
455
  }
201
- signal.alarm(0)
202
- if debug:
203
- print(f'get method = {datetime.now().time()}')
204
-
205
- try:
206
- method = getattr(tmp, method_name) # get_attr second arg must be str
207
- except Exception as e:
208
- signal.alarm(0)
209
- e = sys.exc_info()
210
- print(f'unable to get function error = {e}')
211
- results.append(-2)
212
- return results, {
213
- 'error': repr(e),
214
- 'error_code': -1,
215
- 'error_message': 'Unable to extract code',
216
- }
217
-
218
- for index, inputs in enumerate(in_outs['inputs']):
219
- raw_inputs = inputs
220
- raw_outputs = in_outs['outputs'][index]
221
- if which_type == CODE_TYPE.call_based:
222
- inputs = [json.loads(line) for line in inputs.split('\n')]
223
- in_outs['outputs'][index] = json.loads(in_outs['outputs'][index])
224
-
225
- truncate_line_size = 300 // (raw_inputs.count('\n') + 1)
226
- raw_inputs = '\n'.join(
227
- [truncatefn(line, truncate_line_size) for line in raw_inputs.strip().split('\n')])
228
- raw_outputs = truncatefn(raw_outputs, 200)
229
- else:
230
- raw_inputs = truncatefn(raw_inputs)
231
- raw_outputs = truncatefn(raw_outputs, 200)
232
- # JSON forces dictionaries to have string keys; this undoes this
233
- # (assuming a singleton list)
234
- try:
235
- if isinstance(inputs[0], dict):
236
- inputs = [{int(k): v for k, v in inputs[0].items()}]
237
- except Exception as e: # noqa: F841
238
- True
239
- try:
240
- if isinstance(in_outs['outputs'][index], dict):
241
- in_outs['outputs'][index] = [{int(k): v for k, v in in_outs['outputs'][index].items()}]
242
- except Exception as e: # noqa: F841
243
- True
244
- try:
245
- if isinstance(in_outs['outputs'][index][0], dict):
246
- in_outs['outputs'][index] = [{int(k): v for k, v in in_outs['outputs'][index][0].items()}]
247
- except Exception as e: # noqa: F841
248
- True
249
-
250
- if debug:
251
- print(f'time: {datetime.now().time()} testing index = {index} '
252
- f'inputs = {inputs}, {type(inputs)}. type = {which_type}')
253
- if which_type == CODE_TYPE.call_based: # Call-based
254
- signal.alarm(timeout)
255
- faulthandler.enable()
256
- try:
257
- output = method(*inputs)
258
- raw_true_output = output
259
-
260
- raw_true_output_copy = json.dumps(output)
261
- raw_true_output_copy = truncatefn(raw_true_output_copy, 200)
262
-
263
- # ground truth sequences are not tuples
264
- if isinstance(output, tuple):
265
- output = list(output)
266
-
267
- tmp_result = output == in_outs['outputs'][index]
268
- if (isinstance(in_outs['outputs'][index], list) and in_outs['outputs'][index]):
269
- tmp_result = tmp_result or (output == in_outs['outputs'][index][0])
270
-
271
- # ground truth sequences are not tuples
272
- try:
273
- if isinstance(output[0], tuple):
274
- tmp_result = tmp_result or ([list(x) for x in output] == in_outs['outputs'][index][0])
275
- except Exception as e: # noqa: F841
276
- True
277
- results.append(tmp_result)
278
- if tmp_result is not True:
279
- return results, {
280
- 'output': raw_true_output_copy,
281
- 'expected': raw_outputs,
282
- 'inputs': raw_inputs,
283
- 'error_code': -2,
284
- 'error_message': 'Wrong Answer',
285
- }
286
- # reset the alarm
287
- signal.alarm(0)
288
- except Exception as e:
289
- signal.alarm(0)
290
- faulthandler.disable()
291
- if debug:
292
- print(f'Standard input runtime error or time limit exceeded error = {e}' # noqa: E501
293
- )
294
- results.append(-1)
295
- if 'timeoutexception' in repr(e).lower():
296
- return results, {
297
- 'error': repr(e),
298
- 'error_code': -3,
299
- 'error_message': 'Time Limit Exceeded',
300
- 'inputs': raw_inputs,
301
- 'expected': raw_outputs,
302
- }
303
- else:
304
- return results, {
305
- 'error': repr(e),
306
- 'error_code': -4,
307
- 'error_message': 'Runtime Error',
308
- 'inputs': raw_inputs,
309
- 'expected': raw_outputs,
310
- }
311
- faulthandler.disable()
456
+ finally:
312
457
  signal.alarm(0)
313
- if debug:
314
- print(
315
- f"outputs = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
316
- )
317
- elif which_type == CODE_TYPE.standard_input: # Standard input
318
- faulthandler.enable()
319
- passed = False
320
-
321
- if isinstance(inputs, list):
322
- inputs = '\n'.join(inputs)
323
- if isinstance(in_outs['outputs'][index], list):
324
- in_outs['outputs'][index] = '\n'.join(in_outs['outputs'][index])
325
-
326
- signal.alarm(timeout)
327
- with Capturing() as output:
328
- try:
329
- call_method(method, inputs)
330
- # reset the alarm
331
- signal.alarm(0)
332
- passed = True
333
- except Exception as e:
334
- # runtime error or took too long
335
- signal.alarm(0)
336
- print(f'Call-based runtime error or time limit exceeded error = {repr(e)}{e}' # noqa: E501
337
- )
338
- results.append(-1)
339
- if 'timeoutexception' in repr(e).lower():
340
- return results, {
341
- 'error': repr(e),
342
- 'error_code': -3,
343
- 'error_message': 'Time Limit Exceeded',
344
- 'inputs': raw_inputs,
345
- 'expected': raw_outputs,
346
- }
347
- else:
348
- return results, {
349
- 'error': repr(e),
350
- 'error_code': -4,
351
- 'error_message': 'Runtime Error',
352
- 'inputs': raw_inputs,
353
- 'expected': raw_outputs,
354
- }
355
- signal.alarm(0)
356
- raw_true_output = output[0]
357
- raw_true_output_copy = truncatefn(raw_true_output, 200)
358
- output = raw_true_output.splitlines()
359
- if not passed:
360
- if debug:
361
- nl = '\n'
362
- if not isinstance(inputs, list):
363
- print(
364
- f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl, ' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
365
- )
366
- else:
367
- print(
368
- f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
369
- )
370
- continue
371
-
372
- if passed and debug:
373
- print(f"==> output = {output}, test outputs = {in_outs['outputs'][index]}" # noqa: E501
374
- )
375
-
376
- if custom_compare_(output, in_outs['outputs'][index]):
377
- tmp_result = True
378
- results.append(tmp_result)
379
- continue
380
-
381
- # ground truth sequences are expressed as lists not tuples
382
- if isinstance(output, tuple):
383
- output = list(output)
384
-
385
- tmp_result = False
386
- try:
387
- tmp_result = output == [in_outs['outputs'][index]]
388
- if isinstance(in_outs['outputs'][index], list):
389
- tmp_result = tmp_result or (output == in_outs['outputs'][index])
390
- if isinstance(output[0], str):
391
- tmp_result = tmp_result or ([e.strip() for e in output] == in_outs['outputs'][index])
392
- except Exception as e:
393
- if debug:
394
- print(f'Failed check1 exception = {e}')
395
- pass
396
-
397
- if tmp_result is True:
398
- results.append(tmp_result)
399
- continue
400
-
401
- # try one more time without \n
402
- if isinstance(in_outs['outputs'][index], list):
403
- for tmp_index, i in enumerate(in_outs['outputs'][index]):
404
- in_outs['outputs'][index][tmp_index] = i.split('\n')
405
- in_outs['outputs'][index][tmp_index] = [
406
- x.strip() for x in in_outs['outputs'][index][tmp_index] if x
407
- ]
408
- else:
409
- in_outs['outputs'][index] = in_outs['outputs'][index].split('\n')
410
- in_outs['outputs'][index] = list(filter(len, in_outs['outputs'][index]))
411
- in_outs['outputs'][index] = list(map(lambda x: x.strip(), in_outs['outputs'][index]))
412
-
413
- try:
414
- tmp_result = output == [in_outs['outputs'][index]]
415
- if isinstance(in_outs['outputs'][index], list):
416
- tmp_result = tmp_result or (output == in_outs['outputs'][index])
417
- except Exception as e:
418
- if debug:
419
- print(f'Failed check2 exception = {e}')
420
- pass
421
-
422
- if tmp_result is True:
423
- results.append(tmp_result)
424
- continue
425
-
426
- # try by converting the output into a split up list too
427
- if isinstance(output, list):
428
- output = list(filter(len, output))
429
-
430
- if debug:
431
- nl = '\n'
432
- if not isinstance(inputs, list):
433
- print(
434
- f"@1 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl, ' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]} {tmp_result=}" # noqa: E501
435
- )
436
- else:
437
- print(
438
- f"@1 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]} {tmp_result=}" # noqa: E501
439
- )
440
-
441
- if tmp_result is True:
442
- results.append(tmp_result)
443
- continue
444
-
445
- if debug:
446
- print(f'{tmp_result=} @a')
447
-
448
- try:
449
- tmp_result = output == [in_outs['outputs'][index]]
450
- if isinstance(in_outs['outputs'][index], list):
451
- tmp_result = tmp_result or (output == in_outs['outputs'][index])
452
- except Exception as e:
453
- if debug:
454
- print(f'Failed check3 exception = {e}')
455
- pass
456
-
457
- if debug:
458
- print(f'{tmp_result=} @b')
459
-
460
- try:
461
- all_ints = all(
462
- combined_int_check(e1) and combined_int_check(e2)
463
- for e1, e2 in zip(output, in_outs['outputs'][index]))
464
- if not all_ints:
465
- if debug:
466
- print([
467
- combined_int_check(e1) and combined_int_check(e2)
468
- for e1, e2 in zip(output, in_outs['outputs'][index])
469
- ])
470
- output_float = [float(e) for e in output]
471
- gt_float = [float(e) for e in in_outs['outputs'][index]]
472
- tmp_result = tmp_result or ((len(output_float) == len(gt_float))
473
- and np.allclose(output_float, gt_float))
474
- except Exception as e: # noqa: F841
475
- pass
476
-
477
- if debug:
478
- print(f'{tmp_result=} @c')
479
-
480
- try:
481
- if isinstance(output[0], list):
482
- all_ints = all(
483
- combined_int_check(e1) and combined_int_check(e2)
484
- for e1, e2 in zip(output[0], in_outs['outputs'][index]))
485
- if not all_ints:
486
- output_float = [float(e) for e in output[0]]
487
- gt_float = [float(e) for e in in_outs['outputs'][index][0]]
488
- tmp_result = tmp_result or ((len(output_float) == len(gt_float))
489
- and np.allclose(output_float, gt_float))
490
- except Exception as e: # noqa: F841
491
- pass
492
-
493
- if tmp_result is True:
494
- results.append(tmp_result)
495
- continue
496
-
497
- if debug:
498
- print(f'{tmp_result=} @d')
499
- # try by converting the stuff into split up list
500
- if isinstance(in_outs['outputs'][index], list):
501
- for tmp_index, i in enumerate(in_outs['outputs'][index]):
502
- in_outs['outputs'][index][tmp_index] = set(i.split())
503
- else:
504
- in_outs['outputs'][index] = set(in_outs['outputs'][index].split())
505
-
506
- if debug:
507
- print(f'{tmp_result=} @e')
508
-
509
- try:
510
- tmp_result = output == in_outs['outputs'][index]
511
- except Exception as e:
512
- if debug:
513
- print(f'Failed check4 exception = {e}')
514
- continue
515
-
516
- if tmp_result is True:
517
- results.append(tmp_result)
518
- continue
519
-
520
- if debug:
521
- print(f'{tmp_result=} @f')
522
-
523
- # try by converting the output into a split up list too
524
- if isinstance(output, list):
525
- for tmp_index, i in enumerate(output):
526
- output[tmp_index] = i.split()
527
- output = list(filter(len, output))
528
- for tmp_index, i in enumerate(output):
529
- output[tmp_index] = set(i)
530
- else:
531
- output = output.split()
532
- output = list(filter(len, output))
533
- output = set(output)
534
-
535
- if debug:
536
- print(f'{tmp_result=} @g')
537
- # try:
538
- # tmp_result = set(frozenset(s) for s in output) == set(
539
- # frozenset(s) for s in in_outs["outputs"][index]
540
- # )
541
- # except Exception as e:
542
- # if debug:
543
- # print(f"Failed check5 exception = {e}")
544
-
545
- # if they are all numbers, round so that similar numbers are
546
- # treated as identical
547
- # try:
548
- # all_ints = all(
549
- # combined_int_check(e1) and combined_int_check(e2)
550
- # for e1, e2 in zip(output, in_outs["outputs"][index])
551
- # )
552
- # tmp_result = tmp_result or (
553
- # set(
554
- # frozenset(round(float(t), 3) for t in s) for s in output)
555
- # == set(
556
- # frozenset(round(float(t), 3) for t in s)
557
- # for s in in_outs["outputs"][index]
558
- # )
559
- # )
560
- # except Exception as e:
561
- # if debug:
562
- # print(f"Failed check6 exception = {e}")
563
-
564
- if debug:
565
- print(f'{tmp_result=} @h')
566
-
567
- if tmp_result is True and debug:
568
- print('PASSED')
569
-
570
- results.append(tmp_result)
571
- if tmp_result is not True:
572
- return results, {
573
- 'output': raw_true_output_copy,
574
- 'expected': raw_outputs,
575
- 'inputs': raw_inputs,
576
- 'error_code': -2,
577
- 'error_message': 'Wrong Answer',
578
- }
579
-
580
- if debug:
581
- nl = '\n'
582
- if not isinstance(inputs, list):
583
- print(
584
- f"@2 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl, ' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
585
- )
586
- else:
587
- print(
588
- f"@2 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
589
- )
590
-
591
- print(f'results = {results}')
592
-
593
- return results, {}
594
-
595
-
596
- def custom_compare_(output, ground_truth):
597
-
598
- if isinstance(output, list):
599
- output_1 = '\n'.join(output)
600
- if stripped_string_compare(output_1, ground_truth):
601
- return True
602
-
603
- if isinstance(output, list):
604
- output_2 = [o.lstrip().rstrip() for o in output]
605
- output_2 = '\n'.join(output_2)
606
- if stripped_string_compare(output_2, ground_truth):
607
- return True
608
-
609
- return False
610
-
611
-
612
- def stripped_string_compare(s1, s2):
613
- s1 = s1.lstrip().rstrip()
614
- s2 = s2.lstrip().rstrip()
615
- return s1 == s2
616
-
617
-
618
- def call_method(method, inputs):
619
-
620
- if isinstance(inputs, list):
621
- inputs = '\n'.join(inputs)
622
-
623
- inputs_line_iterator = iter(inputs.split('\n'))
624
-
625
- # sys.setrecursionlimit(10000)
626
-
627
- # @patch('builtins.input', side_effect=inputs.split("\n"))
628
- @patch('builtins.open', mock_open(read_data=inputs))
629
- @patch('sys.stdin', StringIO(inputs))
630
- @patch('sys.stdin.readline', lambda *args: next(inputs_line_iterator))
631
- @patch('sys.stdin.readlines', lambda *args: inputs.split('\n'))
632
- @patch('sys.stdin.read', lambda *args: inputs)
633
- # @patch('sys.stdout.write', print)
634
- def _inner_call_method(_method):
635
- try:
636
- return _method()
637
- except SystemExit as e: # noqa: F841
638
- pass
639
- finally:
640
- pass
641
-
642
- return _inner_call_method(method)
643
458
 
644
459
 
645
460
  def reliability_guard(maximum_memory_bytes=None):
646
- """This disables various destructive functions and prevents the generated
647
- code from interfering with the test (e.g. fork bomb, killing other
648
- processes, removing filesystem files, etc.) WARNING This function is NOT a
649
- security sandbox.
650
-
651
- Untrusted code, including, model- generated code, should not be blindly
652
- executed outside of one. See the Codex paper for more information about
653
- OpenAI's code sandbox, and proceed with caution.
461
+ """
462
+ This disables various destructive functions and prevents the generated code
463
+ from interfering with the test (e.g. fork bomb, killing other processes,
464
+ removing filesystem files, etc.)
465
+ WARNING
466
+ This function is NOT a security sandbox. Untrusted code, including, model-
467
+ generated code, should not be blindly executed outside of one. See the
468
+ Codex paper for more information about OpenAI's code sandbox, and proceed
469
+ with caution.
654
470
  """
655
471
 
656
472
  if maximum_memory_bytes is not None:
@@ -661,11 +477,11 @@ def reliability_guard(maximum_memory_bytes=None):
661
477
  if not platform.uname().system == 'Darwin':
662
478
  resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
663
479
 
664
- faulthandler.disable()
480
+ # faulthandler.disable()
665
481
 
666
482
  import builtins
667
483
 
668
- builtins.exit = None
484
+ # builtins.exit = None
669
485
  builtins.quit = None
670
486
 
671
487
  import os