evalscope 0.12.1__py3-none-any.whl → 0.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (50) hide show
  1. evalscope/arguments.py +6 -1
  2. evalscope/benchmarks/arc/arc_adapter.py +3 -3
  3. evalscope/benchmarks/benchmark.py +3 -2
  4. evalscope/benchmarks/ceval/ceval_adapter.py +2 -1
  5. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  6. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
  7. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +2 -1
  8. evalscope/benchmarks/data_adapter.py +32 -4
  9. evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -4
  10. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +20 -24
  11. evalscope/benchmarks/humaneval/humaneval_adapter.py +8 -5
  12. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  13. evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
  14. evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
  15. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  16. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
  17. evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
  18. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  19. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  20. evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
  21. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -2
  22. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +148 -1
  23. evalscope/benchmarks/super_gpqa/utils.py +0 -5
  24. evalscope/collections/evaluator.py +4 -4
  25. evalscope/config.py +11 -3
  26. evalscope/constants.py +8 -0
  27. evalscope/evaluator/evaluator.py +56 -17
  28. evalscope/metrics/llm_judge.py +104 -0
  29. evalscope/models/custom_adapter.py +1 -1
  30. evalscope/perf/arguments.py +11 -40
  31. evalscope/perf/benchmark.py +39 -28
  32. evalscope/perf/http_client.py +9 -1
  33. evalscope/perf/main.py +2 -1
  34. evalscope/perf/plugin/datasets/__init__.py +1 -0
  35. evalscope/perf/plugin/datasets/openqa.py +6 -11
  36. evalscope/perf/plugin/datasets/random_dataset.py +51 -0
  37. evalscope/perf/utils/db_util.py +3 -0
  38. evalscope/run.py +15 -3
  39. evalscope/third_party/longbench_write/infer.py +1 -1
  40. evalscope/version.py +2 -2
  41. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/METADATA +56 -38
  42. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/RECORD +50 -36
  43. tests/cli/test_all.py +144 -0
  44. tests/cli/test_collection.py +27 -1
  45. tests/cli/test_run.py +103 -11
  46. tests/perf/test_perf.py +23 -0
  47. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/LICENSE +0 -0
  48. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/WHEEL +0 -0
  49. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/entry_points.txt +0 -0
  50. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,721 @@
1
+ # Copyright LiveCodeBench @ 2024,
2
+
3
+ import ast
4
+ import faulthandler
5
+ import json
6
+ import numpy as np
7
+ import platform
8
+ # to run the solution files we're using a timing based approach
9
+ import signal
10
+ import sys
11
+ # used for debugging to time steps
12
+ from datetime import datetime
13
+ from enum import Enum
14
+ # for capturing the stdout
15
+ from io import StringIO
16
+
17
+ try:
18
+ from pyext import RuntimeModule
19
+ except Exception:
20
+ print('pyext not found, please install with `pip install pyext`')
21
+ pyext = None
22
+ # used for testing the code that reads from input
23
+ from unittest.mock import mock_open, patch
24
+
25
+
26
+ def truncatefn(s, length=300):
27
+ assert isinstance(s, str)
28
+ if len(s) <= length:
29
+ return s
30
+
31
+ return s[:length // 2] + '...(truncated) ...' + s[-length // 2:]
32
+
33
+
34
+ class CODE_TYPE(Enum):
35
+ call_based = 0
36
+ standard_input = 1
37
+
38
+
39
+ # stuff for setting up signal timer
40
+ class TimeoutException(Exception):
41
+ pass
42
+
43
+
44
+ def timeout_handler(signum, frame):
45
+ print('alarm went off')
46
+ # return
47
+ raise TimeoutException
48
+
49
+
50
+ signal.signal(signal.SIGALRM, timeout_handler)
51
+
52
+ # timeout = 6 # seconds
53
+
54
+
55
+ # used to capture stdout as a list
56
+ # from https://stackoverflow.com/a/16571630/6416660
57
+ # alternative use redirect_stdout() from contextlib
58
+ class Capturing(list):
59
+
60
+ def __enter__(self):
61
+ self._stdout = sys.stdout
62
+ sys.stdout = self._stringio = StringIO()
63
+ # Make closing the StringIO a no-op
64
+ self._stringio.close = lambda x: 1
65
+ return self
66
+
67
+ def __exit__(self, *args):
68
+ self.append(self._stringio.getvalue())
69
+ del self._stringio # free up some memory
70
+ sys.stdout = self._stdout
71
+
72
+
73
+ def only_int_check(val):
74
+ return isinstance(val, int)
75
+
76
+
77
+ def string_int_check(val):
78
+ return isinstance(val, str) and val.isdigit()
79
+
80
+
81
+ def combined_int_check(val):
82
+ return only_int_check(val) or string_int_check(val)
83
+
84
+
85
+ def run_test(sample, test=None, debug=False, timeout=6):
86
+ """if test(generated_code) is not None it'll try to run the code.
87
+
88
+ otherwise it'll just return an input and output pair.
89
+ """
90
+ # Disable functionalities that can make destructive changes to the test.
91
+ reliability_guard()
92
+
93
+ if debug:
94
+ print(f'start = {datetime.now().time()}')
95
+
96
+ try:
97
+ in_outs = json.loads(sample['input_output'])
98
+ except ValueError:
99
+ in_outs = None
100
+ if in_outs:
101
+ if in_outs.get('fn_name') is None:
102
+ which_type = CODE_TYPE.standard_input # Standard input
103
+ method_name = None
104
+ else:
105
+ which_type = CODE_TYPE.call_based # Call-based
106
+ method_name = in_outs['fn_name']
107
+
108
+ if debug:
109
+ print(f'loaded input_output = {datetime.now().time()}')
110
+
111
+ if test is None:
112
+ assert False, 'should not happen: test code is none'
113
+ return in_outs, {'error': 'no test code provided'}
114
+ elif test is not None:
115
+ results = []
116
+ sol = 'from string import *\nfrom re import *\nfrom datetime import *\nfrom collections import *\nfrom heapq import *\nfrom bisect import *\nfrom copy import *\nfrom math import *\nfrom random import *\nfrom statistics import *\nfrom itertools import *\nfrom functools import *\nfrom operator import *\nfrom io import *\nfrom sys import *\nfrom json import *\nfrom builtins import *\nfrom typing import *\nimport string\nimport re\nimport datetime\nimport collections\nimport heapq\nimport bisect\nimport copy\nimport math\nimport random\nimport statistics\nimport itertools\nimport functools\nimport operator\nimport io\nimport sys\nimport json\nsys.setrecursionlimit(6*10**5)\n' # noqa: E501
117
+ if debug:
118
+ print(f'loading test code = {datetime.now().time()}')
119
+
120
+ if which_type == CODE_TYPE.call_based:
121
+
122
+ sol += test
123
+ if debug:
124
+ print(f'sol = {sol}')
125
+ signal.alarm(timeout)
126
+ try:
127
+ tmp_sol = RuntimeModule.from_string('tmp_sol', '', sol)
128
+ if 'class Solution' not in test:
129
+ tmp = tmp_sol
130
+ else:
131
+ tmp = tmp_sol.Solution()
132
+ signal.alarm(0)
133
+ except Exception as e:
134
+ signal.alarm(0)
135
+ if debug:
136
+ print(f'type 0 compilation error = {e}')
137
+ results.append(-2)
138
+ return results, {
139
+ 'error': repr(e),
140
+ 'error_code': -1,
141
+ 'error_message': 'Compilation Error',
142
+ }
143
+ signal.alarm(0)
144
+
145
+ elif which_type == CODE_TYPE.standard_input:
146
+ # sol
147
+ # if code has if __name__ == "__main__": then remove it
148
+ try:
149
+ astree = ast.parse(test)
150
+ last_block = astree.body[-1]
151
+ if isinstance(last_block, ast.If):
152
+ condition = last_block.test
153
+ if ast.unparse(condition).strip() == "__name__ == '__main__'":
154
+ test = (ast.unparse(astree.body[:-1]) + '\n' + ast.unparse(last_block.body))
155
+ except Exception as e: # noqa:
156
+ pass
157
+
158
+ tmp_test = test.split('\n')
159
+
160
+ new_test = []
161
+ for x in tmp_test:
162
+ if (not x.startswith('from ')) and (not x.startswith('import ')):
163
+ new_test.append('\t' + x + '\n')
164
+ else:
165
+ new_test.append(x + '\n')
166
+ tmp_test = new_test
167
+
168
+ new_test = ''
169
+ started = False
170
+ for i in tmp_test:
171
+ if i.startswith('\t') and not started:
172
+ new_test += 'stdin = sys.stdin\nstdout = sys.stdout\n'
173
+ new_test += 'def code():\n'
174
+ new_test += i
175
+ started = True
176
+ elif started and ((i.startswith('from ')) or (i.startswith('import '))):
177
+ new_test += '\t' + i
178
+ else:
179
+ new_test += i
180
+ tmp_test = new_test
181
+
182
+ sol += tmp_test
183
+ if debug:
184
+ print(f'sol = {sol}')
185
+ method_name = 'code'
186
+ signal.alarm(timeout)
187
+ try:
188
+ tmp_sol = RuntimeModule.from_string('tmp_sol', '', sol)
189
+ tmp = tmp_sol
190
+ signal.alarm(0)
191
+ except Exception as e:
192
+ signal.alarm(0)
193
+ if debug:
194
+ print(f'type 1 compilation error = {e}')
195
+ results.append(-2)
196
+ return results, {
197
+ 'error': repr(e),
198
+ 'error_code': -1,
199
+ 'error_message': 'Compilation Error',
200
+ }
201
+ signal.alarm(0)
202
+ if debug:
203
+ print(f'get method = {datetime.now().time()}')
204
+
205
+ try:
206
+ method = getattr(tmp, method_name) # get_attr second arg must be str
207
+ except Exception as e:
208
+ signal.alarm(0)
209
+ e = sys.exc_info()
210
+ print(f'unable to get function error = {e}')
211
+ results.append(-2)
212
+ return results, {
213
+ 'error': repr(e),
214
+ 'error_code': -1,
215
+ 'error_message': 'Unable to extract code',
216
+ }
217
+
218
+ for index, inputs in enumerate(in_outs['inputs']):
219
+ raw_inputs = inputs
220
+ raw_outputs = in_outs['outputs'][index]
221
+ if which_type == CODE_TYPE.call_based:
222
+ inputs = [json.loads(line) for line in inputs.split('\n')]
223
+ in_outs['outputs'][index] = json.loads(in_outs['outputs'][index])
224
+
225
+ truncate_line_size = 300 // (raw_inputs.count('\n') + 1)
226
+ raw_inputs = '\n'.join(
227
+ [truncatefn(line, truncate_line_size) for line in raw_inputs.strip().split('\n')])
228
+ raw_outputs = truncatefn(raw_outputs, 200)
229
+ else:
230
+ raw_inputs = truncatefn(raw_inputs)
231
+ raw_outputs = truncatefn(raw_outputs, 200)
232
+ # JSON forces dictionaries to have string keys; this undoes this
233
+ # (assuming a singleton list)
234
+ try:
235
+ if isinstance(inputs[0], dict):
236
+ inputs = [{int(k): v for k, v in inputs[0].items()}]
237
+ except Exception as e: # noqa: F841
238
+ True
239
+ try:
240
+ if isinstance(in_outs['outputs'][index], dict):
241
+ in_outs['outputs'][index] = [{int(k): v for k, v in in_outs['outputs'][index].items()}]
242
+ except Exception as e: # noqa: F841
243
+ True
244
+ try:
245
+ if isinstance(in_outs['outputs'][index][0], dict):
246
+ in_outs['outputs'][index] = [{int(k): v for k, v in in_outs['outputs'][index][0].items()}]
247
+ except Exception as e: # noqa: F841
248
+ True
249
+
250
+ if debug:
251
+ print(f'time: {datetime.now().time()} testing index = {index} '
252
+ f'inputs = {inputs}, {type(inputs)}. type = {which_type}')
253
+ if which_type == CODE_TYPE.call_based: # Call-based
254
+ signal.alarm(timeout)
255
+ faulthandler.enable()
256
+ try:
257
+ output = method(*inputs)
258
+ raw_true_output = output
259
+
260
+ raw_true_output_copy = json.dumps(output)
261
+ raw_true_output_copy = truncatefn(raw_true_output_copy, 200)
262
+
263
+ # ground truth sequences are not tuples
264
+ if isinstance(output, tuple):
265
+ output = list(output)
266
+
267
+ tmp_result = output == in_outs['outputs'][index]
268
+ if (isinstance(in_outs['outputs'][index], list) and in_outs['outputs'][index]):
269
+ tmp_result = tmp_result or (output == in_outs['outputs'][index][0])
270
+
271
+ # ground truth sequences are not tuples
272
+ try:
273
+ if isinstance(output[0], tuple):
274
+ tmp_result = tmp_result or ([list(x) for x in output] == in_outs['outputs'][index][0])
275
+ except Exception as e: # noqa: F841
276
+ True
277
+ results.append(tmp_result)
278
+ if tmp_result is not True:
279
+ return results, {
280
+ 'output': raw_true_output_copy,
281
+ 'expected': raw_outputs,
282
+ 'inputs': raw_inputs,
283
+ 'error_code': -2,
284
+ 'error_message': 'Wrong Answer',
285
+ }
286
+ # reset the alarm
287
+ signal.alarm(0)
288
+ except Exception as e:
289
+ signal.alarm(0)
290
+ faulthandler.disable()
291
+ if debug:
292
+ print(f'Standard input runtime error or time limit exceeded error = {e}' # noqa: E501
293
+ )
294
+ results.append(-1)
295
+ if 'timeoutexception' in repr(e).lower():
296
+ return results, {
297
+ 'error': repr(e),
298
+ 'error_code': -3,
299
+ 'error_message': 'Time Limit Exceeded',
300
+ 'inputs': raw_inputs,
301
+ 'expected': raw_outputs,
302
+ }
303
+ else:
304
+ return results, {
305
+ 'error': repr(e),
306
+ 'error_code': -4,
307
+ 'error_message': 'Runtime Error',
308
+ 'inputs': raw_inputs,
309
+ 'expected': raw_outputs,
310
+ }
311
+ faulthandler.disable()
312
+ signal.alarm(0)
313
+ if debug:
314
+ print(
315
+ f"outputs = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
316
+ )
317
+ elif which_type == CODE_TYPE.standard_input: # Standard input
318
+ faulthandler.enable()
319
+ passed = False
320
+
321
+ if isinstance(inputs, list):
322
+ inputs = '\n'.join(inputs)
323
+ if isinstance(in_outs['outputs'][index], list):
324
+ in_outs['outputs'][index] = '\n'.join(in_outs['outputs'][index])
325
+
326
+ signal.alarm(timeout)
327
+ with Capturing() as output:
328
+ try:
329
+ call_method(method, inputs)
330
+ # reset the alarm
331
+ signal.alarm(0)
332
+ passed = True
333
+ except Exception as e:
334
+ # runtime error or took too long
335
+ signal.alarm(0)
336
+ print(f'Call-based runtime error or time limit exceeded error = {repr(e)}{e}' # noqa: E501
337
+ )
338
+ results.append(-1)
339
+ if 'timeoutexception' in repr(e).lower():
340
+ return results, {
341
+ 'error': repr(e),
342
+ 'error_code': -3,
343
+ 'error_message': 'Time Limit Exceeded',
344
+ 'inputs': raw_inputs,
345
+ 'expected': raw_outputs,
346
+ }
347
+ else:
348
+ return results, {
349
+ 'error': repr(e),
350
+ 'error_code': -4,
351
+ 'error_message': 'Runtime Error',
352
+ 'inputs': raw_inputs,
353
+ 'expected': raw_outputs,
354
+ }
355
+ signal.alarm(0)
356
+ raw_true_output = output[0]
357
+ raw_true_output_copy = truncatefn(raw_true_output, 200)
358
+ output = raw_true_output.splitlines()
359
+ if not passed:
360
+ if debug:
361
+ nl = '\n'
362
+ if not isinstance(inputs, list):
363
+ print(
364
+ f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
365
+ )
366
+ else:
367
+ print(
368
+ f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
369
+ )
370
+ continue
371
+
372
+ if passed and debug:
373
+ print(f"==> output = {output}, test outputs = {in_outs['outputs'][index]}" # noqa: E501
374
+ )
375
+
376
+ if custom_compare_(output, in_outs['outputs'][index]):
377
+ tmp_result = True
378
+ results.append(tmp_result)
379
+ continue
380
+
381
+ # ground truth sequences are expressed as lists not tuples
382
+ if isinstance(output, tuple):
383
+ output = list(output)
384
+
385
+ tmp_result = False
386
+ try:
387
+ tmp_result = output == [in_outs['outputs'][index]]
388
+ if isinstance(in_outs['outputs'][index], list):
389
+ tmp_result = tmp_result or (output == in_outs['outputs'][index])
390
+ if isinstance(output[0], str):
391
+ tmp_result = tmp_result or ([e.strip() for e in output] == in_outs['outputs'][index])
392
+ except Exception as e:
393
+ if debug:
394
+ print(f'Failed check1 exception = {e}')
395
+ pass
396
+
397
+ if tmp_result is True:
398
+ results.append(tmp_result)
399
+ continue
400
+
401
+ # try one more time without \n
402
+ if isinstance(in_outs['outputs'][index], list):
403
+ for tmp_index, i in enumerate(in_outs['outputs'][index]):
404
+ in_outs['outputs'][index][tmp_index] = i.split('\n')
405
+ in_outs['outputs'][index][tmp_index] = [
406
+ x.strip() for x in in_outs['outputs'][index][tmp_index] if x
407
+ ]
408
+ else:
409
+ in_outs['outputs'][index] = in_outs['outputs'][index].split('\n')
410
+ in_outs['outputs'][index] = list(filter(len, in_outs['outputs'][index]))
411
+ in_outs['outputs'][index] = list(map(lambda x: x.strip(), in_outs['outputs'][index]))
412
+
413
+ try:
414
+ tmp_result = output == [in_outs['outputs'][index]]
415
+ if isinstance(in_outs['outputs'][index], list):
416
+ tmp_result = tmp_result or (output == in_outs['outputs'][index])
417
+ except Exception as e:
418
+ if debug:
419
+ print(f'Failed check2 exception = {e}')
420
+ pass
421
+
422
+ if tmp_result is True:
423
+ results.append(tmp_result)
424
+ continue
425
+
426
+ # try by converting the output into a split up list too
427
+ if isinstance(output, list):
428
+ output = list(filter(len, output))
429
+
430
+ if debug:
431
+ nl = '\n'
432
+ if not isinstance(inputs, list):
433
+ print(
434
+ f"@1 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]} {tmp_result=}" # noqa: E501
435
+ )
436
+ else:
437
+ print(
438
+ f"@1 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]} {tmp_result=}" # noqa: E501
439
+ )
440
+
441
+ if tmp_result is True:
442
+ results.append(tmp_result)
443
+ continue
444
+
445
+ if debug:
446
+ print(f'{tmp_result=} @a')
447
+
448
+ try:
449
+ tmp_result = output == [in_outs['outputs'][index]]
450
+ if isinstance(in_outs['outputs'][index], list):
451
+ tmp_result = tmp_result or (output == in_outs['outputs'][index])
452
+ except Exception as e:
453
+ if debug:
454
+ print(f'Failed check3 exception = {e}')
455
+ pass
456
+
457
+ if debug:
458
+ print(f'{tmp_result=} @b')
459
+
460
+ try:
461
+ all_ints = all(
462
+ combined_int_check(e1) and combined_int_check(e2)
463
+ for e1, e2 in zip(output, in_outs['outputs'][index]))
464
+ if not all_ints:
465
+ if debug:
466
+ print([
467
+ combined_int_check(e1) and combined_int_check(e2)
468
+ for e1, e2 in zip(output, in_outs['outputs'][index])
469
+ ])
470
+ output_float = [float(e) for e in output]
471
+ gt_float = [float(e) for e in in_outs['outputs'][index]]
472
+ tmp_result = tmp_result or ((len(output_float) == len(gt_float))
473
+ and np.allclose(output_float, gt_float))
474
+ except Exception as e: # noqa: F841
475
+ pass
476
+
477
+ if debug:
478
+ print(f'{tmp_result=} @c')
479
+
480
+ try:
481
+ if isinstance(output[0], list):
482
+ all_ints = all(
483
+ combined_int_check(e1) and combined_int_check(e2)
484
+ for e1, e2 in zip(output[0], in_outs['outputs'][index]))
485
+ if not all_ints:
486
+ output_float = [float(e) for e in output[0]]
487
+ gt_float = [float(e) for e in in_outs['outputs'][index][0]]
488
+ tmp_result = tmp_result or ((len(output_float) == len(gt_float))
489
+ and np.allclose(output_float, gt_float))
490
+ except Exception as e: # noqa: F841
491
+ pass
492
+
493
+ if tmp_result is True:
494
+ results.append(tmp_result)
495
+ continue
496
+
497
+ if debug:
498
+ print(f'{tmp_result=} @d')
499
+ # try by converting the stuff into split up list
500
+ if isinstance(in_outs['outputs'][index], list):
501
+ for tmp_index, i in enumerate(in_outs['outputs'][index]):
502
+ in_outs['outputs'][index][tmp_index] = set(i.split())
503
+ else:
504
+ in_outs['outputs'][index] = set(in_outs['outputs'][index].split())
505
+
506
+ if debug:
507
+ print(f'{tmp_result=} @e')
508
+
509
+ try:
510
+ tmp_result = output == in_outs['outputs'][index]
511
+ except Exception as e:
512
+ if debug:
513
+ print(f'Failed check4 exception = {e}')
514
+ continue
515
+
516
+ if tmp_result is True:
517
+ results.append(tmp_result)
518
+ continue
519
+
520
+ if debug:
521
+ print(f'{tmp_result=} @f')
522
+
523
+ # try by converting the output into a split up list too
524
+ if isinstance(output, list):
525
+ for tmp_index, i in enumerate(output):
526
+ output[tmp_index] = i.split()
527
+ output = list(filter(len, output))
528
+ for tmp_index, i in enumerate(output):
529
+ output[tmp_index] = set(i)
530
+ else:
531
+ output = output.split()
532
+ output = list(filter(len, output))
533
+ output = set(output)
534
+
535
+ if debug:
536
+ print(f'{tmp_result=} @g')
537
+ # try:
538
+ # tmp_result = set(frozenset(s) for s in output) == set(
539
+ # frozenset(s) for s in in_outs["outputs"][index]
540
+ # )
541
+ # except Exception as e:
542
+ # if debug:
543
+ # print(f"Failed check5 exception = {e}")
544
+
545
+ # if they are all numbers, round so that similar numbers are
546
+ # treated as identical
547
+ # try:
548
+ # all_ints = all(
549
+ # combined_int_check(e1) and combined_int_check(e2)
550
+ # for e1, e2 in zip(output, in_outs["outputs"][index])
551
+ # )
552
+ # tmp_result = tmp_result or (
553
+ # set(
554
+ # frozenset(round(float(t), 3) for t in s) for s in output)
555
+ # == set(
556
+ # frozenset(round(float(t), 3) for t in s)
557
+ # for s in in_outs["outputs"][index]
558
+ # )
559
+ # )
560
+ # except Exception as e:
561
+ # if debug:
562
+ # print(f"Failed check6 exception = {e}")
563
+
564
+ if debug:
565
+ print(f'{tmp_result=} @h')
566
+
567
+ if tmp_result is True and debug:
568
+ print('PASSED')
569
+
570
+ results.append(tmp_result)
571
+ if tmp_result is not True:
572
+ return results, {
573
+ 'output': raw_true_output_copy,
574
+ 'expected': raw_outputs,
575
+ 'inputs': raw_inputs,
576
+ 'error_code': -2,
577
+ 'error_message': 'Wrong Answer',
578
+ }
579
+
580
+ if debug:
581
+ nl = '\n'
582
+ if not isinstance(inputs, list):
583
+ print(
584
+ f"@2 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
585
+ )
586
+ else:
587
+ print(
588
+ f"@2 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
589
+ )
590
+
591
+ print(f'results = {results}')
592
+
593
+ return results, {}
594
+
595
+
596
+ def custom_compare_(output, ground_truth):
597
+
598
+ if isinstance(output, list):
599
+ output_1 = '\n'.join(output)
600
+ if stripped_string_compare(output_1, ground_truth):
601
+ return True
602
+
603
+ if isinstance(output, list):
604
+ output_2 = [o.lstrip().rstrip() for o in output]
605
+ output_2 = '\n'.join(output_2)
606
+ if stripped_string_compare(output_2, ground_truth):
607
+ return True
608
+
609
+ return False
610
+
611
+
612
+ def stripped_string_compare(s1, s2):
613
+ s1 = s1.lstrip().rstrip()
614
+ s2 = s2.lstrip().rstrip()
615
+ return s1 == s2
616
+
617
+
618
+ def call_method(method, inputs):
619
+
620
+ if isinstance(inputs, list):
621
+ inputs = '\n'.join(inputs)
622
+
623
+ inputs_line_iterator = iter(inputs.split('\n'))
624
+
625
+ # sys.setrecursionlimit(10000)
626
+
627
+ # @patch('builtins.input', side_effect=inputs.split("\n"))
628
+ @patch('builtins.open', mock_open(read_data=inputs))
629
+ @patch('sys.stdin', StringIO(inputs))
630
+ @patch('sys.stdin.readline', lambda *args: next(inputs_line_iterator))
631
+ @patch('sys.stdin.readlines', lambda *args: inputs.split('\n'))
632
+ @patch('sys.stdin.read', lambda *args: inputs)
633
+ # @patch('sys.stdout.write', print)
634
+ def _inner_call_method(_method):
635
+ try:
636
+ return _method()
637
+ except SystemExit as e: # noqa: F841
638
+ pass
639
+ finally:
640
+ pass
641
+
642
+ return _inner_call_method(method)
643
+
644
+
645
+ def reliability_guard(maximum_memory_bytes=None):
646
+ """This disables various destructive functions and prevents the generated
647
+ code from interfering with the test (e.g. fork bomb, killing other
648
+ processes, removing filesystem files, etc.) WARNING This function is NOT a
649
+ security sandbox.
650
+
651
+ Untrusted code, including, model- generated code, should not be blindly
652
+ executed outside of one. See the Codex paper for more information about
653
+ OpenAI's code sandbox, and proceed with caution.
654
+ """
655
+
656
+ if maximum_memory_bytes is not None:
657
+ import resource
658
+
659
+ resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
660
+ resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
661
+ if not platform.uname().system == 'Darwin':
662
+ resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
663
+
664
+ faulthandler.disable()
665
+
666
+ import builtins
667
+
668
+ builtins.exit = None
669
+ builtins.quit = None
670
+
671
+ import os
672
+
673
+ os.environ['OMP_NUM_THREADS'] = '1'
674
+
675
+ os.kill = None
676
+ os.system = None
677
+ os.putenv = None
678
+ os.remove = None
679
+ os.removedirs = None
680
+ os.rmdir = None
681
+ os.fchdir = None
682
+ os.setuid = None
683
+ os.fork = None
684
+ os.forkpty = None
685
+ os.killpg = None
686
+ os.rename = None
687
+ os.renames = None
688
+ os.truncate = None
689
+ os.replace = None
690
+ os.unlink = None
691
+ os.fchmod = None
692
+ os.fchown = None
693
+ os.chmod = None
694
+ os.chown = None
695
+ os.chroot = None
696
+ os.fchdir = None
697
+ os.lchflags = None
698
+ os.lchmod = None
699
+ os.lchown = None
700
+ os.getcwd = None
701
+ os.chdir = None
702
+
703
+ import shutil
704
+
705
+ shutil.rmtree = None
706
+ shutil.move = None
707
+ shutil.chown = None
708
+
709
+ import subprocess
710
+
711
+ subprocess.Popen = None # type: ignore
712
+
713
+ __builtins__['help'] = None
714
+
715
+ import sys
716
+
717
+ sys.modules['ipdb'] = None
718
+ sys.modules['joblib'] = None
719
+ sys.modules['resource'] = None
720
+ sys.modules['psutil'] = None
721
+ sys.modules['tkinter'] = None