ai2-olmo-eval 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ai2_olmo_eval-0.8.1.dist-info → ai2_olmo_eval-0.8.3.dist-info}/METADATA +1 -1
- {ai2_olmo_eval-0.8.1.dist-info → ai2_olmo_eval-0.8.3.dist-info}/RECORD +46 -42
- {ai2_olmo_eval-0.8.1.dist-info → ai2_olmo_eval-0.8.3.dist-info}/WHEEL +1 -1
- olmo_eval/metrics.py +3 -3
- olmo_eval/oe_eval_tasks/codex_humaneval/gold_bpb_3shot/config.json +1 -0
- olmo_eval/oe_eval_tasks/codex_humaneval/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/codex_mbpp/gold_bpb_3shot/config.json +1 -0
- olmo_eval/oe_eval_tasks/codex_mbpp/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_bash/gold_bpb_3shot/config.json +1 -1
- olmo_eval/oe_eval_tasks/mt_mbpp_bash/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_c/gold_bpb_3shot/config.json +1 -1
- olmo_eval/oe_eval_tasks/mt_mbpp_c/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_cpp/gold_bpb_3shot/config.json +1 -1
- olmo_eval/oe_eval_tasks/mt_mbpp_cpp/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_csharp/gold_bpb_3shot/config.json +1 -1
- olmo_eval/oe_eval_tasks/mt_mbpp_csharp/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_go/gold_bpb_3shot/config.json +1 -1
- olmo_eval/oe_eval_tasks/mt_mbpp_go/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_haskell/gold_bpb_3shot/config.json +1 -1
- olmo_eval/oe_eval_tasks/mt_mbpp_haskell/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_java/gold_bpb_3shot/config.json +1 -1
- olmo_eval/oe_eval_tasks/mt_mbpp_java/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_javascript/gold_bpb_3shot/config.json +1 -1
- olmo_eval/oe_eval_tasks/mt_mbpp_javascript/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_matlab/gold_bpb_3shot/config.json +1 -1
- olmo_eval/oe_eval_tasks/mt_mbpp_matlab/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_php/gold_bpb_3shot/config.json +1 -1
- olmo_eval/oe_eval_tasks/mt_mbpp_php/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_python/gold_bpb_3shot/config.json +1 -1
- olmo_eval/oe_eval_tasks/mt_mbpp_python/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_r/gold_bpb_3shot/config.json +1 -1
- olmo_eval/oe_eval_tasks/mt_mbpp_r/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_ruby/gold_bpb_3shot/config.json +1 -1
- olmo_eval/oe_eval_tasks/mt_mbpp_ruby/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_rust/gold_bpb_3shot/config.json +1 -1
- olmo_eval/oe_eval_tasks/mt_mbpp_rust/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_scala/gold_bpb_3shot/config.json +1 -1
- olmo_eval/oe_eval_tasks/mt_mbpp_scala/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_swift/gold_bpb_3shot/config.json +1 -1
- olmo_eval/oe_eval_tasks/mt_mbpp_swift/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_typescript/gold_bpb_3shot/config.json +1 -1
- olmo_eval/oe_eval_tasks/mt_mbpp_typescript/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/tasks.py +8 -0
- olmo_eval/version.py +1 -1
- {ai2_olmo_eval-0.8.1.dist-info → ai2_olmo_eval-0.8.3.dist-info}/licenses/LICENSE +0 -0
- {ai2_olmo_eval-0.8.1.dist-info → ai2_olmo_eval-0.8.3.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
ai2_olmo_eval-0.8.
|
|
1
|
+
ai2_olmo_eval-0.8.3.dist-info/licenses/LICENSE,sha256=YvuKOpYh3COIF0yqq-nCMXtpS7mh1GyYvPVlW2j1G-M,11359
|
|
2
2
|
olmo_eval/__init__.py,sha256=49RxnAaJNk8U9XP3SF5MjyFIxLSkxH0vXQuZgnEOi44,283
|
|
3
|
-
olmo_eval/metrics.py,sha256=
|
|
4
|
-
olmo_eval/tasks.py,sha256=
|
|
3
|
+
olmo_eval/metrics.py,sha256=xUnFUGho1Y99595G79chqv2iFZU6LU5KVACHRYcUI1k,20046
|
|
4
|
+
olmo_eval/tasks.py,sha256=eecUt07ww7lDuh9w974QXMIykV7RX6GhsI5iVoG4eQk,96636
|
|
5
5
|
olmo_eval/tokenizer.py,sha256=PnkidE0nAtEA1QZjuQpE_bIwgAsHxodnaJRALAPqrJQ,5127
|
|
6
6
|
olmo_eval/util.py,sha256=ARmZmRQl8VOvnKQoUprb3cOunzcApeNhRdV4BMXZuvo,3856
|
|
7
|
-
olmo_eval/version.py,sha256=
|
|
7
|
+
olmo_eval/version.py,sha256=2WwAQD_9rfYlFOdUcW7n-z_8LFN-v_CznrmwPxkrjbQ,308
|
|
8
8
|
olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/data-00000-of-00001.arrow,sha256=TPWbMhBmticWjYp7TA3etcKbXbaoCDBWhxuqlD1bDJA,98080
|
|
9
9
|
olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/dataset_info.json,sha256=iZumP5Udu8LD7cbew3o7nNpnGu-o9jPaMxUrNDDNIVY,1795
|
|
10
10
|
olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/state.json,sha256=6Q1XhM-HMZcymuGAKBC_8RjMBKgJSaR_6lLUO9Z8XwE,255
|
|
@@ -624,8 +624,12 @@ olmo_eval/oe_eval_tasks/boolq/val_rc_5shot/config.json,sha256=eR9oyZMCIckUVP9FoQ
|
|
|
624
624
|
olmo_eval/oe_eval_tasks/boolq/val_rc_5shot/requests.jsonl.gz,sha256=1e0uURX0nSTDfsF_t7Sm7GvFjGXOFUEL7StANC8I-8k,1282935
|
|
625
625
|
olmo_eval/oe_eval_tasks/codex_humaneval/gold_bpb_0shot/config.json,sha256=7qlpbRzhEZPN8kTAMsA-pG3BniapN8KdeaT1Xutjaao,1795
|
|
626
626
|
olmo_eval/oe_eval_tasks/codex_humaneval/gold_bpb_0shot/requests.jsonl.gz,sha256=RAloKzNczG1GQ5rYIsypeBH2J9gMJJh7F9ryKphtNbY,55008
|
|
627
|
+
olmo_eval/oe_eval_tasks/codex_humaneval/gold_bpb_3shot/config.json,sha256=KmoTn3onOUOuWRJoaOjSv5Mzsfl7llNJQEoLB8NFeSY,1795
|
|
628
|
+
olmo_eval/oe_eval_tasks/codex_humaneval/gold_bpb_3shot/requests.jsonl.gz,sha256=ACQZ5zrUnTArHzsMCRJXgTIHKyiV53i6b1M3TX0igZ4,63989
|
|
627
629
|
olmo_eval/oe_eval_tasks/codex_mbpp/gold_bpb_0shot/config.json,sha256=NN2j87atOpst1kjVO7opBKdcggTCfpyw9PqDccEP1t8,1909
|
|
628
630
|
olmo_eval/oe_eval_tasks/codex_mbpp/gold_bpb_0shot/requests.jsonl.gz,sha256=5XVIcNjo7QzRfEbBKVybmfDmshgNqvKj24WUJdvg050,99950
|
|
631
|
+
olmo_eval/oe_eval_tasks/codex_mbpp/gold_bpb_3shot/config.json,sha256=5t6k8xKcci6OJHDCIwypOh_JxZO2tQkH_sOMm1XHfFI,1856
|
|
632
|
+
olmo_eval/oe_eval_tasks/codex_mbpp/gold_bpb_3shot/requests.jsonl.gz,sha256=KXjtB1gmSfOBJFJeFOSmDPFT4BBdKcpuxGnP4FHdGQY,109863
|
|
629
633
|
olmo_eval/oe_eval_tasks/copa/rc_0shot/config.json,sha256=dkNiFgoOz56d8IPeQkJTQlR6rJbO2Iogdp1wHKp5ztw,531
|
|
630
634
|
olmo_eval/oe_eval_tasks/copa/rc_0shot/requests.jsonl.gz,sha256=4XnXOK5tIEN-XBUQmQ9TAffnp1TfgwJFYpKZo9-6CFE,18841
|
|
631
635
|
olmo_eval/oe_eval_tasks/copycolors/10way/config.json,sha256=LcKu28kWu9TphtNk5BJXo44z1deqUZ0boK_lt-F0Ils,1064
|
|
@@ -670,40 +674,40 @@ olmo_eval/oe_eval_tasks/minerva_math_prealgebra/gold_bpb_0shot/config.json,sha25
|
|
|
670
674
|
olmo_eval/oe_eval_tasks/minerva_math_prealgebra/gold_bpb_0shot/requests.jsonl.gz,sha256=YoiSOp8JH-I0y0ppgnTcLNbF7sLVHrHFbWI8K6cBj_M,267414
|
|
671
675
|
olmo_eval/oe_eval_tasks/minerva_math_precalculus/gold_bpb_0shot/config.json,sha256=-ex7VPSuyCoVHEkGw___E9gv8Jcfr8BWMEMO0Ber3H8,1750
|
|
672
676
|
olmo_eval/oe_eval_tasks/minerva_math_precalculus/gold_bpb_0shot/requests.jsonl.gz,sha256=A7b4WGo5qLCl3s1z5HBbEUpoJwva12doE_WRc3un6Fs,223616
|
|
673
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_bash/gold_bpb_3shot/config.json,sha256=
|
|
674
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_bash/gold_bpb_3shot/requests.jsonl.gz,sha256=
|
|
675
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_c/gold_bpb_3shot/config.json,sha256=
|
|
676
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_c/gold_bpb_3shot/requests.jsonl.gz,sha256=
|
|
677
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_cpp/gold_bpb_3shot/config.json,sha256=
|
|
678
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_cpp/gold_bpb_3shot/requests.jsonl.gz,sha256=
|
|
679
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_csharp/gold_bpb_3shot/config.json,sha256=
|
|
680
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_csharp/gold_bpb_3shot/requests.jsonl.gz,sha256=
|
|
681
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_go/gold_bpb_3shot/config.json,sha256=
|
|
682
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_go/gold_bpb_3shot/requests.jsonl.gz,sha256=
|
|
683
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_haskell/gold_bpb_3shot/config.json,sha256=
|
|
684
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_haskell/gold_bpb_3shot/requests.jsonl.gz,sha256=
|
|
685
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_java/gold_bpb_3shot/config.json,sha256=
|
|
686
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_java/gold_bpb_3shot/requests.jsonl.gz,sha256=
|
|
687
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_javascript/gold_bpb_3shot/config.json,sha256=
|
|
688
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_javascript/gold_bpb_3shot/requests.jsonl.gz,sha256=
|
|
689
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_matlab/gold_bpb_3shot/config.json,sha256=
|
|
690
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_matlab/gold_bpb_3shot/requests.jsonl.gz,sha256=
|
|
691
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_php/gold_bpb_3shot/config.json,sha256=
|
|
692
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_php/gold_bpb_3shot/requests.jsonl.gz,sha256=
|
|
693
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_python/gold_bpb_3shot/config.json,sha256=
|
|
694
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_python/gold_bpb_3shot/requests.jsonl.gz,sha256=
|
|
695
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_r/gold_bpb_3shot/config.json,sha256=
|
|
696
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_r/gold_bpb_3shot/requests.jsonl.gz,sha256=
|
|
697
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_ruby/gold_bpb_3shot/config.json,sha256=
|
|
698
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_ruby/gold_bpb_3shot/requests.jsonl.gz,sha256=
|
|
699
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_rust/gold_bpb_3shot/config.json,sha256=
|
|
700
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_rust/gold_bpb_3shot/requests.jsonl.gz,sha256=
|
|
701
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_scala/gold_bpb_3shot/config.json,sha256=
|
|
702
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_scala/gold_bpb_3shot/requests.jsonl.gz,sha256=
|
|
703
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_swift/gold_bpb_3shot/config.json,sha256=
|
|
704
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_swift/gold_bpb_3shot/requests.jsonl.gz,sha256=
|
|
705
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_typescript/gold_bpb_3shot/config.json,sha256=
|
|
706
|
-
olmo_eval/oe_eval_tasks/mt_mbpp_typescript/gold_bpb_3shot/requests.jsonl.gz,sha256=
|
|
677
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_bash/gold_bpb_3shot/config.json,sha256=d7iO2JFOqAixBwblBMjJ-9dF-23d1N9OFaE61r0RxEU,1740
|
|
678
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_bash/gold_bpb_3shot/requests.jsonl.gz,sha256=ETpUbwN6y7URqHWtyTKPYOt7QUOfKrbSNzhKMOsqR9E,86539
|
|
679
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_c/gold_bpb_3shot/config.json,sha256=6kX7Bvyvg-6NdbmLsIX_Ov4QHANnzRc44OMzUOjH95Y,1729
|
|
680
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_c/gold_bpb_3shot/requests.jsonl.gz,sha256=Y4uLmyClONrqCcmqM-G9yLDxq6p10AFlbAsP3tiD6h8,114583
|
|
681
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_cpp/gold_bpb_3shot/config.json,sha256=15cMJYiWIvDM0CqkJmvVDcbrVwrREXi2_-LcpCey2pc,1736
|
|
682
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_cpp/gold_bpb_3shot/requests.jsonl.gz,sha256=004fqdUAFc2U1pJTPcxhKqiXC8tkZfbIV6B_xL3aiV4,95046
|
|
683
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_csharp/gold_bpb_3shot/config.json,sha256=5a0LS47oF4Y-pkaR-A0MOnTa4SmIyx9GZO1xyOqVwQ0,1749
|
|
684
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_csharp/gold_bpb_3shot/requests.jsonl.gz,sha256=VrW7MQEtw772H5do2d3BBg50W-CQieEiBuLhjrywQo4,95741
|
|
685
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_go/gold_bpb_3shot/config.json,sha256=7G-VzM8iIgZJt-U6QL9edXL0nxELb1-kQh7w6SRai7o,1733
|
|
686
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_go/gold_bpb_3shot/requests.jsonl.gz,sha256=Dz6NoBnjsmSkwQB_Jo5NxDCXvOBR_iyv-dAcFAO4Nzg,93754
|
|
687
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_haskell/gold_bpb_3shot/config.json,sha256=UyjPYcGXdKZsKDAVsiRLFM9NHj40DXHmfOUyFXM12JM,1752
|
|
688
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_haskell/gold_bpb_3shot/requests.jsonl.gz,sha256=oYow1wz5YlKUGOUPDrmcfTZMWT-TgoGk9KAo7jQitfw,77203
|
|
689
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_java/gold_bpb_3shot/config.json,sha256=H20LX5VRZroEp3-RLiiZa0GrWHhEaYpvODUxcsTKEu0,1740
|
|
690
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_java/gold_bpb_3shot/requests.jsonl.gz,sha256=TClz1VEYO3fCFXEdN4WbBXcRUgs0TVlU_-21LbR5CbE,98602
|
|
691
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_javascript/gold_bpb_3shot/config.json,sha256=J2ogiOMRT0WzCpZ6yvRN6j47LzQJwIWX0xPXqlW3Pyg,1764
|
|
692
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_javascript/gold_bpb_3shot/requests.jsonl.gz,sha256=Xr6QPU3Xfn0psBJ-ononPqbhl3v1xtZXtyREqID2mnQ,88707
|
|
693
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_matlab/gold_bpb_3shot/config.json,sha256=3rQleYInwFB2okgVKMxmjcs3c8nZBybD3069h5i-ht4,1748
|
|
694
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_matlab/gold_bpb_3shot/requests.jsonl.gz,sha256=JVWGSqIp94JtcPOcTQF3uJXBNfymke50G4KgSmO05UQ,74857
|
|
695
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_php/gold_bpb_3shot/config.json,sha256=RKgy9dHGP6XkzX9JcrqnY9yqprrQ3XAe7yrD7dIOBt8,1736
|
|
696
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_php/gold_bpb_3shot/requests.jsonl.gz,sha256=MR_N0Qpe9wKTGvoolyInQC9EbXsvrHVTzbIbEcX8ly8,80215
|
|
697
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_python/gold_bpb_3shot/config.json,sha256=viAjwwneXp3PDD_mmfpr9jSrCrTFrXUssol45BcZ7N4,1748
|
|
698
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_python/gold_bpb_3shot/requests.jsonl.gz,sha256=u7ltKtCeiMarhDAPXH18L3Q7Tndi__4rEDpA1b6ahtQ,72874
|
|
699
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_r/gold_bpb_3shot/config.json,sha256=dlll0Ik6Z7ForFQMAcTPb51v-WIzP70TgHgE7s8a2Gs,1729
|
|
700
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_r/gold_bpb_3shot/requests.jsonl.gz,sha256=6K3EAXPWI8uIHnlKt2DB544dJg8RjCLpmxGPWkBuVbQ,73142
|
|
701
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_ruby/gold_bpb_3shot/config.json,sha256=wnLcmFpvYHVLJWu6cEhccsCkY09HzZyc1CKuMSIEHEo,1741
|
|
702
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_ruby/gold_bpb_3shot/requests.jsonl.gz,sha256=XqNgCYWSq8zXxB63mHZOEGE3m0vkGP6f30edqVMWPak,65901
|
|
703
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_rust/gold_bpb_3shot/config.json,sha256=1MBjYE141GU9gyZjxQj0U3JQM10R3YZRxDiOxeNK0HI,1740
|
|
704
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_rust/gold_bpb_3shot/requests.jsonl.gz,sha256=WxxInVCS8uv7mPBWvUcJkJcMJPPeMBhgCwJoZr9weNE,86877
|
|
705
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_scala/gold_bpb_3shot/config.json,sha256=FQ4LuNRyZAz5XBgs7FquL0t76weZI2gzZz7FtO-Jcfw,1744
|
|
706
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_scala/gold_bpb_3shot/requests.jsonl.gz,sha256=e9eyNLwYx5YC1f89haxuCRGMB6Va_tOz-D_2KZ8ESFk,78583
|
|
707
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_swift/gold_bpb_3shot/config.json,sha256=yASn9JorzLV8SPiJPsUO3T5MT5pheHTgGLPO0kc4LwI,1744
|
|
708
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_swift/gold_bpb_3shot/requests.jsonl.gz,sha256=rV8AjQKOMtmtYQO8pwH5FUgEBaKYGLUChn42C5i1CY4,101903
|
|
709
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_typescript/gold_bpb_3shot/config.json,sha256=BpLurpx35sWIoyR-bJXB7exabtpSJunLnwkjGbm28Ig,1765
|
|
710
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_typescript/gold_bpb_3shot/requests.jsonl.gz,sha256=uzpMReBbsv5Fx56CXU3cPRabu987cnuaqImgCHgE0zM,99334
|
|
707
711
|
olmo_eval/oe_eval_tasks/openbookqa/mc_5shot/config.json,sha256=RfZ4pfvROPvMIMmVhlr_3xWGLz5mVOqsdWCHTdu1eIY,612
|
|
708
712
|
olmo_eval/oe_eval_tasks/openbookqa/mc_5shot/requests.jsonl.gz,sha256=oQ4gkIyQ5ZkZa-ceBvETqlrRs8GJCgsn6djAV6ubFgU,75731
|
|
709
713
|
olmo_eval/oe_eval_tasks/openbookqa/rc_0shot/config.json,sha256=bOBTOqksTpGGJYJ2419HJxd__pbOpetokEJBzN6yRKw,547
|
|
@@ -752,7 +756,7 @@ olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/config.json,sha256=ySjEVqTOj5GwC
|
|
|
752
756
|
olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/requests.jsonl.gz,sha256=knTzcqigWCfdYLN1Pl0TfCm0Fi1lRASWAo_SC6KtXsc,115262
|
|
753
757
|
olmo_eval/tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json,sha256=yjXYcnpTO7Zjm_R4Gucrn9oA5paadiYM-ZZER5q_EXc,2114319
|
|
754
758
|
olmo_eval/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json,sha256=mtM7Szmp-Dlzw_jEKgGUjdW4d6KKyaU1aVbE_07QtxQ,2115113
|
|
755
|
-
ai2_olmo_eval-0.8.
|
|
756
|
-
ai2_olmo_eval-0.8.
|
|
757
|
-
ai2_olmo_eval-0.8.
|
|
758
|
-
ai2_olmo_eval-0.8.
|
|
759
|
+
ai2_olmo_eval-0.8.3.dist-info/METADATA,sha256=yEIyjzmw8MXnBMMpXEYy2N8WDwoQajTyZpiJiBvlPzM,14398
|
|
760
|
+
ai2_olmo_eval-0.8.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
761
|
+
ai2_olmo_eval-0.8.3.dist-info/top_level.txt,sha256=Pryk28JTb89-j624Uy1gRZiE0YXI3czgbNIfJCl9-x0,10
|
|
762
|
+
ai2_olmo_eval-0.8.3.dist-info/RECORD,,
|
olmo_eval/metrics.py
CHANGED
|
@@ -103,7 +103,7 @@ class ICLMetric(Metric):
|
|
|
103
103
|
choice_ids = batch["choice_ids"][idx]
|
|
104
104
|
else:
|
|
105
105
|
fast_mc = False
|
|
106
|
-
choice_ids = cont_tokens
|
|
106
|
+
choice_ids = [cont_tokens]
|
|
107
107
|
|
|
108
108
|
# For each choice token, calculate metrics and append as separate entries
|
|
109
109
|
for choice_idx, choice_token in enumerate(choice_ids):
|
|
@@ -410,8 +410,8 @@ class ICLMetric(Metric):
|
|
|
410
410
|
}
|
|
411
411
|
elif self.metric_type == "bpb":
|
|
412
412
|
return {
|
|
413
|
-
"bpb_v1":
|
|
414
|
-
"bpb_v2":
|
|
413
|
+
"bpb_v1": (sum(bpb_no_leading_space) / len(bpb_no_leading_space)).clone().detach(),
|
|
414
|
+
"bpb_v2": (sum(bpb) / len(bpb)).clone().detach(),
|
|
415
415
|
}
|
|
416
416
|
else:
|
|
417
417
|
return {
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"task_name": "codex_humaneval", "task_hash": "b271b0f127ae71cf79a80d6463f0c877", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "codex_humaneval", "task_core": "codex_humaneval", "limit": null, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {"answer_prefix": ""}, "generation_kwargs": {"max_gen_toks": 512, "do_sample": false, "temperature": 0.0, "stop_sequences": [], "repeats": 1}, "metric_kwargs": {"pass_at_ks": [1]}, "native_id_field": "task_id", "fewshot_source": null, "dataset_path": "openai_humaneval", "dataset_name": null, "use_chat_format": false, "version": 0.1, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "codex_humaneval:3shot:bpb::none"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.107417345046997, "current_date": "2025-05-19 20:42:07 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5PVN5HA1E2SX4FNFAZN4", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVGSX2MHV1Z5AY6TBD7F6HAS", "BEAKER_WORKLOAD_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ENVIRONMENT_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ASSIGNED_CPU_COUNT": "127.5", "BEAKER_ASSIGNED_GPU_COUNT": "4", "BEAKER_NODE_HOSTNAME": "neptune-cs-aus-264.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"task_name": "mbpp", "task_hash": "73a5dd2f855a4dcfec59e486a6381c7d", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mbpp", "task_core": "mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {"assistant_prefix": "Here is the completed function:\n\n```python\n", "prompt_variant": "inloop_bpb"}, "generation_kwargs": {"max_gen_toks": 512, "do_sample": false, "temperature": 0.0, "stop_sequences": [], "repeats": 1}, "metric_kwargs": {"pass_at_ks": [1]}, "native_id_field": "task_id", "fewshot_source": "Original:MBPP", "dataset_path": "google-research-datasets/mbpp", "dataset_name": null, "use_chat_format": false, "version": 0.1, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mbpp:3shot:bpb::none"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 1.7871758937835693, "current_date": "2025-05-19 19:41:15 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5PVN5HA1E2SX4FNFAZN4", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVGSX2MHV1Z5AY6TBD7F6HAS", "BEAKER_WORKLOAD_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ENVIRONMENT_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ASSIGNED_CPU_COUNT": "127.5", "BEAKER_ASSIGNED_GPU_COUNT": "4", "BEAKER_NODE_HOSTNAME": "neptune-cs-aus-264.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"task_name": "mt_mbpp:bash", "task_hash": "12bf5ff314ab6e3b192fdb28a364b610", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:bash", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "bash", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:bash"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time":
|
|
1
|
+
{"task_name": "mt_mbpp:bash", "task_hash": "12bf5ff314ab6e3b192fdb28a364b610", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:bash", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "bash", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:bash"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.915968418121338, "current_date": "2025-05-19 19:41:18 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5PVN5HA1E2SX4FNFAZN4", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVGSX2MHV1Z5AY6TBD7F6HAS", "BEAKER_WORKLOAD_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ENVIRONMENT_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ASSIGNED_CPU_COUNT": "127.5", "BEAKER_ASSIGNED_GPU_COUNT": "4", "BEAKER_NODE_HOSTNAME": "neptune-cs-aus-264.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"task_name": "mt_mbpp:c", "task_hash": "a61c21b0fd7fa57512e11b2c624dec05", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:c", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "c", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:c"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time":
|
|
1
|
+
{"task_name": "mt_mbpp:c", "task_hash": "a61c21b0fd7fa57512e11b2c624dec05", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:c", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "c", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:c"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 1.5405921936035156, "current_date": "2025-05-19 19:41:20 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5PVN5HA1E2SX4FNFAZN4", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVGSX2MHV1Z5AY6TBD7F6HAS", "BEAKER_WORKLOAD_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ENVIRONMENT_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ASSIGNED_CPU_COUNT": "127.5", "BEAKER_ASSIGNED_GPU_COUNT": "4", "BEAKER_NODE_HOSTNAME": "neptune-cs-aus-264.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"task_name": "mt_mbpp:cpp", "task_hash": "51069b2a5f1bf7fe9d54b54a37128b1d", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:cpp", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "cpp", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:cpp"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.
|
|
1
|
+
{"task_name": "mt_mbpp:cpp", "task_hash": "51069b2a5f1bf7fe9d54b54a37128b1d", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:cpp", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "cpp", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:cpp"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.374018430709839, "current_date": "2025-05-19 19:41:22 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5PVN5HA1E2SX4FNFAZN4", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVGSX2MHV1Z5AY6TBD7F6HAS", "BEAKER_WORKLOAD_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ENVIRONMENT_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ASSIGNED_CPU_COUNT": "127.5", "BEAKER_ASSIGNED_GPU_COUNT": "4", "BEAKER_NODE_HOSTNAME": "neptune-cs-aus-264.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"task_name": "mt_mbpp:csharp", "task_hash": "1bd53de5a3c6987e174dc031e5496975", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:csharp", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "csharp", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:csharp"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time":
|
|
1
|
+
{"task_name": "mt_mbpp:csharp", "task_hash": "1bd53de5a3c6987e174dc031e5496975", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:csharp", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "csharp", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:csharp"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.5499913692474365, "current_date": "2025-05-19 19:41:25 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5PVN5HA1E2SX4FNFAZN4", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVGSX2MHV1Z5AY6TBD7F6HAS", "BEAKER_WORKLOAD_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ENVIRONMENT_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ASSIGNED_CPU_COUNT": "127.5", "BEAKER_ASSIGNED_GPU_COUNT": "4", "BEAKER_NODE_HOSTNAME": "neptune-cs-aus-264.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"task_name": "mt_mbpp:go", "task_hash": "ad42237d305a14bf48d22fbd7275d533", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:go", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "go", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:go"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time":
|
|
1
|
+
{"task_name": "mt_mbpp:go", "task_hash": "ad42237d305a14bf48d22fbd7275d533", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:go", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "go", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:go"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.4711477756500244, "current_date": "2025-05-19 19:41:28 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5PVN5HA1E2SX4FNFAZN4", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVGSX2MHV1Z5AY6TBD7F6HAS", "BEAKER_WORKLOAD_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ENVIRONMENT_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ASSIGNED_CPU_COUNT": "127.5", "BEAKER_ASSIGNED_GPU_COUNT": "4", "BEAKER_NODE_HOSTNAME": "neptune-cs-aus-264.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"task_name": "mt_mbpp:haskell", "task_hash": "fb523f2ace6fa704fe5ac33cf8d57c26", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:haskell", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "haskell", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:haskell"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.
|
|
1
|
+
{"task_name": "mt_mbpp:haskell", "task_hash": "fb523f2ace6fa704fe5ac33cf8d57c26", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:haskell", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "haskell", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:haskell"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.714506149291992, "current_date": "2025-05-19 19:41:31 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5PVN5HA1E2SX4FNFAZN4", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVGSX2MHV1Z5AY6TBD7F6HAS", "BEAKER_WORKLOAD_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ENVIRONMENT_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ASSIGNED_CPU_COUNT": "127.5", "BEAKER_ASSIGNED_GPU_COUNT": "4", "BEAKER_NODE_HOSTNAME": "neptune-cs-aus-264.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"task_name": "mt_mbpp:java", "task_hash": "09dca3d5dc08e5549be48c7c840d4a87", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:java", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "java", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:java"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time":
|
|
1
|
+
{"task_name": "mt_mbpp:java", "task_hash": "09dca3d5dc08e5549be48c7c840d4a87", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:java", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "java", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:java"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 1.388763427734375, "current_date": "2025-05-19 19:41:32 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5PVN5HA1E2SX4FNFAZN4", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVGSX2MHV1Z5AY6TBD7F6HAS", "BEAKER_WORKLOAD_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ENVIRONMENT_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ASSIGNED_CPU_COUNT": "127.5", "BEAKER_ASSIGNED_GPU_COUNT": "4", "BEAKER_NODE_HOSTNAME": "neptune-cs-aus-264.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"task_name": "mt_mbpp:javascript", "task_hash": "e02e668d2bb8b66897858b7ce39eb8ea", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:javascript", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "javascript", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:javascript"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time":
|
|
1
|
+
{"task_name": "mt_mbpp:javascript", "task_hash": "e02e668d2bb8b66897858b7ce39eb8ea", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:javascript", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "javascript", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:javascript"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.969928741455078, "current_date": "2025-05-19 19:41:36 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5PVN5HA1E2SX4FNFAZN4", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVGSX2MHV1Z5AY6TBD7F6HAS", "BEAKER_WORKLOAD_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ENVIRONMENT_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ASSIGNED_CPU_COUNT": "127.5", "BEAKER_ASSIGNED_GPU_COUNT": "4", "BEAKER_NODE_HOSTNAME": "neptune-cs-aus-264.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"task_name": "mt_mbpp:matlab", "task_hash": "8d2c28b2bc33eb546714fdb3a72a8f50", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:matlab", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "matlab", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:matlab"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.
|
|
1
|
+
{"task_name": "mt_mbpp:matlab", "task_hash": "8d2c28b2bc33eb546714fdb3a72a8f50", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:matlab", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "matlab", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:matlab"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.147547721862793, "current_date": "2025-05-19 19:41:38 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5PVN5HA1E2SX4FNFAZN4", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVGSX2MHV1Z5AY6TBD7F6HAS", "BEAKER_WORKLOAD_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ENVIRONMENT_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ASSIGNED_CPU_COUNT": "127.5", "BEAKER_ASSIGNED_GPU_COUNT": "4", "BEAKER_NODE_HOSTNAME": "neptune-cs-aus-264.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"task_name": "mt_mbpp:php", "task_hash": "d6319dd39349460d65796302a83f7d31", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:php", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "php", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:php"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time":
|
|
1
|
+
{"task_name": "mt_mbpp:php", "task_hash": "d6319dd39349460d65796302a83f7d31", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:php", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "php", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:php"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 1.145324945449829, "current_date": "2025-05-19 19:41:39 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5PVN5HA1E2SX4FNFAZN4", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVGSX2MHV1Z5AY6TBD7F6HAS", "BEAKER_WORKLOAD_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ENVIRONMENT_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ASSIGNED_CPU_COUNT": "127.5", "BEAKER_ASSIGNED_GPU_COUNT": "4", "BEAKER_NODE_HOSTNAME": "neptune-cs-aus-264.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"task_name": "mt_mbpp:python", "task_hash": "fee56e18d38a80c1118f60e81a72d442", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:python", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "python", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:python"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time":
|
|
1
|
+
{"task_name": "mt_mbpp:python", "task_hash": "fee56e18d38a80c1118f60e81a72d442", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:python", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "python", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:python"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.339555263519287, "current_date": "2025-05-19 19:41:43 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5PVN5HA1E2SX4FNFAZN4", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVGSX2MHV1Z5AY6TBD7F6HAS", "BEAKER_WORKLOAD_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ENVIRONMENT_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ASSIGNED_CPU_COUNT": "127.5", "BEAKER_ASSIGNED_GPU_COUNT": "4", "BEAKER_NODE_HOSTNAME": "neptune-cs-aus-264.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"task_name": "mt_mbpp:r", "task_hash": "d81e15e102450362af2d7171e33a40d0", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:r", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "r", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:r"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time":
|
|
1
|
+
{"task_name": "mt_mbpp:r", "task_hash": "d81e15e102450362af2d7171e33a40d0", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:r", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "r", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:r"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 1.4658513069152832, "current_date": "2025-05-19 19:41:44 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5PVN5HA1E2SX4FNFAZN4", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVGSX2MHV1Z5AY6TBD7F6HAS", "BEAKER_WORKLOAD_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ENVIRONMENT_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ASSIGNED_CPU_COUNT": "127.5", "BEAKER_ASSIGNED_GPU_COUNT": "4", "BEAKER_NODE_HOSTNAME": "neptune-cs-aus-264.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"task_name": "mt_mbpp:ruby", "task_hash": "295088b5bf617929bc5f6c50c3c8e178", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:ruby", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "ruby", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:ruby"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time":
|
|
1
|
+
{"task_name": "mt_mbpp:ruby", "task_hash": "295088b5bf617929bc5f6c50c3c8e178", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:ruby", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "ruby", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:ruby"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.9486355781555176, "current_date": "2025-05-19 19:41:47 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5PVN5HA1E2SX4FNFAZN4", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVGSX2MHV1Z5AY6TBD7F6HAS", "BEAKER_WORKLOAD_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ENVIRONMENT_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ASSIGNED_CPU_COUNT": "127.5", "BEAKER_ASSIGNED_GPU_COUNT": "4", "BEAKER_NODE_HOSTNAME": "neptune-cs-aus-264.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"task_name": "mt_mbpp:rust", "task_hash": "c4e090ab96af1f8b427bbf55e2f15a92", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:rust", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "rust", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:rust"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time":
|
|
1
|
+
{"task_name": "mt_mbpp:rust", "task_hash": "c4e090ab96af1f8b427bbf55e2f15a92", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:rust", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "rust", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:rust"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 1.233504056930542, "current_date": "2025-05-19 19:41:48 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5PVN5HA1E2SX4FNFAZN4", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVGSX2MHV1Z5AY6TBD7F6HAS", "BEAKER_WORKLOAD_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ENVIRONMENT_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ASSIGNED_CPU_COUNT": "127.5", "BEAKER_ASSIGNED_GPU_COUNT": "4", "BEAKER_NODE_HOSTNAME": "neptune-cs-aus-264.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"task_name": "mt_mbpp:scala", "task_hash": "69a440383704f4474586c6642ad58c22", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:scala", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "scala", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:scala"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time":
|
|
1
|
+
{"task_name": "mt_mbpp:scala", "task_hash": "69a440383704f4474586c6642ad58c22", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:scala", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "scala", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:scala"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.882420539855957, "current_date": "2025-05-19 19:41:53 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5PVN5HA1E2SX4FNFAZN4", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVGSX2MHV1Z5AY6TBD7F6HAS", "BEAKER_WORKLOAD_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ENVIRONMENT_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ASSIGNED_CPU_COUNT": "127.5", "BEAKER_ASSIGNED_GPU_COUNT": "4", "BEAKER_NODE_HOSTNAME": "neptune-cs-aus-264.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"task_name": "mt_mbpp:swift", "task_hash": "20d99f047a4973c156ee030770a02d10", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:swift", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "swift", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:swift"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 1.
|
|
1
|
+
{"task_name": "mt_mbpp:swift", "task_hash": "20d99f047a4973c156ee030770a02d10", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:swift", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "swift", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:swift"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 1.460864543914795, "current_date": "2025-05-19 19:41:55 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5PVN5HA1E2SX4FNFAZN4", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVGSX2MHV1Z5AY6TBD7F6HAS", "BEAKER_WORKLOAD_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ENVIRONMENT_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ASSIGNED_CPU_COUNT": "127.5", "BEAKER_ASSIGNED_GPU_COUNT": "4", "BEAKER_NODE_HOSTNAME": "neptune-cs-aus-264.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"task_name": "mt_mbpp:typescript", "task_hash": "0124047c8167d9c7b97d38642efc1c5d", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:typescript", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "typescript", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:typescript"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time":
|
|
1
|
+
{"task_name": "mt_mbpp:typescript", "task_hash": "0124047c8167d9c7b97d38642efc1c5d", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:typescript", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "typescript", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:typescript"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 1.8711166381835938, "current_date": "2025-05-19 19:41:57 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5PVN5HA1E2SX4FNFAZN4", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVGSX2MHV1Z5AY6TBD7F6HAS", "BEAKER_WORKLOAD_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ENVIRONMENT_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ASSIGNED_CPU_COUNT": "127.5", "BEAKER_ASSIGNED_GPU_COUNT": "4", "BEAKER_NODE_HOSTNAME": "neptune-cs-aus-264.reviz.ai2.in"}}
|
|
Binary file
|
olmo_eval/tasks.py
CHANGED
|
@@ -2620,10 +2620,18 @@ LABEL_TO_TASK_MAP_EXPANDED = {
|
|
|
2620
2620
|
OEEvalTask,
|
|
2621
2621
|
{"dataset_path": "codex_humaneval", "dataset_name": "gold_bpb_0shot", "metric_type": "bpb"},
|
|
2622
2622
|
),
|
|
2623
|
+
"codex_humaneval_gold_bpb_3shot": (
|
|
2624
|
+
OEEvalTask,
|
|
2625
|
+
{"dataset_path": "codex_humaneval", "dataset_name": "gold_bpb_3shot", "metric_type": "bpb"},
|
|
2626
|
+
),
|
|
2623
2627
|
"codex_mbpp_gold_bpb_0shot": (
|
|
2624
2628
|
OEEvalTask,
|
|
2625
2629
|
{"dataset_path": "codex_mbpp", "dataset_name": "gold_bpb_0shot", "metric_type": "bpb"},
|
|
2626
2630
|
),
|
|
2631
|
+
"codex_mbpp_gold_bpb_3shot": (
|
|
2632
|
+
OEEvalTask,
|
|
2633
|
+
{"dataset_path": "codex_mbpp", "dataset_name": "gold_bpb_3shot", "metric_type": "bpb"},
|
|
2634
|
+
),
|
|
2627
2635
|
"minerva_math_algebra_gold_bpb_0shot": (
|
|
2628
2636
|
OEEvalTask,
|
|
2629
2637
|
{
|
olmo_eval/version.py
CHANGED
|
File without changes
|
|
File without changes
|