ai2-olmo-eval 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ai2_olmo_eval-0.7.2.dist-info → ai2_olmo_eval-0.8.1.dist-info}/METADATA +1 -1
- {ai2_olmo_eval-0.7.2.dist-info → ai2_olmo_eval-0.8.1.dist-info}/RECORD +44 -8
- olmo_eval/metrics.py +112 -87
- olmo_eval/oe_eval_tasks/minerva_math_500/gold_bpb_0shot/config.json +1 -0
- olmo_eval/oe_eval_tasks/minerva_math_500/gold_bpb_0shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_bash/gold_bpb_3shot/config.json +1 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_bash/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_c/gold_bpb_3shot/config.json +1 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_c/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_cpp/gold_bpb_3shot/config.json +1 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_cpp/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_csharp/gold_bpb_3shot/config.json +1 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_csharp/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_go/gold_bpb_3shot/config.json +1 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_go/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_haskell/gold_bpb_3shot/config.json +1 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_haskell/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_java/gold_bpb_3shot/config.json +1 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_java/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_javascript/gold_bpb_3shot/config.json +1 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_javascript/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_matlab/gold_bpb_3shot/config.json +1 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_matlab/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_php/gold_bpb_3shot/config.json +1 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_php/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_python/gold_bpb_3shot/config.json +1 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_python/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_r/gold_bpb_3shot/config.json +1 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_r/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_ruby/gold_bpb_3shot/config.json +1 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_ruby/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_rust/gold_bpb_3shot/config.json +1 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_rust/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_scala/gold_bpb_3shot/config.json +1 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_scala/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_swift/gold_bpb_3shot/config.json +1 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_swift/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_typescript/gold_bpb_3shot/config.json +1 -0
- olmo_eval/oe_eval_tasks/mt_mbpp_typescript/gold_bpb_3shot/requests.jsonl.gz +0 -0
- olmo_eval/tasks.py +514 -2
- olmo_eval/version.py +2 -2
- {ai2_olmo_eval-0.7.2.dist-info → ai2_olmo_eval-0.8.1.dist-info}/WHEEL +0 -0
- {ai2_olmo_eval-0.7.2.dist-info → ai2_olmo_eval-0.8.1.dist-info}/licenses/LICENSE +0 -0
- {ai2_olmo_eval-0.7.2.dist-info → ai2_olmo_eval-0.8.1.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
ai2_olmo_eval-0.
|
|
1
|
+
ai2_olmo_eval-0.8.1.dist-info/licenses/LICENSE,sha256=YvuKOpYh3COIF0yqq-nCMXtpS7mh1GyYvPVlW2j1G-M,11359
|
|
2
2
|
olmo_eval/__init__.py,sha256=49RxnAaJNk8U9XP3SF5MjyFIxLSkxH0vXQuZgnEOi44,283
|
|
3
|
-
olmo_eval/metrics.py,sha256=
|
|
4
|
-
olmo_eval/tasks.py,sha256=
|
|
3
|
+
olmo_eval/metrics.py,sha256=zc4IOZ8rUhxPyXVk6fOYzVKjJ4Lzq4tYeoyurxYWqY0,20034
|
|
4
|
+
olmo_eval/tasks.py,sha256=CDdDs2FMPYNQJ3nH4QhgH6iCflGVwCP7rDylBAD2ZoA,96310
|
|
5
5
|
olmo_eval/tokenizer.py,sha256=PnkidE0nAtEA1QZjuQpE_bIwgAsHxodnaJRALAPqrJQ,5127
|
|
6
6
|
olmo_eval/util.py,sha256=ARmZmRQl8VOvnKQoUprb3cOunzcApeNhRdV4BMXZuvo,3856
|
|
7
|
-
olmo_eval/version.py,sha256=
|
|
7
|
+
olmo_eval/version.py,sha256=m8LhsK0Q8YOvgVGanSxnr9xFRnJGqoISa243HGBZMfQ,308
|
|
8
8
|
olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/data-00000-of-00001.arrow,sha256=TPWbMhBmticWjYp7TA3etcKbXbaoCDBWhxuqlD1bDJA,98080
|
|
9
9
|
olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/dataset_info.json,sha256=iZumP5Udu8LD7cbew3o7nNpnGu-o9jPaMxUrNDDNIVY,1795
|
|
10
10
|
olmo_eval/hf_datasets/ai2_arc/ARC-Challenge/validation/state.json,sha256=6Q1XhM-HMZcymuGAKBC_8RjMBKgJSaR_6lLUO9Z8XwE,255
|
|
@@ -654,6 +654,8 @@ olmo_eval/oe_eval_tasks/hellaswag/val_mc_5shot/config.json,sha256=DaDjfSSvXBE0Nk
|
|
|
654
654
|
olmo_eval/oe_eval_tasks/hellaswag/val_mc_5shot/requests.jsonl.gz,sha256=XNpBEoMjg397dEuapS9r3UxOOPQAxmVUrYM92YBxzDs,5298017
|
|
655
655
|
olmo_eval/oe_eval_tasks/hellaswag/val_rc_5shot/config.json,sha256=5QzTjpfq260Gn-uudEcnwtNQhu5cgXDppDb4xwVKaXg,697
|
|
656
656
|
olmo_eval/oe_eval_tasks/hellaswag/val_rc_5shot/requests.jsonl.gz,sha256=eqT2QbCuP_dnNHVI-QLSl7GANmSJ97vc6cW6CNaxpOk,8715020
|
|
657
|
+
olmo_eval/oe_eval_tasks/minerva_math_500/gold_bpb_0shot/config.json,sha256=knOyrHZQID7U0MHMU-pY5-yH8wAFQfyKfe6TOKVzj1Q,1820
|
|
658
|
+
olmo_eval/oe_eval_tasks/minerva_math_500/gold_bpb_0shot/requests.jsonl.gz,sha256=Ijap4745EB4KAkRQYGv5O2E7u9U4bvQegKc_88kTfVw,187400
|
|
657
659
|
olmo_eval/oe_eval_tasks/minerva_math_algebra/gold_bpb_0shot/config.json,sha256=dnEBrtnEVI1xwxrnXhRwAY1D9TgkBTO-2FDuGnDZlyo,1729
|
|
658
660
|
olmo_eval/oe_eval_tasks/minerva_math_algebra/gold_bpb_0shot/requests.jsonl.gz,sha256=aaMtZp8Pdm_lsf0K9lVGowD-3622GgWh8gPB57PVypk,354806
|
|
659
661
|
olmo_eval/oe_eval_tasks/minerva_math_counting_and_probability/gold_bpb_0shot/config.json,sha256=ma_4PuBS0_HmU4DJlktEp-YtZSDjBKn_n5BuIaUns0U,1815
|
|
@@ -668,6 +670,40 @@ olmo_eval/oe_eval_tasks/minerva_math_prealgebra/gold_bpb_0shot/config.json,sha25
|
|
|
668
670
|
olmo_eval/oe_eval_tasks/minerva_math_prealgebra/gold_bpb_0shot/requests.jsonl.gz,sha256=YoiSOp8JH-I0y0ppgnTcLNbF7sLVHrHFbWI8K6cBj_M,267414
|
|
669
671
|
olmo_eval/oe_eval_tasks/minerva_math_precalculus/gold_bpb_0shot/config.json,sha256=-ex7VPSuyCoVHEkGw___E9gv8Jcfr8BWMEMO0Ber3H8,1750
|
|
670
672
|
olmo_eval/oe_eval_tasks/minerva_math_precalculus/gold_bpb_0shot/requests.jsonl.gz,sha256=A7b4WGo5qLCl3s1z5HBbEUpoJwva12doE_WRc3un6Fs,223616
|
|
673
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_bash/gold_bpb_3shot/config.json,sha256=7atQztNncbIzRR1KIGKIUFVTKTEh7Qzq0BCI1wX5MbM,1741
|
|
674
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_bash/gold_bpb_3shot/requests.jsonl.gz,sha256=vVfvG9JH07yLnUQS7GL08Fk6qssLLYf9ILRDkNdaCMU,86297
|
|
675
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_c/gold_bpb_3shot/config.json,sha256=9nrxe2A8-C6Sgoili94D9m5CrswYKcMujSBh0YXf2Xk,1729
|
|
676
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_c/gold_bpb_3shot/requests.jsonl.gz,sha256=ioL2fQn_Xgj85cJ0Oovv0Yc0rN5nMfXBm-Wy4D3JpOs,114627
|
|
677
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_cpp/gold_bpb_3shot/config.json,sha256=nrQeUhyBk7haqKGkM5McOnKgAtgDPy54VwgRaYAF-WM,1736
|
|
678
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_cpp/gold_bpb_3shot/requests.jsonl.gz,sha256=Ki-kaNCTKXNqEC3_91nQxcgOWo88ckDN-xasq4E-Chk,94905
|
|
679
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_csharp/gold_bpb_3shot/config.json,sha256=UYLjhduEO6SMiM45aHxNTfFiuH3fchtumrrbd3TTjng,1749
|
|
680
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_csharp/gold_bpb_3shot/requests.jsonl.gz,sha256=eEt6c0kO9GvnE8mu97479o8dtm40Qz44e3MAR3LkJMA,95599
|
|
681
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_go/gold_bpb_3shot/config.json,sha256=G76DhavPWRsZV0ysHyLmbT6pULrRj3ItRoAcc46bruE,1733
|
|
682
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_go/gold_bpb_3shot/requests.jsonl.gz,sha256=Smy_aJvmpP7zmSp56dobOZ9e0YVyniWPvczZgK6KLrk,93557
|
|
683
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_haskell/gold_bpb_3shot/config.json,sha256=jNlAdeY8xk6QpyfWR3aaqmZisnRO1ctG-aJet3hOvic,1753
|
|
684
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_haskell/gold_bpb_3shot/requests.jsonl.gz,sha256=4sdH9VyO8o5tUR0UCcZ-Wd7oGGxlx2GR5dZc6vtHulM,77028
|
|
685
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_java/gold_bpb_3shot/config.json,sha256=zZCOL6qVMAJN7OtSVa8EQCQ8wjEzzjZYyPqxRZogxvQ,1741
|
|
686
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_java/gold_bpb_3shot/requests.jsonl.gz,sha256=_5D1kfdHd18A-04FgVZu3rSsYUJ0U_pNQ2umRShXIl0,98419
|
|
687
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_javascript/gold_bpb_3shot/config.json,sha256=kNRgtsUx5d1wRfAybiQm2ZeCBitrNn9bD3i8kUNc8uI,1765
|
|
688
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_javascript/gold_bpb_3shot/requests.jsonl.gz,sha256=BR-FGU3ya2ydzZJg9CaONaSZA2XZeHsbEL719YAE4vY,88546
|
|
689
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_matlab/gold_bpb_3shot/config.json,sha256=FV3bj1KJXEior3ARIcOBgDTSg2lTT2J2ffk8BU0uvXM,1747
|
|
690
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_matlab/gold_bpb_3shot/requests.jsonl.gz,sha256=si7mTPuxIWZWV_ZpIoTWPAqT0grY9Iz_0nFUXfATyYE,74734
|
|
691
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_php/gold_bpb_3shot/config.json,sha256=244aDI1B5OY7xlEeZFiagT95VvUdYsXmhW4OUzU4HK8,1737
|
|
692
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_php/gold_bpb_3shot/requests.jsonl.gz,sha256=jZe3jXzIMPIrBnkS4Br73S9J6B3KGqIlY_G3fN7HAnE,80071
|
|
693
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_python/gold_bpb_3shot/config.json,sha256=N3k5lq8ZKlFxQADDYLAEZNFO4Ih36P68OR2n6V-jvOU,1749
|
|
694
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_python/gold_bpb_3shot/requests.jsonl.gz,sha256=V-BMGYbJ8Pg3Yhn8_69FkY9Ie9-FPK7U7RkMqX1HR34,72745
|
|
695
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_r/gold_bpb_3shot/config.json,sha256=DOygII5OdqWrL4V6-pgZGyi2eJeQFbiPSu5PEp8T2qg,1729
|
|
696
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_r/gold_bpb_3shot/requests.jsonl.gz,sha256=sl59osruluqKuB9c-1Aw7frYyx9h7oDHLYfJHrdGzfQ,72929
|
|
697
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_ruby/gold_bpb_3shot/config.json,sha256=C9QLr7Gt3Fj4pBsGCN_G76nOrsWOvKdsxLe_bV11xA0,1741
|
|
698
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_ruby/gold_bpb_3shot/requests.jsonl.gz,sha256=BTClvjaMUc2Zoee4mGz5V4_nFlqrJ9RBHwjcQhCTwKg,65784
|
|
699
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_rust/gold_bpb_3shot/config.json,sha256=RDY_cNdm-SJ5c2dqeS6HaXpuzDsqiTWL2k6_nfyH2y8,1741
|
|
700
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_rust/gold_bpb_3shot/requests.jsonl.gz,sha256=8-elUeFVXXUq1jdtOM_JyqBh6wsaBwDOzsi3TYuSP-c,86683
|
|
701
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_scala/gold_bpb_3shot/config.json,sha256=jw2Mtff9RziYyJ-ANRUIm1gdiBTdjH7_lWUb87pbCRs,1744
|
|
702
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_scala/gold_bpb_3shot/requests.jsonl.gz,sha256=pGY_jkOqSMwP1vo0-pxu-bldKsyEOuEnUQcCr3vcBhQ,78441
|
|
703
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_swift/gold_bpb_3shot/config.json,sha256=EtDGZxog6XDwtgaIihMphqLWSYOA9wI6EosT7ahMJr8,1745
|
|
704
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_swift/gold_bpb_3shot/requests.jsonl.gz,sha256=wuToc6AXf769kBG4-jwo1i0RW4aVmoyO-TOT7GgBCs0,101695
|
|
705
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_typescript/gold_bpb_3shot/config.json,sha256=xFVh4sB8EXecyGyRJIqjiJxvXdvn7ihq2JQ4o5fJgvg,1765
|
|
706
|
+
olmo_eval/oe_eval_tasks/mt_mbpp_typescript/gold_bpb_3shot/requests.jsonl.gz,sha256=1FvsI8wBPMzsPQTE-Fou-EMupLVBr4u6x0sSiDsjAi8,99206
|
|
671
707
|
olmo_eval/oe_eval_tasks/openbookqa/mc_5shot/config.json,sha256=RfZ4pfvROPvMIMmVhlr_3xWGLz5mVOqsdWCHTdu1eIY,612
|
|
672
708
|
olmo_eval/oe_eval_tasks/openbookqa/mc_5shot/requests.jsonl.gz,sha256=oQ4gkIyQ5ZkZa-ceBvETqlrRs8GJCgsn6djAV6ubFgU,75731
|
|
673
709
|
olmo_eval/oe_eval_tasks/openbookqa/rc_0shot/config.json,sha256=bOBTOqksTpGGJYJ2419HJxd__pbOpetokEJBzN6yRKw,547
|
|
@@ -716,7 +752,7 @@ olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/config.json,sha256=ySjEVqTOj5GwC
|
|
|
716
752
|
olmo_eval/oe_eval_tasks/winogrande/val_rc_5shot/requests.jsonl.gz,sha256=knTzcqigWCfdYLN1Pl0TfCm0Fi1lRASWAo_SC6KtXsc,115262
|
|
717
753
|
olmo_eval/tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json,sha256=yjXYcnpTO7Zjm_R4Gucrn9oA5paadiYM-ZZER5q_EXc,2114319
|
|
718
754
|
olmo_eval/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json,sha256=mtM7Szmp-Dlzw_jEKgGUjdW4d6KKyaU1aVbE_07QtxQ,2115113
|
|
719
|
-
ai2_olmo_eval-0.
|
|
720
|
-
ai2_olmo_eval-0.
|
|
721
|
-
ai2_olmo_eval-0.
|
|
722
|
-
ai2_olmo_eval-0.
|
|
755
|
+
ai2_olmo_eval-0.8.1.dist-info/METADATA,sha256=w5VIenUlcbF_DlzzrQD4XZ-2rfAvemdioDBFPUyd0WM,14398
|
|
756
|
+
ai2_olmo_eval-0.8.1.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
|
|
757
|
+
ai2_olmo_eval-0.8.1.dist-info/top_level.txt,sha256=Pryk28JTb89-j624Uy1gRZiE0YXI3czgbNIfJCl9-x0,10
|
|
758
|
+
ai2_olmo_eval-0.8.1.dist-info/RECORD,,
|
olmo_eval/metrics.py
CHANGED
|
@@ -98,96 +98,121 @@ class ICLMetric(Metric):
|
|
|
98
98
|
batch["ctx_len"][idx] - 1 : batch["ctx_len"][idx] + batch["cont_len"][idx] - 1
|
|
99
99
|
]
|
|
100
100
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
)
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
/ batch["cont_byte_len"][idx]
|
|
135
|
-
* LOG_2_OF_E
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
log_likelihood_no_leading_space = torch.gather(
|
|
139
|
-
lm_cont_logits, 1, cont_tokens.unsqueeze(-1)
|
|
140
|
-
).sum()
|
|
141
|
-
celoss_no_leading_space = (
|
|
142
|
-
-torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
|
|
143
|
-
/ batch["cont_str_len_no_leading_space"][idx]
|
|
144
|
-
)
|
|
145
|
-
bpb_no_leading_space = (
|
|
146
|
-
-torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
|
|
147
|
-
/ batch["cont_byte_len_no_leading_space"][idx]
|
|
148
|
-
* LOG_2_OF_E
|
|
149
|
-
)
|
|
150
|
-
elif self.metric_type in ["len_norm", "ce_loss", "bpb"]:
|
|
151
|
-
log_likelihood = (
|
|
152
|
-
torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
|
|
153
|
-
/ batch["cont_str_len"][idx]
|
|
154
|
-
)
|
|
155
|
-
celoss = (
|
|
156
|
-
-torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
|
|
157
|
-
/ batch["cont_str_len"][idx]
|
|
158
|
-
)
|
|
159
|
-
bpb = (
|
|
160
|
-
-torch.gather(lm_cont_logits, 1, cont_tokens.unsqueeze(-1)).sum()
|
|
161
|
-
/ batch["cont_byte_len"][idx]
|
|
162
|
-
* LOG_2_OF_E
|
|
163
|
-
)
|
|
101
|
+
if "choice_ids" in batch:
|
|
102
|
+
fast_mc = True
|
|
103
|
+
choice_ids = batch["choice_ids"][idx]
|
|
104
|
+
else:
|
|
105
|
+
fast_mc = False
|
|
106
|
+
choice_ids = cont_tokens
|
|
107
|
+
|
|
108
|
+
# For each choice token, calculate metrics and append as separate entries
|
|
109
|
+
for choice_idx, choice_token in enumerate(choice_ids):
|
|
110
|
+
if fast_mc:
|
|
111
|
+
_cont_id = choice_idx
|
|
112
|
+
_cont_tokens = choice_token.unsqueeze(-1)
|
|
113
|
+
else:
|
|
114
|
+
_cont_id = cont_id
|
|
115
|
+
_cont_tokens = cont_tokens
|
|
116
|
+
|
|
117
|
+
# Skip choices for Qs with less than the max choices (for questions w/ different nubmers of choices)
|
|
118
|
+
is_empty_choice = (choice_token.unsqueeze(-1).unsqueeze(-1) == -1).all().item()
|
|
119
|
+
if is_empty_choice:
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
log_likelihood: torch.Tensor
|
|
123
|
+
celoss: torch.Tensor
|
|
124
|
+
bpb: torch.Tensor
|
|
125
|
+
log_likelihood_no_leading_space: torch.Tensor
|
|
126
|
+
celoss_no_leading_space: torch.Tensor
|
|
127
|
+
bpb_no_leading_space: torch.Tensor
|
|
128
|
+
if self.metric_type == "pmi_dc":
|
|
129
|
+
assert dc_lm_logits is not None
|
|
130
|
+
# get domain conditional continuation logits: [cont_len, vocab]
|
|
131
|
+
dc_lm_cont_logits = dc_lm_logits[idx][
|
|
132
|
+
batch["dc_len"][idx] - 1 : batch["dc_len"][idx] + batch["cont_len"][idx] - 1
|
|
133
|
+
]
|
|
164
134
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
135
|
+
# gather log-probs at continuation token indices but divide by domain conditional prob
|
|
136
|
+
log_likelihood = (
|
|
137
|
+
torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
|
|
138
|
+
/ torch.gather(dc_lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
|
|
139
|
+
)
|
|
140
|
+
celoss = -log_likelihood
|
|
141
|
+
bpb = -log_likelihood # the normalization factors cancel out
|
|
142
|
+
|
|
143
|
+
log_likelihood_no_leading_space = log_likelihood
|
|
144
|
+
celoss_no_leading_space = celoss
|
|
145
|
+
bpb_no_leading_space = bpb
|
|
146
|
+
elif self.metric_type == "acc" or self.metric_type == "f1":
|
|
147
|
+
# gather log-probs at continuation token indices
|
|
148
|
+
log_likelihood = torch.gather(
|
|
149
|
+
lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)
|
|
150
|
+
).sum()
|
|
151
|
+
celoss = (
|
|
152
|
+
-torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
|
|
153
|
+
/ batch["cont_str_len"][idx]
|
|
154
|
+
)
|
|
155
|
+
bpb = (
|
|
156
|
+
-torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
|
|
157
|
+
/ batch["cont_byte_len"][idx]
|
|
158
|
+
* LOG_2_OF_E
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
log_likelihood_no_leading_space = torch.gather(
|
|
162
|
+
lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)
|
|
163
|
+
).sum()
|
|
164
|
+
celoss_no_leading_space = (
|
|
165
|
+
-torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
|
|
166
|
+
/ batch["cont_str_len_no_leading_space"][idx]
|
|
167
|
+
)
|
|
168
|
+
bpb_no_leading_space = (
|
|
169
|
+
-torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
|
|
170
|
+
/ batch["cont_byte_len_no_leading_space"][idx]
|
|
171
|
+
* LOG_2_OF_E
|
|
172
|
+
)
|
|
173
|
+
elif self.metric_type in ["len_norm", "ce_loss", "bpb"]:
|
|
174
|
+
log_likelihood = (
|
|
175
|
+
torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
|
|
176
|
+
/ batch["cont_str_len"][idx]
|
|
177
|
+
)
|
|
178
|
+
celoss = (
|
|
179
|
+
-torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
|
|
180
|
+
/ batch["cont_str_len"][idx]
|
|
181
|
+
)
|
|
182
|
+
bpb = (
|
|
183
|
+
-torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
|
|
184
|
+
/ batch["cont_byte_len"][idx]
|
|
185
|
+
* LOG_2_OF_E
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
log_likelihood_no_leading_space = (
|
|
189
|
+
torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
|
|
190
|
+
/ batch["cont_str_len_no_leading_space"][idx]
|
|
191
|
+
)
|
|
192
|
+
celoss_no_leading_space = (
|
|
193
|
+
-torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
|
|
194
|
+
/ batch["cont_str_len_no_leading_space"][idx]
|
|
195
|
+
)
|
|
196
|
+
bpb_no_leading_space = (
|
|
197
|
+
-torch.gather(lm_cont_logits, 1, _cont_tokens.unsqueeze(-1)).sum()
|
|
198
|
+
/ batch["cont_byte_len_no_leading_space"][idx]
|
|
199
|
+
* LOG_2_OF_E
|
|
200
|
+
)
|
|
201
|
+
else:
|
|
202
|
+
raise ValueError(self.metric_type)
|
|
203
|
+
|
|
204
|
+
self.labels.append((doc_id, _cont_id, int(batch["label_id"][idx])))
|
|
205
|
+
self.loglikelihoods.append((doc_id, _cont_id, float(log_likelihood)))
|
|
206
|
+
self.celosses.append((doc_id, _cont_id, float(celoss)))
|
|
207
|
+
self.bpbs.append((doc_id, _cont_id, float(bpb)))
|
|
208
|
+
|
|
209
|
+
self.loglikelihoods_no_leading_space.append(
|
|
210
|
+
(doc_id, _cont_id, float(log_likelihood_no_leading_space))
|
|
172
211
|
)
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
/ batch["cont_byte_len_no_leading_space"][idx]
|
|
176
|
-
* LOG_2_OF_E
|
|
212
|
+
self.celosses_no_leading_space.append(
|
|
213
|
+
(doc_id, _cont_id, float(celoss_no_leading_space))
|
|
177
214
|
)
|
|
178
|
-
|
|
179
|
-
raise ValueError(self.metric_type)
|
|
180
|
-
|
|
181
|
-
self.labels.append((doc_id, cont_id, int(batch["label_id"][idx])))
|
|
182
|
-
self.loglikelihoods.append((doc_id, cont_id, float(log_likelihood)))
|
|
183
|
-
self.celosses.append((doc_id, cont_id, float(celoss)))
|
|
184
|
-
self.bpbs.append((doc_id, cont_id, float(bpb)))
|
|
185
|
-
|
|
186
|
-
self.loglikelihoods_no_leading_space.append(
|
|
187
|
-
(doc_id, cont_id, float(log_likelihood_no_leading_space))
|
|
188
|
-
)
|
|
189
|
-
self.celosses_no_leading_space.append((doc_id, cont_id, float(celoss_no_leading_space)))
|
|
190
|
-
self.bpbs_no_leading_space.append((doc_id, cont_id, float(bpb_no_leading_space)))
|
|
215
|
+
self.bpbs_no_leading_space.append((doc_id, _cont_id, float(bpb_no_leading_space)))
|
|
191
216
|
|
|
192
217
|
def compute(self) -> Dict[str, torch.Tensor]:
|
|
193
218
|
# Task "suffix" -> tensor
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"task_name": "minerva_math_500", "task_hash": "75c1b390d73949780c88a5ff49948b5d", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "minerva_math_500", "task_core": "minerva_math_500", "limit": null, "split": "test", "num_shots": 4, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {"use_cot": true, "cot_style": "minerva"}, "generation_kwargs": {"max_gen_toks": 1024, "temperature": 0.0, "do_sample": false, "stop_sequences": ["Problem:", "\n\n"]}, "metric_kwargs": {}, "native_id_field": "index", "fewshot_source": "Minerva:MATH:fixed", "dataset_path": "HuggingFaceH4/MATH-500", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "minerva_math_500:bpb::olmes"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 1.9896588325500488, "current_date": "2025-05-18 01:08:29 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5Q99TVRRHS826AQSXY3H", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVDSK0DTYMJWXCKFBRFNA24V", "BEAKER_WORKLOAD_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ENVIRONMENT_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ASSIGNED_CPU_COUNT": "31.875", "BEAKER_ASSIGNED_GPU_COUNT": "1", "BEAKER_NODE_HOSTNAME": "triton-cs-aus-454.reviz.ai2.in"}}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"task_name": "mt_mbpp:bash", "task_hash": "12bf5ff314ab6e3b192fdb28a364b610", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:bash", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "bash", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:bash"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.0172407627105713, "current_date": "2025-05-18 01:08:32 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5Q99TVRRHS826AQSXY3H", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVDSK0DTYMJWXCKFBRFNA24V", "BEAKER_WORKLOAD_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ENVIRONMENT_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ASSIGNED_CPU_COUNT": "31.875", "BEAKER_ASSIGNED_GPU_COUNT": "1", "BEAKER_NODE_HOSTNAME": "triton-cs-aus-454.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"task_name": "mt_mbpp:c", "task_hash": "a61c21b0fd7fa57512e11b2c624dec05", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:c", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "c", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:c"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.3485467433929443, "current_date": "2025-05-18 01:08:34 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5Q99TVRRHS826AQSXY3H", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVDSK0DTYMJWXCKFBRFNA24V", "BEAKER_WORKLOAD_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ENVIRONMENT_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ASSIGNED_CPU_COUNT": "31.875", "BEAKER_ASSIGNED_GPU_COUNT": "1", "BEAKER_NODE_HOSTNAME": "triton-cs-aus-454.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"task_name": "mt_mbpp:cpp", "task_hash": "51069b2a5f1bf7fe9d54b54a37128b1d", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:cpp", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "cpp", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:cpp"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.267606496810913, "current_date": "2025-05-18 01:08:36 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5Q99TVRRHS826AQSXY3H", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVDSK0DTYMJWXCKFBRFNA24V", "BEAKER_WORKLOAD_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ENVIRONMENT_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ASSIGNED_CPU_COUNT": "31.875", "BEAKER_ASSIGNED_GPU_COUNT": "1", "BEAKER_NODE_HOSTNAME": "triton-cs-aus-454.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"task_name": "mt_mbpp:csharp", "task_hash": "1bd53de5a3c6987e174dc031e5496975", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:csharp", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "csharp", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:csharp"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 1.9407422542572021, "current_date": "2025-05-18 01:08:38 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5Q99TVRRHS826AQSXY3H", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVDSK0DTYMJWXCKFBRFNA24V", "BEAKER_WORKLOAD_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ENVIRONMENT_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ASSIGNED_CPU_COUNT": "31.875", "BEAKER_ASSIGNED_GPU_COUNT": "1", "BEAKER_NODE_HOSTNAME": "triton-cs-aus-454.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"task_name": "mt_mbpp:go", "task_hash": "ad42237d305a14bf48d22fbd7275d533", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:go", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "go", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:go"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 1.7393126487731934, "current_date": "2025-05-18 01:08:40 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5Q99TVRRHS826AQSXY3H", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVDSK0DTYMJWXCKFBRFNA24V", "BEAKER_WORKLOAD_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ENVIRONMENT_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ASSIGNED_CPU_COUNT": "31.875", "BEAKER_ASSIGNED_GPU_COUNT": "1", "BEAKER_NODE_HOSTNAME": "triton-cs-aus-454.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"task_name": "mt_mbpp:haskell", "task_hash": "fb523f2ace6fa704fe5ac33cf8d57c26", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:haskell", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "haskell", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:haskell"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.2595787048339844, "current_date": "2025-05-18 01:08:42 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5Q99TVRRHS826AQSXY3H", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVDSK0DTYMJWXCKFBRFNA24V", "BEAKER_WORKLOAD_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ENVIRONMENT_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ASSIGNED_CPU_COUNT": "31.875", "BEAKER_ASSIGNED_GPU_COUNT": "1", "BEAKER_NODE_HOSTNAME": "triton-cs-aus-454.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"task_name": "mt_mbpp:java", "task_hash": "09dca3d5dc08e5549be48c7c840d4a87", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:java", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "java", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:java"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.4217495918273926, "current_date": "2025-05-18 01:08:45 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5Q99TVRRHS826AQSXY3H", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVDSK0DTYMJWXCKFBRFNA24V", "BEAKER_WORKLOAD_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ENVIRONMENT_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ASSIGNED_CPU_COUNT": "31.875", "BEAKER_ASSIGNED_GPU_COUNT": "1", "BEAKER_NODE_HOSTNAME": "triton-cs-aus-454.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"task_name": "mt_mbpp:javascript", "task_hash": "e02e668d2bb8b66897858b7ce39eb8ea", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:javascript", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "javascript", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:javascript"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 1.7279460430145264, "current_date": "2025-05-18 01:08:47 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5Q99TVRRHS826AQSXY3H", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVDSK0DTYMJWXCKFBRFNA24V", "BEAKER_WORKLOAD_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ENVIRONMENT_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ASSIGNED_CPU_COUNT": "31.875", "BEAKER_ASSIGNED_GPU_COUNT": "1", "BEAKER_NODE_HOSTNAME": "triton-cs-aus-454.reviz.ai2.in"}}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"task_name": "mt_mbpp:matlab", "task_hash": "8d2c28b2bc33eb546714fdb3a72a8f50", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:matlab", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "matlab", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:matlab"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.80657958984375, "current_date": "2025-05-18 01:08:49 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5Q99TVRRHS826AQSXY3H", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVDSK0DTYMJWXCKFBRFNA24V", "BEAKER_WORKLOAD_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ENVIRONMENT_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ASSIGNED_CPU_COUNT": "31.875", "BEAKER_ASSIGNED_GPU_COUNT": "1", "BEAKER_NODE_HOSTNAME": "triton-cs-aus-454.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"task_name": "mt_mbpp:php", "task_hash": "d6319dd39349460d65796302a83f7d31", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:php", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "php", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:php"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.0057003498077393, "current_date": "2025-05-18 01:08:51 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5Q99TVRRHS826AQSXY3H", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVDSK0DTYMJWXCKFBRFNA24V", "BEAKER_WORKLOAD_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ENVIRONMENT_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ASSIGNED_CPU_COUNT": "31.875", "BEAKER_ASSIGNED_GPU_COUNT": "1", "BEAKER_NODE_HOSTNAME": "triton-cs-aus-454.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"task_name": "mt_mbpp:python", "task_hash": "fee56e18d38a80c1118f60e81a72d442", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:python", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "python", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:python"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 1.9594206809997559, "current_date": "2025-05-18 01:08:53 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5Q99TVRRHS826AQSXY3H", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVDSK0DTYMJWXCKFBRFNA24V", "BEAKER_WORKLOAD_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ENVIRONMENT_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ASSIGNED_CPU_COUNT": "31.875", "BEAKER_ASSIGNED_GPU_COUNT": "1", "BEAKER_NODE_HOSTNAME": "triton-cs-aus-454.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"task_name": "mt_mbpp:r", "task_hash": "d81e15e102450362af2d7171e33a40d0", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:r", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "r", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:r"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.6297383308410645, "current_date": "2025-05-18 01:08:56 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5Q99TVRRHS826AQSXY3H", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVDSK0DTYMJWXCKFBRFNA24V", "BEAKER_WORKLOAD_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ENVIRONMENT_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ASSIGNED_CPU_COUNT": "31.875", "BEAKER_ASSIGNED_GPU_COUNT": "1", "BEAKER_NODE_HOSTNAME": "triton-cs-aus-454.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"task_name": "mt_mbpp:ruby", "task_hash": "295088b5bf617929bc5f6c50c3c8e178", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:ruby", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "ruby", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:ruby"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 1.7374136447906494, "current_date": "2025-05-18 01:08:58 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5Q99TVRRHS826AQSXY3H", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVDSK0DTYMJWXCKFBRFNA24V", "BEAKER_WORKLOAD_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ENVIRONMENT_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ASSIGNED_CPU_COUNT": "31.875", "BEAKER_ASSIGNED_GPU_COUNT": "1", "BEAKER_NODE_HOSTNAME": "triton-cs-aus-454.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"task_name": "mt_mbpp:rust", "task_hash": "c4e090ab96af1f8b427bbf55e2f15a92", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:rust", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "rust", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:rust"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.1582589149475098, "current_date": "2025-05-18 01:09:00 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5Q99TVRRHS826AQSXY3H", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVDSK0DTYMJWXCKFBRFNA24V", "BEAKER_WORKLOAD_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ENVIRONMENT_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ASSIGNED_CPU_COUNT": "31.875", "BEAKER_ASSIGNED_GPU_COUNT": "1", "BEAKER_NODE_HOSTNAME": "triton-cs-aus-454.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"task_name": "mt_mbpp:scala", "task_hash": "69a440383704f4474586c6642ad58c22", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:scala", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "scala", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:scala"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.035874366760254, "current_date": "2025-05-18 01:09:02 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5Q99TVRRHS826AQSXY3H", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVDSK0DTYMJWXCKFBRFNA24V", "BEAKER_WORKLOAD_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ENVIRONMENT_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ASSIGNED_CPU_COUNT": "31.875", "BEAKER_ASSIGNED_GPU_COUNT": "1", "BEAKER_NODE_HOSTNAME": "triton-cs-aus-454.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"task_name": "mt_mbpp:swift", "task_hash": "20d99f047a4973c156ee030770a02d10", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:swift", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "swift", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:swift"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 1.7264349460601807, "current_date": "2025-05-18 01:09:04 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5Q99TVRRHS826AQSXY3H", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVDSK0DTYMJWXCKFBRFNA24V", "BEAKER_WORKLOAD_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ENVIRONMENT_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ASSIGNED_CPU_COUNT": "31.875", "BEAKER_ASSIGNED_GPU_COUNT": "1", "BEAKER_NODE_HOSTNAME": "triton-cs-aus-454.reviz.ai2.in"}}
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"task_name": "mt_mbpp:typescript", "task_hash": "0124047c8167d9c7b97d38642efc1c5d", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mt_mbpp:typescript", "task_core": "mt_mbpp", "limit": 500, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {}, "generation_kwargs": {"max_gen_toks": 512, "temperature": 0.0, "do_sample": false, "stop_sequences": ["\n\n"]}, "metric_kwargs": {}, "native_id_field": "task_id", "fewshot_source": "multilingual_mbpp", "dataset_path": "allenai/multilingual_mbpp", "dataset_name": "typescript", "use_chat_format": null, "version": 0, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "mt_mbpp:typescript"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.5021820068359375, "current_date": "2025-05-18 01:09:06 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5Q99TVRRHS826AQSXY3H", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVDSK0DTYMJWXCKFBRFNA24V", "BEAKER_WORKLOAD_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ENVIRONMENT_ID": "01JVDSK0DTA4GTNDS6ANJS0352", "BEAKER_ASSIGNED_CPU_COUNT": "31.875", "BEAKER_ASSIGNED_GPU_COUNT": "1", "BEAKER_NODE_HOSTNAME": "triton-cs-aus-454.reviz.ai2.in"}}
|