wisent 0.7.901__py3-none-any.whl → 0.7.1045__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wisent/__init__.py +1 -1
- wisent/comparison/__init__.py +1 -0
- wisent/comparison/detect_bos_features.py +275 -0
- wisent/comparison/fgaa.py +465 -0
- wisent/comparison/lora.py +669 -0
- wisent/comparison/lora_dpo.py +592 -0
- wisent/comparison/main.py +444 -0
- wisent/comparison/ours.py +76 -0
- wisent/comparison/sae.py +304 -0
- wisent/comparison/utils.py +381 -0
- wisent/core/activations/activations_collector.py +3 -2
- wisent/core/activations/extraction_strategy.py +8 -4
- wisent/core/cli/agent/apply_steering.py +7 -5
- wisent/core/cli/agent/train_classifier.py +4 -3
- wisent/core/cli/generate_vector_from_task.py +11 -20
- wisent/core/cli/get_activations.py +1 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +20 -3
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +8 -1
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +8 -1
- wisent/core/parser_arguments/generate_vector_from_task_parser.py +4 -11
- wisent/core/parser_arguments/get_activations_parser.py +5 -14
- {wisent-0.7.901.dist-info → wisent-0.7.1045.dist-info}/METADATA +5 -1
- {wisent-0.7.901.dist-info → wisent-0.7.1045.dist-info}/RECORD +27 -91
- wisent/examples/contrastive_pairs/humanization_human_vs_ai.json +0 -2112
- wisent/examples/scripts/1/test_basqueglue_evaluation.json +0 -51
- wisent/examples/scripts/1/test_basqueglue_pairs.json +0 -14
- wisent/examples/scripts/1/test_bec2016eu_evaluation.json +0 -51
- wisent/examples/scripts/1/test_bec2016eu_pairs.json +0 -14
- wisent/examples/scripts/1/test_belebele_evaluation.json +0 -51
- wisent/examples/scripts/1/test_belebele_pairs.json +0 -14
- wisent/examples/scripts/1/test_benchmarks_evaluation.json +0 -51
- wisent/examples/scripts/1/test_benchmarks_pairs.json +0 -14
- wisent/examples/scripts/1/test_bertaqa_evaluation.json +0 -51
- wisent/examples/scripts/1/test_bertaqa_pairs.json +0 -14
- wisent/examples/scripts/1/test_bhtc_v2_evaluation.json +0 -30
- wisent/examples/scripts/1/test_bhtc_v2_pairs.json +0 -8
- wisent/examples/scripts/1/test_boolq-seq2seq_evaluation.json +0 -30
- wisent/examples/scripts/1/test_boolq-seq2seq_pairs.json +0 -8
- wisent/examples/scripts/1/test_cabreu_evaluation.json +0 -30
- wisent/examples/scripts/1/test_cabreu_pairs.json +0 -8
- wisent/examples/scripts/1/test_careqa_en_evaluation.json +0 -30
- wisent/examples/scripts/1/test_careqa_en_pairs.json +0 -8
- wisent/examples/scripts/1/test_careqa_evaluation.json +0 -30
- wisent/examples/scripts/1/test_careqa_pairs.json +0 -8
- wisent/examples/scripts/1/test_catalanqa_evaluation.json +0 -30
- wisent/examples/scripts/1/test_catalanqa_pairs.json +0 -8
- wisent/examples/scripts/1/test_catcola_evaluation.json +0 -30
- wisent/examples/scripts/1/test_catcola_pairs.json +0 -8
- wisent/examples/scripts/1/test_chartqa_evaluation.json +0 -30
- wisent/examples/scripts/1/test_chartqa_pairs.json +0 -8
- wisent/examples/scripts/1/test_claim_stance_topic_evaluation.json +0 -30
- wisent/examples/scripts/1/test_claim_stance_topic_pairs.json +0 -8
- wisent/examples/scripts/1/test_cnn_dailymail_evaluation.json +0 -30
- wisent/examples/scripts/1/test_cnn_dailymail_pairs.json +0 -8
- wisent/examples/scripts/1/test_cocoteros_es_evaluation.json +0 -30
- wisent/examples/scripts/1/test_cocoteros_es_pairs.json +0 -8
- wisent/examples/scripts/1/test_coedit_gec_evaluation.json +0 -30
- wisent/examples/scripts/1/test_coedit_gec_pairs.json +0 -8
- wisent/examples/scripts/1/test_cola_evaluation.json +0 -30
- wisent/examples/scripts/1/test_cola_pairs.json +0 -8
- wisent/examples/scripts/1/test_coqcat_evaluation.json +0 -30
- wisent/examples/scripts/1/test_coqcat_pairs.json +0 -8
- wisent/examples/scripts/1/test_dbpedia_14_evaluation.json +0 -30
- wisent/examples/scripts/1/test_dbpedia_14_pairs.json +0 -8
- wisent/examples/scripts/1/test_epec_koref_bin_evaluation.json +0 -30
- wisent/examples/scripts/1/test_epec_koref_bin_pairs.json +0 -8
- wisent/examples/scripts/1/test_ethos_binary_evaluation.json +0 -30
- wisent/examples/scripts/1/test_ethos_binary_pairs.json +0 -8
- wisent/examples/scripts/2/test_afrimgsm_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/2/test_afrimgsm_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/2/test_afrimmlu_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/2/test_afrimmlu_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/2/test_afrixnli_en_direct_amh_evaluation.json +0 -30
- wisent/examples/scripts/2/test_afrixnli_en_direct_amh_pairs.json +0 -8
- wisent/examples/scripts/2/test_arc_ar_evaluation.json +0 -30
- wisent/examples/scripts/2/test_arc_ar_pairs.json +0 -8
- wisent/examples/scripts/2/test_atis_evaluation.json +0 -30
- wisent/examples/scripts/2/test_atis_pairs.json +0 -8
- wisent/examples/scripts/2/test_babi_evaluation.json +0 -30
- wisent/examples/scripts/2/test_babi_pairs.json +0 -8
- wisent/examples/scripts/2/test_babilong_evaluation.json +0 -30
- wisent/examples/scripts/2/test_babilong_pairs.json +0 -8
- wisent/examples/scripts/2/test_bangla_mmlu_evaluation.json +0 -30
- wisent/examples/scripts/2/test_bangla_mmlu_pairs.json +0 -8
- wisent/examples/scripts/2/test_basque-glue_pairs.json +0 -14
- wisent/examples/scripts/generate_paper_data.py +0 -384
- wisent/examples/scripts/intervention_validation.py +0 -626
- wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +0 -324
- wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +0 -92
- wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +0 -324
- wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +0 -92
- wisent/examples/scripts/results/test_afrimgsm_pairs.json +0 -92
- wisent/examples/scripts/results/test_afrimmlu_evaluation.json +0 -324
- wisent/examples/scripts/results/test_afrimmlu_pairs.json +0 -92
- wisent/examples/scripts/threshold_analysis.py +0 -434
- wisent/examples/scripts/visualization_gallery.py +0 -582
- {wisent-0.7.901.dist-info → wisent-0.7.1045.dist-info}/WHEEL +0 -0
- {wisent-0.7.901.dist-info → wisent-0.7.1045.dist-info}/entry_points.txt +0 -0
- {wisent-0.7.901.dist-info → wisent-0.7.1045.dist-info}/licenses/LICENSE +0 -0
- {wisent-0.7.901.dist-info → wisent-0.7.1045.dist-info}/top_level.txt +0 -0
|
@@ -1,324 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"task_name": "afrimmlu",
|
|
3
|
-
"model_name": "mock",
|
|
4
|
-
"evaluator_name": "log_likelihoods",
|
|
5
|
-
"num_pairs": 15,
|
|
6
|
-
"all_correct": true,
|
|
7
|
-
"pairs": [
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 0,
|
|
10
|
-
"prompt": "Ked\u1ee5 \u1ee5d\u1ecb emume Steeti a ch\u1ecdr\u1ecd?...",
|
|
11
|
-
"positive_response": "[",
|
|
12
|
-
"negative_response": "'",
|
|
13
|
-
"positive_evaluation": {
|
|
14
|
-
"ground_truth": "TRUTHFUL",
|
|
15
|
-
"confidence": 1.0,
|
|
16
|
-
"expected": "TRUTHFUL",
|
|
17
|
-
"correct": true,
|
|
18
|
-
"details": "Predicted: '[' (log_prob=-0.500), Expected: '['"
|
|
19
|
-
},
|
|
20
|
-
"negative_evaluation": {
|
|
21
|
-
"ground_truth": "UNTRUTHFUL",
|
|
22
|
-
"confidence": 0.0,
|
|
23
|
-
"expected": "UNTRUTHFUL",
|
|
24
|
-
"correct": true,
|
|
25
|
-
"details": "Predicted: '[' (log_prob=-0.500), Expected: '''"
|
|
26
|
-
},
|
|
27
|
-
"both_correct": true
|
|
28
|
-
},
|
|
29
|
-
{
|
|
30
|
-
"pair_id": 1,
|
|
31
|
-
"prompt": "\u12a8\u121a\u12a8\u1270\u1209\u1275 \u12cd\u1235\u1325 \u12e8\u12cb\u130b (P) \u1260\u134d\u1341\u121d \u1349\u12ad\u12ad\u122d \u12cd\u1235\u1325 \u1208\u121d\u1295 \u12a8\u1205\u12f3\u130d \u1308\u1262 (MR) \u130b\u122d \u12a5\u1295\u12f0\u121a\u1270\u12ab\u12a8\u120d\u1363 \u12a5\u1293 \u12cb\u130b (P) \u1260\u1265\u1278\u129d\u1290\u1275 \u12c8\u12ed\u121d \u134d\u133d\u121d\u1293 \u1260\u130e\u12f0\u1208\u12cd \u1349\u12ad\u12ad\u122d \u12a8\u12a0\u1290\u1235\u1270\u129b \u1308\u1262 \u1208\u121d\u1295 \u12a5\u1295\u12f0\u121a\u1260\u120d\u1325 \u1260\u1275\u12ad\u12ad\u120d \u12e8\u121a\u12eb\u1235\u1228\u12f3\u12cd \u12e8\u1275\u129b\u12cd \u1290\u12cd? I. MR = P = \u1208\u134d\u1339\u121d \u12cd\u12f5\u12f5\u122d \u1260\u12a0\u130d\u12f5\u121d \u1270\u130d\u1263\u122d \u120b\u12ed \u134d\u120b\u130e\u1275. II. P > MR \u134d\u133d\u121d\u1293 \u12e8\u130e\u12f0\u1208\u12cd \u1349\u12ad\u12ad\u122d \u12a5\u1295\u12f0 \u1241\u120d\u1241\u120d \u1270\u1295\u1238\u122b\u1273\u127d \u1270\u130d\u1263\u122b\u1275\u1362 III. \u134d\u120b\u130e\u1275 \u12a5\u1293 \u12cb\u130b \u134d\u133d\u121d\u1293 \u12e8\u130e\u12f0\u1208\u12cd \u12cd\u12f5\u12f5\u122d \u12a5\u1295\u12f0 \u124b\u121a \u1270\u130d\u1263\u122d \u1290\u12cd \u12e8\u121a\u12c8\u12a8\u1209\u1275\u1362...",
|
|
32
|
-
"positive_response": "'",
|
|
33
|
-
"negative_response": "I",
|
|
34
|
-
"positive_evaluation": {
|
|
35
|
-
"ground_truth": "TRUTHFUL",
|
|
36
|
-
"confidence": 1.0,
|
|
37
|
-
"expected": "TRUTHFUL",
|
|
38
|
-
"correct": true,
|
|
39
|
-
"details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
|
|
40
|
-
},
|
|
41
|
-
"negative_evaluation": {
|
|
42
|
-
"ground_truth": "UNTRUTHFUL",
|
|
43
|
-
"confidence": 0.0,
|
|
44
|
-
"expected": "UNTRUTHFUL",
|
|
45
|
-
"correct": true,
|
|
46
|
-
"details": "Predicted: ''' (log_prob=-0.500), Expected: 'I'"
|
|
47
|
-
},
|
|
48
|
-
"both_correct": true
|
|
49
|
-
},
|
|
50
|
-
{
|
|
51
|
-
"pair_id": 2,
|
|
52
|
-
"prompt": "Na 1997, \u1ee4l\u1ecd ak\u1ee5 \u1ee5wa ch\u1ecdp\u1ee5tara na fraksh\u1ecdn nke nd\u1ecb b\u1ecb n'\u1ee5wa niile bi na mba nd\u1ecb na-akpata obere ego--ya b\u1ee5 mba kwa onye \u1ecdb\u1ee5la n'eri \u1ee5gw\u1ecd \u1ecdnwa nke $1230 ma\u1ecdb\u1ee5 n'erughi --d\u1ecb ihe d\u1ecb ka...",
|
|
53
|
-
"positive_response": "5",
|
|
54
|
-
"negative_response": "%",
|
|
55
|
-
"positive_evaluation": {
|
|
56
|
-
"ground_truth": "TRUTHFUL",
|
|
57
|
-
"confidence": 1.0,
|
|
58
|
-
"expected": "TRUTHFUL",
|
|
59
|
-
"correct": true,
|
|
60
|
-
"details": "Predicted: '5' (log_prob=-0.500), Expected: '5'"
|
|
61
|
-
},
|
|
62
|
-
"negative_evaluation": {
|
|
63
|
-
"ground_truth": "UNTRUTHFUL",
|
|
64
|
-
"confidence": 0.0,
|
|
65
|
-
"expected": "UNTRUTHFUL",
|
|
66
|
-
"correct": true,
|
|
67
|
-
"details": "Predicted: '5' (log_prob=-0.500), Expected: '%'"
|
|
68
|
-
},
|
|
69
|
-
"both_correct": true
|
|
70
|
-
},
|
|
71
|
-
{
|
|
72
|
-
"pair_id": 3,
|
|
73
|
-
"prompt": "Mu gihe cyacyo cy'agahebuzo, ni irihe tandukaniro riri hagati y'abantu bo mu Majyepfo n'abo mu bindi bice bya Leta Zunze Ubumwe z'Amerika bemera ko amashuri atandukana n'ayandi?...",
|
|
74
|
-
"positive_response": "'",
|
|
75
|
-
"negative_response": "8",
|
|
76
|
-
"positive_evaluation": {
|
|
77
|
-
"ground_truth": "TRUTHFUL",
|
|
78
|
-
"confidence": 1.0,
|
|
79
|
-
"expected": "TRUTHFUL",
|
|
80
|
-
"correct": true,
|
|
81
|
-
"details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
|
|
82
|
-
},
|
|
83
|
-
"negative_evaluation": {
|
|
84
|
-
"ground_truth": "UNTRUTHFUL",
|
|
85
|
-
"confidence": 0.0,
|
|
86
|
-
"expected": "UNTRUTHFUL",
|
|
87
|
-
"correct": true,
|
|
88
|
-
"details": "Predicted: ''' (log_prob=-0.500), Expected: '8'"
|
|
89
|
-
},
|
|
90
|
-
"both_correct": true
|
|
91
|
-
},
|
|
92
|
-
{
|
|
93
|
-
"pair_id": 4,
|
|
94
|
-
"prompt": "Ni iyihe mibare yabaturage itubwira igipimo cyuburumbuke aho umubare wabana bazavuka uzaba uhagije kugira ngo usimbure ababyeyi no kwishyura impfu zabana bapfa bakiri bato?...",
|
|
95
|
-
"positive_response": "'",
|
|
96
|
-
"negative_response": "U",
|
|
97
|
-
"positive_evaluation": {
|
|
98
|
-
"ground_truth": "TRUTHFUL",
|
|
99
|
-
"confidence": 1.0,
|
|
100
|
-
"expected": "TRUTHFUL",
|
|
101
|
-
"correct": true,
|
|
102
|
-
"details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
|
|
103
|
-
},
|
|
104
|
-
"negative_evaluation": {
|
|
105
|
-
"ground_truth": "UNTRUTHFUL",
|
|
106
|
-
"confidence": 0.0,
|
|
107
|
-
"expected": "UNTRUTHFUL",
|
|
108
|
-
"correct": true,
|
|
109
|
-
"details": "Predicted: ''' (log_prob=-0.500), Expected: 'U'"
|
|
110
|
-
},
|
|
111
|
-
"both_correct": true
|
|
112
|
-
},
|
|
113
|
-
{
|
|
114
|
-
"pair_id": 5,
|
|
115
|
-
"prompt": "A 2016, kusan mutane nawa a Amurka ba su da matsuguni?...",
|
|
116
|
-
"positive_response": "'",
|
|
117
|
-
"negative_response": "5",
|
|
118
|
-
"positive_evaluation": {
|
|
119
|
-
"ground_truth": "TRUTHFUL",
|
|
120
|
-
"confidence": 1.0,
|
|
121
|
-
"expected": "TRUTHFUL",
|
|
122
|
-
"correct": true,
|
|
123
|
-
"details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
|
|
124
|
-
},
|
|
125
|
-
"negative_evaluation": {
|
|
126
|
-
"ground_truth": "UNTRUTHFUL",
|
|
127
|
-
"confidence": 0.0,
|
|
128
|
-
"expected": "UNTRUTHFUL",
|
|
129
|
-
"correct": true,
|
|
130
|
-
"details": "Predicted: ''' (log_prob=-0.500), Expected: '5'"
|
|
131
|
-
},
|
|
132
|
-
"both_correct": true
|
|
133
|
-
},
|
|
134
|
-
{
|
|
135
|
-
"pair_id": 6,
|
|
136
|
-
"prompt": "Pour quel nombre l'\u00e9quation ci-apr\u00e8s est-elle vraie? : 26,1 mm = _ dm....",
|
|
137
|
-
"positive_response": "[",
|
|
138
|
-
"negative_response": "'",
|
|
139
|
-
"positive_evaluation": {
|
|
140
|
-
"ground_truth": "TRUTHFUL",
|
|
141
|
-
"confidence": 1.0,
|
|
142
|
-
"expected": "TRUTHFUL",
|
|
143
|
-
"correct": true,
|
|
144
|
-
"details": "Predicted: '[' (log_prob=-0.500), Expected: '['"
|
|
145
|
-
},
|
|
146
|
-
"negative_evaluation": {
|
|
147
|
-
"ground_truth": "UNTRUTHFUL",
|
|
148
|
-
"confidence": 0.0,
|
|
149
|
-
"expected": "UNTRUTHFUL",
|
|
150
|
-
"correct": true,
|
|
151
|
-
"details": "Predicted: '[' (log_prob=-0.500), Expected: '''"
|
|
152
|
-
},
|
|
153
|
-
"both_correct": true
|
|
154
|
-
},
|
|
155
|
-
{
|
|
156
|
-
"pair_id": 7,
|
|
157
|
-
"prompt": "Je, kuna vikwazo vyovyote vya kuingizwa kwa uhalifu wa kimila chini ya sheria ya Kiingereza?...",
|
|
158
|
-
"positive_response": "a",
|
|
159
|
-
"negative_response": "k",
|
|
160
|
-
"positive_evaluation": {
|
|
161
|
-
"ground_truth": "TRUTHFUL",
|
|
162
|
-
"confidence": 1.0,
|
|
163
|
-
"expected": "TRUTHFUL",
|
|
164
|
-
"correct": true,
|
|
165
|
-
"details": "Predicted: 'a' (log_prob=-0.500), Expected: 'a'"
|
|
166
|
-
},
|
|
167
|
-
"negative_evaluation": {
|
|
168
|
-
"ground_truth": "UNTRUTHFUL",
|
|
169
|
-
"confidence": 0.0,
|
|
170
|
-
"expected": "UNTRUTHFUL",
|
|
171
|
-
"correct": true,
|
|
172
|
-
"details": "Predicted: 'a' (log_prob=-0.500), Expected: 'k'"
|
|
173
|
-
},
|
|
174
|
-
"both_correct": true
|
|
175
|
-
},
|
|
176
|
-
{
|
|
177
|
-
"pair_id": 8,
|
|
178
|
-
"prompt": "Akwai wasu iyakoki dan shigar laifukan al'ada a \u0199ar\u0199ashin dokar Ingilishi?...",
|
|
179
|
-
"positive_response": "a",
|
|
180
|
-
"negative_response": "b",
|
|
181
|
-
"positive_evaluation": {
|
|
182
|
-
"ground_truth": "TRUTHFUL",
|
|
183
|
-
"confidence": 1.0,
|
|
184
|
-
"expected": "TRUTHFUL",
|
|
185
|
-
"correct": true,
|
|
186
|
-
"details": "Predicted: 'a' (log_prob=-0.500), Expected: 'a'"
|
|
187
|
-
},
|
|
188
|
-
"negative_evaluation": {
|
|
189
|
-
"ground_truth": "UNTRUTHFUL",
|
|
190
|
-
"confidence": 0.0,
|
|
191
|
-
"expected": "UNTRUTHFUL",
|
|
192
|
-
"correct": true,
|
|
193
|
-
"details": "Predicted: 'a' (log_prob=-0.500), Expected: 'b'"
|
|
194
|
-
},
|
|
195
|
-
"both_correct": true
|
|
196
|
-
},
|
|
197
|
-
{
|
|
198
|
-
"pair_id": 9,
|
|
199
|
-
"prompt": "Kufikia 2019, ni takriban asilimia ngapi ya watu duniani wanaishi katika umaskini uliokithiri (chini ya $1.90 kwa siku)?...",
|
|
200
|
-
"positive_response": "'",
|
|
201
|
-
"negative_response": "5",
|
|
202
|
-
"positive_evaluation": {
|
|
203
|
-
"ground_truth": "TRUTHFUL",
|
|
204
|
-
"confidence": 1.0,
|
|
205
|
-
"expected": "TRUTHFUL",
|
|
206
|
-
"correct": true,
|
|
207
|
-
"details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
|
|
208
|
-
},
|
|
209
|
-
"negative_evaluation": {
|
|
210
|
-
"ground_truth": "UNTRUTHFUL",
|
|
211
|
-
"confidence": 0.0,
|
|
212
|
-
"expected": "UNTRUTHFUL",
|
|
213
|
-
"correct": true,
|
|
214
|
-
"details": "Predicted: ''' (log_prob=-0.500), Expected: '5'"
|
|
215
|
-
},
|
|
216
|
-
"both_correct": true
|
|
217
|
-
},
|
|
218
|
-
{
|
|
219
|
-
"pair_id": 10,
|
|
220
|
-
"prompt": "Which treaties are considered as 'source of international law' under article 38 ICJ Statute?...",
|
|
221
|
-
"positive_response": "'",
|
|
222
|
-
"negative_response": "A",
|
|
223
|
-
"positive_evaluation": {
|
|
224
|
-
"ground_truth": "TRUTHFUL",
|
|
225
|
-
"confidence": 1.0,
|
|
226
|
-
"expected": "TRUTHFUL",
|
|
227
|
-
"correct": true,
|
|
228
|
-
"details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
|
|
229
|
-
},
|
|
230
|
-
"negative_evaluation": {
|
|
231
|
-
"ground_truth": "UNTRUTHFUL",
|
|
232
|
-
"confidence": 0.0,
|
|
233
|
-
"expected": "UNTRUTHFUL",
|
|
234
|
-
"correct": true,
|
|
235
|
-
"details": "Predicted: ''' (log_prob=-0.500), Expected: 'A'"
|
|
236
|
-
},
|
|
237
|
-
"both_correct": true
|
|
238
|
-
},
|
|
239
|
-
{
|
|
240
|
-
"pair_id": 11,
|
|
241
|
-
"prompt": "Umupaka w'ibishoboka byo gukora uzagaragara nk'umurongo ugororotse igihe...",
|
|
242
|
-
"positive_response": "b",
|
|
243
|
-
"negative_response": "u",
|
|
244
|
-
"positive_evaluation": {
|
|
245
|
-
"ground_truth": "TRUTHFUL",
|
|
246
|
-
"confidence": 1.0,
|
|
247
|
-
"expected": "TRUTHFUL",
|
|
248
|
-
"correct": true,
|
|
249
|
-
"details": "Predicted: 'b' (log_prob=-0.500), Expected: 'b'"
|
|
250
|
-
},
|
|
251
|
-
"negative_evaluation": {
|
|
252
|
-
"ground_truth": "UNTRUTHFUL",
|
|
253
|
-
"confidence": 0.0,
|
|
254
|
-
"expected": "UNTRUTHFUL",
|
|
255
|
-
"correct": true,
|
|
256
|
-
"details": "Predicted: 'b' (log_prob=-0.500), Expected: 'u'"
|
|
257
|
-
},
|
|
258
|
-
"both_correct": true
|
|
259
|
-
},
|
|
260
|
-
{
|
|
261
|
-
"pair_id": 12,
|
|
262
|
-
"prompt": "T\u00ed \u00ecd\u00e1s\u00ed l\u00e1ti \u00ecta t\u00ed \u00f2 d\u00e1ra b\u00e1 darap\u1ecd\u0300 m\u1ecd\u0301 \u00ecp\u00e8s\u00e8 ohun \u00e8l\u00f2 al\u00e1d\u00e0ni, \u00e8wo n\u00edn\u00fa \u00e0w\u1ecdn w\u1ecd\u0300ny\u00ec\u00ed ni \u00ecg\u00e9s\u1eb9\u0300 l\u00e1ti \u1ecd\u0300d\u1ecd\u0300 \u00ecj\u1ecdba t\u00ed \u00f3 \u1e63e\u00e9\u1e63e k\u00f3 gb\u00e9 \u1ecdj\u00e0 n\u00e1\u00e0 l\u1ecd s\u00edbi \u00e0b\u00e1j\u00e1de t\u00f3 d\u00e1ra?...",
|
|
263
|
-
"positive_response": "T",
|
|
264
|
-
"negative_response": "i",
|
|
265
|
-
"positive_evaluation": {
|
|
266
|
-
"ground_truth": "TRUTHFUL",
|
|
267
|
-
"confidence": 1.0,
|
|
268
|
-
"expected": "TRUTHFUL",
|
|
269
|
-
"correct": true,
|
|
270
|
-
"details": "Predicted: 'T' (log_prob=-0.500), Expected: 'T'"
|
|
271
|
-
},
|
|
272
|
-
"negative_evaluation": {
|
|
273
|
-
"ground_truth": "UNTRUTHFUL",
|
|
274
|
-
"confidence": 0.0,
|
|
275
|
-
"expected": "UNTRUTHFUL",
|
|
276
|
-
"correct": true,
|
|
277
|
-
"details": "Predicted: 'T' (log_prob=-0.500), Expected: 'i'"
|
|
278
|
-
},
|
|
279
|
-
"both_correct": true
|
|
280
|
-
},
|
|
281
|
-
{
|
|
282
|
-
"pair_id": 13,
|
|
283
|
-
"prompt": "What was GDP per capita in the United States in 1850 when adjusting for inflation and PPP in 2011 prices?...",
|
|
284
|
-
"positive_response": "'",
|
|
285
|
-
"negative_response": "A",
|
|
286
|
-
"positive_evaluation": {
|
|
287
|
-
"ground_truth": "TRUTHFUL",
|
|
288
|
-
"confidence": 1.0,
|
|
289
|
-
"expected": "TRUTHFUL",
|
|
290
|
-
"correct": true,
|
|
291
|
-
"details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
|
|
292
|
-
},
|
|
293
|
-
"negative_evaluation": {
|
|
294
|
-
"ground_truth": "UNTRUTHFUL",
|
|
295
|
-
"confidence": 0.0,
|
|
296
|
-
"expected": "UNTRUTHFUL",
|
|
297
|
-
"correct": true,
|
|
298
|
-
"details": "Predicted: ''' (log_prob=-0.500), Expected: 'A'"
|
|
299
|
-
},
|
|
300
|
-
"both_correct": true
|
|
301
|
-
},
|
|
302
|
-
{
|
|
303
|
-
"pair_id": 14,
|
|
304
|
-
"prompt": "Ni uwuhe mubare w'abana bafite hagati y'imyaka 13 na 15 muri Misiri bavuze ko bahohotewe nibura rimwe mu mezi abiri ashize, guhera mu mwaka wa 2015?...",
|
|
305
|
-
"positive_response": "0",
|
|
306
|
-
"negative_response": "%",
|
|
307
|
-
"positive_evaluation": {
|
|
308
|
-
"ground_truth": "TRUTHFUL",
|
|
309
|
-
"confidence": 1.0,
|
|
310
|
-
"expected": "TRUTHFUL",
|
|
311
|
-
"correct": true,
|
|
312
|
-
"details": "Predicted: '0' (log_prob=-0.500), Expected: '0'"
|
|
313
|
-
},
|
|
314
|
-
"negative_evaluation": {
|
|
315
|
-
"ground_truth": "UNTRUTHFUL",
|
|
316
|
-
"confidence": 0.0,
|
|
317
|
-
"expected": "UNTRUTHFUL",
|
|
318
|
-
"correct": true,
|
|
319
|
-
"details": "Predicted: '0' (log_prob=-0.500), Expected: '%'"
|
|
320
|
-
},
|
|
321
|
-
"both_correct": true
|
|
322
|
-
}
|
|
323
|
-
]
|
|
324
|
-
}
|
|
@@ -1,92 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"pair_id": 0,
|
|
4
|
-
"prompt": "Ked\u1ee5 \u1ee5d\u1ecb emume Steeti a ch\u1ecdr\u1ecd?",
|
|
5
|
-
"positive_response": "[",
|
|
6
|
-
"negative_response": "'"
|
|
7
|
-
},
|
|
8
|
-
{
|
|
9
|
-
"pair_id": 1,
|
|
10
|
-
"prompt": "\u12a8\u121a\u12a8\u1270\u1209\u1275 \u12cd\u1235\u1325 \u12e8\u12cb\u130b (P) \u1260\u134d\u1341\u121d \u1349\u12ad\u12ad\u122d \u12cd\u1235\u1325 \u1208\u121d\u1295 \u12a8\u1205\u12f3\u130d \u1308\u1262 (MR) \u130b\u122d \u12a5\u1295\u12f0\u121a\u1270\u12ab\u12a8\u120d\u1363 \u12a5\u1293 \u12cb\u130b (P) \u1260\u1265\u1278\u129d\u1290\u1275 \u12c8\u12ed\u121d \u134d\u133d\u121d\u1293 \u1260\u130e\u12f0\u1208\u12cd \u1349\u12ad\u12ad\u122d \u12a8\u12a0\u1290\u1235\u1270\u129b \u1308\u1262 \u1208\u121d\u1295 \u12a5\u1295\u12f0\u121a\u1260\u120d\u1325 \u1260\u1275\u12ad\u12ad\u120d \u12e8\u121a\u12eb\u1235\u1228\u12f3\u12cd \u12e8\u1275\u129b\u12cd \u1290\u12cd? I. MR = P = \u1208\u134d\u1339\u121d \u12cd\u12f5\u12f5\u122d \u1260\u12a0\u130d\u12f5\u121d \u1270\u130d\u1263\u122d \u120b\u12ed \u134d\u120b\u130e\u1275. II. P > MR \u134d\u133d\u121d\u1293 \u12e8\u130e\u12f0\u1208\u12cd \u1349\u12ad\u12ad\u122d \u12a5\u1295\u12f0 \u1241\u120d\u1241\u120d \u1270\u1295\u1238\u122b\u1273\u127d \u1270\u130d\u1263\u122b\u1275\u1362 III. \u134d\u120b\u130e\u1275 \u12a5\u1293 \u12cb\u130b \u134d\u133d\u121d\u1293 \u12e8\u130e\u12f0\u1208\u12cd \u12cd\u12f5\u12f5\u122d \u12a5\u1295\u12f0 \u124b\u121a \u1270\u130d\u1263\u122d \u1290\u12cd \u12e8\u121a\u12c8\u12a8\u1209\u1275\u1362",
|
|
11
|
-
"positive_response": "'",
|
|
12
|
-
"negative_response": "I"
|
|
13
|
-
},
|
|
14
|
-
{
|
|
15
|
-
"pair_id": 2,
|
|
16
|
-
"prompt": "Na 1997, \u1ee4l\u1ecd ak\u1ee5 \u1ee5wa ch\u1ecdp\u1ee5tara na fraksh\u1ecdn nke nd\u1ecb b\u1ecb n'\u1ee5wa niile bi na mba nd\u1ecb na-akpata obere ego--ya b\u1ee5 mba kwa onye \u1ecdb\u1ee5la n'eri \u1ee5gw\u1ecd \u1ecdnwa nke $1230 ma\u1ecdb\u1ee5 n'erughi --d\u1ecb ihe d\u1ecb ka",
|
|
17
|
-
"positive_response": "5",
|
|
18
|
-
"negative_response": "%"
|
|
19
|
-
},
|
|
20
|
-
{
|
|
21
|
-
"pair_id": 3,
|
|
22
|
-
"prompt": "Mu gihe cyacyo cy'agahebuzo, ni irihe tandukaniro riri hagati y'abantu bo mu Majyepfo n'abo mu bindi bice bya Leta Zunze Ubumwe z'Amerika bemera ko amashuri atandukana n'ayandi?",
|
|
23
|
-
"positive_response": "'",
|
|
24
|
-
"negative_response": "8"
|
|
25
|
-
},
|
|
26
|
-
{
|
|
27
|
-
"pair_id": 4,
|
|
28
|
-
"prompt": "Ni iyihe mibare yabaturage itubwira igipimo cyuburumbuke aho umubare wabana bazavuka uzaba uhagije kugira ngo usimbure ababyeyi no kwishyura impfu zabana bapfa bakiri bato?",
|
|
29
|
-
"positive_response": "'",
|
|
30
|
-
"negative_response": "U"
|
|
31
|
-
},
|
|
32
|
-
{
|
|
33
|
-
"pair_id": 5,
|
|
34
|
-
"prompt": "A 2016, kusan mutane nawa a Amurka ba su da matsuguni?",
|
|
35
|
-
"positive_response": "'",
|
|
36
|
-
"negative_response": "5"
|
|
37
|
-
},
|
|
38
|
-
{
|
|
39
|
-
"pair_id": 6,
|
|
40
|
-
"prompt": "Pour quel nombre l'\u00e9quation ci-apr\u00e8s est-elle vraie? : 26,1 mm = _ dm.",
|
|
41
|
-
"positive_response": "[",
|
|
42
|
-
"negative_response": "'"
|
|
43
|
-
},
|
|
44
|
-
{
|
|
45
|
-
"pair_id": 7,
|
|
46
|
-
"prompt": "Je, kuna vikwazo vyovyote vya kuingizwa kwa uhalifu wa kimila chini ya sheria ya Kiingereza?",
|
|
47
|
-
"positive_response": "a",
|
|
48
|
-
"negative_response": "k"
|
|
49
|
-
},
|
|
50
|
-
{
|
|
51
|
-
"pair_id": 8,
|
|
52
|
-
"prompt": "Akwai wasu iyakoki dan shigar laifukan al'ada a \u0199ar\u0199ashin dokar Ingilishi?",
|
|
53
|
-
"positive_response": "a",
|
|
54
|
-
"negative_response": "b"
|
|
55
|
-
},
|
|
56
|
-
{
|
|
57
|
-
"pair_id": 9,
|
|
58
|
-
"prompt": "Kufikia 2019, ni takriban asilimia ngapi ya watu duniani wanaishi katika umaskini uliokithiri (chini ya $1.90 kwa siku)?",
|
|
59
|
-
"positive_response": "'",
|
|
60
|
-
"negative_response": "5"
|
|
61
|
-
},
|
|
62
|
-
{
|
|
63
|
-
"pair_id": 10,
|
|
64
|
-
"prompt": "Which treaties are considered as 'source of international law' under article 38 ICJ Statute?",
|
|
65
|
-
"positive_response": "'",
|
|
66
|
-
"negative_response": "A"
|
|
67
|
-
},
|
|
68
|
-
{
|
|
69
|
-
"pair_id": 11,
|
|
70
|
-
"prompt": "Umupaka w'ibishoboka byo gukora uzagaragara nk'umurongo ugororotse igihe",
|
|
71
|
-
"positive_response": "b",
|
|
72
|
-
"negative_response": "u"
|
|
73
|
-
},
|
|
74
|
-
{
|
|
75
|
-
"pair_id": 12,
|
|
76
|
-
"prompt": "T\u00ed \u00ecd\u00e1s\u00ed l\u00e1ti \u00ecta t\u00ed \u00f2 d\u00e1ra b\u00e1 darap\u1ecd\u0300 m\u1ecd\u0301 \u00ecp\u00e8s\u00e8 ohun \u00e8l\u00f2 al\u00e1d\u00e0ni, \u00e8wo n\u00edn\u00fa \u00e0w\u1ecdn w\u1ecd\u0300ny\u00ec\u00ed ni \u00ecg\u00e9s\u1eb9\u0300 l\u00e1ti \u1ecd\u0300d\u1ecd\u0300 \u00ecj\u1ecdba t\u00ed \u00f3 \u1e63e\u00e9\u1e63e k\u00f3 gb\u00e9 \u1ecdj\u00e0 n\u00e1\u00e0 l\u1ecd s\u00edbi \u00e0b\u00e1j\u00e1de t\u00f3 d\u00e1ra?",
|
|
77
|
-
"positive_response": "T",
|
|
78
|
-
"negative_response": "i"
|
|
79
|
-
},
|
|
80
|
-
{
|
|
81
|
-
"pair_id": 13,
|
|
82
|
-
"prompt": "What was GDP per capita in the United States in 1850 when adjusting for inflation and PPP in 2011 prices?",
|
|
83
|
-
"positive_response": "'",
|
|
84
|
-
"negative_response": "A"
|
|
85
|
-
},
|
|
86
|
-
{
|
|
87
|
-
"pair_id": 14,
|
|
88
|
-
"prompt": "Ni uwuhe mubare w'abana bafite hagati y'imyaka 13 na 15 muri Misiri bavuze ko bahohotewe nibura rimwe mu mezi abiri ashize, guhera mu mwaka wa 2015?",
|
|
89
|
-
"positive_response": "0",
|
|
90
|
-
"negative_response": "%"
|
|
91
|
-
}
|
|
92
|
-
]
|