wisent 0.7.901__py3-none-any.whl → 0.7.1116__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/comparison/__init__.py +1 -0
  3. wisent/comparison/detect_bos_features.py +275 -0
  4. wisent/comparison/fgaa.py +465 -0
  5. wisent/comparison/lora.py +663 -0
  6. wisent/comparison/lora_dpo.py +604 -0
  7. wisent/comparison/main.py +444 -0
  8. wisent/comparison/ours.py +76 -0
  9. wisent/comparison/reft.py +690 -0
  10. wisent/comparison/sae.py +304 -0
  11. wisent/comparison/utils.py +381 -0
  12. wisent/core/activations/activations_collector.py +3 -2
  13. wisent/core/activations/extraction_strategy.py +8 -4
  14. wisent/core/cli/agent/apply_steering.py +7 -5
  15. wisent/core/cli/agent/train_classifier.py +4 -3
  16. wisent/core/cli/generate_vector_from_task.py +11 -20
  17. wisent/core/cli/get_activations.py +1 -1
  18. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +20 -3
  19. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +8 -1
  20. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +8 -1
  21. wisent/core/parser_arguments/generate_vector_from_task_parser.py +4 -11
  22. wisent/core/parser_arguments/get_activations_parser.py +5 -14
  23. {wisent-0.7.901.dist-info → wisent-0.7.1116.dist-info}/METADATA +5 -1
  24. {wisent-0.7.901.dist-info → wisent-0.7.1116.dist-info}/RECORD +28 -91
  25. wisent/examples/contrastive_pairs/humanization_human_vs_ai.json +0 -2112
  26. wisent/examples/scripts/1/test_basqueglue_evaluation.json +0 -51
  27. wisent/examples/scripts/1/test_basqueglue_pairs.json +0 -14
  28. wisent/examples/scripts/1/test_bec2016eu_evaluation.json +0 -51
  29. wisent/examples/scripts/1/test_bec2016eu_pairs.json +0 -14
  30. wisent/examples/scripts/1/test_belebele_evaluation.json +0 -51
  31. wisent/examples/scripts/1/test_belebele_pairs.json +0 -14
  32. wisent/examples/scripts/1/test_benchmarks_evaluation.json +0 -51
  33. wisent/examples/scripts/1/test_benchmarks_pairs.json +0 -14
  34. wisent/examples/scripts/1/test_bertaqa_evaluation.json +0 -51
  35. wisent/examples/scripts/1/test_bertaqa_pairs.json +0 -14
  36. wisent/examples/scripts/1/test_bhtc_v2_evaluation.json +0 -30
  37. wisent/examples/scripts/1/test_bhtc_v2_pairs.json +0 -8
  38. wisent/examples/scripts/1/test_boolq-seq2seq_evaluation.json +0 -30
  39. wisent/examples/scripts/1/test_boolq-seq2seq_pairs.json +0 -8
  40. wisent/examples/scripts/1/test_cabreu_evaluation.json +0 -30
  41. wisent/examples/scripts/1/test_cabreu_pairs.json +0 -8
  42. wisent/examples/scripts/1/test_careqa_en_evaluation.json +0 -30
  43. wisent/examples/scripts/1/test_careqa_en_pairs.json +0 -8
  44. wisent/examples/scripts/1/test_careqa_evaluation.json +0 -30
  45. wisent/examples/scripts/1/test_careqa_pairs.json +0 -8
  46. wisent/examples/scripts/1/test_catalanqa_evaluation.json +0 -30
  47. wisent/examples/scripts/1/test_catalanqa_pairs.json +0 -8
  48. wisent/examples/scripts/1/test_catcola_evaluation.json +0 -30
  49. wisent/examples/scripts/1/test_catcola_pairs.json +0 -8
  50. wisent/examples/scripts/1/test_chartqa_evaluation.json +0 -30
  51. wisent/examples/scripts/1/test_chartqa_pairs.json +0 -8
  52. wisent/examples/scripts/1/test_claim_stance_topic_evaluation.json +0 -30
  53. wisent/examples/scripts/1/test_claim_stance_topic_pairs.json +0 -8
  54. wisent/examples/scripts/1/test_cnn_dailymail_evaluation.json +0 -30
  55. wisent/examples/scripts/1/test_cnn_dailymail_pairs.json +0 -8
  56. wisent/examples/scripts/1/test_cocoteros_es_evaluation.json +0 -30
  57. wisent/examples/scripts/1/test_cocoteros_es_pairs.json +0 -8
  58. wisent/examples/scripts/1/test_coedit_gec_evaluation.json +0 -30
  59. wisent/examples/scripts/1/test_coedit_gec_pairs.json +0 -8
  60. wisent/examples/scripts/1/test_cola_evaluation.json +0 -30
  61. wisent/examples/scripts/1/test_cola_pairs.json +0 -8
  62. wisent/examples/scripts/1/test_coqcat_evaluation.json +0 -30
  63. wisent/examples/scripts/1/test_coqcat_pairs.json +0 -8
  64. wisent/examples/scripts/1/test_dbpedia_14_evaluation.json +0 -30
  65. wisent/examples/scripts/1/test_dbpedia_14_pairs.json +0 -8
  66. wisent/examples/scripts/1/test_epec_koref_bin_evaluation.json +0 -30
  67. wisent/examples/scripts/1/test_epec_koref_bin_pairs.json +0 -8
  68. wisent/examples/scripts/1/test_ethos_binary_evaluation.json +0 -30
  69. wisent/examples/scripts/1/test_ethos_binary_pairs.json +0 -8
  70. wisent/examples/scripts/2/test_afrimgsm_direct_amh_evaluation.json +0 -30
  71. wisent/examples/scripts/2/test_afrimgsm_direct_amh_pairs.json +0 -8
  72. wisent/examples/scripts/2/test_afrimmlu_direct_amh_evaluation.json +0 -30
  73. wisent/examples/scripts/2/test_afrimmlu_direct_amh_pairs.json +0 -8
  74. wisent/examples/scripts/2/test_afrixnli_en_direct_amh_evaluation.json +0 -30
  75. wisent/examples/scripts/2/test_afrixnli_en_direct_amh_pairs.json +0 -8
  76. wisent/examples/scripts/2/test_arc_ar_evaluation.json +0 -30
  77. wisent/examples/scripts/2/test_arc_ar_pairs.json +0 -8
  78. wisent/examples/scripts/2/test_atis_evaluation.json +0 -30
  79. wisent/examples/scripts/2/test_atis_pairs.json +0 -8
  80. wisent/examples/scripts/2/test_babi_evaluation.json +0 -30
  81. wisent/examples/scripts/2/test_babi_pairs.json +0 -8
  82. wisent/examples/scripts/2/test_babilong_evaluation.json +0 -30
  83. wisent/examples/scripts/2/test_babilong_pairs.json +0 -8
  84. wisent/examples/scripts/2/test_bangla_mmlu_evaluation.json +0 -30
  85. wisent/examples/scripts/2/test_bangla_mmlu_pairs.json +0 -8
  86. wisent/examples/scripts/2/test_basque-glue_pairs.json +0 -14
  87. wisent/examples/scripts/generate_paper_data.py +0 -384
  88. wisent/examples/scripts/intervention_validation.py +0 -626
  89. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +0 -324
  90. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +0 -92
  91. wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +0 -324
  92. wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +0 -92
  93. wisent/examples/scripts/results/test_afrimgsm_pairs.json +0 -92
  94. wisent/examples/scripts/results/test_afrimmlu_evaluation.json +0 -324
  95. wisent/examples/scripts/results/test_afrimmlu_pairs.json +0 -92
  96. wisent/examples/scripts/threshold_analysis.py +0 -434
  97. wisent/examples/scripts/visualization_gallery.py +0 -582
  98. {wisent-0.7.901.dist-info → wisent-0.7.1116.dist-info}/WHEEL +0 -0
  99. {wisent-0.7.901.dist-info → wisent-0.7.1116.dist-info}/entry_points.txt +0 -0
  100. {wisent-0.7.901.dist-info → wisent-0.7.1116.dist-info}/licenses/LICENSE +0 -0
  101. {wisent-0.7.901.dist-info → wisent-0.7.1116.dist-info}/top_level.txt +0 -0
@@ -1,324 +0,0 @@
1
- {
2
- "task_name": "afrimmlu",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 15,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Ked\u1ee5 \u1ee5d\u1ecb emume Steeti a ch\u1ecdr\u1ecd?...",
11
- "positive_response": "[",
12
- "negative_response": "'",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: '[' (log_prob=-0.500), Expected: '['"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: '[' (log_prob=-0.500), Expected: '''"
26
- },
27
- "both_correct": true
28
- },
29
- {
30
- "pair_id": 1,
31
- "prompt": "\u12a8\u121a\u12a8\u1270\u1209\u1275 \u12cd\u1235\u1325 \u12e8\u12cb\u130b (P) \u1260\u134d\u1341\u121d \u1349\u12ad\u12ad\u122d \u12cd\u1235\u1325 \u1208\u121d\u1295 \u12a8\u1205\u12f3\u130d \u1308\u1262 (MR) \u130b\u122d \u12a5\u1295\u12f0\u121a\u1270\u12ab\u12a8\u120d\u1363 \u12a5\u1293 \u12cb\u130b (P) \u1260\u1265\u1278\u129d\u1290\u1275 \u12c8\u12ed\u121d \u134d\u133d\u121d\u1293 \u1260\u130e\u12f0\u1208\u12cd \u1349\u12ad\u12ad\u122d \u12a8\u12a0\u1290\u1235\u1270\u129b \u1308\u1262 \u1208\u121d\u1295 \u12a5\u1295\u12f0\u121a\u1260\u120d\u1325 \u1260\u1275\u12ad\u12ad\u120d \u12e8\u121a\u12eb\u1235\u1228\u12f3\u12cd \u12e8\u1275\u129b\u12cd \u1290\u12cd? I. MR = P = \u1208\u134d\u1339\u121d \u12cd\u12f5\u12f5\u122d \u1260\u12a0\u130d\u12f5\u121d \u1270\u130d\u1263\u122d \u120b\u12ed \u134d\u120b\u130e\u1275. II. P > MR \u134d\u133d\u121d\u1293 \u12e8\u130e\u12f0\u1208\u12cd \u1349\u12ad\u12ad\u122d \u12a5\u1295\u12f0 \u1241\u120d\u1241\u120d \u1270\u1295\u1238\u122b\u1273\u127d \u1270\u130d\u1263\u122b\u1275\u1362 III. \u134d\u120b\u130e\u1275 \u12a5\u1293 \u12cb\u130b \u134d\u133d\u121d\u1293 \u12e8\u130e\u12f0\u1208\u12cd \u12cd\u12f5\u12f5\u122d \u12a5\u1295\u12f0 \u124b\u121a \u1270\u130d\u1263\u122d \u1290\u12cd \u12e8\u121a\u12c8\u12a8\u1209\u1275\u1362...",
32
- "positive_response": "'",
33
- "negative_response": "I",
34
- "positive_evaluation": {
35
- "ground_truth": "TRUTHFUL",
36
- "confidence": 1.0,
37
- "expected": "TRUTHFUL",
38
- "correct": true,
39
- "details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
40
- },
41
- "negative_evaluation": {
42
- "ground_truth": "UNTRUTHFUL",
43
- "confidence": 0.0,
44
- "expected": "UNTRUTHFUL",
45
- "correct": true,
46
- "details": "Predicted: ''' (log_prob=-0.500), Expected: 'I'"
47
- },
48
- "both_correct": true
49
- },
50
- {
51
- "pair_id": 2,
52
- "prompt": "Na 1997, \u1ee4l\u1ecd ak\u1ee5 \u1ee5wa ch\u1ecdp\u1ee5tara na fraksh\u1ecdn nke nd\u1ecb b\u1ecb n'\u1ee5wa niile bi na mba nd\u1ecb na-akpata obere ego--ya b\u1ee5 mba kwa onye \u1ecdb\u1ee5la n'eri \u1ee5gw\u1ecd \u1ecdnwa nke $1230 ma\u1ecdb\u1ee5 n'erughi --d\u1ecb ihe d\u1ecb ka...",
53
- "positive_response": "5",
54
- "negative_response": "%",
55
- "positive_evaluation": {
56
- "ground_truth": "TRUTHFUL",
57
- "confidence": 1.0,
58
- "expected": "TRUTHFUL",
59
- "correct": true,
60
- "details": "Predicted: '5' (log_prob=-0.500), Expected: '5'"
61
- },
62
- "negative_evaluation": {
63
- "ground_truth": "UNTRUTHFUL",
64
- "confidence": 0.0,
65
- "expected": "UNTRUTHFUL",
66
- "correct": true,
67
- "details": "Predicted: '5' (log_prob=-0.500), Expected: '%'"
68
- },
69
- "both_correct": true
70
- },
71
- {
72
- "pair_id": 3,
73
- "prompt": "Mu gihe cyacyo cy'agahebuzo, ni irihe tandukaniro riri hagati y'abantu bo mu Majyepfo n'abo mu bindi bice bya Leta Zunze Ubumwe z'Amerika bemera ko amashuri atandukana n'ayandi?...",
74
- "positive_response": "'",
75
- "negative_response": "8",
76
- "positive_evaluation": {
77
- "ground_truth": "TRUTHFUL",
78
- "confidence": 1.0,
79
- "expected": "TRUTHFUL",
80
- "correct": true,
81
- "details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
82
- },
83
- "negative_evaluation": {
84
- "ground_truth": "UNTRUTHFUL",
85
- "confidence": 0.0,
86
- "expected": "UNTRUTHFUL",
87
- "correct": true,
88
- "details": "Predicted: ''' (log_prob=-0.500), Expected: '8'"
89
- },
90
- "both_correct": true
91
- },
92
- {
93
- "pair_id": 4,
94
- "prompt": "Ni iyihe mibare yabaturage itubwira igipimo cyuburumbuke aho umubare wabana bazavuka uzaba uhagije kugira ngo usimbure ababyeyi no kwishyura impfu zabana bapfa bakiri bato?...",
95
- "positive_response": "'",
96
- "negative_response": "U",
97
- "positive_evaluation": {
98
- "ground_truth": "TRUTHFUL",
99
- "confidence": 1.0,
100
- "expected": "TRUTHFUL",
101
- "correct": true,
102
- "details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
103
- },
104
- "negative_evaluation": {
105
- "ground_truth": "UNTRUTHFUL",
106
- "confidence": 0.0,
107
- "expected": "UNTRUTHFUL",
108
- "correct": true,
109
- "details": "Predicted: ''' (log_prob=-0.500), Expected: 'U'"
110
- },
111
- "both_correct": true
112
- },
113
- {
114
- "pair_id": 5,
115
- "prompt": "A 2016, kusan mutane nawa a Amurka ba su da matsuguni?...",
116
- "positive_response": "'",
117
- "negative_response": "5",
118
- "positive_evaluation": {
119
- "ground_truth": "TRUTHFUL",
120
- "confidence": 1.0,
121
- "expected": "TRUTHFUL",
122
- "correct": true,
123
- "details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
124
- },
125
- "negative_evaluation": {
126
- "ground_truth": "UNTRUTHFUL",
127
- "confidence": 0.0,
128
- "expected": "UNTRUTHFUL",
129
- "correct": true,
130
- "details": "Predicted: ''' (log_prob=-0.500), Expected: '5'"
131
- },
132
- "both_correct": true
133
- },
134
- {
135
- "pair_id": 6,
136
- "prompt": "Pour quel nombre l'\u00e9quation ci-apr\u00e8s est-elle vraie? : 26,1 mm = _ dm....",
137
- "positive_response": "[",
138
- "negative_response": "'",
139
- "positive_evaluation": {
140
- "ground_truth": "TRUTHFUL",
141
- "confidence": 1.0,
142
- "expected": "TRUTHFUL",
143
- "correct": true,
144
- "details": "Predicted: '[' (log_prob=-0.500), Expected: '['"
145
- },
146
- "negative_evaluation": {
147
- "ground_truth": "UNTRUTHFUL",
148
- "confidence": 0.0,
149
- "expected": "UNTRUTHFUL",
150
- "correct": true,
151
- "details": "Predicted: '[' (log_prob=-0.500), Expected: '''"
152
- },
153
- "both_correct": true
154
- },
155
- {
156
- "pair_id": 7,
157
- "prompt": "Je, kuna vikwazo vyovyote vya kuingizwa kwa uhalifu wa kimila chini ya sheria ya Kiingereza?...",
158
- "positive_response": "a",
159
- "negative_response": "k",
160
- "positive_evaluation": {
161
- "ground_truth": "TRUTHFUL",
162
- "confidence": 1.0,
163
- "expected": "TRUTHFUL",
164
- "correct": true,
165
- "details": "Predicted: 'a' (log_prob=-0.500), Expected: 'a'"
166
- },
167
- "negative_evaluation": {
168
- "ground_truth": "UNTRUTHFUL",
169
- "confidence": 0.0,
170
- "expected": "UNTRUTHFUL",
171
- "correct": true,
172
- "details": "Predicted: 'a' (log_prob=-0.500), Expected: 'k'"
173
- },
174
- "both_correct": true
175
- },
176
- {
177
- "pair_id": 8,
178
- "prompt": "Akwai wasu iyakoki dan shigar laifukan al'ada a \u0199ar\u0199ashin dokar Ingilishi?...",
179
- "positive_response": "a",
180
- "negative_response": "b",
181
- "positive_evaluation": {
182
- "ground_truth": "TRUTHFUL",
183
- "confidence": 1.0,
184
- "expected": "TRUTHFUL",
185
- "correct": true,
186
- "details": "Predicted: 'a' (log_prob=-0.500), Expected: 'a'"
187
- },
188
- "negative_evaluation": {
189
- "ground_truth": "UNTRUTHFUL",
190
- "confidence": 0.0,
191
- "expected": "UNTRUTHFUL",
192
- "correct": true,
193
- "details": "Predicted: 'a' (log_prob=-0.500), Expected: 'b'"
194
- },
195
- "both_correct": true
196
- },
197
- {
198
- "pair_id": 9,
199
- "prompt": "Kufikia 2019, ni takriban asilimia ngapi ya watu duniani wanaishi katika umaskini uliokithiri (chini ya $1.90 kwa siku)?...",
200
- "positive_response": "'",
201
- "negative_response": "5",
202
- "positive_evaluation": {
203
- "ground_truth": "TRUTHFUL",
204
- "confidence": 1.0,
205
- "expected": "TRUTHFUL",
206
- "correct": true,
207
- "details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
208
- },
209
- "negative_evaluation": {
210
- "ground_truth": "UNTRUTHFUL",
211
- "confidence": 0.0,
212
- "expected": "UNTRUTHFUL",
213
- "correct": true,
214
- "details": "Predicted: ''' (log_prob=-0.500), Expected: '5'"
215
- },
216
- "both_correct": true
217
- },
218
- {
219
- "pair_id": 10,
220
- "prompt": "Which treaties are considered as 'source of international law' under article 38 ICJ Statute?...",
221
- "positive_response": "'",
222
- "negative_response": "A",
223
- "positive_evaluation": {
224
- "ground_truth": "TRUTHFUL",
225
- "confidence": 1.0,
226
- "expected": "TRUTHFUL",
227
- "correct": true,
228
- "details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
229
- },
230
- "negative_evaluation": {
231
- "ground_truth": "UNTRUTHFUL",
232
- "confidence": 0.0,
233
- "expected": "UNTRUTHFUL",
234
- "correct": true,
235
- "details": "Predicted: ''' (log_prob=-0.500), Expected: 'A'"
236
- },
237
- "both_correct": true
238
- },
239
- {
240
- "pair_id": 11,
241
- "prompt": "Umupaka w'ibishoboka byo gukora uzagaragara nk'umurongo ugororotse igihe...",
242
- "positive_response": "b",
243
- "negative_response": "u",
244
- "positive_evaluation": {
245
- "ground_truth": "TRUTHFUL",
246
- "confidence": 1.0,
247
- "expected": "TRUTHFUL",
248
- "correct": true,
249
- "details": "Predicted: 'b' (log_prob=-0.500), Expected: 'b'"
250
- },
251
- "negative_evaluation": {
252
- "ground_truth": "UNTRUTHFUL",
253
- "confidence": 0.0,
254
- "expected": "UNTRUTHFUL",
255
- "correct": true,
256
- "details": "Predicted: 'b' (log_prob=-0.500), Expected: 'u'"
257
- },
258
- "both_correct": true
259
- },
260
- {
261
- "pair_id": 12,
262
- "prompt": "T\u00ed \u00ecd\u00e1s\u00ed l\u00e1ti \u00ecta t\u00ed \u00f2 d\u00e1ra b\u00e1 darap\u1ecd\u0300 m\u1ecd\u0301 \u00ecp\u00e8s\u00e8 ohun \u00e8l\u00f2 al\u00e1d\u00e0ni, \u00e8wo n\u00edn\u00fa \u00e0w\u1ecdn w\u1ecd\u0300ny\u00ec\u00ed ni \u00ecg\u00e9s\u1eb9\u0300 l\u00e1ti \u1ecd\u0300d\u1ecd\u0300 \u00ecj\u1ecdba t\u00ed \u00f3 \u1e63e\u00e9\u1e63e k\u00f3 gb\u00e9 \u1ecdj\u00e0 n\u00e1\u00e0 l\u1ecd s\u00edbi \u00e0b\u00e1j\u00e1de t\u00f3 d\u00e1ra?...",
263
- "positive_response": "T",
264
- "negative_response": "i",
265
- "positive_evaluation": {
266
- "ground_truth": "TRUTHFUL",
267
- "confidence": 1.0,
268
- "expected": "TRUTHFUL",
269
- "correct": true,
270
- "details": "Predicted: 'T' (log_prob=-0.500), Expected: 'T'"
271
- },
272
- "negative_evaluation": {
273
- "ground_truth": "UNTRUTHFUL",
274
- "confidence": 0.0,
275
- "expected": "UNTRUTHFUL",
276
- "correct": true,
277
- "details": "Predicted: 'T' (log_prob=-0.500), Expected: 'i'"
278
- },
279
- "both_correct": true
280
- },
281
- {
282
- "pair_id": 13,
283
- "prompt": "What was GDP per capita in the United States in 1850 when adjusting for inflation and PPP in 2011 prices?...",
284
- "positive_response": "'",
285
- "negative_response": "A",
286
- "positive_evaluation": {
287
- "ground_truth": "TRUTHFUL",
288
- "confidence": 1.0,
289
- "expected": "TRUTHFUL",
290
- "correct": true,
291
- "details": "Predicted: ''' (log_prob=-0.500), Expected: '''"
292
- },
293
- "negative_evaluation": {
294
- "ground_truth": "UNTRUTHFUL",
295
- "confidence": 0.0,
296
- "expected": "UNTRUTHFUL",
297
- "correct": true,
298
- "details": "Predicted: ''' (log_prob=-0.500), Expected: 'A'"
299
- },
300
- "both_correct": true
301
- },
302
- {
303
- "pair_id": 14,
304
- "prompt": "Ni uwuhe mubare w'abana bafite hagati y'imyaka 13 na 15 muri Misiri bavuze ko bahohotewe nibura rimwe mu mezi abiri ashize, guhera mu mwaka wa 2015?...",
305
- "positive_response": "0",
306
- "negative_response": "%",
307
- "positive_evaluation": {
308
- "ground_truth": "TRUTHFUL",
309
- "confidence": 1.0,
310
- "expected": "TRUTHFUL",
311
- "correct": true,
312
- "details": "Predicted: '0' (log_prob=-0.500), Expected: '0'"
313
- },
314
- "negative_evaluation": {
315
- "ground_truth": "UNTRUTHFUL",
316
- "confidence": 0.0,
317
- "expected": "UNTRUTHFUL",
318
- "correct": true,
319
- "details": "Predicted: '0' (log_prob=-0.500), Expected: '%'"
320
- },
321
- "both_correct": true
322
- }
323
- ]
324
- }
@@ -1,92 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Ked\u1ee5 \u1ee5d\u1ecb emume Steeti a ch\u1ecdr\u1ecd?",
5
- "positive_response": "[",
6
- "negative_response": "'"
7
- },
8
- {
9
- "pair_id": 1,
10
- "prompt": "\u12a8\u121a\u12a8\u1270\u1209\u1275 \u12cd\u1235\u1325 \u12e8\u12cb\u130b (P) \u1260\u134d\u1341\u121d \u1349\u12ad\u12ad\u122d \u12cd\u1235\u1325 \u1208\u121d\u1295 \u12a8\u1205\u12f3\u130d \u1308\u1262 (MR) \u130b\u122d \u12a5\u1295\u12f0\u121a\u1270\u12ab\u12a8\u120d\u1363 \u12a5\u1293 \u12cb\u130b (P) \u1260\u1265\u1278\u129d\u1290\u1275 \u12c8\u12ed\u121d \u134d\u133d\u121d\u1293 \u1260\u130e\u12f0\u1208\u12cd \u1349\u12ad\u12ad\u122d \u12a8\u12a0\u1290\u1235\u1270\u129b \u1308\u1262 \u1208\u121d\u1295 \u12a5\u1295\u12f0\u121a\u1260\u120d\u1325 \u1260\u1275\u12ad\u12ad\u120d \u12e8\u121a\u12eb\u1235\u1228\u12f3\u12cd \u12e8\u1275\u129b\u12cd \u1290\u12cd? I. MR = P = \u1208\u134d\u1339\u121d \u12cd\u12f5\u12f5\u122d \u1260\u12a0\u130d\u12f5\u121d \u1270\u130d\u1263\u122d \u120b\u12ed \u134d\u120b\u130e\u1275. II. P > MR \u134d\u133d\u121d\u1293 \u12e8\u130e\u12f0\u1208\u12cd \u1349\u12ad\u12ad\u122d \u12a5\u1295\u12f0 \u1241\u120d\u1241\u120d \u1270\u1295\u1238\u122b\u1273\u127d \u1270\u130d\u1263\u122b\u1275\u1362 III. \u134d\u120b\u130e\u1275 \u12a5\u1293 \u12cb\u130b \u134d\u133d\u121d\u1293 \u12e8\u130e\u12f0\u1208\u12cd \u12cd\u12f5\u12f5\u122d \u12a5\u1295\u12f0 \u124b\u121a \u1270\u130d\u1263\u122d \u1290\u12cd \u12e8\u121a\u12c8\u12a8\u1209\u1275\u1362",
11
- "positive_response": "'",
12
- "negative_response": "I"
13
- },
14
- {
15
- "pair_id": 2,
16
- "prompt": "Na 1997, \u1ee4l\u1ecd ak\u1ee5 \u1ee5wa ch\u1ecdp\u1ee5tara na fraksh\u1ecdn nke nd\u1ecb b\u1ecb n'\u1ee5wa niile bi na mba nd\u1ecb na-akpata obere ego--ya b\u1ee5 mba kwa onye \u1ecdb\u1ee5la n'eri \u1ee5gw\u1ecd \u1ecdnwa nke $1230 ma\u1ecdb\u1ee5 n'erughi --d\u1ecb ihe d\u1ecb ka",
17
- "positive_response": "5",
18
- "negative_response": "%"
19
- },
20
- {
21
- "pair_id": 3,
22
- "prompt": "Mu gihe cyacyo cy'agahebuzo, ni irihe tandukaniro riri hagati y'abantu bo mu Majyepfo n'abo mu bindi bice bya Leta Zunze Ubumwe z'Amerika bemera ko amashuri atandukana n'ayandi?",
23
- "positive_response": "'",
24
- "negative_response": "8"
25
- },
26
- {
27
- "pair_id": 4,
28
- "prompt": "Ni iyihe mibare yabaturage itubwira igipimo cyuburumbuke aho umubare wabana bazavuka uzaba uhagije kugira ngo usimbure ababyeyi no kwishyura impfu zabana bapfa bakiri bato?",
29
- "positive_response": "'",
30
- "negative_response": "U"
31
- },
32
- {
33
- "pair_id": 5,
34
- "prompt": "A 2016, kusan mutane nawa a Amurka ba su da matsuguni?",
35
- "positive_response": "'",
36
- "negative_response": "5"
37
- },
38
- {
39
- "pair_id": 6,
40
- "prompt": "Pour quel nombre l'\u00e9quation ci-apr\u00e8s est-elle vraie? : 26,1 mm = _ dm.",
41
- "positive_response": "[",
42
- "negative_response": "'"
43
- },
44
- {
45
- "pair_id": 7,
46
- "prompt": "Je, kuna vikwazo vyovyote vya kuingizwa kwa uhalifu wa kimila chini ya sheria ya Kiingereza?",
47
- "positive_response": "a",
48
- "negative_response": "k"
49
- },
50
- {
51
- "pair_id": 8,
52
- "prompt": "Akwai wasu iyakoki dan shigar laifukan al'ada a \u0199ar\u0199ashin dokar Ingilishi?",
53
- "positive_response": "a",
54
- "negative_response": "b"
55
- },
56
- {
57
- "pair_id": 9,
58
- "prompt": "Kufikia 2019, ni takriban asilimia ngapi ya watu duniani wanaishi katika umaskini uliokithiri (chini ya $1.90 kwa siku)?",
59
- "positive_response": "'",
60
- "negative_response": "5"
61
- },
62
- {
63
- "pair_id": 10,
64
- "prompt": "Which treaties are considered as 'source of international law' under article 38 ICJ Statute?",
65
- "positive_response": "'",
66
- "negative_response": "A"
67
- },
68
- {
69
- "pair_id": 11,
70
- "prompt": "Umupaka w'ibishoboka byo gukora uzagaragara nk'umurongo ugororotse igihe",
71
- "positive_response": "b",
72
- "negative_response": "u"
73
- },
74
- {
75
- "pair_id": 12,
76
- "prompt": "T\u00ed \u00ecd\u00e1s\u00ed l\u00e1ti \u00ecta t\u00ed \u00f2 d\u00e1ra b\u00e1 darap\u1ecd\u0300 m\u1ecd\u0301 \u00ecp\u00e8s\u00e8 ohun \u00e8l\u00f2 al\u00e1d\u00e0ni, \u00e8wo n\u00edn\u00fa \u00e0w\u1ecdn w\u1ecd\u0300ny\u00ec\u00ed ni \u00ecg\u00e9s\u1eb9\u0300 l\u00e1ti \u1ecd\u0300d\u1ecd\u0300 \u00ecj\u1ecdba t\u00ed \u00f3 \u1e63e\u00e9\u1e63e k\u00f3 gb\u00e9 \u1ecdj\u00e0 n\u00e1\u00e0 l\u1ecd s\u00edbi \u00e0b\u00e1j\u00e1de t\u00f3 d\u00e1ra?",
77
- "positive_response": "T",
78
- "negative_response": "i"
79
- },
80
- {
81
- "pair_id": 13,
82
- "prompt": "What was GDP per capita in the United States in 1850 when adjusting for inflation and PPP in 2011 prices?",
83
- "positive_response": "'",
84
- "negative_response": "A"
85
- },
86
- {
87
- "pair_id": 14,
88
- "prompt": "Ni uwuhe mubare w'abana bafite hagati y'imyaka 13 na 15 muri Misiri bavuze ko bahohotewe nibura rimwe mu mezi abiri ashize, guhera mu mwaka wa 2015?",
89
- "positive_response": "0",
90
- "negative_response": "%"
91
- }
92
- ]