wisent 0.7.901__py3-none-any.whl → 0.7.1116__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/comparison/__init__.py +1 -0
  3. wisent/comparison/detect_bos_features.py +275 -0
  4. wisent/comparison/fgaa.py +465 -0
  5. wisent/comparison/lora.py +663 -0
  6. wisent/comparison/lora_dpo.py +604 -0
  7. wisent/comparison/main.py +444 -0
  8. wisent/comparison/ours.py +76 -0
  9. wisent/comparison/reft.py +690 -0
  10. wisent/comparison/sae.py +304 -0
  11. wisent/comparison/utils.py +381 -0
  12. wisent/core/activations/activations_collector.py +3 -2
  13. wisent/core/activations/extraction_strategy.py +8 -4
  14. wisent/core/cli/agent/apply_steering.py +7 -5
  15. wisent/core/cli/agent/train_classifier.py +4 -3
  16. wisent/core/cli/generate_vector_from_task.py +11 -20
  17. wisent/core/cli/get_activations.py +1 -1
  18. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +20 -3
  19. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +8 -1
  20. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +8 -1
  21. wisent/core/parser_arguments/generate_vector_from_task_parser.py +4 -11
  22. wisent/core/parser_arguments/get_activations_parser.py +5 -14
  23. {wisent-0.7.901.dist-info → wisent-0.7.1116.dist-info}/METADATA +5 -1
  24. {wisent-0.7.901.dist-info → wisent-0.7.1116.dist-info}/RECORD +28 -91
  25. wisent/examples/contrastive_pairs/humanization_human_vs_ai.json +0 -2112
  26. wisent/examples/scripts/1/test_basqueglue_evaluation.json +0 -51
  27. wisent/examples/scripts/1/test_basqueglue_pairs.json +0 -14
  28. wisent/examples/scripts/1/test_bec2016eu_evaluation.json +0 -51
  29. wisent/examples/scripts/1/test_bec2016eu_pairs.json +0 -14
  30. wisent/examples/scripts/1/test_belebele_evaluation.json +0 -51
  31. wisent/examples/scripts/1/test_belebele_pairs.json +0 -14
  32. wisent/examples/scripts/1/test_benchmarks_evaluation.json +0 -51
  33. wisent/examples/scripts/1/test_benchmarks_pairs.json +0 -14
  34. wisent/examples/scripts/1/test_bertaqa_evaluation.json +0 -51
  35. wisent/examples/scripts/1/test_bertaqa_pairs.json +0 -14
  36. wisent/examples/scripts/1/test_bhtc_v2_evaluation.json +0 -30
  37. wisent/examples/scripts/1/test_bhtc_v2_pairs.json +0 -8
  38. wisent/examples/scripts/1/test_boolq-seq2seq_evaluation.json +0 -30
  39. wisent/examples/scripts/1/test_boolq-seq2seq_pairs.json +0 -8
  40. wisent/examples/scripts/1/test_cabreu_evaluation.json +0 -30
  41. wisent/examples/scripts/1/test_cabreu_pairs.json +0 -8
  42. wisent/examples/scripts/1/test_careqa_en_evaluation.json +0 -30
  43. wisent/examples/scripts/1/test_careqa_en_pairs.json +0 -8
  44. wisent/examples/scripts/1/test_careqa_evaluation.json +0 -30
  45. wisent/examples/scripts/1/test_careqa_pairs.json +0 -8
  46. wisent/examples/scripts/1/test_catalanqa_evaluation.json +0 -30
  47. wisent/examples/scripts/1/test_catalanqa_pairs.json +0 -8
  48. wisent/examples/scripts/1/test_catcola_evaluation.json +0 -30
  49. wisent/examples/scripts/1/test_catcola_pairs.json +0 -8
  50. wisent/examples/scripts/1/test_chartqa_evaluation.json +0 -30
  51. wisent/examples/scripts/1/test_chartqa_pairs.json +0 -8
  52. wisent/examples/scripts/1/test_claim_stance_topic_evaluation.json +0 -30
  53. wisent/examples/scripts/1/test_claim_stance_topic_pairs.json +0 -8
  54. wisent/examples/scripts/1/test_cnn_dailymail_evaluation.json +0 -30
  55. wisent/examples/scripts/1/test_cnn_dailymail_pairs.json +0 -8
  56. wisent/examples/scripts/1/test_cocoteros_es_evaluation.json +0 -30
  57. wisent/examples/scripts/1/test_cocoteros_es_pairs.json +0 -8
  58. wisent/examples/scripts/1/test_coedit_gec_evaluation.json +0 -30
  59. wisent/examples/scripts/1/test_coedit_gec_pairs.json +0 -8
  60. wisent/examples/scripts/1/test_cola_evaluation.json +0 -30
  61. wisent/examples/scripts/1/test_cola_pairs.json +0 -8
  62. wisent/examples/scripts/1/test_coqcat_evaluation.json +0 -30
  63. wisent/examples/scripts/1/test_coqcat_pairs.json +0 -8
  64. wisent/examples/scripts/1/test_dbpedia_14_evaluation.json +0 -30
  65. wisent/examples/scripts/1/test_dbpedia_14_pairs.json +0 -8
  66. wisent/examples/scripts/1/test_epec_koref_bin_evaluation.json +0 -30
  67. wisent/examples/scripts/1/test_epec_koref_bin_pairs.json +0 -8
  68. wisent/examples/scripts/1/test_ethos_binary_evaluation.json +0 -30
  69. wisent/examples/scripts/1/test_ethos_binary_pairs.json +0 -8
  70. wisent/examples/scripts/2/test_afrimgsm_direct_amh_evaluation.json +0 -30
  71. wisent/examples/scripts/2/test_afrimgsm_direct_amh_pairs.json +0 -8
  72. wisent/examples/scripts/2/test_afrimmlu_direct_amh_evaluation.json +0 -30
  73. wisent/examples/scripts/2/test_afrimmlu_direct_amh_pairs.json +0 -8
  74. wisent/examples/scripts/2/test_afrixnli_en_direct_amh_evaluation.json +0 -30
  75. wisent/examples/scripts/2/test_afrixnli_en_direct_amh_pairs.json +0 -8
  76. wisent/examples/scripts/2/test_arc_ar_evaluation.json +0 -30
  77. wisent/examples/scripts/2/test_arc_ar_pairs.json +0 -8
  78. wisent/examples/scripts/2/test_atis_evaluation.json +0 -30
  79. wisent/examples/scripts/2/test_atis_pairs.json +0 -8
  80. wisent/examples/scripts/2/test_babi_evaluation.json +0 -30
  81. wisent/examples/scripts/2/test_babi_pairs.json +0 -8
  82. wisent/examples/scripts/2/test_babilong_evaluation.json +0 -30
  83. wisent/examples/scripts/2/test_babilong_pairs.json +0 -8
  84. wisent/examples/scripts/2/test_bangla_mmlu_evaluation.json +0 -30
  85. wisent/examples/scripts/2/test_bangla_mmlu_pairs.json +0 -8
  86. wisent/examples/scripts/2/test_basque-glue_pairs.json +0 -14
  87. wisent/examples/scripts/generate_paper_data.py +0 -384
  88. wisent/examples/scripts/intervention_validation.py +0 -626
  89. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +0 -324
  90. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +0 -92
  91. wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +0 -324
  92. wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +0 -92
  93. wisent/examples/scripts/results/test_afrimgsm_pairs.json +0 -92
  94. wisent/examples/scripts/results/test_afrimmlu_evaluation.json +0 -324
  95. wisent/examples/scripts/results/test_afrimmlu_pairs.json +0 -92
  96. wisent/examples/scripts/threshold_analysis.py +0 -434
  97. wisent/examples/scripts/visualization_gallery.py +0 -582
  98. {wisent-0.7.901.dist-info → wisent-0.7.1116.dist-info}/WHEEL +0 -0
  99. {wisent-0.7.901.dist-info → wisent-0.7.1116.dist-info}/entry_points.txt +0 -0
  100. {wisent-0.7.901.dist-info → wisent-0.7.1116.dist-info}/licenses/LICENSE +0 -0
  101. {wisent-0.7.901.dist-info → wisent-0.7.1116.dist-info}/top_level.txt +0 -0
@@ -1,324 +0,0 @@
1
- {
2
- "task_name": "AraDiCE_ArabicMMLU_lev",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 15,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "\u062d\u0643\u0645 \u0645\u0633\u062d \u0627\u0644\u0631\u0643\u0646 \u0627\u0644\u064a\u0645\u0627\u0646\u064a:...",
11
- "positive_response": "\u0628",
12
- "negative_response": "\u062c",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: '\u0628' (log_prob=-0.500), Expected: '\u0628'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: '\u0628' (log_prob=-0.500), Expected: '\u062c'"
26
- },
27
- "both_correct": true
28
- },
29
- {
30
- "pair_id": 1,
31
- "prompt": "\u0628\u062a\u062d\u0648\u0644 \u0627\u0644\u0625\u0634\u0627\u0631\u0627\u062a \u0627\u0644\u0643\u0647\u0631\u0628\u0627\u0626\u064a\u0629 \u0644\u0645\u0648\u062c\u0627\u062a \u0635\u0648\u062a\u064a\u0629:...",
32
- "positive_response": "\u062f",
33
- "negative_response": "\u0623",
34
- "positive_evaluation": {
35
- "ground_truth": "TRUTHFUL",
36
- "confidence": 1.0,
37
- "expected": "TRUTHFUL",
38
- "correct": true,
39
- "details": "Predicted: '\u062f' (log_prob=-0.500), Expected: '\u062f'"
40
- },
41
- "negative_evaluation": {
42
- "ground_truth": "UNTRUTHFUL",
43
- "confidence": 0.0,
44
- "expected": "UNTRUTHFUL",
45
- "correct": true,
46
- "details": "Predicted: '\u062f' (log_prob=-0.500), Expected: '\u0623'"
47
- },
48
- "both_correct": true
49
- },
50
- {
51
- "pair_id": 2,
52
- "prompt": "\u0648\u062d\u062f\u0629 \u0645\u0646 \u0627\u0644\u0634\u063a\u0644\u0627\u062a \u0627\u0644\u062c\u0627\u064a\u0629 \u062a\u0639\u062f \u0645\u0646 \u0622\u062f\u0627\u0628 \u0627\u0644\u062d\u0643\u064a \u0645\u064a\u0632\u0647...",
53
- "positive_response": "\u062c",
54
- "negative_response": "\u062f",
55
- "positive_evaluation": {
56
- "ground_truth": "TRUTHFUL",
57
- "confidence": 1.0,
58
- "expected": "TRUTHFUL",
59
- "correct": true,
60
- "details": "Predicted: '\u062c' (log_prob=-0.500), Expected: '\u062c'"
61
- },
62
- "negative_evaluation": {
63
- "ground_truth": "UNTRUTHFUL",
64
- "confidence": 0.0,
65
- "expected": "UNTRUTHFUL",
66
- "correct": true,
67
- "details": "Predicted: '\u062c' (log_prob=-0.500), Expected: '\u062f'"
68
- },
69
- "both_correct": true
70
- },
71
- {
72
- "pair_id": 3,
73
- "prompt": "\u0627\u0644\u0642\u0637\u0639\u0629 \u0627\u0644\u0625\u0644\u0643\u062a\u0631\u0648\u0646\u064a\u0629 \u0627\u0644\u0644\u064a \u0627\u0633\u062a\u0639\u0645\u0644\u0648\u0647\u0627 \u0628\u0627\u0644\u0645\u0631\u062d\u0644\u0629 \u0627\u0644\u0623\u0648\u0644\u0649 \u0644\u062a\u0637\u0648\u0631 \u0627\u0644\u0643\u0645\u0628\u064a\u0648\u062a\u0631 \u0647\u064a\u064a \u0627\u0644\u062f\u0627\u0631\u0627\u062a \u0627\u0644\u0645\u062a\u0643\u0627\u0645\u0644\u0629....",
74
- "positive_response": "\u0628",
75
- "negative_response": "\u062c",
76
- "positive_evaluation": {
77
- "ground_truth": "TRUTHFUL",
78
- "confidence": 1.0,
79
- "expected": "TRUTHFUL",
80
- "correct": true,
81
- "details": "Predicted: '\u0628' (log_prob=-0.500), Expected: '\u0628'"
82
- },
83
- "negative_evaluation": {
84
- "ground_truth": "UNTRUTHFUL",
85
- "confidence": 0.0,
86
- "expected": "UNTRUTHFUL",
87
- "correct": true,
88
- "details": "Predicted: '\u0628' (log_prob=-0.500), Expected: '\u062c'"
89
- },
90
- "both_correct": true
91
- },
92
- {
93
- "pair_id": 4,
94
- "prompt": "\"\u062d\u0627\u0641\u0638 \u0639\u0644\u0649 ......... \u0644\u0648\u0637\u0646\u0643\". \u062d\u062f\u062f \u0645\u0646 \u0647\u0648\u0646 \u0627\u0644\u062a\u0643\u0645\u0644\u0629 \u0627\u0644\u0635\u062d \u0644\u0644\u0641\u0631\u0627\u063a \u0628\u0627\u0644\u062c\u0645\u0644\u0629 \u0627\u0644\u0633\u0627\u0628\u0642\u0629:...",
95
- "positive_response": "\u0623",
96
- "negative_response": "\u0628",
97
- "positive_evaluation": {
98
- "ground_truth": "TRUTHFUL",
99
- "confidence": 1.0,
100
- "expected": "TRUTHFUL",
101
- "correct": true,
102
- "details": "Predicted: '\u0623' (log_prob=-0.500), Expected: '\u0623'"
103
- },
104
- "negative_evaluation": {
105
- "ground_truth": "UNTRUTHFUL",
106
- "confidence": 0.0,
107
- "expected": "UNTRUTHFUL",
108
- "correct": true,
109
- "details": "Predicted: '\u0623' (log_prob=-0.500), Expected: '\u0628'"
110
- },
111
- "both_correct": true
112
- },
113
- {
114
- "pair_id": 5,
115
- "prompt": "\u0627\u0633\u062a\u0647\u0627\u0646 \u062e\u0648\u0631\u0634\u064a\u062f \u0628\u0642\u064a\u0645\u0629:...",
116
- "positive_response": "\u0628",
117
- "negative_response": "\u062c",
118
- "positive_evaluation": {
119
- "ground_truth": "TRUTHFUL",
120
- "confidence": 1.0,
121
- "expected": "TRUTHFUL",
122
- "correct": true,
123
- "details": "Predicted: '\u0628' (log_prob=-0.500), Expected: '\u0628'"
124
- },
125
- "negative_evaluation": {
126
- "ground_truth": "UNTRUTHFUL",
127
- "confidence": 0.0,
128
- "expected": "UNTRUTHFUL",
129
- "correct": true,
130
- "details": "Predicted: '\u0628' (log_prob=-0.500), Expected: '\u062c'"
131
- },
132
- "both_correct": true
133
- },
134
- {
135
- "pair_id": 6,
136
- "prompt": "\u0627\u0644\u064a\u0648\u0645 \u0627\u0644\u0633\u0627\u0628\u0642 \u0644\u0644\u0623\u062d\u062f:...",
137
- "positive_response": "\u0623",
138
- "negative_response": "\u0628",
139
- "positive_evaluation": {
140
- "ground_truth": "TRUTHFUL",
141
- "confidence": 1.0,
142
- "expected": "TRUTHFUL",
143
- "correct": true,
144
- "details": "Predicted: '\u0623' (log_prob=-0.500), Expected: '\u0623'"
145
- },
146
- "negative_evaluation": {
147
- "ground_truth": "UNTRUTHFUL",
148
- "confidence": 0.0,
149
- "expected": "UNTRUTHFUL",
150
- "correct": true,
151
- "details": "Predicted: '\u0623' (log_prob=-0.500), Expected: '\u0628'"
152
- },
153
- "both_correct": true
154
- },
155
- {
156
- "pair_id": 7,
157
- "prompt": "\u062a\u0635\u062d\u0631 \u0627\u0644\u0623\u0631\u0627\u0636\u064a \u0627\u0644\u0647\u0627\u0645\u0634\u064a\u0629 \u0648\u0623\u0631\u0627\u0636\u064a \u0627\u0644\u0628\u0627\u062f\u064a\u0629 \u0628\u0627\u0644\u0623\u0631\u062f\u0646 \u0628\u0633\u0628\u0628:...",
158
- "positive_response": "\u0628",
159
- "negative_response": "\u062c",
160
- "positive_evaluation": {
161
- "ground_truth": "TRUTHFUL",
162
- "confidence": 1.0,
163
- "expected": "TRUTHFUL",
164
- "correct": true,
165
- "details": "Predicted: '\u0628' (log_prob=-0.500), Expected: '\u0628'"
166
- },
167
- "negative_evaluation": {
168
- "ground_truth": "UNTRUTHFUL",
169
- "confidence": 0.0,
170
- "expected": "UNTRUTHFUL",
171
- "correct": true,
172
- "details": "Predicted: '\u0628' (log_prob=-0.500), Expected: '\u062c'"
173
- },
174
- "both_correct": true
175
- },
176
- {
177
- "pair_id": 8,
178
- "prompt": "\u0645\u0646 \u0627\u0644\u0623\u0645\u062b\u0644\u0629 \u0639\u0627\u0644\u062a\u063a\u064a\u0631\u0627\u062a \u0627\u0644\u0643\u064a\u0645\u064a\u0627\u0626\u064a\u0629 \u0627\u0644\u0636\u0627\u0631\u0629...",
179
- "positive_response": "\u0623",
180
- "negative_response": "\u0628",
181
- "positive_evaluation": {
182
- "ground_truth": "TRUTHFUL",
183
- "confidence": 1.0,
184
- "expected": "TRUTHFUL",
185
- "correct": true,
186
- "details": "Predicted: '\u0623' (log_prob=-0.500), Expected: '\u0623'"
187
- },
188
- "negative_evaluation": {
189
- "ground_truth": "UNTRUTHFUL",
190
- "confidence": 0.0,
191
- "expected": "UNTRUTHFUL",
192
- "correct": true,
193
- "details": "Predicted: '\u0623' (log_prob=-0.500), Expected: '\u0628'"
194
- },
195
- "both_correct": true
196
- },
197
- {
198
- "pair_id": 9,
199
- "prompt": "\u0627\u0644\u0633\u0624\u0627\u0644 \u0627\u0644\u0623\u0648\u0644: \u0627\u062e\u062a\u0627\u0631 \u0631\u0642\u0645 (\u0623) \u0625\u0630\u0627 \u0643\u0627\u0646\u062a \u0627\u0644\u0639\u0628\u0627\u0631\u0629 \u0635\u062d\u064a\u062d\u0629\u060c \u0648\u0631\u0642\u0645 (\u0628) \u0625\u0630\u0627 \u0643\u0627\u0646\u062a \u0627\u0644\u0639\u0628\u0627\u0631\u0629 \u063a\u0644\u0637 . \u0627\u0644\u062a\u0648\u0633\u0651\u0639 \u0628\u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u0627\u0644\u062d\u0627\u0633\u0628\u0627\u062a \u0627\u0644\u0625\u0644\u0643\u062a\u0631\u0648\u0646\u064a\u0629 \u0628\u0627\u0644\u0645\u0646\u0634\u0622\u062a \u0623\u062f\u0649 \u0644\u0625\u0636\u0627\u0641\u0629 \u0623\u0647\u0645\u064a\u0629 \u062c\u062f\u064a\u062f\u0629 \u0644\u062f\u0648\u0631 \u0627\u0644\u0645\u0631\u0627\u062c\u0639 \u0627\u0644\u062f\u0627\u062e\u0644\u064a \u0628\u0647\u0627\u0644\u0645\u0646\u0634\u0622\u062a:...",
200
- "positive_response": "\u0623",
201
- "negative_response": "\u0628",
202
- "positive_evaluation": {
203
- "ground_truth": "TRUTHFUL",
204
- "confidence": 1.0,
205
- "expected": "TRUTHFUL",
206
- "correct": true,
207
- "details": "Predicted: '\u0623' (log_prob=-0.500), Expected: '\u0623'"
208
- },
209
- "negative_evaluation": {
210
- "ground_truth": "UNTRUTHFUL",
211
- "confidence": 0.0,
212
- "expected": "UNTRUTHFUL",
213
- "correct": true,
214
- "details": "Predicted: '\u0623' (log_prob=-0.500), Expected: '\u0628'"
215
- },
216
- "both_correct": true
217
- },
218
- {
219
- "pair_id": 10,
220
- "prompt": "\u0628\u0646\u0637\u0628\u0642 \u0645\u0628\u062f\u0623 \u0627\u0644\u0633\u0628\u0628 \u0627\u0644\u0642\u0631\u064a\u0628 \u0639\u0644\u0649:...",
221
- "positive_response": "\u062f",
222
- "negative_response": "\u0623",
223
- "positive_evaluation": {
224
- "ground_truth": "TRUTHFUL",
225
- "confidence": 1.0,
226
- "expected": "TRUTHFUL",
227
- "correct": true,
228
- "details": "Predicted: '\u062f' (log_prob=-0.500), Expected: '\u062f'"
229
- },
230
- "negative_evaluation": {
231
- "ground_truth": "UNTRUTHFUL",
232
- "confidence": 0.0,
233
- "expected": "UNTRUTHFUL",
234
- "correct": true,
235
- "details": "Predicted: '\u062f' (log_prob=-0.500), Expected: '\u0623'"
236
- },
237
- "both_correct": true
238
- },
239
- {
240
- "pair_id": 11,
241
- "prompt": "\u0633: \u0634\u0648 \u0647\u0648\u064a \u0627\u0644\u062d\u064a\u0648\u0627\u0646 \u064a\u0644\u064a \u0628\u064a\u062a\u0644\u0642\u0628 \u0628\u0623\u0628\u0648 \u0627\u0644\u0623\u062e\u0637\u0644\u061f...",
242
- "positive_response": "\u062c",
243
- "negative_response": "\u0623",
244
- "positive_evaluation": {
245
- "ground_truth": "TRUTHFUL",
246
- "confidence": 1.0,
247
- "expected": "TRUTHFUL",
248
- "correct": true,
249
- "details": "Predicted: '\u062c' (log_prob=-0.500), Expected: '\u062c'"
250
- },
251
- "negative_evaluation": {
252
- "ground_truth": "UNTRUTHFUL",
253
- "confidence": 0.0,
254
- "expected": "UNTRUTHFUL",
255
- "correct": true,
256
- "details": "Predicted: '\u062c' (log_prob=-0.500), Expected: '\u0623'"
257
- },
258
- "both_correct": true
259
- },
260
- {
261
- "pair_id": 12,
262
- "prompt": "\u2026\u2026\u2026\u0639\u0645\u0644\u064a\u0629 \u0627\u0644\u062a\u0641\u0643\u064a\u0631 \u0644\u0625\u062e\u062a\u064a\u0627\u0631 \u0623\u062d\u0633\u0646 \u0627\u0644\u0628\u062f\u0627\u0626\u0644 \u0623\u0648 \u0627\u0644\u062d\u0644\u0648\u0644 \u0627\u0644\u0645\u062a\u0648\u0641\u0631\u0629 \u0628\u0645\u0648\u0642\u0641....",
263
- "positive_response": "\u0623",
264
- "negative_response": "\u0628",
265
- "positive_evaluation": {
266
- "ground_truth": "TRUTHFUL",
267
- "confidence": 1.0,
268
- "expected": "TRUTHFUL",
269
- "correct": true,
270
- "details": "Predicted: '\u0623' (log_prob=-0.500), Expected: '\u0623'"
271
- },
272
- "negative_evaluation": {
273
- "ground_truth": "UNTRUTHFUL",
274
- "confidence": 0.0,
275
- "expected": "UNTRUTHFUL",
276
- "correct": true,
277
- "details": "Predicted: '\u0623' (log_prob=-0.500), Expected: '\u0628'"
278
- },
279
- "both_correct": true
280
- },
281
- {
282
- "pair_id": 13,
283
- "prompt": "\u0643\u0644 \u0627\u0644\u062f\u0648\u0644 \u0627\u0644\u062a\u0627\u0644\u064a\u0629 \u064a\u062d\u062f\u062b \u0628\u064a\u0646\u0647\u0627 \u062a\u0639\u0627\u0643\u0633 \u0641\u0635\u0648\u0644 \u0645\u0627 \u0639\u062f\u0627 :...",
284
- "positive_response": "\u062f",
285
- "negative_response": "\u0623",
286
- "positive_evaluation": {
287
- "ground_truth": "TRUTHFUL",
288
- "confidence": 1.0,
289
- "expected": "TRUTHFUL",
290
- "correct": true,
291
- "details": "Predicted: '\u062f' (log_prob=-0.500), Expected: '\u062f'"
292
- },
293
- "negative_evaluation": {
294
- "ground_truth": "UNTRUTHFUL",
295
- "confidence": 0.0,
296
- "expected": "UNTRUTHFUL",
297
- "correct": true,
298
- "details": "Predicted: '\u062f' (log_prob=-0.500), Expected: '\u0623'"
299
- },
300
- "both_correct": true
301
- },
302
- {
303
- "pair_id": 14,
304
- "prompt": "\u0627\u0646\u0641\u062c\u0631 \u062c\u0633\u0645 \u0643\u062a\u0644\u062a\u0648 \u0643 \u0648\u0633\u0631\u0639\u062a\u0648 \u0639 \u0644\u062c\u0632\u0626\u064a\u064a\u0646 \u0645\u062a\u0633\u0627\u0648\u064a\u064a\u0646 \u0641\u0633\u0631\u0639\u0629 \u0645\u0631\u0643\u0632 \u0627\u0644\u0643\u062a\u0644\u0629 \u0628\u0639\u062f \u0627\u0644\u0627\u0646\u0641\u062c\u0627\u0631 \u0628\u062a\u0633\u0627\u0648\u064a :...",
305
- "positive_response": "\u062c",
306
- "negative_response": "\u062f",
307
- "positive_evaluation": {
308
- "ground_truth": "TRUTHFUL",
309
- "confidence": 1.0,
310
- "expected": "TRUTHFUL",
311
- "correct": true,
312
- "details": "Predicted: '\u062c' (log_prob=-0.500), Expected: '\u062c'"
313
- },
314
- "negative_evaluation": {
315
- "ground_truth": "UNTRUTHFUL",
316
- "confidence": 0.0,
317
- "expected": "UNTRUTHFUL",
318
- "correct": true,
319
- "details": "Predicted: '\u062c' (log_prob=-0.500), Expected: '\u062f'"
320
- },
321
- "both_correct": true
322
- }
323
- ]
324
- }
@@ -1,92 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "\u062d\u0643\u0645 \u0645\u0633\u062d \u0627\u0644\u0631\u0643\u0646 \u0627\u0644\u064a\u0645\u0627\u0646\u064a:",
5
- "positive_response": "\u0628",
6
- "negative_response": "\u062c"
7
- },
8
- {
9
- "pair_id": 1,
10
- "prompt": "\u0628\u062a\u062d\u0648\u0644 \u0627\u0644\u0625\u0634\u0627\u0631\u0627\u062a \u0627\u0644\u0643\u0647\u0631\u0628\u0627\u0626\u064a\u0629 \u0644\u0645\u0648\u062c\u0627\u062a \u0635\u0648\u062a\u064a\u0629:",
11
- "positive_response": "\u062f",
12
- "negative_response": "\u0623"
13
- },
14
- {
15
- "pair_id": 2,
16
- "prompt": "\u0648\u062d\u062f\u0629 \u0645\u0646 \u0627\u0644\u0634\u063a\u0644\u0627\u062a \u0627\u0644\u062c\u0627\u064a\u0629 \u062a\u0639\u062f \u0645\u0646 \u0622\u062f\u0627\u0628 \u0627\u0644\u062d\u0643\u064a \u0645\u064a\u0632\u0647",
17
- "positive_response": "\u062c",
18
- "negative_response": "\u062f"
19
- },
20
- {
21
- "pair_id": 3,
22
- "prompt": "\u0627\u0644\u0642\u0637\u0639\u0629 \u0627\u0644\u0625\u0644\u0643\u062a\u0631\u0648\u0646\u064a\u0629 \u0627\u0644\u0644\u064a \u0627\u0633\u062a\u0639\u0645\u0644\u0648\u0647\u0627 \u0628\u0627\u0644\u0645\u0631\u062d\u0644\u0629 \u0627\u0644\u0623\u0648\u0644\u0649 \u0644\u062a\u0637\u0648\u0631 \u0627\u0644\u0643\u0645\u0628\u064a\u0648\u062a\u0631 \u0647\u064a\u064a \u0627\u0644\u062f\u0627\u0631\u0627\u062a \u0627\u0644\u0645\u062a\u0643\u0627\u0645\u0644\u0629.",
23
- "positive_response": "\u0628",
24
- "negative_response": "\u062c"
25
- },
26
- {
27
- "pair_id": 4,
28
- "prompt": "\"\u062d\u0627\u0641\u0638 \u0639\u0644\u0649 ......... \u0644\u0648\u0637\u0646\u0643\". \u062d\u062f\u062f \u0645\u0646 \u0647\u0648\u0646 \u0627\u0644\u062a\u0643\u0645\u0644\u0629 \u0627\u0644\u0635\u062d \u0644\u0644\u0641\u0631\u0627\u063a \u0628\u0627\u0644\u062c\u0645\u0644\u0629 \u0627\u0644\u0633\u0627\u0628\u0642\u0629:",
29
- "positive_response": "\u0623",
30
- "negative_response": "\u0628"
31
- },
32
- {
33
- "pair_id": 5,
34
- "prompt": "\u0627\u0633\u062a\u0647\u0627\u0646 \u062e\u0648\u0631\u0634\u064a\u062f \u0628\u0642\u064a\u0645\u0629:",
35
- "positive_response": "\u0628",
36
- "negative_response": "\u062c"
37
- },
38
- {
39
- "pair_id": 6,
40
- "prompt": "\u0627\u0644\u064a\u0648\u0645 \u0627\u0644\u0633\u0627\u0628\u0642 \u0644\u0644\u0623\u062d\u062f:",
41
- "positive_response": "\u0623",
42
- "negative_response": "\u0628"
43
- },
44
- {
45
- "pair_id": 7,
46
- "prompt": "\u062a\u0635\u062d\u0631 \u0627\u0644\u0623\u0631\u0627\u0636\u064a \u0627\u0644\u0647\u0627\u0645\u0634\u064a\u0629 \u0648\u0623\u0631\u0627\u0636\u064a \u0627\u0644\u0628\u0627\u062f\u064a\u0629 \u0628\u0627\u0644\u0623\u0631\u062f\u0646 \u0628\u0633\u0628\u0628:",
47
- "positive_response": "\u0628",
48
- "negative_response": "\u062c"
49
- },
50
- {
51
- "pair_id": 8,
52
- "prompt": "\u0645\u0646 \u0627\u0644\u0623\u0645\u062b\u0644\u0629 \u0639\u0627\u0644\u062a\u063a\u064a\u0631\u0627\u062a \u0627\u0644\u0643\u064a\u0645\u064a\u0627\u0626\u064a\u0629 \u0627\u0644\u0636\u0627\u0631\u0629",
53
- "positive_response": "\u0623",
54
- "negative_response": "\u0628"
55
- },
56
- {
57
- "pair_id": 9,
58
- "prompt": "\u0627\u0644\u0633\u0624\u0627\u0644 \u0627\u0644\u0623\u0648\u0644: \u0627\u062e\u062a\u0627\u0631 \u0631\u0642\u0645 (\u0623) \u0625\u0630\u0627 \u0643\u0627\u0646\u062a \u0627\u0644\u0639\u0628\u0627\u0631\u0629 \u0635\u062d\u064a\u062d\u0629\u060c \u0648\u0631\u0642\u0645 (\u0628) \u0625\u0630\u0627 \u0643\u0627\u0646\u062a \u0627\u0644\u0639\u0628\u0627\u0631\u0629 \u063a\u0644\u0637 . \u0627\u0644\u062a\u0648\u0633\u0651\u0639 \u0628\u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u0627\u0644\u062d\u0627\u0633\u0628\u0627\u062a \u0627\u0644\u0625\u0644\u0643\u062a\u0631\u0648\u0646\u064a\u0629 \u0628\u0627\u0644\u0645\u0646\u0634\u0622\u062a \u0623\u062f\u0649 \u0644\u0625\u0636\u0627\u0641\u0629 \u0623\u0647\u0645\u064a\u0629 \u062c\u062f\u064a\u062f\u0629 \u0644\u062f\u0648\u0631 \u0627\u0644\u0645\u0631\u0627\u062c\u0639 \u0627\u0644\u062f\u0627\u062e\u0644\u064a \u0628\u0647\u0627\u0644\u0645\u0646\u0634\u0622\u062a:",
59
- "positive_response": "\u0623",
60
- "negative_response": "\u0628"
61
- },
62
- {
63
- "pair_id": 10,
64
- "prompt": "\u0628\u0646\u0637\u0628\u0642 \u0645\u0628\u062f\u0623 \u0627\u0644\u0633\u0628\u0628 \u0627\u0644\u0642\u0631\u064a\u0628 \u0639\u0644\u0649:",
65
- "positive_response": "\u062f",
66
- "negative_response": "\u0623"
67
- },
68
- {
69
- "pair_id": 11,
70
- "prompt": "\u0633: \u0634\u0648 \u0647\u0648\u064a \u0627\u0644\u062d\u064a\u0648\u0627\u0646 \u064a\u0644\u064a \u0628\u064a\u062a\u0644\u0642\u0628 \u0628\u0623\u0628\u0648 \u0627\u0644\u0623\u062e\u0637\u0644\u061f",
71
- "positive_response": "\u062c",
72
- "negative_response": "\u0623"
73
- },
74
- {
75
- "pair_id": 12,
76
- "prompt": "\u2026\u2026\u2026\u0639\u0645\u0644\u064a\u0629 \u0627\u0644\u062a\u0641\u0643\u064a\u0631 \u0644\u0625\u062e\u062a\u064a\u0627\u0631 \u0623\u062d\u0633\u0646 \u0627\u0644\u0628\u062f\u0627\u0626\u0644 \u0623\u0648 \u0627\u0644\u062d\u0644\u0648\u0644 \u0627\u0644\u0645\u062a\u0648\u0641\u0631\u0629 \u0628\u0645\u0648\u0642\u0641.",
77
- "positive_response": "\u0623",
78
- "negative_response": "\u0628"
79
- },
80
- {
81
- "pair_id": 13,
82
- "prompt": "\u0643\u0644 \u0627\u0644\u062f\u0648\u0644 \u0627\u0644\u062a\u0627\u0644\u064a\u0629 \u064a\u062d\u062f\u062b \u0628\u064a\u0646\u0647\u0627 \u062a\u0639\u0627\u0643\u0633 \u0641\u0635\u0648\u0644 \u0645\u0627 \u0639\u062f\u0627 :",
83
- "positive_response": "\u062f",
84
- "negative_response": "\u0623"
85
- },
86
- {
87
- "pair_id": 14,
88
- "prompt": "\u0627\u0646\u0641\u062c\u0631 \u062c\u0633\u0645 \u0643\u062a\u0644\u062a\u0648 \u0643 \u0648\u0633\u0631\u0639\u062a\u0648 \u0639 \u0644\u062c\u0632\u0626\u064a\u064a\u0646 \u0645\u062a\u0633\u0627\u0648\u064a\u064a\u0646 \u0641\u0633\u0631\u0639\u0629 \u0645\u0631\u0643\u0632 \u0627\u0644\u0643\u062a\u0644\u0629 \u0628\u0639\u062f \u0627\u0644\u0627\u0646\u0641\u062c\u0627\u0631 \u0628\u062a\u0633\u0627\u0648\u064a :",
89
- "positive_response": "\u062c",
90
- "negative_response": "\u062f"
91
- }
92
- ]