wisent 0.7.379__py3-none-any.whl → 0.7.701__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (725) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/__init__.py +22 -6
  3. wisent/core/activations/activations.py +21 -39
  4. wisent/core/activations/activations_collector.py +141 -373
  5. wisent/core/activations/classifier_inference_strategy.py +194 -0
  6. wisent/core/activations/core/atoms.py +8 -92
  7. wisent/core/activations/extraction_strategy.py +308 -0
  8. wisent/core/agent/diagnose/response_diagnostics.py +3 -3
  9. wisent/core/agent/diagnose.py +3 -3
  10. wisent/core/autonomous_agent.py +2 -2
  11. wisent/core/cli/agent/apply_steering.py +23 -27
  12. wisent/core/cli/agent/evaluate_response.py +18 -20
  13. wisent/core/cli/agent/train_classifier.py +18 -20
  14. wisent/core/cli/cluster_benchmarks.py +472 -0
  15. wisent/core/cli/create_steering_vector.py +13 -5
  16. wisent/core/cli/generate_vector_from_task.py +4 -0
  17. wisent/core/cli/get_activations.py +12 -36
  18. wisent/core/cli/method_optimizer.py +859 -0
  19. wisent/core/cli/optimize.py +44 -5
  20. wisent/core/cli/optimize_classification.py +5 -6
  21. wisent/core/cli/optimize_sample_size.py +8 -22
  22. wisent/core/cli/optimize_steering.py +429 -153
  23. wisent/core/cli/optimize_weights.py +65 -6
  24. wisent/core/cli/steering_method_trainer.py +5 -4
  25. wisent/core/cli/steering_search_space.py +20 -15
  26. wisent/core/cli/tasks.py +14 -43
  27. wisent/core/cli/train_unified_goodness.py +17 -18
  28. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1578 -173
  29. wisent/core/contrastive_pairs/diagnostics/linearity.py +63 -80
  30. wisent/core/contrastive_pairs/diagnostics/vector_quality.py +6 -5
  31. wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +5 -19
  32. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +11 -5
  33. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +146 -32
  34. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +2 -2
  35. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +98 -57
  36. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +8 -8
  37. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +1 -1
  38. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -5
  39. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval_aqua_rat.py +129 -0
  40. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +11 -6
  41. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +1 -1
  42. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +47 -6
  43. wisent/core/evaluators/benchmark_specific/apps_evaluator.py +133 -0
  44. wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +6 -1
  45. wisent/core/evaluators/benchmark_specific/conala_evaluator.py +31 -168
  46. wisent/core/evaluators/custom/examples/humanization_coherent.py +89 -35
  47. wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +2 -20
  48. wisent/core/evaluators/personalization/coherence.py +46 -0
  49. wisent/core/hyperparameter_optimizer.py +13 -13
  50. wisent/core/lm_eval_harness_ground_truth.py +7 -11
  51. wisent/core/main.py +3 -0
  52. wisent/core/models/wisent_model.py +8 -7
  53. wisent/core/opti/methods/opti_weights.py +29 -2
  54. wisent/core/optuna/classifier/activation_generator.py +14 -12
  55. wisent/core/optuna/steering/steering_optimization.py +14 -9
  56. wisent/core/parser_arguments/cluster_benchmarks_parser.py +31 -0
  57. wisent/core/parser_arguments/generate_vector_from_task_parser.py +20 -0
  58. wisent/core/parser_arguments/main_parser.py +8 -0
  59. wisent/core/parser_arguments/optimize_steering_parser.py +117 -10
  60. wisent/core/parser_arguments/optimize_weights_parser.py +6 -0
  61. wisent/core/parser_arguments/tasks_parser.py +7 -19
  62. wisent/core/steering_methods/core/atoms.py +1 -2
  63. wisent/core/steering_methods/methods/caa.py +1 -1
  64. wisent/core/steering_methods/methods/hyperplane.py +74 -0
  65. wisent/core/steering_methods/methods/prism.py +1 -2
  66. wisent/core/steering_methods/methods/pulse.py +39 -8
  67. wisent/core/steering_methods/methods/titan.py +59 -14
  68. wisent/core/steering_methods/registry.py +52 -12
  69. wisent/core/steering_optimizer.py +15 -15
  70. wisent/core/trainers/steering_trainer.py +9 -18
  71. wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +19 -70
  72. wisent/scripts/run_quality_metrics_sweep.sh +22 -27
  73. wisent/tests/test_aggregation_geometry.py +236 -0
  74. wisent/tests/test_detector_accuracy.py +163 -0
  75. wisent/tests/test_geometry_exhaustive.py +1202 -0
  76. wisent/tests/visualize_geometry.py +255 -61
  77. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/METADATA +1 -1
  78. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/RECORD +82 -714
  79. wisent/core/activations/prompt_construction_strategy.py +0 -47
  80. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +0 -15
  81. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +0 -64
  82. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +0 -65
  83. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +0 -65
  84. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +0 -65
  85. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +0 -65
  86. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +0 -65
  87. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +0 -99
  88. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +0 -180
  89. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +0 -129
  90. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +0 -142
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +0 -155
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +0 -161
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +0 -107
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +0 -155
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +0 -155
  96. wisent/examples/scripts/results/benchmark_descriptions.json +0 -1244
  97. wisent/examples/scripts/results/benchmark_evaluation_methods.json +0 -66
  98. wisent/examples/scripts/results/benchmark_evaluator_mapping.json +0 -2781
  99. wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +0 -30536
  100. wisent/examples/scripts/results/benchmark_evaluators_clean.json +0 -469
  101. wisent/examples/scripts/results/benchmark_methods_summary.json +0 -260
  102. wisent/examples/scripts/results/benchmark_pair_creation_methods.json +0 -66
  103. wisent/examples/scripts/results/benchmark_pair_totals.json +0 -269
  104. wisent/examples/scripts/results/benchmark_tags.json +0 -917
  105. wisent/examples/scripts/results/benchmark_test_summary_nov4.json +0 -71
  106. wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +0 -150
  107. wisent/examples/scripts/results/failing_benchmarks.json +0 -946
  108. wisent/examples/scripts/results/failing_benchmarks_list.json +0 -41
  109. wisent/examples/scripts/results/failing_benchmarks_test_results.json +0 -945
  110. wisent/examples/scripts/results/missing_benchmark_tags.json +0 -341
  111. wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +0 -30
  112. wisent/examples/scripts/results/test_20_newsgroups_pairs.json +0 -8
  113. wisent/examples/scripts/results/test_AraDICE_evaluation.json +0 -51
  114. wisent/examples/scripts/results/test_AraDICE_pairs.json +0 -14
  115. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +0 -30
  116. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +0 -8
  117. wisent/examples/scripts/results/test_ArabCulture_evaluation.json +0 -51
  118. wisent/examples/scripts/results/test_ArabCulture_pairs.json +0 -14
  119. wisent/examples/scripts/results/test_Tag_evaluation.json +0 -30
  120. wisent/examples/scripts/results/test_Tag_pairs.json +0 -8
  121. wisent/examples/scripts/results/test_aclue_evaluation.json +0 -51
  122. wisent/examples/scripts/results/test_aclue_pairs.json +0 -14
  123. wisent/examples/scripts/results/test_acp_bench_evaluation.json +0 -51
  124. wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +0 -51
  125. wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +0 -14
  126. wisent/examples/scripts/results/test_acp_bench_pairs.json +0 -14
  127. wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +0 -51
  128. wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +0 -14
  129. wisent/examples/scripts/results/test_aexams_evaluation.json +0 -51
  130. wisent/examples/scripts/results/test_aexams_pairs.json +0 -14
  131. wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +0 -30
  132. wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +0 -8
  133. wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +0 -30
  134. wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +0 -8
  135. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +0 -30
  136. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +0 -8
  137. wisent/examples/scripts/results/test_ag_news_evaluation.json +0 -30
  138. wisent/examples/scripts/results/test_ag_news_pairs.json +0 -8
  139. wisent/examples/scripts/results/test_agieval_evaluation.json +0 -51
  140. wisent/examples/scripts/results/test_agieval_pairs.json +0 -14
  141. wisent/examples/scripts/results/test_aime2024_evaluation.json +0 -30
  142. wisent/examples/scripts/results/test_aime2024_pairs.json +0 -8
  143. wisent/examples/scripts/results/test_aime2025_evaluation.json +0 -30
  144. wisent/examples/scripts/results/test_aime2025_pairs.json +0 -8
  145. wisent/examples/scripts/results/test_aime_evaluation.json +0 -30
  146. wisent/examples/scripts/results/test_aime_pairs.json +0 -8
  147. wisent/examples/scripts/results/test_anagrams1_evaluation.json +0 -30
  148. wisent/examples/scripts/results/test_anagrams1_pairs.json +0 -8
  149. wisent/examples/scripts/results/test_anagrams2_evaluation.json +0 -30
  150. wisent/examples/scripts/results/test_anagrams2_pairs.json +0 -8
  151. wisent/examples/scripts/results/test_anli_evaluation.json +0 -30
  152. wisent/examples/scripts/results/test_anli_pairs.json +0 -8
  153. wisent/examples/scripts/results/test_apps_evaluation.json +0 -30
  154. wisent/examples/scripts/results/test_apps_pairs.json +0 -8
  155. wisent/examples/scripts/results/test_arabic_exams_evaluation.json +0 -30
  156. wisent/examples/scripts/results/test_arabic_exams_pairs.json +0 -8
  157. wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +0 -51
  158. wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +0 -14
  159. wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +0 -51
  160. wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +0 -14
  161. wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +0 -51
  162. wisent/examples/scripts/results/test_arabicmmlu_pairs.json +0 -14
  163. wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +0 -51
  164. wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +0 -14
  165. wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +0 -51
  166. wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +0 -14
  167. wisent/examples/scripts/results/test_arc_ar_evaluation.json +0 -30
  168. wisent/examples/scripts/results/test_arc_ar_pairs.json +0 -8
  169. wisent/examples/scripts/results/test_arc_challenge_evaluation.json +0 -30
  170. wisent/examples/scripts/results/test_arc_challenge_pairs.json +0 -8
  171. wisent/examples/scripts/results/test_arc_easy_evaluation.json +0 -30
  172. wisent/examples/scripts/results/test_arc_easy_pairs.json +0 -8
  173. wisent/examples/scripts/results/test_argument_topic_evaluation.json +0 -30
  174. wisent/examples/scripts/results/test_argument_topic_pairs.json +0 -8
  175. wisent/examples/scripts/results/test_arithmetic_evaluation.json +0 -51
  176. wisent/examples/scripts/results/test_arithmetic_pairs.json +0 -14
  177. wisent/examples/scripts/results/test_asdiv_evaluation.json +0 -30
  178. wisent/examples/scripts/results/test_asdiv_pairs.json +0 -8
  179. wisent/examples/scripts/results/test_assin_entailment_evaluation.json +0 -30
  180. wisent/examples/scripts/results/test_assin_entailment_pairs.json +0 -8
  181. wisent/examples/scripts/results/test_atis_evaluation.json +0 -30
  182. wisent/examples/scripts/results/test_atis_pairs.json +0 -8
  183. wisent/examples/scripts/results/test_babi_evaluation.json +0 -30
  184. wisent/examples/scripts/results/test_babi_pairs.json +0 -8
  185. wisent/examples/scripts/results/test_babilong_evaluation.json +0 -30
  186. wisent/examples/scripts/results/test_babilong_pairs.json +0 -8
  187. wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +0 -30
  188. wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +0 -8
  189. wisent/examples/scripts/results/test_banking77_evaluation.json +0 -30
  190. wisent/examples/scripts/results/test_banking77_pairs.json +0 -8
  191. wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +0 -14
  192. wisent/examples/scripts/results/test_basque-glue_evaluation.json +0 -51
  193. wisent/examples/scripts/results/test_basque-glue_pairs.json +0 -14
  194. wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +0 -51
  195. wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +0 -14
  196. wisent/examples/scripts/results/test_basque_bench_evaluation.json +0 -51
  197. wisent/examples/scripts/results/test_basque_bench_pairs.json +0 -14
  198. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +0 -51
  199. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +0 -14
  200. wisent/examples/scripts/results/test_basqueglue_evaluation.json +0 -51
  201. wisent/examples/scripts/results/test_basqueglue_pairs.json +0 -14
  202. wisent/examples/scripts/results/test_bbh_evaluation.json +0 -51
  203. wisent/examples/scripts/results/test_bbh_pairs.json +0 -14
  204. wisent/examples/scripts/results/test_bbq_evaluation.json +0 -30
  205. wisent/examples/scripts/results/test_bbq_pairs.json +0 -8
  206. wisent/examples/scripts/results/test_bec2016eu_evaluation.json +0 -51
  207. wisent/examples/scripts/results/test_bec2016eu_pairs.json +0 -14
  208. wisent/examples/scripts/results/test_belebele_evaluation.json +0 -51
  209. wisent/examples/scripts/results/test_belebele_pairs.json +0 -14
  210. wisent/examples/scripts/results/test_benchmarks_evaluation.json +0 -51
  211. wisent/examples/scripts/results/test_benchmarks_pairs.json +0 -14
  212. wisent/examples/scripts/results/test_bertaqa_evaluation.json +0 -51
  213. wisent/examples/scripts/results/test_bertaqa_pairs.json +0 -14
  214. wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +0 -30
  215. wisent/examples/scripts/results/test_bhtc_v2_pairs.json +0 -8
  216. wisent/examples/scripts/results/test_bigbench_evaluation.json +0 -51
  217. wisent/examples/scripts/results/test_bigbench_pairs.json +0 -14
  218. wisent/examples/scripts/results/test_blimp_evaluation.json +0 -51
  219. wisent/examples/scripts/results/test_blimp_pairs.json +0 -14
  220. wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +0 -30
  221. wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +0 -8
  222. wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +0 -30
  223. wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +0 -8
  224. wisent/examples/scripts/results/test_boolq_evaluation.json +0 -30
  225. wisent/examples/scripts/results/test_boolq_pairs.json +0 -8
  226. wisent/examples/scripts/results/test_c4_evaluation.json +0 -30
  227. wisent/examples/scripts/results/test_c4_pairs.json +0 -8
  228. wisent/examples/scripts/results/test_cabreu_evaluation.json +0 -30
  229. wisent/examples/scripts/results/test_cabreu_pairs.json +0 -8
  230. wisent/examples/scripts/results/test_careqa_evaluation.json +0 -30
  231. wisent/examples/scripts/results/test_careqa_pairs.json +0 -8
  232. wisent/examples/scripts/results/test_catalan_bench_evaluation.json +0 -51
  233. wisent/examples/scripts/results/test_catalan_bench_pairs.json +0 -14
  234. wisent/examples/scripts/results/test_catalanqa_evaluation.json +0 -30
  235. wisent/examples/scripts/results/test_catalanqa_pairs.json +0 -8
  236. wisent/examples/scripts/results/test_catcola_evaluation.json +0 -30
  237. wisent/examples/scripts/results/test_catcola_pairs.json +0 -8
  238. wisent/examples/scripts/results/test_cb_evaluation.json +0 -30
  239. wisent/examples/scripts/results/test_cb_pairs.json +0 -8
  240. wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +0 -51
  241. wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +0 -14
  242. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +0 -30
  243. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +0 -8
  244. wisent/examples/scripts/results/test_ceval_evaluation.json +0 -51
  245. wisent/examples/scripts/results/test_ceval_pairs.json +0 -14
  246. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +0 -51
  247. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +0 -14
  248. wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +0 -51
  249. wisent/examples/scripts/results/test_chain_of_thought_pairs.json +0 -14
  250. wisent/examples/scripts/results/test_chartqa_evaluation.json +0 -30
  251. wisent/examples/scripts/results/test_chartqa_pairs.json +0 -8
  252. wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +0 -30
  253. wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +0 -8
  254. wisent/examples/scripts/results/test_cmmlu_evaluation.json +0 -51
  255. wisent/examples/scripts/results/test_cmmlu_pairs.json +0 -14
  256. wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +0 -30
  257. wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +0 -8
  258. wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +0 -30
  259. wisent/examples/scripts/results/test_cocoteros_es_pairs.json +0 -8
  260. wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +0 -30
  261. wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +0 -8
  262. wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +0 -30
  263. wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +0 -8
  264. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +0 -30
  265. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +0 -8
  266. wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +0 -30
  267. wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +0 -8
  268. wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +0 -30
  269. wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +0 -8
  270. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +0 -30
  271. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +0 -8
  272. wisent/examples/scripts/results/test_coedit_gec_evaluation.json +0 -30
  273. wisent/examples/scripts/results/test_coedit_gec_pairs.json +0 -8
  274. wisent/examples/scripts/results/test_cola_evaluation.json +0 -30
  275. wisent/examples/scripts/results/test_cola_pairs.json +0 -8
  276. wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +0 -30
  277. wisent/examples/scripts/results/test_commonsense_qa_pairs.json +0 -8
  278. wisent/examples/scripts/results/test_conala_evaluation.json +0 -30
  279. wisent/examples/scripts/results/test_conala_pairs.json +0 -8
  280. wisent/examples/scripts/results/test_concode_evaluation.json +0 -30
  281. wisent/examples/scripts/results/test_concode_pairs.json +0 -8
  282. wisent/examples/scripts/results/test_copa_evaluation.json +0 -30
  283. wisent/examples/scripts/results/test_copa_pairs.json +0 -8
  284. wisent/examples/scripts/results/test_copal_id_evaluation.json +0 -30
  285. wisent/examples/scripts/results/test_copal_id_pairs.json +0 -8
  286. wisent/examples/scripts/results/test_coqa_evaluation.json +0 -30
  287. wisent/examples/scripts/results/test_coqa_pairs.json +0 -8
  288. wisent/examples/scripts/results/test_coqcat_evaluation.json +0 -30
  289. wisent/examples/scripts/results/test_coqcat_pairs.json +0 -8
  290. wisent/examples/scripts/results/test_crows_pairs_evaluation.json +0 -51
  291. wisent/examples/scripts/results/test_crows_pairs_pairs.json +0 -14
  292. wisent/examples/scripts/results/test_csatqa_evaluation.json +0 -51
  293. wisent/examples/scripts/results/test_csatqa_pairs.json +0 -14
  294. wisent/examples/scripts/results/test_cycle_letters_evaluation.json +0 -30
  295. wisent/examples/scripts/results/test_cycle_letters_pairs.json +0 -8
  296. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +0 -51
  297. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +0 -14
  298. wisent/examples/scripts/results/test_darija_bench_evaluation.json +0 -51
  299. wisent/examples/scripts/results/test_darija_bench_pairs.json +0 -14
  300. wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +0 -30
  301. wisent/examples/scripts/results/test_darijahellaswag_pairs.json +0 -8
  302. wisent/examples/scripts/results/test_darijammlu_evaluation.json +0 -51
  303. wisent/examples/scripts/results/test_darijammlu_pairs.json +0 -14
  304. wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +0 -30
  305. wisent/examples/scripts/results/test_dbpedia_14_pairs.json +0 -8
  306. wisent/examples/scripts/results/test_drop_evaluation.json +0 -30
  307. wisent/examples/scripts/results/test_drop_pairs.json +0 -8
  308. wisent/examples/scripts/results/test_ds1000_evaluation.json +0 -30
  309. wisent/examples/scripts/results/test_ds1000_pairs.json +0 -8
  310. wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +0 -30
  311. wisent/examples/scripts/results/test_egyhellaswag_pairs.json +0 -8
  312. wisent/examples/scripts/results/test_egymmlu_evaluation.json +0 -51
  313. wisent/examples/scripts/results/test_egymmlu_pairs.json +0 -14
  314. wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +0 -30
  315. wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +0 -8
  316. wisent/examples/scripts/results/test_eq_bench_evaluation.json +0 -30
  317. wisent/examples/scripts/results/test_eq_bench_pairs.json +0 -8
  318. wisent/examples/scripts/results/test_escola_evaluation.json +0 -30
  319. wisent/examples/scripts/results/test_escola_pairs.json +0 -8
  320. wisent/examples/scripts/results/test_ethics_cm_evaluation.json +0 -30
  321. wisent/examples/scripts/results/test_ethics_cm_pairs.json +0 -8
  322. wisent/examples/scripts/results/test_ethos_binary_evaluation.json +0 -30
  323. wisent/examples/scripts/results/test_ethos_binary_pairs.json +0 -8
  324. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +0 -51
  325. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +0 -14
  326. wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +0 -51
  327. wisent/examples/scripts/results/test_eus_exams_es_pairs.json +0 -14
  328. wisent/examples/scripts/results/test_eus_exams_evaluation.json +0 -51
  329. wisent/examples/scripts/results/test_eus_exams_pairs.json +0 -14
  330. wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +0 -30
  331. wisent/examples/scripts/results/test_eus_proficiency_pairs.json +0 -8
  332. wisent/examples/scripts/results/test_eus_reading_evaluation.json +0 -30
  333. wisent/examples/scripts/results/test_eus_reading_pairs.json +0 -8
  334. wisent/examples/scripts/results/test_eus_trivia_evaluation.json +0 -30
  335. wisent/examples/scripts/results/test_eus_trivia_pairs.json +0 -8
  336. wisent/examples/scripts/results/test_evalita-mp_evaluation.json +0 -51
  337. wisent/examples/scripts/results/test_evalita-mp_pairs.json +0 -14
  338. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
  339. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
  340. wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +0 -51
  341. wisent/examples/scripts/results/test_evalita_LLM_pairs.json +0 -14
  342. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +0 -51
  343. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +0 -14
  344. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +0 -30
  345. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +0 -8
  346. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +0 -51
  347. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +0 -14
  348. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
  349. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
  350. wisent/examples/scripts/results/test_fda_evaluation.json +0 -30
  351. wisent/examples/scripts/results/test_fda_pairs.json +0 -8
  352. wisent/examples/scripts/results/test_financial_tweets_evaluation.json +0 -30
  353. wisent/examples/scripts/results/test_financial_tweets_pairs.json +0 -8
  354. wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +0 -30
  355. wisent/examples/scripts/results/test_fld/test_fld_pairs.json +0 -8
  356. wisent/examples/scripts/results/test_fld_evaluation.json +0 -30
  357. wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +0 -30
  358. wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +0 -8
  359. wisent/examples/scripts/results/test_fld_pairs.json +0 -8
  360. wisent/examples/scripts/results/test_flores_evaluation.json +0 -51
  361. wisent/examples/scripts/results/test_flores_pairs.json +0 -14
  362. wisent/examples/scripts/results/test_freebase_evaluation.json +0 -30
  363. wisent/examples/scripts/results/test_freebase_pairs.json +0 -8
  364. wisent/examples/scripts/results/test_french_bench_evaluation.json +0 -51
  365. wisent/examples/scripts/results/test_french_bench_pairs.json +0 -14
  366. wisent/examples/scripts/results/test_galcola_evaluation.json +0 -30
  367. wisent/examples/scripts/results/test_galcola_pairs.json +0 -8
  368. wisent/examples/scripts/results/test_galician_bench_evaluation.json +0 -51
  369. wisent/examples/scripts/results/test_galician_bench_pairs.json +0 -14
  370. wisent/examples/scripts/results/test_glianorex_evaluation.json +0 -30
  371. wisent/examples/scripts/results/test_glianorex_pairs.json +0 -8
  372. wisent/examples/scripts/results/test_global_mmlu_evaluation.json +0 -51
  373. wisent/examples/scripts/results/test_global_mmlu_pairs.json +0 -14
  374. wisent/examples/scripts/results/test_glue_evaluation.json +0 -51
  375. wisent/examples/scripts/results/test_glue_pairs.json +0 -14
  376. wisent/examples/scripts/results/test_gpqa_evaluation.json +0 -51
  377. wisent/examples/scripts/results/test_gpqa_pairs.json +0 -14
  378. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +0 -51
  379. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +0 -14
  380. wisent/examples/scripts/results/test_groundcocoa_evaluation.json +0 -30
  381. wisent/examples/scripts/results/test_groundcocoa_pairs.json +0 -8
  382. wisent/examples/scripts/results/test_gsm8k_evaluation.json +0 -30
  383. wisent/examples/scripts/results/test_gsm8k_pairs.json +0 -8
  384. wisent/examples/scripts/results/test_haerae_evaluation.json +0 -51
  385. wisent/examples/scripts/results/test_haerae_pairs.json +0 -14
  386. wisent/examples/scripts/results/test_headqa_evaluation.json +0 -30
  387. wisent/examples/scripts/results/test_headqa_pairs.json +0 -8
  388. wisent/examples/scripts/results/test_hellaswag_evaluation.json +0 -30
  389. wisent/examples/scripts/results/test_hellaswag_pairs.json +0 -8
  390. wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +0 -51
  391. wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +0 -14
  392. wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +0 -51
  393. wisent/examples/scripts/results/test_hendrycks_math_pairs.json +0 -14
  394. wisent/examples/scripts/results/test_histoires_morales_evaluation.json +0 -30
  395. wisent/examples/scripts/results/test_histoires_morales_pairs.json +0 -8
  396. wisent/examples/scripts/results/test_hmmt_evaluation.json +0 -30
  397. wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +0 -30
  398. wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +0 -8
  399. wisent/examples/scripts/results/test_hmmt_pairs.json +0 -8
  400. wisent/examples/scripts/results/test_hrm8k_evaluation.json +0 -51
  401. wisent/examples/scripts/results/test_hrm8k_pairs.json +0 -14
  402. wisent/examples/scripts/results/test_humaneval_evaluation.json +0 -30
  403. wisent/examples/scripts/results/test_humaneval_pairs.json +0 -8
  404. wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +0 -30
  405. wisent/examples/scripts/results/test_humaneval_plus_pairs.json +0 -8
  406. wisent/examples/scripts/results/test_ifeval_evaluation.json +0 -30
  407. wisent/examples/scripts/results/test_ifeval_pairs.json +0 -8
  408. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +0 -30
  409. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +0 -8
  410. wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +0 -30
  411. wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +0 -8
  412. wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +0 -51
  413. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +0 -30
  414. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +0 -8
  415. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +0 -51
  416. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +0 -14
  417. wisent/examples/scripts/results/test_inverse_scaling_pairs.json +0 -14
  418. wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +0 -30
  419. wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +0 -8
  420. wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +0 -30
  421. wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +0 -8
  422. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +0 -30
  423. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +0 -8
  424. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +0 -30
  425. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +0 -8
  426. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +0 -30
  427. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +0 -8
  428. wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +0 -51
  429. wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +0 -14
  430. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +0 -30
  431. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +0 -8
  432. wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +0 -30
  433. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +0 -30
  434. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +0 -8
  435. wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +0 -8
  436. wisent/examples/scripts/results/test_kbl_evaluation.json +0 -51
  437. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +0 -51
  438. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +0 -14
  439. wisent/examples/scripts/results/test_kbl_pairs.json +0 -14
  440. wisent/examples/scripts/results/test_kmmlu_evaluation.json +0 -51
  441. wisent/examples/scripts/results/test_kmmlu_pairs.json +0 -14
  442. wisent/examples/scripts/results/test_kobest_evaluation.json +0 -51
  443. wisent/examples/scripts/results/test_kobest_pairs.json +0 -14
  444. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +0 -30
  445. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +0 -8
  446. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +0 -30
  447. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +0 -8
  448. wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +0 -30
  449. wisent/examples/scripts/results/test_kormedmcqa_pairs.json +0 -8
  450. wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +0 -30
  451. wisent/examples/scripts/results/test_lambada_cloze_pairs.json +0 -8
  452. wisent/examples/scripts/results/test_lambada_evaluation.json +0 -30
  453. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  454. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  455. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +0 -51
  456. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +0 -14
  457. wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +0 -51
  458. wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +0 -14
  459. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +0 -51
  460. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +0 -14
  461. wisent/examples/scripts/results/test_lambada_openai_evaluation.json +0 -30
  462. wisent/examples/scripts/results/test_lambada_openai_pairs.json +0 -8
  463. wisent/examples/scripts/results/test_lambada_pairs.json +0 -8
  464. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  465. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  466. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  467. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  468. wisent/examples/scripts/results/test_lambada_standard_evaluation.json +0 -30
  469. wisent/examples/scripts/results/test_lambada_standard_pairs.json +0 -8
  470. wisent/examples/scripts/results/test_leaderboard_evaluation.json +0 -51
  471. wisent/examples/scripts/results/test_leaderboard_pairs.json +0 -14
  472. wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +0 -51
  473. wisent/examples/scripts/results/test_libra/test_libra_pairs.json +0 -14
  474. wisent/examples/scripts/results/test_libra_evaluation.json +0 -51
  475. wisent/examples/scripts/results/test_libra_pairs.json +0 -14
  476. wisent/examples/scripts/results/test_lingoly_evaluation.json +0 -30
  477. wisent/examples/scripts/results/test_lingoly_pairs.json +0 -8
  478. wisent/examples/scripts/results/test_livecodebench_evaluation.json +0 -30
  479. wisent/examples/scripts/results/test_livecodebench_pairs.json +0 -8
  480. wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +0 -30
  481. wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +0 -8
  482. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +0 -30
  483. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +0 -8
  484. wisent/examples/scripts/results/test_llama_evaluation.json +0 -30
  485. wisent/examples/scripts/results/test_llama_pairs.json +0 -8
  486. wisent/examples/scripts/results/test_logiqa2_evaluation.json +0 -30
  487. wisent/examples/scripts/results/test_logiqa2_pairs.json +0 -8
  488. wisent/examples/scripts/results/test_logiqa_evaluation.json +0 -30
  489. wisent/examples/scripts/results/test_logiqa_pairs.json +0 -8
  490. wisent/examples/scripts/results/test_m_mmlu_evaluation.json +0 -51
  491. wisent/examples/scripts/results/test_m_mmlu_pairs.json +0 -14
  492. wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +0 -51
  493. wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +0 -14
  494. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +0 -30
  495. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +0 -8
  496. wisent/examples/scripts/results/test_mastermind_evaluation.json +0 -51
  497. wisent/examples/scripts/results/test_mastermind_pairs.json +0 -14
  498. wisent/examples/scripts/results/test_math500_evaluation.json +0 -30
  499. wisent/examples/scripts/results/test_math500_pairs.json +0 -8
  500. wisent/examples/scripts/results/test_math_evaluation.json +0 -30
  501. wisent/examples/scripts/results/test_math_pairs.json +0 -8
  502. wisent/examples/scripts/results/test_mathqa_evaluation.json +0 -30
  503. wisent/examples/scripts/results/test_mathqa_pairs.json +0 -8
  504. wisent/examples/scripts/results/test_mbpp_evaluation.json +0 -30
  505. wisent/examples/scripts/results/test_mbpp_pairs.json +0 -8
  506. wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +0 -30
  507. wisent/examples/scripts/results/test_mbpp_plus_pairs.json +0 -8
  508. wisent/examples/scripts/results/test_mc_taco_evaluation.json +0 -30
  509. wisent/examples/scripts/results/test_mc_taco_pairs.json +0 -8
  510. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +0 -51
  511. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +0 -14
  512. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +0 -30
  513. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +0 -8
  514. wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +0 -51
  515. wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +0 -14
  516. wisent/examples/scripts/results/test_meddialog_evaluation.json +0 -30
  517. wisent/examples/scripts/results/test_meddialog_pairs.json +0 -8
  518. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +0 -30
  519. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +0 -8
  520. wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +0 -30
  521. wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +0 -8
  522. wisent/examples/scripts/results/test_medmcqa_evaluation.json +0 -30
  523. wisent/examples/scripts/results/test_medmcqa_pairs.json +0 -8
  524. wisent/examples/scripts/results/test_medqa_evaluation.json +0 -30
  525. wisent/examples/scripts/results/test_medqa_pairs.json +0 -8
  526. wisent/examples/scripts/results/test_medtext_evaluation.json +0 -30
  527. wisent/examples/scripts/results/test_medtext_pairs.json +0 -8
  528. wisent/examples/scripts/results/test_mela_evaluation.json +0 -51
  529. wisent/examples/scripts/results/test_mela_pairs.json +0 -14
  530. wisent/examples/scripts/results/test_meqsum_evaluation.json +0 -30
  531. wisent/examples/scripts/results/test_meqsum_pairs.json +0 -8
  532. wisent/examples/scripts/results/test_mercury_evaluation.json +0 -30
  533. wisent/examples/scripts/results/test_mercury_pairs.json +0 -8
  534. wisent/examples/scripts/results/test_metabench_evaluation.json +0 -51
  535. wisent/examples/scripts/results/test_metabench_pairs.json +0 -14
  536. wisent/examples/scripts/results/test_mgsm_evaluation.json +0 -51
  537. wisent/examples/scripts/results/test_mgsm_pairs.json +0 -14
  538. wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +0 -30
  539. wisent/examples/scripts/results/test_mimic_repsum_pairs.json +0 -8
  540. wisent/examples/scripts/results/test_minerva_math_evaluation.json +0 -51
  541. wisent/examples/scripts/results/test_minerva_math_pairs.json +0 -14
  542. wisent/examples/scripts/results/test_mlqa_evaluation.json +0 -51
  543. wisent/examples/scripts/results/test_mlqa_pairs.json +0 -14
  544. wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +0 -51
  545. wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +0 -14
  546. wisent/examples/scripts/results/test_mmlu_evaluation.json +0 -51
  547. wisent/examples/scripts/results/test_mmlu_pairs.json +0 -14
  548. wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +0 -51
  549. wisent/examples/scripts/results/test_mmlu_pro_pairs.json +0 -14
  550. wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +0 -51
  551. wisent/examples/scripts/results/test_mmlu_prox_pairs.json +0 -14
  552. wisent/examples/scripts/results/test_mmlusr_evaluation.json +0 -30
  553. wisent/examples/scripts/results/test_mmlusr_pairs.json +0 -8
  554. wisent/examples/scripts/results/test_mmmu_evaluation.json +0 -51
  555. wisent/examples/scripts/results/test_mmmu_pairs.json +0 -14
  556. wisent/examples/scripts/results/test_mnli_evaluation.json +0 -30
  557. wisent/examples/scripts/results/test_mnli_pairs.json +0 -8
  558. wisent/examples/scripts/results/test_model_written_evals_evaluation.json +0 -51
  559. wisent/examples/scripts/results/test_model_written_evals_pairs.json +0 -14
  560. wisent/examples/scripts/results/test_moral_stories_evaluation.json +0 -30
  561. wisent/examples/scripts/results/test_moral_stories_pairs.json +0 -8
  562. wisent/examples/scripts/results/test_mts_dialog_evaluation.json +0 -30
  563. wisent/examples/scripts/results/test_mts_dialog_pairs.json +0 -8
  564. wisent/examples/scripts/results/test_multiblimp_evaluation.json +0 -51
  565. wisent/examples/scripts/results/test_multiblimp_pairs.json +0 -14
  566. wisent/examples/scripts/results/test_multimedqa_evaluation.json +0 -51
  567. wisent/examples/scripts/results/test_multimedqa_pairs.json +0 -14
  568. wisent/examples/scripts/results/test_multipl_e_evaluation.json +0 -30
  569. wisent/examples/scripts/results/test_multipl_e_pairs.json +0 -8
  570. wisent/examples/scripts/results/test_mutual_evaluation.json +0 -30
  571. wisent/examples/scripts/results/test_mutual_pairs.json +0 -8
  572. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +0 -30
  573. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +0 -8
  574. wisent/examples/scripts/results/test_noreval_evaluation.json +0 -51
  575. wisent/examples/scripts/results/test_noreval_pairs.json +0 -14
  576. wisent/examples/scripts/results/test_noticia_evaluation.json +0 -30
  577. wisent/examples/scripts/results/test_noticia_pairs.json +0 -8
  578. wisent/examples/scripts/results/test_nq_open_evaluation.json +0 -30
  579. wisent/examples/scripts/results/test_nq_open_pairs.json +0 -8
  580. wisent/examples/scripts/results/test_olaph_evaluation.json +0 -30
  581. wisent/examples/scripts/results/test_olaph_pairs.json +0 -8
  582. wisent/examples/scripts/results/test_openbookqa_evaluation.json +0 -30
  583. wisent/examples/scripts/results/test_openbookqa_pairs.json +0 -8
  584. wisent/examples/scripts/results/test_openllm_evaluation.json +0 -51
  585. wisent/examples/scripts/results/test_openllm_pairs.json +0 -14
  586. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +0 -30
  587. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +0 -8
  588. wisent/examples/scripts/results/test_paloma_evaluation.json +0 -51
  589. wisent/examples/scripts/results/test_paloma_pairs.json +0 -14
  590. wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +0 -30
  591. wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +0 -8
  592. wisent/examples/scripts/results/test_paws-x_evaluation.json +0 -51
  593. wisent/examples/scripts/results/test_paws-x_pairs.json +0 -14
  594. wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +0 -30
  595. wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +0 -8
  596. wisent/examples/scripts/results/test_penn_treebank_evaluation.json +0 -30
  597. wisent/examples/scripts/results/test_penn_treebank_pairs.json +0 -8
  598. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +0 -30
  599. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +0 -8
  600. wisent/examples/scripts/results/test_piqa_evaluation.json +0 -30
  601. wisent/examples/scripts/results/test_piqa_pairs.json +0 -8
  602. wisent/examples/scripts/results/test_polemo2_evaluation.json +0 -30
  603. wisent/examples/scripts/results/test_polemo2_pairs.json +0 -8
  604. wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +0 -30
  605. wisent/examples/scripts/results/test_polymath_en_high_pairs.json +0 -8
  606. wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +0 -30
  607. wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +0 -8
  608. wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +0 -30
  609. wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +0 -8
  610. wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +0 -30
  611. wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +0 -8
  612. wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +0 -51
  613. wisent/examples/scripts/results/test_portuguese_bench_pairs.json +0 -14
  614. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
  615. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
  616. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
  617. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
  618. wisent/examples/scripts/results/test_prost_evaluation.json +0 -30
  619. wisent/examples/scripts/results/test_prost_pairs.json +0 -8
  620. wisent/examples/scripts/results/test_ptb_evaluation.json +0 -30
  621. wisent/examples/scripts/results/test_ptb_pairs.json +0 -8
  622. wisent/examples/scripts/results/test_pubmedqa_evaluation.json +0 -30
  623. wisent/examples/scripts/results/test_pubmedqa_pairs.json +0 -8
  624. wisent/examples/scripts/results/test_pythia_evaluation.json +0 -51
  625. wisent/examples/scripts/results/test_pythia_pairs.json +0 -14
  626. wisent/examples/scripts/results/test_qa4mre_evaluation.json +0 -30
  627. wisent/examples/scripts/results/test_qa4mre_pairs.json +0 -8
  628. wisent/examples/scripts/results/test_qasper_evaluation.json +0 -30
  629. wisent/examples/scripts/results/test_qasper_pairs.json +0 -8
  630. wisent/examples/scripts/results/test_race_evaluation.json +0 -30
  631. wisent/examples/scripts/results/test_race_pairs.json +0 -8
  632. wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +0 -30
  633. wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +0 -8
  634. wisent/examples/scripts/results/test_recode_evaluation.json +0 -30
  635. wisent/examples/scripts/results/test_recode_pairs.json +0 -8
  636. wisent/examples/scripts/results/test_record_evaluation.json +0 -30
  637. wisent/examples/scripts/results/test_record_pairs.json +0 -8
  638. wisent/examples/scripts/results/test_ruler_evaluation.json +0 -51
  639. wisent/examples/scripts/results/test_ruler_pairs.json +0 -14
  640. wisent/examples/scripts/results/test_sciq_evaluation.json +0 -30
  641. wisent/examples/scripts/results/test_sciq_pairs.json +0 -8
  642. wisent/examples/scripts/results/test_score_evaluation.json +0 -51
  643. wisent/examples/scripts/results/test_score_pairs.json +0 -14
  644. wisent/examples/scripts/results/test_self_consistency_evaluation.json +0 -30
  645. wisent/examples/scripts/results/test_self_consistency_pairs.json +0 -8
  646. wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +0 -30
  647. wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +0 -8
  648. wisent/examples/scripts/results/test_siqa_evaluation.json +0 -30
  649. wisent/examples/scripts/results/test_siqa_pairs.json +0 -8
  650. wisent/examples/scripts/results/test_spanish_bench_evaluation.json +0 -51
  651. wisent/examples/scripts/results/test_spanish_bench_pairs.json +0 -14
  652. wisent/examples/scripts/results/test_squad2_evaluation.json +0 -30
  653. wisent/examples/scripts/results/test_squad2_pairs.json +0 -8
  654. wisent/examples/scripts/results/test_squadv2_evaluation.json +0 -30
  655. wisent/examples/scripts/results/test_squadv2_pairs.json +0 -8
  656. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +0 -30
  657. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +0 -8
  658. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +0 -51
  659. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +0 -14
  660. wisent/examples/scripts/results/test_swag_evaluation.json +0 -30
  661. wisent/examples/scripts/results/test_swag_pairs.json +0 -8
  662. wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +0 -51
  663. wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +0 -14
  664. wisent/examples/scripts/results/test_tmmluplus_evaluation.json +0 -51
  665. wisent/examples/scripts/results/test_tmmluplus_pairs.json +0 -14
  666. wisent/examples/scripts/results/test_translation_evaluation.json +0 -51
  667. wisent/examples/scripts/results/test_translation_pairs.json +0 -14
  668. wisent/examples/scripts/results/test_triviaqa_evaluation.json +0 -30
  669. wisent/examples/scripts/results/test_triviaqa_pairs.json +0 -8
  670. wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +0 -51
  671. wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +0 -14
  672. wisent/examples/scripts/results/test_truthfulqa_evaluation.json +0 -30
  673. wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +0 -30
  674. wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +0 -8
  675. wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +0 -30
  676. wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +0 -8
  677. wisent/examples/scripts/results/test_truthfulqa_pairs.json +0 -8
  678. wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +0 -51
  679. wisent/examples/scripts/results/test_turkishmmlu_pairs.json +0 -14
  680. wisent/examples/scripts/results/test_unfair_tos_evaluation.json +0 -30
  681. wisent/examples/scripts/results/test_unfair_tos_pairs.json +0 -8
  682. wisent/examples/scripts/results/test_unscramble_evaluation.json +0 -51
  683. wisent/examples/scripts/results/test_unscramble_pairs.json +0 -14
  684. wisent/examples/scripts/results/test_webqs_evaluation.json +0 -30
  685. wisent/examples/scripts/results/test_webqs_pairs.json +0 -8
  686. wisent/examples/scripts/results/test_wikitext103_evaluation.json +0 -30
  687. wisent/examples/scripts/results/test_wikitext103_pairs.json +0 -8
  688. wisent/examples/scripts/results/test_wikitext_evaluation.json +0 -30
  689. wisent/examples/scripts/results/test_wikitext_pairs.json +0 -8
  690. wisent/examples/scripts/results/test_winogender_evaluation.json +0 -51
  691. wisent/examples/scripts/results/test_winogender_pairs.json +0 -14
  692. wisent/examples/scripts/results/test_winogrande_evaluation.json +0 -30
  693. wisent/examples/scripts/results/test_winogrande_pairs.json +0 -8
  694. wisent/examples/scripts/results/test_wmdp_evaluation.json +0 -30
  695. wisent/examples/scripts/results/test_wmdp_pairs.json +0 -8
  696. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +0 -30
  697. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +0 -8
  698. wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +0 -30
  699. wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +0 -8
  700. wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +0 -30
  701. wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +0 -8
  702. wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +0 -30
  703. wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +0 -8
  704. wisent/examples/scripts/results/test_wsc273_evaluation.json +0 -30
  705. wisent/examples/scripts/results/test_wsc273_pairs.json +0 -8
  706. wisent/examples/scripts/results/test_xcopa_evaluation.json +0 -51
  707. wisent/examples/scripts/results/test_xcopa_pairs.json +0 -14
  708. wisent/examples/scripts/results/test_xnli_eu_evaluation.json +0 -30
  709. wisent/examples/scripts/results/test_xnli_eu_pairs.json +0 -8
  710. wisent/examples/scripts/results/test_xnli_evaluation.json +0 -51
  711. wisent/examples/scripts/results/test_xnli_pairs.json +0 -14
  712. wisent/examples/scripts/results/test_xquad_evaluation.json +0 -51
  713. wisent/examples/scripts/results/test_xquad_pairs.json +0 -14
  714. wisent/examples/scripts/results/test_xstorycloze_evaluation.json +0 -51
  715. wisent/examples/scripts/results/test_xstorycloze_pairs.json +0 -14
  716. wisent/examples/scripts/results/test_xsum_evaluation.json +0 -30
  717. wisent/examples/scripts/results/test_xsum_pairs.json +0 -8
  718. wisent/examples/scripts/results/test_xwinograd_evaluation.json +0 -51
  719. wisent/examples/scripts/results/test_xwinograd_pairs.json +0 -14
  720. wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +0 -30
  721. wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +0 -8
  722. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/WHEEL +0 -0
  723. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/entry_points.txt +0 -0
  724. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/licenses/LICENSE +0 -0
  725. {wisent-0.7.379.dist-info → wisent-0.7.701.dist-info}/top_level.txt +0 -0
@@ -1,8 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Given the following facts:\nsent1: the taxonomicness occurs. sent2: that the snowballing great-uncle does not occur brings about that the taxonomicness and the transfiguring hybridization occurs. sent3: the transfiguring nonabsorbency happens. sent4: both the noness and the snowballing interrupter occurs. sent5: the pocketing contemplative happens if the uncongenialness occurs. sent6: if the snowballing biohazard happens then the pocketing roundup does not occur and the inconvertibleness does not occur. sent7: if the pocketing Culex does not occur then both the no and the harshness happens. sent8: the noness happens. sent9: not the pocketing Culex but the tack occurs if the pocketing contemplative occurs. sent10: if the coinciding occurs the snowballing biohazard occurs. sent11: that the transfiguring nonabsorbency occurs triggers that the non-preventiveness and the rummage happens. sent12: that the discord occurs results in the pocketing contemplative. sent13: the taxonomicness and the snowballing great-uncle happens. sent14: either that the uncongenialness happens or the discord or both is triggered by that the pocketing roundup does not occur. sent15: that the backhandness does not occur is caused by that the snowballing great-uncle and the snowballing interrupter happens. sent16: if the rummage occurs then the coinciding happens.\n\nDetermine if the hypothesis can be proved, disproved, or is unknown:\nHypothesis: the backhanding does not occur.\n\nAnswer (PROVED/DISPROVED/UNKNOWN):",
5
- "positive_response": "PROVED",
6
- "negative_response": "DISPROVED"
7
- }
8
- ]
@@ -1,30 +0,0 @@
1
- {
2
- "task_name": "fld",
3
- "model_name": "mock",
4
- "evaluator_name": "exact_match",
5
- "num_pairs": 1,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Given the following facts:\nsent1: the taxonomicness occurs. sent2: that the snowballing great-uncle ...",
11
- "positive_response": "PROVED",
12
- "negative_response": "DISPROVED",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Exact match: 'PROVED' == 'PROVED'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "No match: 'PROVED' not in ['DISPROVED']"
26
- },
27
- "both_correct": true
28
- }
29
- ]
30
- }
@@ -1,30 +0,0 @@
1
- {
2
- "task_name": "fld",
3
- "model_name": "mock",
4
- "evaluator_name": "exact_match",
5
- "num_pairs": 1,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Given the following facts:\nsent1: the taxonomicness occurs. sent2: that the snowballing great-uncle ...",
11
- "positive_response": "PROVED",
12
- "negative_response": "DISPROVED",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Exact match: 'PROVED' == 'PROVED'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "No match: 'PROVED' not in ['DISPROVED']"
26
- },
27
- "both_correct": true
28
- }
29
- ]
30
- }
@@ -1,8 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Given the following facts:\nsent1: the taxonomicness occurs. sent2: that the snowballing great-uncle does not occur brings about that the taxonomicness and the transfiguring hybridization occurs. sent3: the transfiguring nonabsorbency happens. sent4: both the noness and the snowballing interrupter occurs. sent5: the pocketing contemplative happens if the uncongenialness occurs. sent6: if the snowballing biohazard happens then the pocketing roundup does not occur and the inconvertibleness does not occur. sent7: if the pocketing Culex does not occur then both the no and the harshness happens. sent8: the noness happens. sent9: not the pocketing Culex but the tack occurs if the pocketing contemplative occurs. sent10: if the coinciding occurs the snowballing biohazard occurs. sent11: that the transfiguring nonabsorbency occurs triggers that the non-preventiveness and the rummage happens. sent12: that the discord occurs results in the pocketing contemplative. sent13: the taxonomicness and the snowballing great-uncle happens. sent14: either that the uncongenialness happens or the discord or both is triggered by that the pocketing roundup does not occur. sent15: that the backhandness does not occur is caused by that the snowballing great-uncle and the snowballing interrupter happens. sent16: if the rummage occurs then the coinciding happens.\n\nDetermine if the hypothesis can be proved, disproved, or is unknown:\nHypothesis: the backhanding does not occur.\n\nAnswer (PROVED/DISPROVED/UNKNOWN):",
5
- "positive_response": "PROVED",
6
- "negative_response": "DISPROVED"
7
- }
8
- ]
@@ -1,8 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Given the following facts:\nsent1: the taxonomicness occurs. sent2: that the snowballing great-uncle does not occur brings about that the taxonomicness and the transfiguring hybridization occurs. sent3: the transfiguring nonabsorbency happens. sent4: both the noness and the snowballing interrupter occurs. sent5: the pocketing contemplative happens if the uncongenialness occurs. sent6: if the snowballing biohazard happens then the pocketing roundup does not occur and the inconvertibleness does not occur. sent7: if the pocketing Culex does not occur then both the no and the harshness happens. sent8: the noness happens. sent9: not the pocketing Culex but the tack occurs if the pocketing contemplative occurs. sent10: if the coinciding occurs the snowballing biohazard occurs. sent11: that the transfiguring nonabsorbency occurs triggers that the non-preventiveness and the rummage happens. sent12: that the discord occurs results in the pocketing contemplative. sent13: the taxonomicness and the snowballing great-uncle happens. sent14: either that the uncongenialness happens or the discord or both is triggered by that the pocketing roundup does not occur. sent15: that the backhandness does not occur is caused by that the snowballing great-uncle and the snowballing interrupter happens. sent16: if the rummage occurs then the coinciding happens.\n\nDetermine if the hypothesis can be proved, disproved, or is unknown:\nHypothesis: the backhanding does not occur.\n\nAnswer (PROVED/DISPROVED/UNKNOWN):",
5
- "positive_response": "PROVED",
6
- "negative_response": "DISPROVED"
7
- }
8
- ]
@@ -1,51 +0,0 @@
1
- {
2
- "task_name": "flores",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 2,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Translate the following from ace_Arab to bam_Latn:\n\u062c\u0644 \u062c\u0627\u0645\u0648 \u06bd\u06a0 \u062a\u064a\u0645\u06a0, \u06a0\u0646 \u0686\u0627\u0631\u0648\u06a0 \u0643 \u062a\u0627\u0644\u0648\u0631 \u06a0\u0646 \u06a0\u0646 \u06a4\u0631\u062c\u0627\u0644\u0646\u0646 \u062f...",
11
- "positive_response": "Woyasik\u025bla basigilen ani seba camaw ma se k'u y\u025br\u025b s\u0254r\u0254 ka da jamana tagany\u025bf\u025b woyasi tabolo kuraw kan, y\u0254r\u0254mina tabolo fitiniw degiliw be fara \u0272\u0254g\u0254nkan teliyala.",
12
- "negative_response": "da tagany\u025bf\u025b ani ma fitiniw teliyala. k'u \u0272\u0254g\u0254nkan fara y\u0254r\u0254mina seba be jamana tabolo y\u025br\u025b tabolo camaw se kuraw kan, woyasi degiliw Woyasik\u025bla ka basigilen s\u0254r\u0254",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'Woyasik\u025bla basigilen ani seba camaw ma se k'u y\u025br\u025b s\u0254r\u0254 ka da jamana tagany\u025bf\u025b woyasi tabolo kuraw kan, y\u0254r\u0254mina tabolo fitiniw degiliw be fara \u0272\u0254g\u0254nkan teliyala.' (log_prob=-0.500), Expected: 'Woyasik\u025bla basigilen ani seba camaw ma se k'u y\u025br\u025b s\u0254r\u0254 ka da jamana tagany\u025bf\u025b woyasi tabolo kuraw kan, y\u0254r\u0254mina tabolo fitiniw degiliw be fara \u0272\u0254g\u0254nkan teliyala.'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'Woyasik\u025bla basigilen ani seba camaw ma se k'u y\u025br\u025b s\u0254r\u0254 ka da jamana tagany\u025bf\u025b woyasi tabolo kuraw kan, y\u0254r\u0254mina tabolo fitiniw degiliw be fara \u0272\u0254g\u0254nkan teliyala.' (log_prob=-0.500), Expected: 'da tagany\u025bf\u025b ani ma fitiniw teliyala. k'u \u0272\u0254g\u0254nkan fara y\u0254r\u0254mina seba be jamana tabolo y\u025br\u025b tabolo camaw se kuraw kan, woyasi degiliw Woyasik\u025bla ka basigilen s\u0254r\u0254'"
26
- },
27
- "both_correct": true
28
- },
29
- {
30
- "pair_id": 1,
31
- "prompt": "Translate the following from ace_Arab to bam_Latn:\n\u062c\u0648\u062f\u0648\u06a9 \u062c\u0648\u06a4\u06a0 \u0647\u064a\u062a\u0648\u0634\u064a \u0633\u0627\u064a\u062a\u0648\u2e41 \u06bd\u06a0 \u0645\u0646\u06a0 \u062f\u0648\u0627 \u0645\u062f\u0627\u0644\u064a \u0645\u0648\u0647 \u0627\u0648\u0644...",
32
- "positive_response": "Zap\u0254n ka Zido k\u025bla Hitishi Saito, Oly\u025bnpiki Sanu j\u0254nj\u0254n fila tabaa, sara a san 54.",
33
- "negative_response": "k\u025bla 54. fila tabaa, Zap\u0254n Oly\u025bnpiki j\u0254nj\u0254n Sanu Saito, Hitishi ka Zido a san sara",
34
- "positive_evaluation": {
35
- "ground_truth": "TRUTHFUL",
36
- "confidence": 1.0,
37
- "expected": "TRUTHFUL",
38
- "correct": true,
39
- "details": "Predicted: 'Zap\u0254n ka Zido k\u025bla Hitishi Saito, Oly\u025bnpiki Sanu j\u0254nj\u0254n fila tabaa, sara a san 54.' (log_prob=-0.500), Expected: 'Zap\u0254n ka Zido k\u025bla Hitishi Saito, Oly\u025bnpiki Sanu j\u0254nj\u0254n fila tabaa, sara a san 54.'"
40
- },
41
- "negative_evaluation": {
42
- "ground_truth": "UNTRUTHFUL",
43
- "confidence": 0.0,
44
- "expected": "UNTRUTHFUL",
45
- "correct": true,
46
- "details": "Predicted: 'Zap\u0254n ka Zido k\u025bla Hitishi Saito, Oly\u025bnpiki Sanu j\u0254nj\u0254n fila tabaa, sara a san 54.' (log_prob=-0.500), Expected: 'k\u025bla 54. fila tabaa, Zap\u0254n Oly\u025bnpiki j\u0254nj\u0254n Sanu Saito, Hitishi ka Zido a san sara'"
47
- },
48
- "both_correct": true
49
- }
50
- ]
51
- }
@@ -1,14 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Translate the following from ace_Arab to bam_Latn:\n\u062c\u0644 \u062c\u0627\u0645\u0648 \u06bd\u06a0 \u062a\u064a\u0645\u06a0, \u06a0\u0646 \u0686\u0627\u0631\u0648\u06a0 \u0643 \u062a\u0627\u0644\u0648\u0631 \u06a0\u0646 \u06a0\u0646 \u06a4\u0631\u062c\u0627\u0644\u0646\u0646 \u062f\u0648\u0646\u064a\u0627 \u06bd\u06a0 \u062a\u06a0\u0648\u0647 \u0645\u0627\u062c\u0648, \u062f \u06a4\u062a \u062c\u0644 \u06a4\u0633\u0627\u0628\u0646 \u0633\u0627\u0631\u064a \u0628\u0648\u062f\u0627\u064a\u0627 \u0627\u0648\u0628\u064a\u062a \u062c\u062f \u0645\u062a\u0627\u0645\u0647 \u0628\u0627\u0762\u0633.",
5
- "positive_response": "Woyasik\u025bla basigilen ani seba camaw ma se k'u y\u025br\u025b s\u0254r\u0254 ka da jamana tagany\u025bf\u025b woyasi tabolo kuraw kan, y\u0254r\u0254mina tabolo fitiniw degiliw be fara \u0272\u0254g\u0254nkan teliyala.",
6
- "negative_response": "da tagany\u025bf\u025b ani ma fitiniw teliyala. k'u \u0272\u0254g\u0254nkan fara y\u0254r\u0254mina seba be jamana tabolo y\u025br\u025b tabolo camaw se kuraw kan, woyasi degiliw Woyasik\u025bla ka basigilen s\u0254r\u0254"
7
- },
8
- {
9
- "pair_id": 1,
10
- "prompt": "Translate the following from ace_Arab to bam_Latn:\n\u062c\u0648\u062f\u0648\u06a9 \u062c\u0648\u06a4\u06a0 \u0647\u064a\u062a\u0648\u0634\u064a \u0633\u0627\u064a\u062a\u0648\u2e41 \u06bd\u06a0 \u0645\u0646\u06a0 \u062f\u0648\u0627 \u0645\u062f\u0627\u0644\u064a \u0645\u0648\u0647 \u0627\u0648\u0644\u064a\u0645\u06a4\u064a\u0627\u062f\u0649\u2e41 \u0645\u0627\u062a\u0649 \u0628\u0642 \u0639\u0645\u0648 \u0665\u0664 \u062a\u0647\u0648\u0646.",
11
- "positive_response": "Zap\u0254n ka Zido k\u025bla Hitishi Saito, Oly\u025bnpiki Sanu j\u0254nj\u0254n fila tabaa, sara a san 54.",
12
- "negative_response": "k\u025bla 54. fila tabaa, Zap\u0254n Oly\u025bnpiki j\u0254nj\u0254n Sanu Saito, Hitishi ka Zido a san sara"
13
- }
14
- ]
@@ -1,30 +0,0 @@
1
- {
2
- "task_name": "freebase",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 1,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Question: what does jamaican people speak?\nAnswer:\nA. eansnma i aloecaur CgaelgEJghniL\nB. Jamaican C...",
11
- "positive_response": "Jamaican Creole English Language",
12
- "negative_response": "eansnma i aloecaur CgaelgEJghniL",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'Jamaican Creole English Language' (log_prob=-0.500), Expected: 'Jamaican Creole English Language'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'Jamaican Creole English Language' (log_prob=-0.500), Expected: 'eansnma i aloecaur CgaelgEJghniL'"
26
- },
27
- "both_correct": true
28
- }
29
- ]
30
- }
@@ -1,8 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Question: what does jamaican people speak?\nAnswer:\nA. eansnma i aloecaur CgaelgEJghniL\nB. Jamaican Creole English Language",
5
- "positive_response": "Jamaican Creole English Language",
6
- "negative_response": "eansnma i aloecaur CgaelgEJghniL"
7
- }
8
- ]
@@ -1,51 +0,0 @@
1
- {
2
- "task_name": "french_bench",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 2,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Question: Waterskiing: Un homme en chemise bleue se tient sur une plage. Un petit gar\u00e7on en gilet de...",
11
- "positive_response": "est debout dans l'eau.",
12
- "negative_response": "marche derri\u00e8re lui sur la plage.",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'est debout dans l'eau.' (log_prob=-0.500), Expected: 'est debout dans l'eau.'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'est debout dans l'eau.' (log_prob=-0.500), Expected: 'marche derri\u00e8re lui sur la plage.'"
26
- },
27
- "both_correct": true
28
- },
29
- {
30
- "pair_id": 1,
31
- "prompt": "Question: Spread mulch: On voit un homme parler \u00e0 la cam\u00e9ra et encha\u00eener avec des clips de lui d\u00e9pla...",
32
- "positive_response": "aident \u00e0 planter les zones tout en se d\u00e9pla\u00e7ant avec des brouettes et en plantant des plantes tout en parlant \u00e0 la cam\u00e9ra.",
33
- "negative_response": "sont \u00e9galement vus autour de lui ainsi qu'un vieil homme ramassant des rochers et la vid\u00e9o se terminant avec l'homme se tenant devant un groupe et les fauchant.",
34
- "positive_evaluation": {
35
- "ground_truth": "TRUTHFUL",
36
- "confidence": 1.0,
37
- "expected": "TRUTHFUL",
38
- "correct": true,
39
- "details": "Predicted: 'aident \u00e0 planter les zones tout en se d\u00e9pla\u00e7ant avec des brouettes et en plantant des plantes tout en parlant \u00e0 la cam\u00e9ra.' (log_prob=-0.500), Expected: 'aident \u00e0 planter les zones tout en se d\u00e9pla\u00e7ant avec des brouettes et en plantant des plantes tout en parlant \u00e0 la cam\u00e9ra.'"
40
- },
41
- "negative_evaluation": {
42
- "ground_truth": "UNTRUTHFUL",
43
- "confidence": 0.0,
44
- "expected": "UNTRUTHFUL",
45
- "correct": true,
46
- "details": "Predicted: 'aident \u00e0 planter les zones tout en se d\u00e9pla\u00e7ant avec des brouettes et en plantant des plantes tout en parlant \u00e0 la cam\u00e9ra.' (log_prob=-0.500), Expected: 'sont \u00e9galement vus autour de lui ainsi qu'un vieil homme ramassant des rochers et la vid\u00e9o se terminant avec l'homme se tenant devant un groupe et les fauchant.'"
47
- },
48
- "both_correct": true
49
- }
50
- ]
51
- }
@@ -1,14 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Question: Waterskiing: Un homme en chemise bleue se tient sur une plage. Un petit gar\u00e7on en gilet de sauvetage bleu\nA. marche derri\u00e8re lui sur la plage.\nB. est debout dans l'eau.",
5
- "positive_response": "est debout dans l'eau.",
6
- "negative_response": "marche derri\u00e8re lui sur la plage."
7
- },
8
- {
9
- "pair_id": 1,
10
- "prompt": "Question: Spread mulch: On voit un homme parler \u00e0 la cam\u00e9ra et encha\u00eener avec des clips de lui d\u00e9pla\u00e7ant de la terre. D'autres hommes\nA. sont \u00e9galement vus autour de lui ainsi qu'un vieil homme ramassant des rochers et la vid\u00e9o se terminant avec l'homme se tenant devant un groupe et les fauchant.\nB. aident \u00e0 planter les zones tout en se d\u00e9pla\u00e7ant avec des brouettes et en plantant des plantes tout en parlant \u00e0 la cam\u00e9ra.",
11
- "positive_response": "aident \u00e0 planter les zones tout en se d\u00e9pla\u00e7ant avec des brouettes et en plantant des plantes tout en parlant \u00e0 la cam\u00e9ra.",
12
- "negative_response": "sont \u00e9galement vus autour de lui ainsi qu'un vieil homme ramassant des rochers et la vid\u00e9o se terminant avec l'homme se tenant devant un groupe et les fauchant."
13
- }
14
- ]
@@ -1,30 +0,0 @@
1
- {
2
- "task_name": "galcola",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 1,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Is the following sentence grammatically acceptable?\nCoci\u00f1ei o peixe para o comeres tu....",
11
- "positive_response": "Yes, it is acceptable.",
12
- "negative_response": "No, it is not acceptable.",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'Yes, it is acceptable.' (log_prob=-0.500), Expected: 'Yes, it is acceptable.'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'Yes, it is acceptable.' (log_prob=-0.500), Expected: 'No, it is not acceptable.'"
26
- },
27
- "both_correct": true
28
- }
29
- ]
30
- }
@@ -1,8 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Is the following sentence grammatically acceptable?\nCoci\u00f1ei o peixe para o comeres tu.",
5
- "positive_response": "Yes, it is acceptable.",
6
- "negative_response": "No, it is not acceptable."
7
- }
8
- ]
@@ -1,51 +0,0 @@
1
- {
2
- "task_name": "galician_bench",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 2,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Translate the following from ace_Arab to bam_Latn:\n\u0628\u06a9\u0627\u064a\u0647 \u06a4\u0645\u0631\u0646\u062a\u0647 \u06a9\u0648\u0646\u0633\u064a\u0631\u06cf\u0627\u062a\u064a\u0641 \u0627\u0621\u0648\u0633\u062a\u0631\u0627\u0644\u064a\u0627 \u062a\u0648\u0644\u0642 \u06a9\u0649 \u0762\u06a4\u0631\u062a\u064a...",
11
- "positive_response": "\u0186sitrali laadajamanamarabulon\u0272\u025bm\u0254g\u0254w bana k'u bolon\u0254n bila Kyoto b\u025bnkans\u025bb\u025bn na, ko a b\u025b na s\u0254r\u0254 c\u025bn n'a ka danayaba ye tajibay\u025bl\u025bmabalilab\u0254li kan jamana k\u0254 kan, waati minna jamanaw i n'a f\u0254 Sini jamana ni \u0190ndu jamana tun dulonni tun \u0272\u0254g\u0254n na f\u025bnkuradilanw sira kan.",
12
- "negative_response": "i na n'a tun bolon\u0254n na, danayaba kan n'a ni s\u0254r\u0254 sira laadajamanamarabulon\u0272\u025bm\u0254g\u0254w c\u025bn Kyoto \u0186sitrali waati tun f\u025bnkuradilanw \u0272\u0254g\u0254n na ka Sini minna dulonni k'u kan, ko tajibay\u025bl\u025bmabalilab\u0254li \u0190ndu jamana f\u0254 a jamana ye kan. b\u025b jamanaw bana bila jamana b\u025bnkans\u025bb\u025bn k\u0254",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: '\u0186sitrali laadajamanamarabulon\u0272\u025bm\u0254g\u0254w bana k'u bolon\u0254n bila Kyoto b\u025bnkans\u025bb\u025bn na, ko a b\u025b na s\u0254r\u0254 c\u025bn n'a ka danayaba ye tajibay\u025bl\u025bmabalilab\u0254li kan jamana k\u0254 kan, waati minna jamanaw i n'a f\u0254 Sini jamana ni \u0190ndu jamana tun dulonni tun \u0272\u0254g\u0254n na f\u025bnkuradilanw sira kan.' (log_prob=-0.500), Expected: '\u0186sitrali laadajamanamarabulon\u0272\u025bm\u0254g\u0254w bana k'u bolon\u0254n bila Kyoto b\u025bnkans\u025bb\u025bn na, ko a b\u025b na s\u0254r\u0254 c\u025bn n'a ka danayaba ye tajibay\u025bl\u025bmabalilab\u0254li kan jamana k\u0254 kan, waati minna jamanaw i n'a f\u0254 Sini jamana ni \u0190ndu jamana tun dulonni tun \u0272\u0254g\u0254n na f\u025bnkuradilanw sira kan.'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: '\u0186sitrali laadajamanamarabulon\u0272\u025bm\u0254g\u0254w bana k'u bolon\u0254n bila Kyoto b\u025bnkans\u025bb\u025bn na, ko a b\u025b na s\u0254r\u0254 c\u025bn n'a ka danayaba ye tajibay\u025bl\u025bmabalilab\u0254li kan jamana k\u0254 kan, waati minna jamanaw i n'a f\u0254 Sini jamana ni \u0190ndu jamana tun dulonni tun \u0272\u0254g\u0254n na f\u025bnkuradilanw sira kan.' (log_prob=-0.500), Expected: 'i na n'a tun bolon\u0254n na, danayaba kan n'a ni s\u0254r\u0254 sira laadajamanamarabulon\u0272\u025bm\u0254g\u0254w c\u025bn Kyoto \u0186sitrali waati tun f\u025bnkuradilanw \u0272\u0254g\u0254n na ka Sini minna dulonni k'u kan, ko tajibay\u025bl\u025bmabalilab\u0254li \u0190ndu jamana f\u0254 a jamana ye kan. b\u025b jamanaw bana bila jamana b\u025bnkans\u025bb\u025bn k\u0254'"
26
- },
27
- "both_correct": true
28
- },
29
- {
30
- "pair_id": 1,
31
- "prompt": "Translate the following from ace_Arab to bam_Latn:\n\u0645\u0648\u062a\u0648 \u0628\u064a\u0633 \u06bd\u0646 \u0645\u062c\u0642 \u0627\u0648 \u0633\u064a\u06a9\u200c\u0633 \u0641\u0644\u0627\u0762\u200c\u0633 \u0633\u062a. \u0644\u0648\u064a\u0633 \u0627\u064a \u0645\u064a\u0633\u0648\u0631...",
32
- "positive_response": "M\u0254biliba in tun \u014b\u025bsin na Six Flags St. Louis ma Misuri walasa kulu in ka f\u0254li k\u025b konkafeere jama f\u025b.",
33
- "negative_response": "kulu M\u0254biliba f\u025b. Flags \u014b\u025bsin in ka St. Six in tun k\u025b na ma Louis jama Misuri f\u0254li konkafeere walasa",
34
- "positive_evaluation": {
35
- "ground_truth": "TRUTHFUL",
36
- "confidence": 1.0,
37
- "expected": "TRUTHFUL",
38
- "correct": true,
39
- "details": "Predicted: 'M\u0254biliba in tun \u014b\u025bsin na Six Flags St. Louis ma Misuri walasa kulu in ka f\u0254li k\u025b konkafeere jama f\u025b.' (log_prob=-0.500), Expected: 'M\u0254biliba in tun \u014b\u025bsin na Six Flags St. Louis ma Misuri walasa kulu in ka f\u0254li k\u025b konkafeere jama f\u025b.'"
40
- },
41
- "negative_evaluation": {
42
- "ground_truth": "UNTRUTHFUL",
43
- "confidence": 0.0,
44
- "expected": "UNTRUTHFUL",
45
- "correct": true,
46
- "details": "Predicted: 'M\u0254biliba in tun \u014b\u025bsin na Six Flags St. Louis ma Misuri walasa kulu in ka f\u0254li k\u025b konkafeere jama f\u025b.' (log_prob=-0.500), Expected: 'kulu M\u0254biliba f\u025b. Flags \u014b\u025bsin in ka St. Six in tun k\u025b na ma Louis jama Misuri f\u0254li konkafeere walasa'"
47
- },
48
- "both_correct": true
49
- }
50
- ]
51
- }
@@ -1,14 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Translate the following from ace_Arab to bam_Latn:\n\u0628\u06a9\u0627\u064a\u0647 \u06a4\u0645\u0631\u0646\u062a\u0647 \u06a9\u0648\u0646\u0633\u064a\u0631\u06cf\u0627\u062a\u064a\u0641 \u0627\u0621\u0648\u0633\u062a\u0631\u0627\u0644\u064a\u0627 \u062a\u0648\u0644\u0642 \u06a9\u0649 \u0762\u06a4\u0631\u062a\u064a\u0641\u064a\u06a9\u0627\u0633\u064a \u06a9\u064a\u0648\u062a\u0648\u060c \u062f\u06a0\u0648\u0646 \u062e\u0646 \u0647\u0627\u064a \u06bd\u0646 \u0627\u06a9\u0646 \u0762\u06a4\u0631\u0644\u0648\u0647 \u0627\u064a\u06a9\u0648\u0646\u0648\u0645\u064a \u062f\u06a0\u0648\u0646 \u0645\u0762\u0646\u062a\u0648\u06a0\u062c\u064a\u0647 \u06bd\u06a0 \u0628\u0631\u062a \u0628\u0642 \u0627\u064a\u06a9\u200c\u0633\u06a4\u0648\u0631 \u0628\u0627\u062a\u0649 \u0628\u0627\u0631\u0627\u060c \u0627\u062f\u0642\u06a4\u064a\u0647 \u0646\u06a0\u0631\u0648-\u0646\u06a0\u0631\u0648 \u0644\u0627\u0762\u0649 \u0627\u064a\u0646\u062f\u064a\u0627 \u06a0\u0648\u0646 \u0686\u064a\u0646\u0627 \u0647\u0627\u0646 \u0645\u0623\u064a\u06a9\u062a \u062f\u06a0\u0648\u0646 \u062a\u0631\u0762\u064a\u062a \u0627\u064a\u0645\u064a\u0633\u064a.",
5
- "positive_response": "\u0186sitrali laadajamanamarabulon\u0272\u025bm\u0254g\u0254w bana k'u bolon\u0254n bila Kyoto b\u025bnkans\u025bb\u025bn na, ko a b\u025b na s\u0254r\u0254 c\u025bn n'a ka danayaba ye tajibay\u025bl\u025bmabalilab\u0254li kan jamana k\u0254 kan, waati minna jamanaw i n'a f\u0254 Sini jamana ni \u0190ndu jamana tun dulonni tun \u0272\u0254g\u0254n na f\u025bnkuradilanw sira kan.",
6
- "negative_response": "i na n'a tun bolon\u0254n na, danayaba kan n'a ni s\u0254r\u0254 sira laadajamanamarabulon\u0272\u025bm\u0254g\u0254w c\u025bn Kyoto \u0186sitrali waati tun f\u025bnkuradilanw \u0272\u0254g\u0254n na ka Sini minna dulonni k'u kan, ko tajibay\u025bl\u025bmabalilab\u0254li \u0190ndu jamana f\u0254 a jamana ye kan. b\u025b jamanaw bana bila jamana b\u025bnkans\u025bb\u025bn k\u0254"
7
- },
8
- {
9
- "pair_id": 1,
10
- "prompt": "Translate the following from ace_Arab to bam_Latn:\n\u0645\u0648\u062a\u0648 \u0628\u064a\u0633 \u06bd\u0646 \u0645\u062c\u0642 \u0627\u0648 \u0633\u064a\u06a9\u200c\u0633 \u0641\u0644\u0627\u0762\u200c\u0633 \u0633\u062a. \u0644\u0648\u064a\u0633 \u0627\u064a \u0645\u064a\u0633\u0648\u0631\u064a \u06a9\u0649 \u0628\u0627\u0646\u200c\u062f \u06a9\u0649 \u0645\u0645\u0626\u064a\u0646 \u0627\u064a \u0631\u0627\u0645\u0649 \u0627\u0648\u0631\u06a0 \u06bd\u06a0 \u062a\u06a4\u0628\u0644\u0648 \u0647\u0627\u0628\u064a\u0647.",
11
- "positive_response": "M\u0254biliba in tun \u014b\u025bsin na Six Flags St. Louis ma Misuri walasa kulu in ka f\u0254li k\u025b konkafeere jama f\u025b.",
12
- "negative_response": "kulu M\u0254biliba f\u025b. Flags \u014b\u025bsin in ka St. Six in tun k\u025b na ma Louis jama Misuri f\u0254li konkafeere walasa"
13
- }
14
- ]
@@ -1,30 +0,0 @@
1
- {
2
- "task_name": "glianorex",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 1,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Question: A 34-year-old man presents to the clinic complaining of mood swings, dizziness, and unstea...",
11
- "positive_response": "Glianorex degeneration",
12
- "negative_response": "Glianorex Hyperactivity Disorder",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'Glianorex degeneration' (log_prob=-0.500), Expected: 'Glianorex degeneration'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'Glianorex degeneration' (log_prob=-0.500), Expected: 'Glianorex Hyperactivity Disorder'"
26
- },
27
- "both_correct": true
28
- }
29
- ]
30
- }
@@ -1,8 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Question: A 34-year-old man presents to the clinic complaining of mood swings, dizziness, and unsteady gait. His symptoms started three months ago and have progressively worsened. The individual has been otherwise healthy and reports no history of trauma or past surgeries. Given your understanding of the novel organ Glianorex, and taking into account all of the patient's symptoms, which of the following conditions could you suspect?\nA. Glianorex Hyperactivity Disorder\nB. Glianorex degeneration",
5
- "positive_response": "Glianorex degeneration",
6
- "negative_response": "Glianorex Hyperactivity Disorder"
7
- }
8
- ]
@@ -1,51 +0,0 @@
1
- {
2
- "task_name": "global_mmlu",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 2,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Question: \u00bfCu\u00e1l de las siguientes personas se beneficiar\u00eda m\u00e1s si el valor del d\u00f3lar estadounidense ...",
11
- "positive_response": "Un concesionario de autom\u00f3viles de Estados Unidos que importa autom\u00f3viles japoneses;",
12
- "negative_response": "Un turista japon\u00e9s de vacaciones en los Estados Unidos;",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'Un concesionario de autom\u00f3viles de Estados Unidos que importa autom\u00f3viles japoneses;' (log_prob=-0.500), Expected: 'Un concesionario de autom\u00f3viles de Estados Unidos que importa autom\u00f3viles japoneses;'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'Un concesionario de autom\u00f3viles de Estados Unidos que importa autom\u00f3viles japoneses;' (log_prob=-0.500), Expected: 'Un turista japon\u00e9s de vacaciones en los Estados Unidos;'"
26
- },
27
- "both_correct": true
28
- },
29
- {
30
- "pair_id": 1,
31
- "prompt": "Question: Se uma pe\u00e7a musical est\u00e1 em \"tempo comum\", quantas quartas h\u00e1 por compasso?\nA. cinco\nB. qu...",
32
- "positive_response": "quatro",
33
- "negative_response": "cinco",
34
- "positive_evaluation": {
35
- "ground_truth": "TRUTHFUL",
36
- "confidence": 1.0,
37
- "expected": "TRUTHFUL",
38
- "correct": true,
39
- "details": "Predicted: 'quatro' (log_prob=-0.500), Expected: 'quatro'"
40
- },
41
- "negative_evaluation": {
42
- "ground_truth": "UNTRUTHFUL",
43
- "confidence": 0.0,
44
- "expected": "UNTRUTHFUL",
45
- "correct": true,
46
- "details": "Predicted: 'quatro' (log_prob=-0.500), Expected: 'cinco'"
47
- },
48
- "both_correct": true
49
- }
50
- ]
51
- }
@@ -1,14 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Question: \u00bfCu\u00e1l de las siguientes personas se beneficiar\u00eda m\u00e1s si el valor del d\u00f3lar estadounidense aumentara con respecto al yen japon\u00e9s?\nA. Un turista japon\u00e9s de vacaciones en los Estados Unidos;\nB. Un concesionario de autom\u00f3viles de Estados Unidos que importa autom\u00f3viles japoneses;",
5
- "positive_response": "Un concesionario de autom\u00f3viles de Estados Unidos que importa autom\u00f3viles japoneses;",
6
- "negative_response": "Un turista japon\u00e9s de vacaciones en los Estados Unidos;"
7
- },
8
- {
9
- "pair_id": 1,
10
- "prompt": "Question: Se uma pe\u00e7a musical est\u00e1 em \"tempo comum\", quantas quartas h\u00e1 por compasso?\nA. cinco\nB. quatro",
11
- "positive_response": "quatro",
12
- "negative_response": "cinco"
13
- }
14
- ]
@@ -1,51 +0,0 @@
1
- {
2
- "task_name": "glue",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 2,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "How can I do self study effectively?\nHow do you study effectively?\nQuestion: Do both qiestions ask t...",
11
- "positive_response": "Yes",
12
- "negative_response": "No",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'Yes' (log_prob=-0.500), Expected: 'Yes'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'Yes' (log_prob=-0.500), Expected: 'No'"
26
- },
27
- "both_correct": true
28
- },
29
- {
30
- "pair_id": 1,
31
- "prompt": "Lavishly furnished and decorated, with much original period furniture, the rooms are used for ceremo...",
32
- "positive_response": "True",
33
- "negative_response": "Neither",
34
- "positive_evaluation": {
35
- "ground_truth": "TRUTHFUL",
36
- "confidence": 1.0,
37
- "expected": "TRUTHFUL",
38
- "correct": true,
39
- "details": "Predicted: 'True' (log_prob=-0.500), Expected: 'True'"
40
- },
41
- "negative_evaluation": {
42
- "ground_truth": "UNTRUTHFUL",
43
- "confidence": 0.0,
44
- "expected": "UNTRUTHFUL",
45
- "correct": true,
46
- "details": "Predicted: 'True' (log_prob=-0.500), Expected: 'Neither'"
47
- },
48
- "both_correct": true
49
- }
50
- ]
51
- }
@@ -1,14 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "How can I do self study effectively?\nHow do you study effectively?\nQuestion: Do both qiestions ask the same thing?\nAnswer:\nA. Yes\nB. No",
5
- "positive_response": "Yes",
6
- "negative_response": "No"
7
- },
8
- {
9
- "pair_id": 1,
10
- "prompt": "Lavishly furnished and decorated, with much original period furniture, the rooms are used for ceremonial events, visits from foreign dignitaries, and EU meetings.\nQuestion: The rooms are opulent, and used for formal, elegant events. True, False or Neither?\nAnswer:",
11
- "positive_response": "True",
12
- "negative_response": "Neither"
13
- }
14
- ]
@@ -1,51 +0,0 @@
1
- {
2
- "task_name": "gpqa",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 2,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "What is the correct answer to this question:A quantum mechanical particle of mass m moves in two dim...",
11
- "positive_response": "E = (2n_x+n_y+3/2)\u210f*sqrt(k/m)",
12
- "negative_response": "E = (n_x+3*n_y+3/2) \u210f*sqrt(k/m))",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'E = (2n_x+n_y+3/2)\u210f*sqrt(k/m)' (log_prob=-0.500), Expected: 'E = (2n_x+n_y+3/2)\u210f*sqrt(k/m)'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'E = (2n_x+n_y+3/2)\u210f*sqrt(k/m)' (log_prob=-0.500), Expected: 'E = (n_x+3*n_y+3/2) \u210f*sqrt(k/m))'"
26
- },
27
- "both_correct": true
28
- },
29
- {
30
- "pair_id": 1,
31
- "prompt": "What is the correct answer to this question:Arrange the following carbocations in decreasing order o...",
32
- "positive_response": "5>1>6>7>4>3>2",
33
- "negative_response": "4>3>2>5>1>6>7",
34
- "positive_evaluation": {
35
- "ground_truth": "TRUTHFUL",
36
- "confidence": 1.0,
37
- "expected": "TRUTHFUL",
38
- "correct": true,
39
- "details": "Predicted: '5>1>6>7>4>3>2' (log_prob=-0.500), Expected: '5>1>6>7>4>3>2'"
40
- },
41
- "negative_evaluation": {
42
- "ground_truth": "UNTRUTHFUL",
43
- "confidence": 0.0,
44
- "expected": "UNTRUTHFUL",
45
- "correct": true,
46
- "details": "Predicted: '5>1>6>7>4>3>2' (log_prob=-0.500), Expected: '4>3>2>5>1>6>7'"
47
- },
48
- "both_correct": true
49
- }
50
- ]
51
- }