wisent 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1020) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/__init__.py +22 -6
  3. wisent/core/activations/activation_cache.py +393 -0
  4. wisent/core/activations/activations.py +22 -40
  5. wisent/core/activations/activations_collector.py +145 -373
  6. wisent/core/activations/classifier_inference_strategy.py +195 -0
  7. wisent/core/activations/core/atoms.py +8 -92
  8. wisent/core/activations/extraction_strategy.py +480 -0
  9. wisent/core/agent/diagnose/response_diagnostics.py +3 -3
  10. wisent/core/agent/diagnose.py +3 -3
  11. wisent/core/autonomous_agent.py +2 -2
  12. wisent/core/classifiers/classifiers/core/atoms.py +3 -2
  13. wisent/core/cli/__init__.py +2 -1
  14. wisent/core/cli/agent/apply_steering.py +25 -31
  15. wisent/core/cli/agent/evaluate_response.py +18 -20
  16. wisent/core/cli/agent/train_classifier.py +36 -26
  17. wisent/core/cli/check_linearity.py +35 -3
  18. wisent/core/cli/cluster_benchmarks.py +470 -0
  19. wisent/core/cli/create_steering_vector.py +19 -9
  20. wisent/core/cli/diagnose_vectors.py +7 -4
  21. wisent/core/cli/estimate_unified_goodness_time.py +6 -4
  22. wisent/core/cli/generate_pairs_from_task.py +9 -56
  23. wisent/core/cli/generate_vector_from_task.py +4 -0
  24. wisent/core/cli/geometry_search.py +137 -0
  25. wisent/core/cli/get_activations.py +13 -37
  26. wisent/core/cli/method_optimizer.py +860 -0
  27. wisent/core/cli/modify_weights.py +3 -2
  28. wisent/core/cli/optimize.py +44 -5
  29. wisent/core/cli/optimize_classification.py +5 -6
  30. wisent/core/cli/optimize_sample_size.py +9 -23
  31. wisent/core/cli/optimize_steering.py +433 -159
  32. wisent/core/cli/optimize_weights.py +67 -7
  33. wisent/core/cli/preview_pairs.py +203 -0
  34. wisent/core/cli/steering_method_trainer.py +8 -7
  35. wisent/core/cli/steering_search_space.py +20 -15
  36. wisent/core/cli/tasks.py +31 -117
  37. wisent/core/cli/train_unified_goodness.py +18 -19
  38. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1582 -177
  39. wisent/core/contrastive_pairs/diagnostics/linearity.py +70 -80
  40. wisent/core/contrastive_pairs/diagnostics/vector_quality.py +6 -5
  41. wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +5 -19
  42. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +11 -5
  43. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
  44. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
  45. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +146 -32
  46. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
  47. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +2 -2
  48. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
  49. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
  50. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
  51. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
  52. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
  53. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
  54. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +98 -57
  55. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
  56. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
  57. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
  58. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
  59. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
  60. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
  61. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
  62. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
  63. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
  64. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
  65. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
  66. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
  67. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
  68. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
  69. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +8 -8
  70. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +1 -1
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +11 -5
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval_aqua_rat.py +129 -0
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
  82. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
  83. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
  84. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
  85. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
  86. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
  87. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
  88. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
  89. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
  90. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
  96. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
  102. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
  103. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
  104. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
  105. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
  106. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
  107. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
  108. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
  109. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
  110. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
  111. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
  112. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
  113. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
  114. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
  115. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
  116. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
  117. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
  118. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
  119. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
  120. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +11 -6
  121. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
  122. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
  123. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
  124. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
  125. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
  126. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
  127. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
  128. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
  129. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
  130. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
  131. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
  132. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
  133. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
  134. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
  135. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
  136. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
  137. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
  138. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
  139. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
  140. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
  141. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
  142. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
  143. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
  144. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
  145. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
  146. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
  147. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
  148. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
  149. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
  150. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
  151. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
  152. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
  153. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
  154. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
  155. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
  156. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
  157. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
  158. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
  159. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +1 -1
  160. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
  161. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
  162. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
  163. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
  164. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
  165. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
  166. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
  167. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
  168. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
  169. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
  170. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
  171. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
  172. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
  173. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
  174. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
  175. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
  176. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
  177. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
  178. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
  179. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
  180. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
  181. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
  182. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
  183. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
  184. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
  185. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
  186. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
  187. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
  188. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
  189. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +47 -6
  190. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
  191. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
  192. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
  193. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
  194. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
  195. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
  196. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
  197. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
  198. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
  199. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
  200. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
  201. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
  202. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
  203. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
  204. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
  205. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
  206. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
  207. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
  208. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
  209. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
  210. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
  211. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
  212. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
  213. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
  214. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
  215. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
  216. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
  217. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
  218. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
  219. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
  220. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
  221. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
  222. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
  223. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
  224. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
  225. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
  226. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
  227. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
  228. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
  229. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
  230. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
  231. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
  232. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
  233. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
  234. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
  235. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
  236. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
  237. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
  238. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
  239. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
  240. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
  241. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
  242. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
  243. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
  244. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
  245. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
  246. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
  247. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
  248. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
  249. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
  250. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
  251. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
  252. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
  253. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
  254. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
  255. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
  256. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
  257. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
  258. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
  259. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
  260. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
  261. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
  262. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
  263. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
  264. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
  265. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
  266. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
  267. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
  268. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
  269. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
  270. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
  271. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
  272. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
  273. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
  274. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
  275. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
  276. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
  277. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
  278. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
  279. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
  280. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
  281. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
  282. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
  283. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
  284. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
  285. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
  286. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
  287. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
  288. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
  289. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
  290. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
  291. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
  292. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
  293. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
  294. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
  295. wisent/core/data_loaders/loaders/lm_loader.py +12 -1
  296. wisent/core/evaluators/benchmark_specific/apps_evaluator.py +133 -0
  297. wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +6 -1
  298. wisent/core/evaluators/benchmark_specific/conala_evaluator.py +31 -168
  299. wisent/core/evaluators/custom/examples/humanization_coherent.py +89 -35
  300. wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +2 -20
  301. wisent/core/evaluators/personalization/coherence.py +46 -0
  302. wisent/core/geometry_runner.py +995 -0
  303. wisent/core/geometry_search_space.py +237 -0
  304. wisent/core/hyperparameter_optimizer.py +14 -14
  305. wisent/core/lm_eval_harness_ground_truth.py +7 -11
  306. wisent/core/main.py +6 -0
  307. wisent/core/models/core/atoms.py +5 -3
  308. wisent/core/models/wisent_model.py +9 -8
  309. wisent/core/opti/methods/opti_weights.py +29 -2
  310. wisent/core/optuna/classifier/activation_generator.py +14 -12
  311. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  312. wisent/core/optuna/steering/steering_optimization.py +14 -9
  313. wisent/core/parser_arguments/check_linearity_parser.py +12 -2
  314. wisent/core/parser_arguments/cluster_benchmarks_parser.py +31 -0
  315. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
  316. wisent/core/parser_arguments/generate_vector_from_task_parser.py +22 -2
  317. wisent/core/parser_arguments/geometry_search_parser.py +61 -0
  318. wisent/core/parser_arguments/main_parser.py +16 -0
  319. wisent/core/parser_arguments/optimize_steering_parser.py +117 -10
  320. wisent/core/parser_arguments/optimize_weights_parser.py +6 -0
  321. wisent/core/parser_arguments/tasks_parser.py +7 -19
  322. wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
  323. wisent/core/steering.py +5 -3
  324. wisent/core/steering_methods/core/atoms.py +1 -2
  325. wisent/core/steering_methods/methods/caa.py +1 -1
  326. wisent/core/steering_methods/methods/hyperplane.py +75 -0
  327. wisent/core/steering_methods/methods/prism.py +1 -2
  328. wisent/core/steering_methods/methods/pulse.py +39 -8
  329. wisent/core/steering_methods/methods/titan.py +59 -14
  330. wisent/core/steering_methods/registry.py +52 -12
  331. wisent/core/steering_optimizer.py +15 -15
  332. wisent/core/synthetic/generators/nonsense_generator.py +30 -18
  333. wisent/core/trainers/steering_trainer.py +11 -20
  334. wisent/core/utils/device.py +27 -27
  335. wisent/core/utils/layer_combinations.py +70 -0
  336. wisent/examples/__init__.py +1 -0
  337. wisent/examples/scripts/__init__.py +1 -0
  338. wisent/examples/scripts/count_all_benchmarks.py +121 -0
  339. wisent/examples/scripts/discover_directions.py +469 -0
  340. wisent/examples/scripts/extract_benchmark_info.py +71 -0
  341. wisent/examples/scripts/generate_paper_data.py +384 -0
  342. wisent/examples/scripts/intervention_validation.py +626 -0
  343. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
  344. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
  345. wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
  346. wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
  347. wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
  348. wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
  349. wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
  350. wisent/examples/scripts/search_all_short_names.py +31 -0
  351. wisent/examples/scripts/test_all_benchmarks.py +138 -0
  352. wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
  353. wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
  354. wisent/examples/scripts/test_nonsense_baseline.py +261 -0
  355. wisent/examples/scripts/test_one_benchmark.py +324 -0
  356. wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
  357. wisent/examples/scripts/threshold_analysis.py +434 -0
  358. wisent/examples/scripts/visualization_gallery.py +582 -0
  359. wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
  360. wisent/parameters/lm_eval/category_directions.json +137 -0
  361. wisent/parameters/lm_eval/repair_plan.json +282 -0
  362. wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +19 -70
  363. wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
  364. wisent/parameters/lm_eval/working_benchmarks.json +206 -0
  365. wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
  366. wisent/scripts/run_quality_metrics_sweep.sh +22 -27
  367. wisent/tests/test_aggregation_geometry.py +236 -0
  368. wisent/tests/test_detector_accuracy.py +163 -0
  369. wisent/tests/test_geometry_exhaustive.py +1202 -0
  370. wisent/tests/visualize_geometry.py +255 -61
  371. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
  372. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/RECORD +376 -974
  373. wisent/core/activations/prompt_construction_strategy.py +0 -47
  374. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
  375. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +0 -15
  376. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +0 -64
  377. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +0 -65
  378. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +0 -65
  379. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +0 -65
  380. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +0 -65
  381. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +0 -65
  382. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +0 -99
  383. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +0 -180
  384. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +0 -129
  385. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +0 -142
  386. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +0 -155
  387. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +0 -161
  388. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +0 -107
  389. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +0 -155
  390. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +0 -155
  391. wisent/examples/scripts/results/benchmark_descriptions.json +0 -1244
  392. wisent/examples/scripts/results/benchmark_evaluation_methods.json +0 -66
  393. wisent/examples/scripts/results/benchmark_evaluator_mapping.json +0 -2781
  394. wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +0 -30536
  395. wisent/examples/scripts/results/benchmark_evaluators_clean.json +0 -469
  396. wisent/examples/scripts/results/benchmark_methods_summary.json +0 -260
  397. wisent/examples/scripts/results/benchmark_pair_creation_methods.json +0 -66
  398. wisent/examples/scripts/results/benchmark_pair_totals.json +0 -269
  399. wisent/examples/scripts/results/benchmark_tags.json +0 -917
  400. wisent/examples/scripts/results/benchmark_test_summary_nov4.json +0 -71
  401. wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +0 -150
  402. wisent/examples/scripts/results/failing_benchmarks.json +0 -946
  403. wisent/examples/scripts/results/failing_benchmarks_list.json +0 -41
  404. wisent/examples/scripts/results/failing_benchmarks_test_results.json +0 -945
  405. wisent/examples/scripts/results/missing_benchmark_tags.json +0 -341
  406. wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +0 -30
  407. wisent/examples/scripts/results/test_20_newsgroups_pairs.json +0 -8
  408. wisent/examples/scripts/results/test_AraDICE_evaluation.json +0 -51
  409. wisent/examples/scripts/results/test_AraDICE_pairs.json +0 -14
  410. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +0 -30
  411. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +0 -8
  412. wisent/examples/scripts/results/test_ArabCulture_evaluation.json +0 -51
  413. wisent/examples/scripts/results/test_ArabCulture_pairs.json +0 -14
  414. wisent/examples/scripts/results/test_Tag_evaluation.json +0 -30
  415. wisent/examples/scripts/results/test_Tag_pairs.json +0 -8
  416. wisent/examples/scripts/results/test_aclue_evaluation.json +0 -51
  417. wisent/examples/scripts/results/test_aclue_pairs.json +0 -14
  418. wisent/examples/scripts/results/test_acp_bench_evaluation.json +0 -51
  419. wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +0 -51
  420. wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +0 -14
  421. wisent/examples/scripts/results/test_acp_bench_pairs.json +0 -14
  422. wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +0 -51
  423. wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +0 -14
  424. wisent/examples/scripts/results/test_aexams_evaluation.json +0 -51
  425. wisent/examples/scripts/results/test_aexams_pairs.json +0 -14
  426. wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +0 -30
  427. wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +0 -8
  428. wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +0 -30
  429. wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +0 -8
  430. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +0 -30
  431. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +0 -8
  432. wisent/examples/scripts/results/test_ag_news_evaluation.json +0 -30
  433. wisent/examples/scripts/results/test_ag_news_pairs.json +0 -8
  434. wisent/examples/scripts/results/test_agieval_evaluation.json +0 -51
  435. wisent/examples/scripts/results/test_agieval_pairs.json +0 -14
  436. wisent/examples/scripts/results/test_aime2024_evaluation.json +0 -30
  437. wisent/examples/scripts/results/test_aime2024_pairs.json +0 -8
  438. wisent/examples/scripts/results/test_aime2025_evaluation.json +0 -30
  439. wisent/examples/scripts/results/test_aime2025_pairs.json +0 -8
  440. wisent/examples/scripts/results/test_aime_evaluation.json +0 -30
  441. wisent/examples/scripts/results/test_aime_pairs.json +0 -8
  442. wisent/examples/scripts/results/test_anagrams1_evaluation.json +0 -30
  443. wisent/examples/scripts/results/test_anagrams1_pairs.json +0 -8
  444. wisent/examples/scripts/results/test_anagrams2_evaluation.json +0 -30
  445. wisent/examples/scripts/results/test_anagrams2_pairs.json +0 -8
  446. wisent/examples/scripts/results/test_anli_evaluation.json +0 -30
  447. wisent/examples/scripts/results/test_anli_pairs.json +0 -8
  448. wisent/examples/scripts/results/test_apps_evaluation.json +0 -30
  449. wisent/examples/scripts/results/test_apps_pairs.json +0 -8
  450. wisent/examples/scripts/results/test_arabic_exams_evaluation.json +0 -30
  451. wisent/examples/scripts/results/test_arabic_exams_pairs.json +0 -8
  452. wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +0 -51
  453. wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +0 -14
  454. wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +0 -51
  455. wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +0 -14
  456. wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +0 -51
  457. wisent/examples/scripts/results/test_arabicmmlu_pairs.json +0 -14
  458. wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +0 -51
  459. wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +0 -14
  460. wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +0 -51
  461. wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +0 -14
  462. wisent/examples/scripts/results/test_arc_ar_evaluation.json +0 -30
  463. wisent/examples/scripts/results/test_arc_ar_pairs.json +0 -8
  464. wisent/examples/scripts/results/test_arc_challenge_evaluation.json +0 -30
  465. wisent/examples/scripts/results/test_arc_challenge_pairs.json +0 -8
  466. wisent/examples/scripts/results/test_arc_easy_evaluation.json +0 -30
  467. wisent/examples/scripts/results/test_arc_easy_pairs.json +0 -8
  468. wisent/examples/scripts/results/test_argument_topic_evaluation.json +0 -30
  469. wisent/examples/scripts/results/test_argument_topic_pairs.json +0 -8
  470. wisent/examples/scripts/results/test_arithmetic_evaluation.json +0 -51
  471. wisent/examples/scripts/results/test_arithmetic_pairs.json +0 -14
  472. wisent/examples/scripts/results/test_asdiv_evaluation.json +0 -30
  473. wisent/examples/scripts/results/test_asdiv_pairs.json +0 -8
  474. wisent/examples/scripts/results/test_assin_entailment_evaluation.json +0 -30
  475. wisent/examples/scripts/results/test_assin_entailment_pairs.json +0 -8
  476. wisent/examples/scripts/results/test_atis_evaluation.json +0 -30
  477. wisent/examples/scripts/results/test_atis_pairs.json +0 -8
  478. wisent/examples/scripts/results/test_babi_evaluation.json +0 -30
  479. wisent/examples/scripts/results/test_babi_pairs.json +0 -8
  480. wisent/examples/scripts/results/test_babilong_evaluation.json +0 -30
  481. wisent/examples/scripts/results/test_babilong_pairs.json +0 -8
  482. wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +0 -30
  483. wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +0 -8
  484. wisent/examples/scripts/results/test_banking77_evaluation.json +0 -30
  485. wisent/examples/scripts/results/test_banking77_pairs.json +0 -8
  486. wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +0 -14
  487. wisent/examples/scripts/results/test_basque-glue_evaluation.json +0 -51
  488. wisent/examples/scripts/results/test_basque-glue_pairs.json +0 -14
  489. wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +0 -51
  490. wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +0 -14
  491. wisent/examples/scripts/results/test_basque_bench_evaluation.json +0 -51
  492. wisent/examples/scripts/results/test_basque_bench_pairs.json +0 -14
  493. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +0 -51
  494. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +0 -14
  495. wisent/examples/scripts/results/test_basqueglue_evaluation.json +0 -51
  496. wisent/examples/scripts/results/test_basqueglue_pairs.json +0 -14
  497. wisent/examples/scripts/results/test_bbh_evaluation.json +0 -51
  498. wisent/examples/scripts/results/test_bbh_pairs.json +0 -14
  499. wisent/examples/scripts/results/test_bbq_evaluation.json +0 -30
  500. wisent/examples/scripts/results/test_bbq_pairs.json +0 -8
  501. wisent/examples/scripts/results/test_bec2016eu_evaluation.json +0 -51
  502. wisent/examples/scripts/results/test_bec2016eu_pairs.json +0 -14
  503. wisent/examples/scripts/results/test_belebele_evaluation.json +0 -51
  504. wisent/examples/scripts/results/test_belebele_pairs.json +0 -14
  505. wisent/examples/scripts/results/test_benchmarks_evaluation.json +0 -51
  506. wisent/examples/scripts/results/test_benchmarks_pairs.json +0 -14
  507. wisent/examples/scripts/results/test_bertaqa_evaluation.json +0 -51
  508. wisent/examples/scripts/results/test_bertaqa_pairs.json +0 -14
  509. wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +0 -30
  510. wisent/examples/scripts/results/test_bhtc_v2_pairs.json +0 -8
  511. wisent/examples/scripts/results/test_bigbench_evaluation.json +0 -51
  512. wisent/examples/scripts/results/test_bigbench_pairs.json +0 -14
  513. wisent/examples/scripts/results/test_blimp_evaluation.json +0 -51
  514. wisent/examples/scripts/results/test_blimp_pairs.json +0 -14
  515. wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +0 -30
  516. wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +0 -8
  517. wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +0 -30
  518. wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +0 -8
  519. wisent/examples/scripts/results/test_boolq_evaluation.json +0 -30
  520. wisent/examples/scripts/results/test_boolq_pairs.json +0 -8
  521. wisent/examples/scripts/results/test_c4_evaluation.json +0 -30
  522. wisent/examples/scripts/results/test_c4_pairs.json +0 -8
  523. wisent/examples/scripts/results/test_cabreu_evaluation.json +0 -30
  524. wisent/examples/scripts/results/test_cabreu_pairs.json +0 -8
  525. wisent/examples/scripts/results/test_careqa_evaluation.json +0 -30
  526. wisent/examples/scripts/results/test_careqa_pairs.json +0 -8
  527. wisent/examples/scripts/results/test_catalan_bench_evaluation.json +0 -51
  528. wisent/examples/scripts/results/test_catalan_bench_pairs.json +0 -14
  529. wisent/examples/scripts/results/test_catalanqa_evaluation.json +0 -30
  530. wisent/examples/scripts/results/test_catalanqa_pairs.json +0 -8
  531. wisent/examples/scripts/results/test_catcola_evaluation.json +0 -30
  532. wisent/examples/scripts/results/test_catcola_pairs.json +0 -8
  533. wisent/examples/scripts/results/test_cb_evaluation.json +0 -30
  534. wisent/examples/scripts/results/test_cb_pairs.json +0 -8
  535. wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +0 -51
  536. wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +0 -14
  537. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +0 -30
  538. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +0 -8
  539. wisent/examples/scripts/results/test_ceval_evaluation.json +0 -51
  540. wisent/examples/scripts/results/test_ceval_pairs.json +0 -14
  541. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +0 -51
  542. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +0 -14
  543. wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +0 -51
  544. wisent/examples/scripts/results/test_chain_of_thought_pairs.json +0 -14
  545. wisent/examples/scripts/results/test_chartqa_evaluation.json +0 -30
  546. wisent/examples/scripts/results/test_chartqa_pairs.json +0 -8
  547. wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +0 -30
  548. wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +0 -8
  549. wisent/examples/scripts/results/test_cmmlu_evaluation.json +0 -51
  550. wisent/examples/scripts/results/test_cmmlu_pairs.json +0 -14
  551. wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +0 -30
  552. wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +0 -8
  553. wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +0 -30
  554. wisent/examples/scripts/results/test_cocoteros_es_pairs.json +0 -8
  555. wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +0 -30
  556. wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +0 -8
  557. wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +0 -30
  558. wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +0 -8
  559. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +0 -30
  560. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +0 -8
  561. wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +0 -30
  562. wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +0 -8
  563. wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +0 -30
  564. wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +0 -8
  565. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +0 -30
  566. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +0 -8
  567. wisent/examples/scripts/results/test_coedit_gec_evaluation.json +0 -30
  568. wisent/examples/scripts/results/test_coedit_gec_pairs.json +0 -8
  569. wisent/examples/scripts/results/test_cola_evaluation.json +0 -30
  570. wisent/examples/scripts/results/test_cola_pairs.json +0 -8
  571. wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +0 -30
  572. wisent/examples/scripts/results/test_commonsense_qa_pairs.json +0 -8
  573. wisent/examples/scripts/results/test_conala_evaluation.json +0 -30
  574. wisent/examples/scripts/results/test_conala_pairs.json +0 -8
  575. wisent/examples/scripts/results/test_concode_evaluation.json +0 -30
  576. wisent/examples/scripts/results/test_concode_pairs.json +0 -8
  577. wisent/examples/scripts/results/test_copa_evaluation.json +0 -30
  578. wisent/examples/scripts/results/test_copa_pairs.json +0 -8
  579. wisent/examples/scripts/results/test_copal_id_evaluation.json +0 -30
  580. wisent/examples/scripts/results/test_copal_id_pairs.json +0 -8
  581. wisent/examples/scripts/results/test_coqa_evaluation.json +0 -30
  582. wisent/examples/scripts/results/test_coqa_pairs.json +0 -8
  583. wisent/examples/scripts/results/test_coqcat_evaluation.json +0 -30
  584. wisent/examples/scripts/results/test_coqcat_pairs.json +0 -8
  585. wisent/examples/scripts/results/test_crows_pairs_evaluation.json +0 -51
  586. wisent/examples/scripts/results/test_crows_pairs_pairs.json +0 -14
  587. wisent/examples/scripts/results/test_csatqa_evaluation.json +0 -51
  588. wisent/examples/scripts/results/test_csatqa_pairs.json +0 -14
  589. wisent/examples/scripts/results/test_cycle_letters_evaluation.json +0 -30
  590. wisent/examples/scripts/results/test_cycle_letters_pairs.json +0 -8
  591. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +0 -51
  592. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +0 -14
  593. wisent/examples/scripts/results/test_darija_bench_evaluation.json +0 -51
  594. wisent/examples/scripts/results/test_darija_bench_pairs.json +0 -14
  595. wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +0 -30
  596. wisent/examples/scripts/results/test_darijahellaswag_pairs.json +0 -8
  597. wisent/examples/scripts/results/test_darijammlu_evaluation.json +0 -51
  598. wisent/examples/scripts/results/test_darijammlu_pairs.json +0 -14
  599. wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +0 -30
  600. wisent/examples/scripts/results/test_dbpedia_14_pairs.json +0 -8
  601. wisent/examples/scripts/results/test_drop_evaluation.json +0 -30
  602. wisent/examples/scripts/results/test_drop_pairs.json +0 -8
  603. wisent/examples/scripts/results/test_ds1000_evaluation.json +0 -30
  604. wisent/examples/scripts/results/test_ds1000_pairs.json +0 -8
  605. wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +0 -30
  606. wisent/examples/scripts/results/test_egyhellaswag_pairs.json +0 -8
  607. wisent/examples/scripts/results/test_egymmlu_evaluation.json +0 -51
  608. wisent/examples/scripts/results/test_egymmlu_pairs.json +0 -14
  609. wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +0 -30
  610. wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +0 -8
  611. wisent/examples/scripts/results/test_eq_bench_evaluation.json +0 -30
  612. wisent/examples/scripts/results/test_eq_bench_pairs.json +0 -8
  613. wisent/examples/scripts/results/test_escola_evaluation.json +0 -30
  614. wisent/examples/scripts/results/test_escola_pairs.json +0 -8
  615. wisent/examples/scripts/results/test_ethics_cm_evaluation.json +0 -30
  616. wisent/examples/scripts/results/test_ethics_cm_pairs.json +0 -8
  617. wisent/examples/scripts/results/test_ethos_binary_evaluation.json +0 -30
  618. wisent/examples/scripts/results/test_ethos_binary_pairs.json +0 -8
  619. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +0 -51
  620. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +0 -14
  621. wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +0 -51
  622. wisent/examples/scripts/results/test_eus_exams_es_pairs.json +0 -14
  623. wisent/examples/scripts/results/test_eus_exams_evaluation.json +0 -51
  624. wisent/examples/scripts/results/test_eus_exams_pairs.json +0 -14
  625. wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +0 -30
  626. wisent/examples/scripts/results/test_eus_proficiency_pairs.json +0 -8
  627. wisent/examples/scripts/results/test_eus_reading_evaluation.json +0 -30
  628. wisent/examples/scripts/results/test_eus_reading_pairs.json +0 -8
  629. wisent/examples/scripts/results/test_eus_trivia_evaluation.json +0 -30
  630. wisent/examples/scripts/results/test_eus_trivia_pairs.json +0 -8
  631. wisent/examples/scripts/results/test_evalita-mp_evaluation.json +0 -51
  632. wisent/examples/scripts/results/test_evalita-mp_pairs.json +0 -14
  633. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
  634. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
  635. wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +0 -51
  636. wisent/examples/scripts/results/test_evalita_LLM_pairs.json +0 -14
  637. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +0 -51
  638. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +0 -14
  639. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +0 -30
  640. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +0 -8
  641. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +0 -51
  642. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +0 -14
  643. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +0 -30
  644. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +0 -8
  645. wisent/examples/scripts/results/test_fda_evaluation.json +0 -30
  646. wisent/examples/scripts/results/test_fda_pairs.json +0 -8
  647. wisent/examples/scripts/results/test_financial_tweets_evaluation.json +0 -30
  648. wisent/examples/scripts/results/test_financial_tweets_pairs.json +0 -8
  649. wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +0 -30
  650. wisent/examples/scripts/results/test_fld/test_fld_pairs.json +0 -8
  651. wisent/examples/scripts/results/test_fld_evaluation.json +0 -30
  652. wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +0 -30
  653. wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +0 -8
  654. wisent/examples/scripts/results/test_fld_pairs.json +0 -8
  655. wisent/examples/scripts/results/test_flores_evaluation.json +0 -51
  656. wisent/examples/scripts/results/test_flores_pairs.json +0 -14
  657. wisent/examples/scripts/results/test_freebase_evaluation.json +0 -30
  658. wisent/examples/scripts/results/test_freebase_pairs.json +0 -8
  659. wisent/examples/scripts/results/test_french_bench_evaluation.json +0 -51
  660. wisent/examples/scripts/results/test_french_bench_pairs.json +0 -14
  661. wisent/examples/scripts/results/test_galcola_evaluation.json +0 -30
  662. wisent/examples/scripts/results/test_galcola_pairs.json +0 -8
  663. wisent/examples/scripts/results/test_galician_bench_evaluation.json +0 -51
  664. wisent/examples/scripts/results/test_galician_bench_pairs.json +0 -14
  665. wisent/examples/scripts/results/test_glianorex_evaluation.json +0 -30
  666. wisent/examples/scripts/results/test_glianorex_pairs.json +0 -8
  667. wisent/examples/scripts/results/test_global_mmlu_evaluation.json +0 -51
  668. wisent/examples/scripts/results/test_global_mmlu_pairs.json +0 -14
  669. wisent/examples/scripts/results/test_glue_evaluation.json +0 -51
  670. wisent/examples/scripts/results/test_glue_pairs.json +0 -14
  671. wisent/examples/scripts/results/test_gpqa_evaluation.json +0 -51
  672. wisent/examples/scripts/results/test_gpqa_pairs.json +0 -14
  673. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +0 -51
  674. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +0 -14
  675. wisent/examples/scripts/results/test_groundcocoa_evaluation.json +0 -30
  676. wisent/examples/scripts/results/test_groundcocoa_pairs.json +0 -8
  677. wisent/examples/scripts/results/test_gsm8k_evaluation.json +0 -30
  678. wisent/examples/scripts/results/test_gsm8k_pairs.json +0 -8
  679. wisent/examples/scripts/results/test_haerae_evaluation.json +0 -51
  680. wisent/examples/scripts/results/test_haerae_pairs.json +0 -14
  681. wisent/examples/scripts/results/test_headqa_evaluation.json +0 -30
  682. wisent/examples/scripts/results/test_headqa_pairs.json +0 -8
  683. wisent/examples/scripts/results/test_hellaswag_evaluation.json +0 -30
  684. wisent/examples/scripts/results/test_hellaswag_pairs.json +0 -8
  685. wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +0 -51
  686. wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +0 -14
  687. wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +0 -51
  688. wisent/examples/scripts/results/test_hendrycks_math_pairs.json +0 -14
  689. wisent/examples/scripts/results/test_histoires_morales_evaluation.json +0 -30
  690. wisent/examples/scripts/results/test_histoires_morales_pairs.json +0 -8
  691. wisent/examples/scripts/results/test_hmmt_evaluation.json +0 -30
  692. wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +0 -30
  693. wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +0 -8
  694. wisent/examples/scripts/results/test_hmmt_pairs.json +0 -8
  695. wisent/examples/scripts/results/test_hrm8k_evaluation.json +0 -51
  696. wisent/examples/scripts/results/test_hrm8k_pairs.json +0 -14
  697. wisent/examples/scripts/results/test_humaneval_evaluation.json +0 -30
  698. wisent/examples/scripts/results/test_humaneval_pairs.json +0 -8
  699. wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +0 -30
  700. wisent/examples/scripts/results/test_humaneval_plus_pairs.json +0 -8
  701. wisent/examples/scripts/results/test_ifeval_evaluation.json +0 -30
  702. wisent/examples/scripts/results/test_ifeval_pairs.json +0 -8
  703. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +0 -30
  704. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +0 -8
  705. wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +0 -30
  706. wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +0 -8
  707. wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +0 -51
  708. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +0 -30
  709. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +0 -8
  710. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +0 -51
  711. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +0 -14
  712. wisent/examples/scripts/results/test_inverse_scaling_pairs.json +0 -14
  713. wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +0 -30
  714. wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +0 -8
  715. wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +0 -30
  716. wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +0 -8
  717. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +0 -30
  718. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +0 -8
  719. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +0 -30
  720. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +0 -8
  721. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +0 -30
  722. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +0 -8
  723. wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +0 -51
  724. wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +0 -14
  725. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +0 -30
  726. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +0 -8
  727. wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +0 -30
  728. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +0 -30
  729. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +0 -8
  730. wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +0 -8
  731. wisent/examples/scripts/results/test_kbl_evaluation.json +0 -51
  732. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +0 -51
  733. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +0 -14
  734. wisent/examples/scripts/results/test_kbl_pairs.json +0 -14
  735. wisent/examples/scripts/results/test_kmmlu_evaluation.json +0 -51
  736. wisent/examples/scripts/results/test_kmmlu_pairs.json +0 -14
  737. wisent/examples/scripts/results/test_kobest_evaluation.json +0 -51
  738. wisent/examples/scripts/results/test_kobest_pairs.json +0 -14
  739. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +0 -30
  740. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +0 -8
  741. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +0 -30
  742. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +0 -8
  743. wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +0 -30
  744. wisent/examples/scripts/results/test_kormedmcqa_pairs.json +0 -8
  745. wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +0 -30
  746. wisent/examples/scripts/results/test_lambada_cloze_pairs.json +0 -8
  747. wisent/examples/scripts/results/test_lambada_evaluation.json +0 -30
  748. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  749. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  750. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +0 -51
  751. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +0 -14
  752. wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +0 -51
  753. wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +0 -14
  754. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +0 -51
  755. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +0 -14
  756. wisent/examples/scripts/results/test_lambada_openai_evaluation.json +0 -30
  757. wisent/examples/scripts/results/test_lambada_openai_pairs.json +0 -8
  758. wisent/examples/scripts/results/test_lambada_pairs.json +0 -8
  759. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  760. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  761. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +0 -30
  762. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +0 -8
  763. wisent/examples/scripts/results/test_lambada_standard_evaluation.json +0 -30
  764. wisent/examples/scripts/results/test_lambada_standard_pairs.json +0 -8
  765. wisent/examples/scripts/results/test_leaderboard_evaluation.json +0 -51
  766. wisent/examples/scripts/results/test_leaderboard_pairs.json +0 -14
  767. wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +0 -51
  768. wisent/examples/scripts/results/test_libra/test_libra_pairs.json +0 -14
  769. wisent/examples/scripts/results/test_libra_evaluation.json +0 -51
  770. wisent/examples/scripts/results/test_libra_pairs.json +0 -14
  771. wisent/examples/scripts/results/test_lingoly_evaluation.json +0 -30
  772. wisent/examples/scripts/results/test_lingoly_pairs.json +0 -8
  773. wisent/examples/scripts/results/test_livecodebench_evaluation.json +0 -30
  774. wisent/examples/scripts/results/test_livecodebench_pairs.json +0 -8
  775. wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +0 -30
  776. wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +0 -8
  777. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +0 -30
  778. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +0 -8
  779. wisent/examples/scripts/results/test_llama_evaluation.json +0 -30
  780. wisent/examples/scripts/results/test_llama_pairs.json +0 -8
  781. wisent/examples/scripts/results/test_logiqa2_evaluation.json +0 -30
  782. wisent/examples/scripts/results/test_logiqa2_pairs.json +0 -8
  783. wisent/examples/scripts/results/test_logiqa_evaluation.json +0 -30
  784. wisent/examples/scripts/results/test_logiqa_pairs.json +0 -8
  785. wisent/examples/scripts/results/test_m_mmlu_evaluation.json +0 -51
  786. wisent/examples/scripts/results/test_m_mmlu_pairs.json +0 -14
  787. wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +0 -51
  788. wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +0 -14
  789. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +0 -30
  790. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +0 -8
  791. wisent/examples/scripts/results/test_mastermind_evaluation.json +0 -51
  792. wisent/examples/scripts/results/test_mastermind_pairs.json +0 -14
  793. wisent/examples/scripts/results/test_math500_evaluation.json +0 -30
  794. wisent/examples/scripts/results/test_math500_pairs.json +0 -8
  795. wisent/examples/scripts/results/test_math_evaluation.json +0 -30
  796. wisent/examples/scripts/results/test_math_pairs.json +0 -8
  797. wisent/examples/scripts/results/test_mathqa_evaluation.json +0 -30
  798. wisent/examples/scripts/results/test_mathqa_pairs.json +0 -8
  799. wisent/examples/scripts/results/test_mbpp_evaluation.json +0 -30
  800. wisent/examples/scripts/results/test_mbpp_pairs.json +0 -8
  801. wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +0 -30
  802. wisent/examples/scripts/results/test_mbpp_plus_pairs.json +0 -8
  803. wisent/examples/scripts/results/test_mc_taco_evaluation.json +0 -30
  804. wisent/examples/scripts/results/test_mc_taco_pairs.json +0 -8
  805. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +0 -51
  806. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +0 -14
  807. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +0 -30
  808. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +0 -8
  809. wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +0 -51
  810. wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +0 -14
  811. wisent/examples/scripts/results/test_meddialog_evaluation.json +0 -30
  812. wisent/examples/scripts/results/test_meddialog_pairs.json +0 -8
  813. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +0 -30
  814. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +0 -8
  815. wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +0 -30
  816. wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +0 -8
  817. wisent/examples/scripts/results/test_medmcqa_evaluation.json +0 -30
  818. wisent/examples/scripts/results/test_medmcqa_pairs.json +0 -8
  819. wisent/examples/scripts/results/test_medqa_evaluation.json +0 -30
  820. wisent/examples/scripts/results/test_medqa_pairs.json +0 -8
  821. wisent/examples/scripts/results/test_medtext_evaluation.json +0 -30
  822. wisent/examples/scripts/results/test_medtext_pairs.json +0 -8
  823. wisent/examples/scripts/results/test_mela_evaluation.json +0 -51
  824. wisent/examples/scripts/results/test_mela_pairs.json +0 -14
  825. wisent/examples/scripts/results/test_meqsum_evaluation.json +0 -30
  826. wisent/examples/scripts/results/test_meqsum_pairs.json +0 -8
  827. wisent/examples/scripts/results/test_mercury_evaluation.json +0 -30
  828. wisent/examples/scripts/results/test_mercury_pairs.json +0 -8
  829. wisent/examples/scripts/results/test_metabench_evaluation.json +0 -51
  830. wisent/examples/scripts/results/test_metabench_pairs.json +0 -14
  831. wisent/examples/scripts/results/test_mgsm_evaluation.json +0 -51
  832. wisent/examples/scripts/results/test_mgsm_pairs.json +0 -14
  833. wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +0 -30
  834. wisent/examples/scripts/results/test_mimic_repsum_pairs.json +0 -8
  835. wisent/examples/scripts/results/test_minerva_math_evaluation.json +0 -51
  836. wisent/examples/scripts/results/test_minerva_math_pairs.json +0 -14
  837. wisent/examples/scripts/results/test_mlqa_evaluation.json +0 -51
  838. wisent/examples/scripts/results/test_mlqa_pairs.json +0 -14
  839. wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +0 -51
  840. wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +0 -14
  841. wisent/examples/scripts/results/test_mmlu_evaluation.json +0 -51
  842. wisent/examples/scripts/results/test_mmlu_pairs.json +0 -14
  843. wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +0 -51
  844. wisent/examples/scripts/results/test_mmlu_pro_pairs.json +0 -14
  845. wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +0 -51
  846. wisent/examples/scripts/results/test_mmlu_prox_pairs.json +0 -14
  847. wisent/examples/scripts/results/test_mmlusr_evaluation.json +0 -30
  848. wisent/examples/scripts/results/test_mmlusr_pairs.json +0 -8
  849. wisent/examples/scripts/results/test_mmmu_evaluation.json +0 -51
  850. wisent/examples/scripts/results/test_mmmu_pairs.json +0 -14
  851. wisent/examples/scripts/results/test_mnli_evaluation.json +0 -30
  852. wisent/examples/scripts/results/test_mnli_pairs.json +0 -8
  853. wisent/examples/scripts/results/test_model_written_evals_evaluation.json +0 -51
  854. wisent/examples/scripts/results/test_model_written_evals_pairs.json +0 -14
  855. wisent/examples/scripts/results/test_moral_stories_evaluation.json +0 -30
  856. wisent/examples/scripts/results/test_moral_stories_pairs.json +0 -8
  857. wisent/examples/scripts/results/test_mts_dialog_evaluation.json +0 -30
  858. wisent/examples/scripts/results/test_mts_dialog_pairs.json +0 -8
  859. wisent/examples/scripts/results/test_multiblimp_evaluation.json +0 -51
  860. wisent/examples/scripts/results/test_multiblimp_pairs.json +0 -14
  861. wisent/examples/scripts/results/test_multimedqa_evaluation.json +0 -51
  862. wisent/examples/scripts/results/test_multimedqa_pairs.json +0 -14
  863. wisent/examples/scripts/results/test_multipl_e_evaluation.json +0 -30
  864. wisent/examples/scripts/results/test_multipl_e_pairs.json +0 -8
  865. wisent/examples/scripts/results/test_mutual_evaluation.json +0 -30
  866. wisent/examples/scripts/results/test_mutual_pairs.json +0 -8
  867. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +0 -30
  868. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +0 -8
  869. wisent/examples/scripts/results/test_noreval_evaluation.json +0 -51
  870. wisent/examples/scripts/results/test_noreval_pairs.json +0 -14
  871. wisent/examples/scripts/results/test_noticia_evaluation.json +0 -30
  872. wisent/examples/scripts/results/test_noticia_pairs.json +0 -8
  873. wisent/examples/scripts/results/test_nq_open_evaluation.json +0 -30
  874. wisent/examples/scripts/results/test_nq_open_pairs.json +0 -8
  875. wisent/examples/scripts/results/test_olaph_evaluation.json +0 -30
  876. wisent/examples/scripts/results/test_olaph_pairs.json +0 -8
  877. wisent/examples/scripts/results/test_openbookqa_evaluation.json +0 -30
  878. wisent/examples/scripts/results/test_openbookqa_pairs.json +0 -8
  879. wisent/examples/scripts/results/test_openllm_evaluation.json +0 -51
  880. wisent/examples/scripts/results/test_openllm_pairs.json +0 -14
  881. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +0 -30
  882. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +0 -8
  883. wisent/examples/scripts/results/test_paloma_evaluation.json +0 -51
  884. wisent/examples/scripts/results/test_paloma_pairs.json +0 -14
  885. wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +0 -30
  886. wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +0 -8
  887. wisent/examples/scripts/results/test_paws-x_evaluation.json +0 -51
  888. wisent/examples/scripts/results/test_paws-x_pairs.json +0 -14
  889. wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +0 -30
  890. wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +0 -8
  891. wisent/examples/scripts/results/test_penn_treebank_evaluation.json +0 -30
  892. wisent/examples/scripts/results/test_penn_treebank_pairs.json +0 -8
  893. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +0 -30
  894. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +0 -8
  895. wisent/examples/scripts/results/test_piqa_evaluation.json +0 -30
  896. wisent/examples/scripts/results/test_piqa_pairs.json +0 -8
  897. wisent/examples/scripts/results/test_polemo2_evaluation.json +0 -30
  898. wisent/examples/scripts/results/test_polemo2_pairs.json +0 -8
  899. wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +0 -30
  900. wisent/examples/scripts/results/test_polymath_en_high_pairs.json +0 -8
  901. wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +0 -30
  902. wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +0 -8
  903. wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +0 -30
  904. wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +0 -8
  905. wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +0 -30
  906. wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +0 -8
  907. wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +0 -51
  908. wisent/examples/scripts/results/test_portuguese_bench_pairs.json +0 -14
  909. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
  910. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
  911. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +0 -30
  912. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +0 -8
  913. wisent/examples/scripts/results/test_prost_evaluation.json +0 -30
  914. wisent/examples/scripts/results/test_prost_pairs.json +0 -8
  915. wisent/examples/scripts/results/test_ptb_evaluation.json +0 -30
  916. wisent/examples/scripts/results/test_ptb_pairs.json +0 -8
  917. wisent/examples/scripts/results/test_pubmedqa_evaluation.json +0 -30
  918. wisent/examples/scripts/results/test_pubmedqa_pairs.json +0 -8
  919. wisent/examples/scripts/results/test_pythia_evaluation.json +0 -51
  920. wisent/examples/scripts/results/test_pythia_pairs.json +0 -14
  921. wisent/examples/scripts/results/test_qa4mre_evaluation.json +0 -30
  922. wisent/examples/scripts/results/test_qa4mre_pairs.json +0 -8
  923. wisent/examples/scripts/results/test_qasper_evaluation.json +0 -30
  924. wisent/examples/scripts/results/test_qasper_pairs.json +0 -8
  925. wisent/examples/scripts/results/test_race_evaluation.json +0 -30
  926. wisent/examples/scripts/results/test_race_pairs.json +0 -8
  927. wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +0 -30
  928. wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +0 -8
  929. wisent/examples/scripts/results/test_recode_evaluation.json +0 -30
  930. wisent/examples/scripts/results/test_recode_pairs.json +0 -8
  931. wisent/examples/scripts/results/test_record_evaluation.json +0 -30
  932. wisent/examples/scripts/results/test_record_pairs.json +0 -8
  933. wisent/examples/scripts/results/test_ruler_evaluation.json +0 -51
  934. wisent/examples/scripts/results/test_ruler_pairs.json +0 -14
  935. wisent/examples/scripts/results/test_sciq_evaluation.json +0 -30
  936. wisent/examples/scripts/results/test_sciq_pairs.json +0 -8
  937. wisent/examples/scripts/results/test_score_evaluation.json +0 -51
  938. wisent/examples/scripts/results/test_score_pairs.json +0 -14
  939. wisent/examples/scripts/results/test_self_consistency_evaluation.json +0 -30
  940. wisent/examples/scripts/results/test_self_consistency_pairs.json +0 -8
  941. wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +0 -30
  942. wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +0 -8
  943. wisent/examples/scripts/results/test_siqa_evaluation.json +0 -30
  944. wisent/examples/scripts/results/test_siqa_pairs.json +0 -8
  945. wisent/examples/scripts/results/test_spanish_bench_evaluation.json +0 -51
  946. wisent/examples/scripts/results/test_spanish_bench_pairs.json +0 -14
  947. wisent/examples/scripts/results/test_squad2_evaluation.json +0 -30
  948. wisent/examples/scripts/results/test_squad2_pairs.json +0 -8
  949. wisent/examples/scripts/results/test_squadv2_evaluation.json +0 -30
  950. wisent/examples/scripts/results/test_squadv2_pairs.json +0 -8
  951. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +0 -30
  952. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +0 -8
  953. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +0 -51
  954. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +0 -14
  955. wisent/examples/scripts/results/test_swag_evaluation.json +0 -30
  956. wisent/examples/scripts/results/test_swag_pairs.json +0 -8
  957. wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +0 -51
  958. wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +0 -14
  959. wisent/examples/scripts/results/test_tmmluplus_evaluation.json +0 -51
  960. wisent/examples/scripts/results/test_tmmluplus_pairs.json +0 -14
  961. wisent/examples/scripts/results/test_translation_evaluation.json +0 -51
  962. wisent/examples/scripts/results/test_translation_pairs.json +0 -14
  963. wisent/examples/scripts/results/test_triviaqa_evaluation.json +0 -30
  964. wisent/examples/scripts/results/test_triviaqa_pairs.json +0 -8
  965. wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +0 -51
  966. wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +0 -14
  967. wisent/examples/scripts/results/test_truthfulqa_evaluation.json +0 -30
  968. wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +0 -30
  969. wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +0 -8
  970. wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +0 -30
  971. wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +0 -8
  972. wisent/examples/scripts/results/test_truthfulqa_pairs.json +0 -8
  973. wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +0 -51
  974. wisent/examples/scripts/results/test_turkishmmlu_pairs.json +0 -14
  975. wisent/examples/scripts/results/test_unfair_tos_evaluation.json +0 -30
  976. wisent/examples/scripts/results/test_unfair_tos_pairs.json +0 -8
  977. wisent/examples/scripts/results/test_unscramble_evaluation.json +0 -51
  978. wisent/examples/scripts/results/test_unscramble_pairs.json +0 -14
  979. wisent/examples/scripts/results/test_webqs_evaluation.json +0 -30
  980. wisent/examples/scripts/results/test_webqs_pairs.json +0 -8
  981. wisent/examples/scripts/results/test_wikitext103_evaluation.json +0 -30
  982. wisent/examples/scripts/results/test_wikitext103_pairs.json +0 -8
  983. wisent/examples/scripts/results/test_wikitext_evaluation.json +0 -30
  984. wisent/examples/scripts/results/test_wikitext_pairs.json +0 -8
  985. wisent/examples/scripts/results/test_winogender_evaluation.json +0 -51
  986. wisent/examples/scripts/results/test_winogender_pairs.json +0 -14
  987. wisent/examples/scripts/results/test_winogrande_evaluation.json +0 -30
  988. wisent/examples/scripts/results/test_winogrande_pairs.json +0 -8
  989. wisent/examples/scripts/results/test_wmdp_evaluation.json +0 -30
  990. wisent/examples/scripts/results/test_wmdp_pairs.json +0 -8
  991. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +0 -30
  992. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +0 -8
  993. wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +0 -30
  994. wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +0 -8
  995. wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +0 -30
  996. wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +0 -8
  997. wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +0 -30
  998. wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +0 -8
  999. wisent/examples/scripts/results/test_wsc273_evaluation.json +0 -30
  1000. wisent/examples/scripts/results/test_wsc273_pairs.json +0 -8
  1001. wisent/examples/scripts/results/test_xcopa_evaluation.json +0 -51
  1002. wisent/examples/scripts/results/test_xcopa_pairs.json +0 -14
  1003. wisent/examples/scripts/results/test_xnli_eu_evaluation.json +0 -30
  1004. wisent/examples/scripts/results/test_xnli_eu_pairs.json +0 -8
  1005. wisent/examples/scripts/results/test_xnli_evaluation.json +0 -51
  1006. wisent/examples/scripts/results/test_xnli_pairs.json +0 -14
  1007. wisent/examples/scripts/results/test_xquad_evaluation.json +0 -51
  1008. wisent/examples/scripts/results/test_xquad_pairs.json +0 -14
  1009. wisent/examples/scripts/results/test_xstorycloze_evaluation.json +0 -51
  1010. wisent/examples/scripts/results/test_xstorycloze_pairs.json +0 -14
  1011. wisent/examples/scripts/results/test_xsum_evaluation.json +0 -30
  1012. wisent/examples/scripts/results/test_xsum_pairs.json +0 -8
  1013. wisent/examples/scripts/results/test_xwinograd_evaluation.json +0 -51
  1014. wisent/examples/scripts/results/test_xwinograd_pairs.json +0 -14
  1015. wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +0 -30
  1016. wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +0 -8
  1017. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
  1018. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
  1019. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
  1020. {wisent-0.7.379.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
@@ -1,30 +0,0 @@
1
- {
2
- "task_name": "eus_reading",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 1,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Question: Urarekiko jarrera dela-eta, zer dio testuak?\nA. Ez dela aldatuko, munduko pertsona gehiene...",
11
- "positive_response": "Eskaintzak behera eta eskariak gora egin dutenez, argi dagoela jokamoldea aldatu beharrean gaudela.",
12
- "negative_response": "Ez dela aldatuko, munduko pertsona gehienek ur falta somatuagatik.",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'Eskaintzak behera eta eskariak gora egin dutenez, argi dagoela jokamoldea aldatu beharrean gaudela.' (log_prob=-0.500), Expected: 'Eskaintzak behera eta eskariak gora egin dutenez, argi dagoela jokamoldea aldatu beharrean gaudela.'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'Eskaintzak behera eta eskariak gora egin dutenez, argi dagoela jokamoldea aldatu beharrean gaudela.' (log_prob=-0.500), Expected: 'Ez dela aldatuko, munduko pertsona gehienek ur falta somatuagatik.'"
26
- },
27
- "both_correct": true
28
- }
29
- ]
30
- }
@@ -1,8 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Question: Urarekiko jarrera dela-eta, zer dio testuak?\nA. Ez dela aldatuko, munduko pertsona gehienek ur falta somatuagatik.\nB. Eskaintzak behera eta eskariak gora egin dutenez, argi dagoela jokamoldea aldatu beharrean gaudela.",
5
- "positive_response": "Eskaintzak behera eta eskariak gora egin dutenez, argi dagoela jokamoldea aldatu beharrean gaudela.",
6
- "negative_response": "Ez dela aldatuko, munduko pertsona gehienek ur falta somatuagatik."
7
- }
8
- ]
@@ -1,30 +0,0 @@
1
- {
2
- "task_name": "eus_trivia",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 1,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Question: Nola bota behar dira honakoak ontzi horietara?\nA. Apurturik\nB. Tapoirik gabe...",
11
- "positive_response": "Tapoirik gabe",
12
- "negative_response": "Apurturik",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'Tapoirik gabe' (log_prob=-0.500), Expected: 'Tapoirik gabe'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'Tapoirik gabe' (log_prob=-0.500), Expected: 'Apurturik'"
26
- },
27
- "both_correct": true
28
- }
29
- ]
30
- }
@@ -1,8 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Question: Nola bota behar dira honakoak ontzi horietara?\nA. Apurturik\nB. Tapoirik gabe",
5
- "positive_response": "Tapoirik gabe",
6
- "negative_response": "Apurturik"
7
- }
8
- ]
@@ -1,51 +0,0 @@
1
- {
2
- "task_name": "evalita-mp",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 2,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Question: Una donna di 39 anni riferisce da circa tre mesi dolore anale violento scatenato dalla def...",
11
- "positive_response": "Ragade anale",
12
- "negative_response": "Ascesso rettale",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'Ragade anale' (log_prob=-0.500), Expected: 'Ragade anale'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'Ragade anale' (log_prob=-0.500), Expected: 'Ascesso rettale'"
26
- },
27
- "both_correct": true
28
- },
29
- {
30
- "pair_id": 1,
31
- "prompt": "Question: Una signora di 67 anni viene trovata morta dal figlio nel proprio appartamento a Roma. Qua...",
32
- "positive_response": "entro le tre ore antecedenti il sopralluogo",
33
- "negative_response": "tra le 12 e le 24 ore antecedenti il sopralluogo",
34
- "positive_evaluation": {
35
- "ground_truth": "TRUTHFUL",
36
- "confidence": 1.0,
37
- "expected": "TRUTHFUL",
38
- "correct": true,
39
- "details": "Predicted: 'entro le tre ore antecedenti il sopralluogo' (log_prob=-0.500), Expected: 'entro le tre ore antecedenti il sopralluogo'"
40
- },
41
- "negative_evaluation": {
42
- "ground_truth": "UNTRUTHFUL",
43
- "confidence": 0.0,
44
- "expected": "UNTRUTHFUL",
45
- "correct": true,
46
- "details": "Predicted: 'entro le tre ore antecedenti il sopralluogo' (log_prob=-0.500), Expected: 'tra le 12 e le 24 ore antecedenti il sopralluogo'"
47
- },
48
- "both_correct": true
49
- }
50
- ]
51
- }
@@ -1,14 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Question: Una donna di 39 anni riferisce da circa tre mesi dolore anale violento scatenato dalla defecazione con episodi di sanguinamento. Qual \u00e8 la diagnosi pi\u00f9 probabile?\nA. Ragade anale\nB. Ascesso rettale\nC. Emorroidi di I grado\nD. Prolasso anale\nE. Fistola anale",
5
- "positive_response": "Ragade anale",
6
- "negative_response": "Ascesso rettale"
7
- },
8
- {
9
- "pair_id": 1,
10
- "prompt": "Question: Una signora di 67 anni viene trovata morta dal figlio nel proprio appartamento a Roma. Quando arrivano i Carabinieri la trovano sdraiata supina sul letto con le braccia flesse sotto al capo. Le finestre della casa sono chiuse e la temperatura ambientale \u00e8 di 20 \u00b0C. Il medico legale, durante il sopralluogo, esegue la misurazione della temperatura corporea del cadavere per via rettale che risulta pari a 36 \u00b0C all'inizio del sopralluogo e di 35,5 \u00b0C alla rilevazione finale (circa un'ora dopo). Non \u00e8 presente rigidit\u00e0 cadaverica, ma sono presenti sfumate macchie ipostatiche ai glutei e alle spalle. Esternamente sono presenti delle lesioni sospette sul collo e sul torace della paziente che rendono necessaria l'esecuzione dell'esame autoptico. I dati identificati dal medico legale hanno permesso di determinare l'epoca della morte della signora, stimando che il decesso sia avvenuto verosimilmente:\nA. entro le tre ore antecedenti il sopralluogo\nB. tra le 12 e le 24 ore antecedenti il sopralluogo\nC. circa tre giorni prima del sopralluogo\nD. tra le 24 e le 48 ore antecedenti il sopralluogo\nE. una settimana prima del sopralluogo",
11
- "positive_response": "entro le tre ore antecedenti il sopralluogo",
12
- "negative_response": "tra le 12 e le 24 ore antecedenti il sopralluogo"
13
- }
14
- ]
@@ -1,30 +0,0 @@
1
- {
2
- "task_name": "evalita-sp_sum_task_fp-small_p1",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 1,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Summarize the following text:\n\nPer molti Kristen Stewart \u00e8 ancora Bella Swan, la svenevole protagoni...",
11
- "positive_response": "Basta ricordare l\u2019attrice americana per il ruolo teen della protagonista in Twilight. Kristen Stewart ha alle spalle una carriera lunghissima e ben pi\u00f9 articolata, che \u00e8 passata attraverso il cinema d\u2019autore e l\u2019ha portata sino al ruolo di Diana Spencer nel film di Pablo Larrain, in concorso a Venezia 78.",
12
- "negative_response": "l\u2019ha carriera film protagonista attraverso al portata passata americana spalle alle lunghissima d\u2019autore Pablo e cinema una ruolo il ricordare Basta ben Twilight. 78. \u00e8 e il in ha di sino nel ruolo Venezia in per l\u2019attrice di Diana Stewart Larrain, che teen della pi\u00f9 Kristen a concorso Spencer articolata,",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'Basta ricordare l\u2019attrice americana per il ruolo teen della protagonista in Twilight. Kristen Stewart ha alle spalle una carriera lunghissima e ben pi\u00f9 articolata, che \u00e8 passata attraverso il cinema d\u2019autore e l\u2019ha portata sino al ruolo di Diana Spencer nel film di Pablo Larrain, in concorso a Venezia 78.' (log_prob=-0.500), Expected: 'Basta ricordare l\u2019attrice americana per il ruolo teen della protagonista in Twilight. Kristen Stewart ha alle spalle una carriera lunghissima e ben pi\u00f9 articolata, che \u00e8 passata attraverso il cinema d\u2019autore e l\u2019ha portata sino al ruolo di Diana Spencer nel film di Pablo Larrain, in concorso a Venezia 78.'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'Basta ricordare l\u2019attrice americana per il ruolo teen della protagonista in Twilight. Kristen Stewart ha alle spalle una carriera lunghissima e ben pi\u00f9 articolata, che \u00e8 passata attraverso il cinema d\u2019autore e l\u2019ha portata sino al ruolo di Diana Spencer nel film di Pablo Larrain, in concorso a Venezia 78.' (log_prob=-0.500), Expected: 'l\u2019ha carriera film protagonista attraverso al portata passata americana spalle alle lunghissima d\u2019autore Pablo e cinema una ruolo il ricordare Basta ben Twilight. 78. \u00e8 e il in ha di sino nel ruolo Venezia in per l\u2019attrice di Diana Stewart Larrain, che teen della pi\u00f9 Kristen a concorso Spencer articolata,'"
26
- },
27
- "both_correct": true
28
- }
29
- ]
30
- }
@@ -1,8 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Summarize the following text:\n\nPer molti Kristen Stewart \u00e8 ancora Bella Swan, la svenevole protagonista della saga di Twilight che tra il 2008 e il 2012 \u00e8 stata fenomeno di massa a livello mondiale, la timida e disagiata studentessa che si innamora del vampiro Edward e decide di diventare a sua volta immortale, anche se le sue doti non vanno di molto oltre il sistemarsi compulsivamente le ciocche di capelli dietro l'orecchio. Ebbene, pensare che l'attrice americana sia ancora l'idolo teen delle fanatiche di Stephenie Meyer e nulla di pi\u00f9 significa decisamente negare l'evidenza, o non conoscere la lunga gavetta, fatta di tanto cinema d'autore, che ha portato la Stewart sino ai panni di Lady Diana in Spencer, in concorso alla Mostra del cinema di Venezia 2021. Come nel caso del sempre pi\u00f9 interessante e impegnato Robert Pattinson, i detrattori di Kristen Stewart dovrebbero dimenticarsi di Twilight (che comunque \u00e8 stato un formidabile trampolino di lancio) e conoscere l'attrice oltre la sua chiacchierata vita privata, dalla relazione con lo stesso Pattinson al tradimento con Ruper Sanders fino al suo coming out e alle sue relazioni con Alicia Cargile, St. Vincent, Stella Maxwell e l'attuale compagna Dylan Meyer. Perch\u00e9 la Stewart \u00e8 un'interpreta di notevole eclettismo e innegabile presenza scenica. Attiva sin da bambina, era gi\u00e0 magnetica nel piccolo ruolo di Into the Wild ed \u00e8 stata intelligente a passare dall'oggetto del desiderio di un tenero coming of age (Adventureland) alla discinta Marylou di On the Road, eroina di Jack Kerouac, fino allo straziante Still Alice, in cui \u00e8 la figlia della malata di Alzheimer Julianne Moore. Sempre pi\u00f9 brava, a volte spigolosa e dimessa a volte sensuale ed eterea, si \u00e8 fatta consacrare definitivamente come Musa del regista francese Olivier Assayas, nei bellissimi Sils Maria e Personal Shopperm e di Woody Allen nel nostalgico Caf\u00e9 Society. Qualcuno si \u00e8 chiesto se sia adatta a interpretare Lady Diana, ma in realt\u00e0 si tratta gi\u00e0 del terzo biopic per la Stewart. Ha interpretato prima la star del rock Joan Jett in The Runaways e quindi la sfortunata attrice Jean Seberg nel film, sempre portato a Venezia, Seberg \u2013 Nel mirino. Un ruolo per certi versi anticipatore di Lady D. visto il triste destino della diva di Fino all'ultimo respiro, perseguitata dalla Fbi perch\u00e9 vicina alle Pantere Nere, finita in depressione e morta suicida a 40 anni.",
5
- "positive_response": "Basta ricordare l\u2019attrice americana per il ruolo teen della protagonista in Twilight. Kristen Stewart ha alle spalle una carriera lunghissima e ben pi\u00f9 articolata, che \u00e8 passata attraverso il cinema d\u2019autore e l\u2019ha portata sino al ruolo di Diana Spencer nel film di Pablo Larrain, in concorso a Venezia 78.",
6
- "negative_response": "l\u2019ha carriera film protagonista attraverso al portata passata americana spalle alle lunghissima d\u2019autore Pablo e cinema una ruolo il ricordare Basta ben Twilight. 78. \u00e8 e il in ha di sino nel ruolo Venezia in per l\u2019attrice di Diana Stewart Larrain, che teen della pi\u00f9 Kristen a concorso Spencer articolata,"
7
- }
8
- ]
@@ -1,51 +0,0 @@
1
- {
2
- "task_name": "evalita_LLM",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 2,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Question: Una donna di 39 anni riferisce da circa tre mesi dolore anale violento scatenato dalla def...",
11
- "positive_response": "Ragade anale",
12
- "negative_response": "Ascesso rettale",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'Ragade anale' (log_prob=-0.500), Expected: 'Ragade anale'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'Ragade anale' (log_prob=-0.500), Expected: 'Ascesso rettale'"
26
- },
27
- "both_correct": true
28
- },
29
- {
30
- "pair_id": 1,
31
- "prompt": "Question: Una signora di 67 anni viene trovata morta dal figlio nel proprio appartamento a Roma. Qua...",
32
- "positive_response": "entro le tre ore antecedenti il sopralluogo",
33
- "negative_response": "tra le 12 e le 24 ore antecedenti il sopralluogo",
34
- "positive_evaluation": {
35
- "ground_truth": "TRUTHFUL",
36
- "confidence": 1.0,
37
- "expected": "TRUTHFUL",
38
- "correct": true,
39
- "details": "Predicted: 'entro le tre ore antecedenti il sopralluogo' (log_prob=-0.500), Expected: 'entro le tre ore antecedenti il sopralluogo'"
40
- },
41
- "negative_evaluation": {
42
- "ground_truth": "UNTRUTHFUL",
43
- "confidence": 0.0,
44
- "expected": "UNTRUTHFUL",
45
- "correct": true,
46
- "details": "Predicted: 'entro le tre ore antecedenti il sopralluogo' (log_prob=-0.500), Expected: 'tra le 12 e le 24 ore antecedenti il sopralluogo'"
47
- },
48
- "both_correct": true
49
- }
50
- ]
51
- }
@@ -1,14 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Question: Una donna di 39 anni riferisce da circa tre mesi dolore anale violento scatenato dalla defecazione con episodi di sanguinamento. Qual \u00e8 la diagnosi pi\u00f9 probabile?\nA. Ragade anale\nB. Ascesso rettale\nC. Emorroidi di I grado\nD. Prolasso anale\nE. Fistola anale",
5
- "positive_response": "Ragade anale",
6
- "negative_response": "Ascesso rettale"
7
- },
8
- {
9
- "pair_id": 1,
10
- "prompt": "Question: Una signora di 67 anni viene trovata morta dal figlio nel proprio appartamento a Roma. Quando arrivano i Carabinieri la trovano sdraiata supina sul letto con le braccia flesse sotto al capo. Le finestre della casa sono chiuse e la temperatura ambientale \u00e8 di 20 \u00b0C. Il medico legale, durante il sopralluogo, esegue la misurazione della temperatura corporea del cadavere per via rettale che risulta pari a 36 \u00b0C all'inizio del sopralluogo e di 35,5 \u00b0C alla rilevazione finale (circa un'ora dopo). Non \u00e8 presente rigidit\u00e0 cadaverica, ma sono presenti sfumate macchie ipostatiche ai glutei e alle spalle. Esternamente sono presenti delle lesioni sospette sul collo e sul torace della paziente che rendono necessaria l'esecuzione dell'esame autoptico. I dati identificati dal medico legale hanno permesso di determinare l'epoca della morte della signora, stimando che il decesso sia avvenuto verosimilmente:\nA. entro le tre ore antecedenti il sopralluogo\nB. tra le 12 e le 24 ore antecedenti il sopralluogo\nC. circa tre giorni prima del sopralluogo\nD. tra le 24 e le 48 ore antecedenti il sopralluogo\nE. una settimana prima del sopralluogo",
11
- "positive_response": "entro le tre ore antecedenti il sopralluogo",
12
- "negative_response": "tra le 12 e le 24 ore antecedenti il sopralluogo"
13
- }
14
- ]
@@ -1,51 +0,0 @@
1
- {
2
- "task_name": "evalita_llm",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 2,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Question: Una donna di 39 anni riferisce da circa tre mesi dolore anale violento scatenato dalla def...",
11
- "positive_response": "Ragade anale",
12
- "negative_response": "Ascesso rettale",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'Ragade anale' (log_prob=-0.500), Expected: 'Ragade anale'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'Ragade anale' (log_prob=-0.500), Expected: 'Ascesso rettale'"
26
- },
27
- "both_correct": true
28
- },
29
- {
30
- "pair_id": 1,
31
- "prompt": "Question: Una signora di 67 anni viene trovata morta dal figlio nel proprio appartamento a Roma. Qua...",
32
- "positive_response": "entro le tre ore antecedenti il sopralluogo",
33
- "negative_response": "tra le 12 e le 24 ore antecedenti il sopralluogo",
34
- "positive_evaluation": {
35
- "ground_truth": "TRUTHFUL",
36
- "confidence": 1.0,
37
- "expected": "TRUTHFUL",
38
- "correct": true,
39
- "details": "Predicted: 'entro le tre ore antecedenti il sopralluogo' (log_prob=-0.500), Expected: 'entro le tre ore antecedenti il sopralluogo'"
40
- },
41
- "negative_evaluation": {
42
- "ground_truth": "UNTRUTHFUL",
43
- "confidence": 0.0,
44
- "expected": "UNTRUTHFUL",
45
- "correct": true,
46
- "details": "Predicted: 'entro le tre ore antecedenti il sopralluogo' (log_prob=-0.500), Expected: 'tra le 12 e le 24 ore antecedenti il sopralluogo'"
47
- },
48
- "both_correct": true
49
- }
50
- ]
51
- }
@@ -1,14 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Question: Una donna di 39 anni riferisce da circa tre mesi dolore anale violento scatenato dalla defecazione con episodi di sanguinamento. Qual \u00e8 la diagnosi pi\u00f9 probabile?\nA. Ragade anale\nB. Ascesso rettale\nC. Emorroidi di I grado\nD. Prolasso anale\nE. Fistola anale",
5
- "positive_response": "Ragade anale",
6
- "negative_response": "Ascesso rettale"
7
- },
8
- {
9
- "pair_id": 1,
10
- "prompt": "Question: Una signora di 67 anni viene trovata morta dal figlio nel proprio appartamento a Roma. Quando arrivano i Carabinieri la trovano sdraiata supina sul letto con le braccia flesse sotto al capo. Le finestre della casa sono chiuse e la temperatura ambientale \u00e8 di 20 \u00b0C. Il medico legale, durante il sopralluogo, esegue la misurazione della temperatura corporea del cadavere per via rettale che risulta pari a 36 \u00b0C all'inizio del sopralluogo e di 35,5 \u00b0C alla rilevazione finale (circa un'ora dopo). Non \u00e8 presente rigidit\u00e0 cadaverica, ma sono presenti sfumate macchie ipostatiche ai glutei e alle spalle. Esternamente sono presenti delle lesioni sospette sul collo e sul torace della paziente che rendono necessaria l'esecuzione dell'esame autoptico. I dati identificati dal medico legale hanno permesso di determinare l'epoca della morte della signora, stimando che il decesso sia avvenuto verosimilmente:\nA. entro le tre ore antecedenti il sopralluogo\nB. tra le 12 e le 24 ore antecedenti il sopralluogo\nC. circa tre giorni prima del sopralluogo\nD. tra le 24 e le 48 ore antecedenti il sopralluogo\nE. una settimana prima del sopralluogo",
11
- "positive_response": "entro le tre ore antecedenti il sopralluogo",
12
- "negative_response": "tra le 12 e le 24 ore antecedenti il sopralluogo"
13
- }
14
- ]
@@ -1,30 +0,0 @@
1
- {
2
- "task_name": "evalita-mp_te_prompt-1",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 1,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Text 1: 'Pieralfonso Fratta Pasini' \u00e8 un imprenditore e un politico italiano.\nText 2: 'Pieralfonso F...",
11
- "positive_response": "YES",
12
- "negative_response": "NO",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'YES' (log_prob=-0.500), Expected: 'YES'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'YES' (log_prob=-0.500), Expected: 'NO'"
26
- },
27
- "both_correct": true
28
- }
29
- ]
30
- }
@@ -1,8 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Text 1: 'Pieralfonso Fratta Pasini' \u00e8 un imprenditore e un politico italiano.\nText 2: 'Pieralfonso Fratta Pasini' \u00e8 un imprenditore e politico italiano.\nDoes text 2 entail text 1?",
5
- "positive_response": "YES",
6
- "negative_response": "NO"
7
- }
8
- ]
@@ -1,51 +0,0 @@
1
- {
2
- "task_name": "evalita_mp",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 2,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Question: Una donna di 39 anni riferisce da circa tre mesi dolore anale violento scatenato dalla def...",
11
- "positive_response": "Ragade anale",
12
- "negative_response": "Ascesso rettale",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'Ragade anale' (log_prob=-0.500), Expected: 'Ragade anale'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'Ragade anale' (log_prob=-0.500), Expected: 'Ascesso rettale'"
26
- },
27
- "both_correct": true
28
- },
29
- {
30
- "pair_id": 1,
31
- "prompt": "Question: Una signora di 67 anni viene trovata morta dal figlio nel proprio appartamento a Roma. Qua...",
32
- "positive_response": "entro le tre ore antecedenti il sopralluogo",
33
- "negative_response": "tra le 12 e le 24 ore antecedenti il sopralluogo",
34
- "positive_evaluation": {
35
- "ground_truth": "TRUTHFUL",
36
- "confidence": 1.0,
37
- "expected": "TRUTHFUL",
38
- "correct": true,
39
- "details": "Predicted: 'entro le tre ore antecedenti il sopralluogo' (log_prob=-0.500), Expected: 'entro le tre ore antecedenti il sopralluogo'"
40
- },
41
- "negative_evaluation": {
42
- "ground_truth": "UNTRUTHFUL",
43
- "confidence": 0.0,
44
- "expected": "UNTRUTHFUL",
45
- "correct": true,
46
- "details": "Predicted: 'entro le tre ore antecedenti il sopralluogo' (log_prob=-0.500), Expected: 'tra le 12 e le 24 ore antecedenti il sopralluogo'"
47
- },
48
- "both_correct": true
49
- }
50
- ]
51
- }
@@ -1,14 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Question: Una donna di 39 anni riferisce da circa tre mesi dolore anale violento scatenato dalla defecazione con episodi di sanguinamento. Qual \u00e8 la diagnosi pi\u00f9 probabile?\nA. Ragade anale\nB. Ascesso rettale\nC. Emorroidi di I grado\nD. Prolasso anale\nE. Fistola anale",
5
- "positive_response": "Ragade anale",
6
- "negative_response": "Ascesso rettale"
7
- },
8
- {
9
- "pair_id": 1,
10
- "prompt": "Question: Una signora di 67 anni viene trovata morta dal figlio nel proprio appartamento a Roma. Quando arrivano i Carabinieri la trovano sdraiata supina sul letto con le braccia flesse sotto al capo. Le finestre della casa sono chiuse e la temperatura ambientale \u00e8 di 20 \u00b0C. Il medico legale, durante il sopralluogo, esegue la misurazione della temperatura corporea del cadavere per via rettale che risulta pari a 36 \u00b0C all'inizio del sopralluogo e di 35,5 \u00b0C alla rilevazione finale (circa un'ora dopo). Non \u00e8 presente rigidit\u00e0 cadaverica, ma sono presenti sfumate macchie ipostatiche ai glutei e alle spalle. Esternamente sono presenti delle lesioni sospette sul collo e sul torace della paziente che rendono necessaria l'esecuzione dell'esame autoptico. I dati identificati dal medico legale hanno permesso di determinare l'epoca della morte della signora, stimando che il decesso sia avvenuto verosimilmente:\nA. entro le tre ore antecedenti il sopralluogo\nB. tra le 12 e le 24 ore antecedenti il sopralluogo\nC. circa tre giorni prima del sopralluogo\nD. tra le 24 e le 48 ore antecedenti il sopralluogo\nE. una settimana prima del sopralluogo",
11
- "positive_response": "entro le tre ore antecedenti il sopralluogo",
12
- "negative_response": "tra le 12 e le 24 ore antecedenti il sopralluogo"
13
- }
14
- ]
@@ -1,30 +0,0 @@
1
- {
2
- "task_name": "evalita-sp_sum_task_fp-small_p1",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 1,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Summarize the following text:\n\nPer molti Kristen Stewart \u00e8 ancora Bella Swan, la svenevole protagoni...",
11
- "positive_response": "Basta ricordare l\u2019attrice americana per il ruolo teen della protagonista in Twilight. Kristen Stewart ha alle spalle una carriera lunghissima e ben pi\u00f9 articolata, che \u00e8 passata attraverso il cinema d\u2019autore e l\u2019ha portata sino al ruolo di Diana Spencer nel film di Pablo Larrain, in concorso a Venezia 78.",
12
- "negative_response": "in \u00e8 concorso Basta Twilight. 78. Venezia a alle pi\u00f9 spalle una teen e Kristen ruolo il l\u2019attrice cinema d\u2019autore protagonista articolata, sino che il nel e Stewart della passata Larrain, Pablo al in Spencer di ruolo ha film l\u2019ha per ricordare attraverso Diana ben americana portata lunghissima carriera di",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'Basta ricordare l\u2019attrice americana per il ruolo teen della protagonista in Twilight. Kristen Stewart ha alle spalle una carriera lunghissima e ben pi\u00f9 articolata, che \u00e8 passata attraverso il cinema d\u2019autore e l\u2019ha portata sino al ruolo di Diana Spencer nel film di Pablo Larrain, in concorso a Venezia 78.' (log_prob=-0.500), Expected: 'Basta ricordare l\u2019attrice americana per il ruolo teen della protagonista in Twilight. Kristen Stewart ha alle spalle una carriera lunghissima e ben pi\u00f9 articolata, che \u00e8 passata attraverso il cinema d\u2019autore e l\u2019ha portata sino al ruolo di Diana Spencer nel film di Pablo Larrain, in concorso a Venezia 78.'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'Basta ricordare l\u2019attrice americana per il ruolo teen della protagonista in Twilight. Kristen Stewart ha alle spalle una carriera lunghissima e ben pi\u00f9 articolata, che \u00e8 passata attraverso il cinema d\u2019autore e l\u2019ha portata sino al ruolo di Diana Spencer nel film di Pablo Larrain, in concorso a Venezia 78.' (log_prob=-0.500), Expected: 'in \u00e8 concorso Basta Twilight. 78. Venezia a alle pi\u00f9 spalle una teen e Kristen ruolo il l\u2019attrice cinema d\u2019autore protagonista articolata, sino che il nel e Stewart della passata Larrain, Pablo al in Spencer di ruolo ha film l\u2019ha per ricordare attraverso Diana ben americana portata lunghissima carriera di'"
26
- },
27
- "both_correct": true
28
- }
29
- ]
30
- }
@@ -1,8 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Summarize the following text:\n\nPer molti Kristen Stewart \u00e8 ancora Bella Swan, la svenevole protagonista della saga di Twilight che tra il 2008 e il 2012 \u00e8 stata fenomeno di massa a livello mondiale, la timida e disagiata studentessa che si innamora del vampiro Edward e decide di diventare a sua volta immortale, anche se le sue doti non vanno di molto oltre il sistemarsi compulsivamente le ciocche di capelli dietro l'orecchio. Ebbene, pensare che l'attrice americana sia ancora l'idolo teen delle fanatiche di Stephenie Meyer e nulla di pi\u00f9 significa decisamente negare l'evidenza, o non conoscere la lunga gavetta, fatta di tanto cinema d'autore, che ha portato la Stewart sino ai panni di Lady Diana in Spencer, in concorso alla Mostra del cinema di Venezia 2021. Come nel caso del sempre pi\u00f9 interessante e impegnato Robert Pattinson, i detrattori di Kristen Stewart dovrebbero dimenticarsi di Twilight (che comunque \u00e8 stato un formidabile trampolino di lancio) e conoscere l'attrice oltre la sua chiacchierata vita privata, dalla relazione con lo stesso Pattinson al tradimento con Ruper Sanders fino al suo coming out e alle sue relazioni con Alicia Cargile, St. Vincent, Stella Maxwell e l'attuale compagna Dylan Meyer. Perch\u00e9 la Stewart \u00e8 un'interpreta di notevole eclettismo e innegabile presenza scenica. Attiva sin da bambina, era gi\u00e0 magnetica nel piccolo ruolo di Into the Wild ed \u00e8 stata intelligente a passare dall'oggetto del desiderio di un tenero coming of age (Adventureland) alla discinta Marylou di On the Road, eroina di Jack Kerouac, fino allo straziante Still Alice, in cui \u00e8 la figlia della malata di Alzheimer Julianne Moore. Sempre pi\u00f9 brava, a volte spigolosa e dimessa a volte sensuale ed eterea, si \u00e8 fatta consacrare definitivamente come Musa del regista francese Olivier Assayas, nei bellissimi Sils Maria e Personal Shopperm e di Woody Allen nel nostalgico Caf\u00e9 Society. Qualcuno si \u00e8 chiesto se sia adatta a interpretare Lady Diana, ma in realt\u00e0 si tratta gi\u00e0 del terzo biopic per la Stewart. Ha interpretato prima la star del rock Joan Jett in The Runaways e quindi la sfortunata attrice Jean Seberg nel film, sempre portato a Venezia, Seberg \u2013 Nel mirino. Un ruolo per certi versi anticipatore di Lady D. visto il triste destino della diva di Fino all'ultimo respiro, perseguitata dalla Fbi perch\u00e9 vicina alle Pantere Nere, finita in depressione e morta suicida a 40 anni.",
5
- "positive_response": "Basta ricordare l\u2019attrice americana per il ruolo teen della protagonista in Twilight. Kristen Stewart ha alle spalle una carriera lunghissima e ben pi\u00f9 articolata, che \u00e8 passata attraverso il cinema d\u2019autore e l\u2019ha portata sino al ruolo di Diana Spencer nel film di Pablo Larrain, in concorso a Venezia 78.",
6
- "negative_response": "in \u00e8 concorso Basta Twilight. 78. Venezia a alle pi\u00f9 spalle una teen e Kristen ruolo il l\u2019attrice cinema d\u2019autore protagonista articolata, sino che il nel e Stewart della passata Larrain, Pablo al in Spencer di ruolo ha film l\u2019ha per ricordare attraverso Diana ben americana portata lunghissima carriera di"
7
- }
8
- ]
@@ -1,30 +0,0 @@
1
- {
2
- "task_name": "fda",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 1,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "STANTIAL EQUIVALENCE DETERMINATION DECISION SUMMARY A. 510(k) Number: K153137 B. Purpose for Submiss...",
11
- "positive_response": "Clearance of a new device",
12
- "negative_response": "a new of device Clearance",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'Clearance of a new device' (log_prob=-0.500), Expected: 'Clearance of a new device'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'Clearance of a new device' (log_prob=-0.500), Expected: 'a new of device Clearance'"
26
- },
27
- "both_correct": true
28
- }
29
- ]
30
- }
@@ -1,8 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "STANTIAL EQUIVALENCE DETERMINATION DECISION SUMMARY A. 510(k) Number: K153137 B. Purpose for Submission: Clearance of a new device C. Measurand: Anti-PF4/Heparin Total Antibodies D. Type of Test: Automated, latex enhanced immuno-turbidimetric assay E. Applicant: Instrumentation Laboratory (IL) Co. F. Proprietary and Established Names: HemosIL HIT\u2010Ab(PF4\u2010H) HemosIL HIT\u2010Ab(PF4\u2010H) Controls G. Regulatory Information: 1. Regulation section: 21 CFR 864.7695, Platelet factor 4 radioimmunoassay 21 CFR 864.5425, Multipurpose system for in vitro coagulation studies 2. Classification: Class II 3. Product code: 2 LCO, Platelet factor 4 radioimmunoassay GGN, Plasma, Coagulation Control 4. Panel: Hematology (81) H. Intended Use: 1. Intended use(s): HemosIL HIT-Ab(PF4-H) is a qualitative, fully automated, latex enhanced immunoassay for the detection of anti-platelet factor 4/heparin (PF4/H) antibodies. The assay is for use in human 3.2% or 3.8% citrated plasma on the ACL TOP\u00ae Family of instruments in a laboratory setting. The result provided by the assay should be interpreted as either positive or negative based on the assay cut-off (1.0 U/mL). The positive or negative result aids in determining the risk for heparin induced thrombocytopenia (HIT) when used in conjunction with other laboratory and clinical findings. Anti-PF4/Heparin antibodies are commonly found in patients with HIT. For use in adult population suspected of HIT. Not for use in isolation to exclude HIT. HemoslL HIT-Ab(PF4-H) Controls are for the Quality Control of the HemosIL HIT-Ab(PF4-\nH) assay as performed on the ACL TOP\u00ae Family of instruments. For prescription use. 2. Indication(s) for use: Same as Intended Use 3. Special conditions for use statement(s): For prescription use 4. Special instrument requirements: ACL TOP\u00ae Family Instruments I. Device Description: The HemosIL HIT-Ab(PF4-H) kit is a latex particle enhanced immuno-turbidimetric assay to detect total anti\u2010PF4/Heparin antibodies found in HIT patients. A monoclonal 3 antibody that mimics human HIT antibodies is coated onto latex particles. The HemosIL HIT-Ab(PF4-H) kit consists of: Latex Reagent: Suspension of polystyrene latex particles coated with purified mouse monoclonal anti-PF4-Heparin in Tris buffer, containing bovine serum albumin, stabilizers and preservative. Stabilizer: PBS buffer containing bovine serum albumin, stabilizers and preservative. Complex: Solution of PF4-PVS complex (PF4 from human platelets complexed to PVS), in PBS buffer containing bovine serum albumin, stabilizers and preservative. Contains 0.02% Bronidox\u2122 as a preservative. Calibrator: Lyophilized solution of a monoclonal anti- PF4-Heparin antibody in Tris buffer containing bovine serum albumin, stabilizers and preservative. Controls: The Low and High HIT\u2010Ab(PF4\u2010H) Controls are prepared by means of a dedicated process and contain different concentrations of humanized monoclonal anti\u2010PF4\u2010Heparin human IgG. \u2022 Low HIT Control: Control intended for the assessment of precision and accuracy of the assay at PF4/H antibody levels at or below the cut\u2010off. \u2022 High HIT Control: Control intended for the assessment of precision and accuracy of the assay at abnormal PF4/H antibody levels. J. Substantial Equivalence Information: 1. Predicate device name(s): Asserachrom HPIA Test kit from Diagnostica Stago 2. Predicate 510(k) number(s): K003767 3. Comparison with predicate: 4 Similarities Item Device Predicate Trade Names HemosIL HIT-Ab(PF4-H) HemosIL HIT-Ab(PF4-H) Controls (K153137) Asserachrom HPIA Test Kit (kit includes two control levels) (K003767) Measurand Anti-PF4/Heparin Total Antibodies Anti\u2010PF4/Heparin Total Antibodies Detection Method Absorbance (Turbimetric) Absorbance (Colorimetric) Intended Use HemosIL HIT-Ab(PF4-H) is a qualitative, fully automated, latex enhanced immunoassay for the detection of anti-platelet factor 4/heparin (PF4/H) antibodies. The assay is for use in human 3.2% or 3.8% citrated plasma on the ACL TOP\u00ae Family of instruments in a laboratory setting. The result provided by the assay should be interpreted as either positive or negative based on the assay cut-off (1.0 U/mL). The positive or negative result aids in determining the risk for heparin induced thrombocytopenia (HIT) when used in conjunction with other laboratory and clinical findings. Anti-PF4/Heparin antibodies are commonly found in patients with HIT. For use in adult population suspected of HIT. Not for use in isolation to exclude HIT. HemosIL HIT-Ab(PF4-H) Controls are for the Quality Control of the HemosIL HIT-Ab(PF4-H) assay as performed on the ACL TOP Family of instruments. For prescription use. The ASSERACHROM\u00ae HPIA Test Kit is intended for use as a qualitative procedure for the detection of anti\u2010heparin\u2010platelet factor 4 (anti-Heparin-PF4) antibodies in citrated plasma or serum by the sandwich technique of enzyme-linked immunosorbent assay (ELISA). The presence in plasma or serum of anti-Heparin-PF4 antibodies, together with a concurrent drop in platelet count, is generally associated with Type II heparin\u2010induced thrombocytopenia (Type II HIT), a condition that occurs during heparin therapy, leading to arterial or venous thrombosis. Assay Type Qualitative Qualitative Differences Item Device Predicate Sample Types Citrated human plasma only Citrated human plasma or serum Cut\u2010off Fixed clinical cut\u2010off: \u2265 1.0 U/mL Variable clinical cut\u2010off Cut\u2010off is lot and plate dependent. Every time a plate is processed, the cut\u2010off for this plate is calculated as the percentage (X%) of the value 5 Differences Item Device Predicate obtained for the reagent supplied with the kit. This percentage is provided for each lot through the insert sheets. Methodology Latex\u2010enhanced immuno-turbidimetric assay Two\u2010step enzyme immunoassay (EIA) sandwich method with a final colorimetric detection. Antibodies Purified mouse monoclonal anti-PF4-Heparin Goat anti-human antibodies to IgG, IgA and IgM Controls Controls sold separately: - Low Level at or below the cut-off - High Level at abnormal anti-PF4/H antibody level. Controls included in test kit: - Negative level - Positive level Calibrator Traceability The reported values for the kit calibrator are determined over multiple runs on the ACL TOP Family of instruments using specific lots of reagents and against an internal House Standard. Since an HIT International Standard is not currently available, arbitrary units (U/mL) have been established. Not Applicable K. Standard/Guidance Document Referenced (if applicable): EP05-A3; Evaluation of Precision of Quantitative Measurement Procedures; Approved Guideline; 2014 EP06-A; Evaluation of the Linearity of Quantitative Measurement Procedures; a Statistical Approach; Approved Guideline; 2003 EP07-A2; Interference Testing in Clinical Chemistry; Approved Guideline; 2005 EP09-A3; Measurement Procedure Comparison and Bias Estimation Using Patient Samples; Approved Guideline; 2013 EP12-A2; User Protocol for Evaluation of Qualitative Test Performance; Approved Guideline; 2008 EP14-A3; Evaluation of Commutability of Processed Samples; Approved Guideline; 2013 EP17-A2; Evaluation of Detection Capability For Clinical Laboratory Measurement Procedures; Approved Guideline; 2012 EP24-A2; Assessment of Diagnostic Accuracy of Laboratory Tests Using receiver Operating 6 Characteristic Curves; Approved Guideline; 2011 EP25-A3; Evaluation of Stability of In Vitro Diagnostic Reagents; Approved Guideline; 2009 EP28-A3C; Defining, Establishing and Verifying Reference Intervals in the Clinical Laboratory; Approved Guideline; 2010 L. Test Principle: The HemosIL HIT-Ab(PF4-H) kit is a latex particle enhanced immuno-turbidimetric assay to detect total Anti\u2010PF4/Heparin (PF4/H) antibodies found in HIT patients. A monoclonal antibody that mimics human HIT antibodies\nPurpose for submission:\npurpose for submission:",
5
- "positive_response": "Clearance of a new device",
6
- "negative_response": "a new of device Clearance"
7
- }
8
- ]
@@ -1,30 +0,0 @@
1
- {
2
- "task_name": "financial_tweets",
3
- "model_name": "mock",
4
- "evaluator_name": "log_likelihoods",
5
- "num_pairs": 1,
6
- "all_correct": true,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Classify the Topic of the following Tweet to one of these options: analyst update, fed and central b...",
11
- "positive_response": "dividend",
12
- "negative_response": "energy and oil",
13
- "positive_evaluation": {
14
- "ground_truth": "TRUTHFUL",
15
- "confidence": 1.0,
16
- "expected": "TRUTHFUL",
17
- "correct": true,
18
- "details": "Predicted: 'dividend' (log_prob=-0.500), Expected: 'dividend'"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "Predicted: 'dividend' (log_prob=-0.500), Expected: 'energy and oil'"
26
- },
27
- "both_correct": true
28
- }
29
- ]
30
- }
@@ -1,8 +0,0 @@
1
- [
2
- {
3
- "pair_id": 0,
4
- "prompt": "Classify the Topic of the following Tweet to one of these options: analyst update, fed and central banks, company and product news, treasuries and corporate debt, dividend, earnings, energy and oil, financials, currencies, general News and opinion, gold, metals and materials, initial public offering, legal and regulation, mergers, acquisitions and investments, macro, markets, politics, personnel change, stock commentary, stock movement.\nTweet:\n$JBHT - J.B. Hunt Transport Services declares $0.40 dividend https://t.co/wemIpznm4B\nTopic:",
5
- "positive_response": "dividend",
6
- "negative_response": "energy and oil"
7
- }
8
- ]
@@ -1,30 +0,0 @@
1
- {
2
- "task_name": "fld",
3
- "model_name": "mock",
4
- "evaluator_name": "exact_match",
5
- "num_pairs": 1,
6
- "all_correct": false,
7
- "pairs": [
8
- {
9
- "pair_id": 0,
10
- "prompt": "Given the following facts:\nsent1: the taxonomicness occurs. sent2: that the snowballing great-uncle ...",
11
- "positive_response": "PROVED",
12
- "negative_response": "DISPROVED",
13
- "positive_evaluation": {
14
- "ground_truth": "UNTRUTHFUL",
15
- "confidence": 0.0,
16
- "expected": "TRUTHFUL",
17
- "correct": false,
18
- "details": "No match: '' not in ['PROVED']"
19
- },
20
- "negative_evaluation": {
21
- "ground_truth": "UNTRUTHFUL",
22
- "confidence": 0.0,
23
- "expected": "UNTRUTHFUL",
24
- "correct": true,
25
- "details": "No match: '' not in ['DISPROVED']"
26
- },
27
- "both_correct": false
28
- }
29
- ]
30
- }