wisent 0.7.701__py3-none-any.whl → 0.7.901__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/activation_cache.py +393 -0
  3. wisent/core/activations/activations.py +3 -3
  4. wisent/core/activations/activations_collector.py +9 -5
  5. wisent/core/activations/classifier_inference_strategy.py +12 -11
  6. wisent/core/activations/extraction_strategy.py +256 -84
  7. wisent/core/classifiers/classifiers/core/atoms.py +3 -2
  8. wisent/core/cli/__init__.py +2 -1
  9. wisent/core/cli/agent/apply_steering.py +5 -7
  10. wisent/core/cli/agent/train_classifier.py +19 -7
  11. wisent/core/cli/check_linearity.py +35 -3
  12. wisent/core/cli/cluster_benchmarks.py +4 -6
  13. wisent/core/cli/create_steering_vector.py +6 -4
  14. wisent/core/cli/diagnose_vectors.py +7 -4
  15. wisent/core/cli/estimate_unified_goodness_time.py +6 -4
  16. wisent/core/cli/generate_pairs_from_task.py +9 -56
  17. wisent/core/cli/geometry_search.py +137 -0
  18. wisent/core/cli/get_activations.py +1 -1
  19. wisent/core/cli/method_optimizer.py +4 -3
  20. wisent/core/cli/modify_weights.py +3 -2
  21. wisent/core/cli/optimize_sample_size.py +1 -1
  22. wisent/core/cli/optimize_steering.py +14 -16
  23. wisent/core/cli/optimize_weights.py +2 -1
  24. wisent/core/cli/preview_pairs.py +203 -0
  25. wisent/core/cli/steering_method_trainer.py +3 -3
  26. wisent/core/cli/tasks.py +19 -76
  27. wisent/core/cli/train_unified_goodness.py +3 -3
  28. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +4 -4
  29. wisent/core/contrastive_pairs/diagnostics/linearity.py +7 -0
  30. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
  31. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
  32. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
  33. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
  34. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
  35. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
  36. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
  37. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
  38. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
  39. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
  40. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
  41. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
  42. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
  43. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
  44. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
  45. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
  46. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
  47. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
  48. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
  49. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
  50. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
  51. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
  52. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
  53. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +3 -0
  54. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
  55. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
  56. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
  57. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
  58. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
  59. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
  60. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
  61. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
  62. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
  63. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
  64. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
  65. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
  66. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
  67. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
  68. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
  69. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
  70. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
  82. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
  83. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
  84. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
  85. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
  86. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
  87. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
  88. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
  89. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
  90. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
  96. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
  102. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
  103. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
  104. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
  105. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
  106. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
  107. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
  108. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
  109. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
  110. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
  111. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
  112. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
  113. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
  114. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
  115. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
  116. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
  117. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
  118. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
  119. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
  120. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
  121. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
  122. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
  123. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
  124. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
  125. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
  126. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
  127. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
  128. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
  129. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
  130. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
  131. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
  132. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
  133. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
  134. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
  135. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
  136. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
  137. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
  138. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
  139. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
  140. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
  141. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
  142. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
  143. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
  144. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
  145. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
  146. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
  147. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
  148. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
  149. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
  150. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
  151. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
  152. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
  153. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
  154. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
  155. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
  156. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
  157. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
  158. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
  159. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
  160. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
  161. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
  162. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
  163. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
  164. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
  165. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
  166. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
  167. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
  168. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
  169. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
  170. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
  171. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
  172. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
  173. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
  174. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
  175. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
  176. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
  177. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
  178. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
  179. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
  180. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
  181. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
  182. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
  183. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
  184. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
  185. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
  186. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
  187. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
  188. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
  189. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
  190. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
  191. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
  192. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
  193. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
  194. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
  195. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
  196. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
  197. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
  198. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
  199. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
  200. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
  201. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
  202. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
  203. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
  204. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
  205. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
  206. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
  207. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
  208. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
  209. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
  210. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
  211. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
  212. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
  213. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
  214. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
  215. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
  216. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
  217. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
  218. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
  219. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
  220. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
  221. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
  222. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
  223. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
  224. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
  225. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
  226. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
  227. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
  228. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
  229. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
  230. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
  231. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
  232. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
  233. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
  234. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
  235. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
  236. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
  237. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
  238. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
  239. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
  240. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
  241. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
  242. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
  243. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
  244. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
  245. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
  246. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
  247. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
  248. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
  249. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
  250. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
  251. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
  252. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
  253. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
  254. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
  255. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
  256. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
  257. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
  258. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
  259. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
  260. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
  261. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
  262. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
  263. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
  264. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
  265. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
  266. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
  267. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
  268. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
  269. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
  270. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
  271. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
  272. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
  273. wisent/core/data_loaders/loaders/lm_loader.py +12 -1
  274. wisent/core/geometry_runner.py +995 -0
  275. wisent/core/geometry_search_space.py +237 -0
  276. wisent/core/hyperparameter_optimizer.py +1 -1
  277. wisent/core/main.py +3 -0
  278. wisent/core/models/core/atoms.py +5 -3
  279. wisent/core/models/wisent_model.py +1 -1
  280. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  281. wisent/core/parser_arguments/check_linearity_parser.py +12 -2
  282. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
  283. wisent/core/parser_arguments/generate_vector_from_task_parser.py +2 -2
  284. wisent/core/parser_arguments/geometry_search_parser.py +61 -0
  285. wisent/core/parser_arguments/main_parser.py +8 -0
  286. wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
  287. wisent/core/steering.py +5 -3
  288. wisent/core/steering_methods/methods/hyperplane.py +2 -1
  289. wisent/core/synthetic/generators/nonsense_generator.py +30 -18
  290. wisent/core/trainers/steering_trainer.py +2 -2
  291. wisent/core/utils/device.py +27 -27
  292. wisent/core/utils/layer_combinations.py +70 -0
  293. wisent/examples/__init__.py +1 -0
  294. wisent/examples/scripts/__init__.py +1 -0
  295. wisent/examples/scripts/count_all_benchmarks.py +121 -0
  296. wisent/examples/scripts/discover_directions.py +469 -0
  297. wisent/examples/scripts/extract_benchmark_info.py +71 -0
  298. wisent/examples/scripts/generate_paper_data.py +384 -0
  299. wisent/examples/scripts/intervention_validation.py +626 -0
  300. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
  301. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
  302. wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
  303. wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
  304. wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
  305. wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
  306. wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
  307. wisent/examples/scripts/search_all_short_names.py +31 -0
  308. wisent/examples/scripts/test_all_benchmarks.py +138 -0
  309. wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
  310. wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
  311. wisent/examples/scripts/test_nonsense_baseline.py +261 -0
  312. wisent/examples/scripts/test_one_benchmark.py +324 -0
  313. wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
  314. wisent/examples/scripts/threshold_analysis.py +434 -0
  315. wisent/examples/scripts/visualization_gallery.py +582 -0
  316. wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
  317. wisent/parameters/lm_eval/category_directions.json +137 -0
  318. wisent/parameters/lm_eval/repair_plan.json +282 -0
  319. wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
  320. wisent/parameters/lm_eval/working_benchmarks.json +206 -0
  321. wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
  322. wisent/tests/test_detector_accuracy.py +1 -1
  323. wisent/tests/visualize_geometry.py +1 -1
  324. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
  325. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/RECORD +329 -295
  326. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
  327. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
  328. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
  329. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
  330. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,324 @@
1
+ """Test if a benchmark can create contrastive pairs and evaluation works."""
2
+
3
+ import json
4
+ import os
5
+ from pathlib import Path
6
+
7
+ from wisent.core.data_loaders.loaders.lm_loader import LMEvalDataLoader
8
+ from wisent.core.data_loaders.loaders.huggingface_loader import HuggingFaceDataLoader
9
+ from wisent.core.evaluators.rotator import EvaluatorRotator
10
+
11
+ # Set environment variables
12
+ os.environ['HF_DATASETS_TRUST_REMOTE_CODE'] = '1'
13
+ os.environ['HF_ALLOW_CODE_EVAL'] = '1'
14
+
15
+
16
+ class MockModel:
17
+ """Mock model that returns predictable outputs without actual inference.
18
+
19
+ This mock ensures that:
20
+ - For log_likelihoods: first choice always has higher log prob
21
+ - For perplexity: returns low perplexity for first choice
22
+ - For generation: returns empty (not used in contrastive evaluation)
23
+ """
24
+
25
+ def __init__(self, model_name: str = "mock"):
26
+ self.model_name = model_name
27
+
28
+ def get_log_probs(self, prompt: str, choices: list[str]) -> list[float]:
29
+ """Return mock log probabilities - first choice always has higher probability.
30
+
31
+ Used by log_likelihoods evaluator.
32
+ """
33
+ # First choice gets -0.5 (high), rest get -2.0 (low)
34
+ return [-0.5] + [-2.0] * (len(choices) - 1) if len(choices) >= 1 else []
35
+
36
+ def loglikelihood(self, context: str, continuation: str) -> float:
37
+ """Return mock log likelihood for perplexity evaluator."""
38
+ # Return higher likelihood for shorter continuations (mock)
39
+ return -len(continuation) * 0.1
40
+
41
+ def generate(self, prompt: str, **kwargs) -> str:
42
+ """Mock generation - returns empty as we use choices for evaluation."""
43
+ return "mock generation"
44
+
45
+
46
+ def test_benchmark(task_name: str, model_name: str = "distilgpt2", output_dir: str = ".", loader_type: str = "auto"):
47
+ """Test if we can create contrastive pairs and evaluate them with mock model.
48
+
49
+ This function:
50
+ 1. Creates contrastive pairs from the benchmark
51
+ 2. Finds the appropriate evaluator
52
+ 3. Evaluates pairs using a mock model (no real inference)
53
+ 4. Verifies positive=TRUTHFUL and negative=UNTRUTHFUL
54
+ 5. Saves pairs and evaluation results to JSON files
55
+
56
+ Args:
57
+ task_name: Name of the benchmark (e.g., "boolq", "gsm8k", "humaneval")
58
+ model_name: Unused (kept for backward compatibility)
59
+ output_dir: Directory to save results
60
+ loader_type: Type of loader to use ("lm_eval", "huggingface", or "auto")
61
+
62
+ Returns:
63
+ True if all evaluations correct (positive=TRUTHFUL, negative=UNTRUTHFUL), False otherwise
64
+ """
65
+ try:
66
+ print(f"\nTesting {task_name}...")
67
+ output_path = Path(output_dir)
68
+ # Create results directory if it doesn't exist
69
+ output_path.mkdir(parents=True, exist_ok=True)
70
+
71
+ # Step 1: Load data and create contrastive pairs
72
+ print(" [1/3] Creating contrastive pairs...")
73
+
74
+ # Auto-detect loader type if needed
75
+ if loader_type == "auto":
76
+ # Try HuggingFace first for known non-lm-eval tasks
77
+ hf_tasks = [
78
+ # Math benchmarks
79
+ "math", "math_500", "aime", "hmmt", "polymath", "livemathbench",
80
+ # Coding benchmarks
81
+ "humaneval", "humaneval_plus",
82
+ "instruct_humaneval", "apps", "conala", "concode",
83
+ "ds", "ds1000", "ds_1000", "mercury", "recode",
84
+ "multipl", "multiple_", "multipl_e",
85
+ "codexglue", "livecodebench",
86
+ # Reasoning benchmarks
87
+ "super_gpqa", "supergpqa", "hle",
88
+ # Database/Table benchmarks
89
+ "tag",
90
+ # Medical benchmarks
91
+ "meddialog",
92
+ # MMLU-SR benchmarks
93
+ "mmlusr",
94
+ # Translation benchmarks
95
+ "iwslt2017",
96
+ # Sentence similarity benchmarks
97
+ "stsb",
98
+ # Newly created HuggingFace extractors
99
+ "babilong", "bangla_mmlu",
100
+ "bhtc_v2", "basque-glue", "basqueglue",
101
+ "flan_held_in",
102
+ "gpt3_translation_benchmarks",
103
+ "penn_treebank", "ptb",
104
+ "self_consistency", "t0_eval",
105
+ "wikitext103"
106
+ ]
107
+ # Tasks that should explicitly use LMEval (not HuggingFace)
108
+ lm_eval_only_tasks = [
109
+ "minerva_math", "code_x_glue", "humaneval_infilling", "mathqa",
110
+ "multiple_choice", # multiple_choice is an lm-eval task, not HuggingFace
111
+ "vaxx_stance", "wiceu" # These are also lm-eval tasks
112
+ ]
113
+ if any(task_name.lower() == t or task_name.lower().startswith(t + "_") for t in lm_eval_only_tasks):
114
+ loader_type = "lm_eval"
115
+ elif any(task_name.lower().startswith(t) for t in hf_tasks):
116
+ loader_type = "huggingface"
117
+ else:
118
+ loader_type = "lm_eval"
119
+
120
+ # Select appropriate loader
121
+ if loader_type == "huggingface":
122
+ print(f" Using HuggingFaceDataLoader")
123
+ loader = HuggingFaceDataLoader()
124
+ else:
125
+ print(f" Using LMEvalDataLoader")
126
+ loader = LMEvalDataLoader()
127
+
128
+ result = loader._load_one_task(
129
+ task_name=task_name,
130
+ split_ratio=0.5,
131
+ seed=42,
132
+ limit=300,
133
+ training_limit=15,
134
+ testing_limit=15
135
+ )
136
+
137
+ test_pairs = result['test_qa_pairs']
138
+ print(f" Created {len(test_pairs.pairs)} contrastive pairs")
139
+
140
+ # Save the pairs
141
+ pairs_data = []
142
+ for i, pair in enumerate(test_pairs.pairs):
143
+ pairs_data.append({
144
+ "pair_id": i,
145
+ "prompt": pair.prompt,
146
+ "positive_response": pair.positive_response.model_response,
147
+ "negative_response": pair.negative_response.model_response,
148
+ })
149
+
150
+ pairs_file = output_path / f"test_{task_name}_pairs.json"
151
+ with open(pairs_file, 'w') as f:
152
+ json.dump(pairs_data, f, indent=2)
153
+ print(f" Saved pairs to: {pairs_file}")
154
+
155
+ # Step 2: Find evaluator
156
+ print(" [2/4] Finding evaluator...")
157
+ EvaluatorRotator.discover_evaluators('wisent.core.evaluators.benchmark_specific')
158
+ rotator = EvaluatorRotator(task_name=task_name)
159
+ evaluator_name = rotator._plugin.name
160
+ print(f" Using evaluator: {evaluator_name}")
161
+
162
+ # Step 3: Monkey patch evaluator if it's log_likelihoods
163
+ print(" [3/4] Setting up mock evaluation...")
164
+ if evaluator_name == "log_likelihoods":
165
+ # Monkey patch the log likelihood computation to return mock values
166
+ # First choice always gets higher log prob (-0.5), rest get lower (-2.0)
167
+ original_compute = rotator._plugin._compute_choice_log_likelihood
168
+ choice_index = [0] # Track which choice we're on
169
+
170
+ def mock_compute_log_likelihood(model, question, choice):
171
+ """Return mock log prob - first choice is higher."""
172
+ idx = choice_index[0]
173
+ choice_index[0] += 1
174
+ if choice_index[0] > 1: # Reset after both choices
175
+ choice_index[0] = 0
176
+ return -0.5 if idx == 0 else -2.0
177
+
178
+ rotator._plugin._compute_choice_log_likelihood = mock_compute_log_likelihood
179
+ print(f" Patched log_likelihoods evaluator with mock")
180
+ elif evaluator_name == "perplexity":
181
+ # Monkey patch perplexity computation
182
+ def mock_compute_perplexity(model, text):
183
+ """Return mock perplexity - lower for shorter text."""
184
+ return len(text) * 0.1
185
+
186
+ rotator._plugin._compute_perplexity = mock_compute_perplexity
187
+ print(f" Patched perplexity evaluator with mock")
188
+ else:
189
+ print(f" No patching needed for {evaluator_name}")
190
+
191
+ model = MockModel(model_name="mock")
192
+ print(f" Using MockModel (no real inference)")
193
+
194
+ # Step 4: Evaluate with mock model
195
+ print(" [4/4] Evaluating pairs...")
196
+ results = []
197
+ all_correct = True
198
+
199
+ for i, pair in enumerate(test_pairs.pairs):
200
+ # Verify required fields
201
+ if not pair.prompt or not pair.positive_response.model_response or not pair.negative_response.model_response:
202
+ print(f" ✗ Pair {i+1}: Missing required fields")
203
+ all_correct = False
204
+ continue
205
+
206
+ pair_results = {
207
+ "pair_id": i,
208
+ "prompt": pair.prompt + "...",
209
+ "positive_response": pair.positive_response.model_response,
210
+ "negative_response": pair.negative_response.model_response,
211
+ }
212
+
213
+ # Get test code from pair metadata if available (for coding tasks)
214
+ test_code = None
215
+ entry_point = None
216
+ if hasattr(pair, 'metadata') and pair.metadata:
217
+ test_code = pair.metadata.get('test_code')
218
+ entry_point = pair.metadata.get('entry_point')
219
+
220
+ # Evaluate with POSITIVE as expected (should return TRUTHFUL)
221
+ # For exact_match evaluators, response should match the expected
222
+ positive_response_value = pair.positive_response.model_response if evaluator_name == "exact_match" else ""
223
+ eval_kwargs = {
224
+ "response": positive_response_value,
225
+ "expected": pair.positive_response.model_response,
226
+ "model": model,
227
+ "question": pair.prompt,
228
+ "choices": [pair.positive_response.model_response, pair.negative_response.model_response],
229
+ "task_name": task_name,
230
+ }
231
+ if test_code:
232
+ eval_kwargs["test_code"] = test_code
233
+ if entry_point:
234
+ eval_kwargs["entry_point"] = entry_point
235
+
236
+ positive_eval = rotator._plugin.evaluate(**eval_kwargs)
237
+
238
+ positive_correct = positive_eval.ground_truth == "TRUTHFUL"
239
+ pair_results["positive_evaluation"] = {
240
+ "ground_truth": positive_eval.ground_truth,
241
+ "confidence": positive_eval.confidence,
242
+ "expected": "TRUTHFUL",
243
+ "correct": positive_correct,
244
+ "details": positive_eval.details
245
+ }
246
+
247
+ # Evaluate with NEGATIVE as expected (should return UNTRUTHFUL)
248
+ # For exact_match evaluators, provide positive_response (not negative) to test that it's marked UNTRUTHFUL
249
+ negative_response_value = pair.positive_response.model_response if evaluator_name == "exact_match" else ""
250
+ eval_kwargs_neg = {
251
+ "response": negative_response_value,
252
+ "expected": pair.negative_response.model_response,
253
+ "model": model,
254
+ "question": pair.prompt,
255
+ "choices": [pair.positive_response.model_response, pair.negative_response.model_response],
256
+ "task_name": task_name
257
+ }
258
+ if test_code:
259
+ eval_kwargs_neg["test_code"] = test_code
260
+ if entry_point:
261
+ eval_kwargs_neg["entry_point"] = entry_point
262
+
263
+ negative_eval = rotator._plugin.evaluate(**eval_kwargs_neg)
264
+
265
+ negative_correct = negative_eval.ground_truth == "UNTRUTHFUL"
266
+ pair_results["negative_evaluation"] = {
267
+ "ground_truth": negative_eval.ground_truth,
268
+ "confidence": negative_eval.confidence,
269
+ "expected": "UNTRUTHFUL",
270
+ "correct": negative_correct,
271
+ "details": negative_eval.details
272
+ }
273
+
274
+ # Check if both evaluations are correct
275
+ pair_correct = positive_correct and negative_correct
276
+ pair_results["both_correct"] = pair_correct
277
+
278
+ if not pair_correct:
279
+ all_correct = False
280
+
281
+ results.append(pair_results)
282
+
283
+ print(f" Pair {i+1}:")
284
+ print(f" Positive: {positive_eval.ground_truth} (expected TRUTHFUL) - {'✓' if positive_correct else '✗'}")
285
+ print(f" Negative: {negative_eval.ground_truth} (expected UNTRUTHFUL) - {'✓' if negative_correct else '✗'}")
286
+
287
+ # Save evaluation results
288
+ eval_file = output_path / f"test_{task_name}_evaluation.json"
289
+ summary = {
290
+ "task_name": task_name,
291
+ "model_name": "mock",
292
+ "evaluator_name": evaluator_name,
293
+ "num_pairs": len(test_pairs.pairs),
294
+ "all_correct": all_correct,
295
+ "pairs": results
296
+ }
297
+
298
+ with open(eval_file, 'w') as f:
299
+ json.dump(summary, f, indent=2)
300
+ print(f" Saved results to: {eval_file}")
301
+
302
+ if all_correct:
303
+ print(f" ✓ SUCCESS: All evaluations correct!\n")
304
+ else:
305
+ print(f" ✗ FAILED: Some evaluations incorrect\n")
306
+
307
+ return all_correct
308
+
309
+ except Exception as e:
310
+ print(f" ✗ FAILED: {e}\n")
311
+ import traceback
312
+ traceback.print_exc()
313
+ return False
314
+
315
+
316
+ if __name__ == "__main__":
317
+ import sys
318
+ task = sys.argv[1] if len(sys.argv) > 1 else "boolq"
319
+ model = sys.argv[2] if len(sys.argv) > 2 else "distilgpt2"
320
+ # Default to results directory in same folder as this script
321
+ default_output = Path(__file__).parent / "results"
322
+ output_dir = sys.argv[3] if len(sys.argv) > 3 else str(default_output)
323
+ success = test_benchmark(task, model, output_dir)
324
+ sys.exit(0 if success else 1)
@@ -0,0 +1,293 @@
1
+ """Test coding benchmarks with Docker execution.
2
+
3
+ This script tests coding benchmarks (HumanEval, MBPP, etc.) by:
4
+ 1. Creating contrastive pairs from the benchmark
5
+ 2. Executing code in Docker sandbox
6
+ 3. Verifying positive code passes tests (TRUTHFUL)
7
+ 4. Verifying negative code fails tests (UNTRUTHFUL)
8
+
9
+ Usage:
10
+ python test_one_coding_benchmark.py humaneval
11
+ python test_one_coding_benchmark.py mbpp
12
+ python test_one_coding_benchmark.py humaneval --limit 5
13
+ """
14
+
15
+ import json
16
+ import os
17
+ import sys
18
+ from pathlib import Path
19
+
20
+ # Set environment variables
21
+ os.environ['HF_DATASETS_TRUST_REMOTE_CODE'] = '1'
22
+ os.environ['HF_ALLOW_CODE_EVAL'] = '1'
23
+
24
+ from wisent.core.data_loaders.loaders.huggingface_loader import HuggingFaceDataLoader
25
+ from wisent.core.evaluators.benchmark_specific.coding.metrics.evaluator import CodingEvaluator, EvaluatorConfig
26
+
27
+
28
+ def test_coding_benchmark(
29
+ task_name: str,
30
+ output_dir: str = ".",
31
+ limit: int = 10,
32
+ ):
33
+ """
34
+ Test a coding benchmark using Docker sandbox execution.
35
+
36
+ Args:
37
+ task_name: Name of the benchmark (e.g., "humaneval", "mbpp")
38
+ output_dir: Directory to save results
39
+ limit: Maximum number of pairs to test
40
+
41
+ Returns:
42
+ True if all evaluations correct, False otherwise
43
+ """
44
+ try:
45
+ print(f"\n{'='*60}")
46
+ print(f"Testing coding benchmark: {task_name}")
47
+ print(f"{'='*60}")
48
+
49
+ output_path = Path(output_dir)
50
+ output_path.mkdir(parents=True, exist_ok=True)
51
+
52
+ # Step 1: Load data and create contrastive pairs
53
+ print("\n[1/4] Creating contrastive pairs...")
54
+ loader = HuggingFaceDataLoader()
55
+
56
+ result = loader._load_one_task(
57
+ task_name=task_name,
58
+ split_ratio=0.5,
59
+ seed=42,
60
+ limit=limit * 3, # Load more to account for filtering
61
+ training_limit=limit,
62
+ testing_limit=limit,
63
+ )
64
+
65
+ test_pairs = result['test_qa_pairs']
66
+ print(f" Created {len(test_pairs.pairs)} contrastive pairs")
67
+
68
+ if len(test_pairs.pairs) == 0:
69
+ print(" ERROR: No pairs created!")
70
+ return False
71
+
72
+ # Step 2: Initialize CodingEvaluator with Docker
73
+ print("\n[2/4] Initializing CodingEvaluator with Docker sandbox...")
74
+
75
+ cfg = EvaluatorConfig(
76
+ image="coding/sandbox:polyglot-1.0",
77
+ time_limit_s=10,
78
+ cpu_limit_s=5,
79
+ mem_limit_mb=512,
80
+ pre_sanitize=True,
81
+ )
82
+ evaluator = CodingEvaluator(cfg=cfg)
83
+ print(f" Docker image: {cfg.image}")
84
+ print(f" Time limit: {cfg.time_limit_s}s, CPU limit: {cfg.cpu_limit_s}s")
85
+
86
+ # Step 3: Save the pairs
87
+ print("\n[3/4] Saving contrastive pairs...")
88
+ pairs_data = []
89
+ for i, pair in enumerate(test_pairs.pairs):
90
+ pair_info = {
91
+ "pair_id": i,
92
+ "prompt": pair.prompt,
93
+ "positive_response": pair.positive_response.model_response,
94
+ "negative_response": pair.negative_response.model_response,
95
+ "has_test_code": bool(pair.metadata and pair.metadata.get('test_code')),
96
+ "entry_point": pair.metadata.get('entry_point') if pair.metadata else None,
97
+ }
98
+ pairs_data.append(pair_info)
99
+
100
+ pairs_file = output_path / f"coding_{task_name}_pairs.json"
101
+ with open(pairs_file, 'w') as f:
102
+ json.dump(pairs_data, f, indent=2)
103
+ print(f" Saved pairs to: {pairs_file}")
104
+
105
+ # Step 4: Evaluate with Docker execution
106
+ print("\n[4/4] Evaluating pairs with Docker execution...")
107
+ results = []
108
+ all_correct = True
109
+ passed_count = 0
110
+ failed_count = 0
111
+
112
+ for i, pair in enumerate(test_pairs.pairs):
113
+ print(f"\n Pair {i+1}/{len(test_pairs.pairs)}:")
114
+
115
+ # Get test code from metadata
116
+ test_code = None
117
+ entry_point = None
118
+ if pair.metadata:
119
+ test_code = pair.metadata.get('test_code')
120
+ entry_point = pair.metadata.get('entry_point')
121
+
122
+ if not test_code:
123
+ print(f" SKIP: No test_code in metadata")
124
+ continue
125
+
126
+ print(f" Entry point: {entry_point}")
127
+ print(f" Test code length: {len(test_code)} chars")
128
+
129
+ pair_result = {
130
+ "pair_id": i,
131
+ "entry_point": entry_point,
132
+ }
133
+
134
+ # Evaluate POSITIVE response (should pass tests -> TRUTHFUL)
135
+ print(f" Evaluating positive response...")
136
+ try:
137
+ pos_eval = evaluator.evaluate(
138
+ response=pair.positive_response.model_response,
139
+ expected="",
140
+ test_code=test_code,
141
+ entry_point=entry_point,
142
+ task_name=task_name,
143
+ language="python",
144
+ )
145
+ pos_correct = pos_eval.ground_truth == "TRUTHFUL"
146
+ pair_result["positive"] = {
147
+ "ground_truth": pos_eval.ground_truth,
148
+ "expected": "TRUTHFUL",
149
+ "correct": pos_correct,
150
+ "details": pos_eval.details if pos_eval.details else "",
151
+ }
152
+ status = "PASS" if pos_correct else "FAIL"
153
+ print(f" Result: {pos_eval.ground_truth} (expected TRUTHFUL) -> {status}")
154
+ if not pos_correct:
155
+ print(f" Details: {pos_eval.details if pos_eval.details else 'N/A'}")
156
+ except Exception as e:
157
+ print(f" ERROR: {e}")
158
+ pair_result["positive"] = {"error": str(e)}
159
+ pos_correct = False
160
+
161
+ # Evaluate NEGATIVE response (should fail tests -> UNTRUTHFUL)
162
+ print(f" Evaluating negative response...")
163
+ try:
164
+ neg_eval = evaluator.evaluate(
165
+ response=pair.negative_response.model_response,
166
+ expected="",
167
+ test_code=test_code,
168
+ entry_point=entry_point,
169
+ task_name=task_name,
170
+ language="python",
171
+ )
172
+ neg_correct = neg_eval.ground_truth == "UNTRUTHFUL"
173
+ pair_result["negative"] = {
174
+ "ground_truth": neg_eval.ground_truth,
175
+ "expected": "UNTRUTHFUL",
176
+ "correct": neg_correct,
177
+ "details": neg_eval.details if neg_eval.details else "",
178
+ }
179
+ status = "PASS" if neg_correct else "FAIL"
180
+ print(f" Result: {neg_eval.ground_truth} (expected UNTRUTHFUL) -> {status}")
181
+ except Exception as e:
182
+ print(f" ERROR: {e}")
183
+ pair_result["negative"] = {"error": str(e)}
184
+ neg_correct = False
185
+
186
+ # Track overall success
187
+ pair_correct = pos_correct and neg_correct
188
+ pair_result["both_correct"] = pair_correct
189
+
190
+ if pair_correct:
191
+ passed_count += 1
192
+ print(f" PAIR RESULT: PASS")
193
+ else:
194
+ failed_count += 1
195
+ all_correct = False
196
+ print(f" PAIR RESULT: FAIL")
197
+
198
+ results.append(pair_result)
199
+
200
+ # Save evaluation results
201
+ eval_file = output_path / f"coding_{task_name}_evaluation.json"
202
+ summary = {
203
+ "task_name": task_name,
204
+ "evaluator": "CodingEvaluator (Docker)",
205
+ "num_pairs": len(test_pairs.pairs),
206
+ "evaluated": len(results),
207
+ "passed": passed_count,
208
+ "failed": failed_count,
209
+ "all_correct": all_correct,
210
+ "pairs": results,
211
+ }
212
+
213
+ with open(eval_file, 'w') as f:
214
+ json.dump(summary, f, indent=2)
215
+
216
+ # Final summary
217
+ print(f"\n{'='*60}")
218
+ print(f"SUMMARY: {task_name}")
219
+ print(f"{'='*60}")
220
+ print(f" Total pairs: {len(test_pairs.pairs)}")
221
+ print(f" Evaluated: {len(results)}")
222
+ print(f" Passed: {passed_count}")
223
+ print(f" Failed: {failed_count}")
224
+ print(f" Success rate: {passed_count}/{len(results)} ({100*passed_count/max(1,len(results)):.1f}%)")
225
+ print(f" Results saved to: {eval_file}")
226
+
227
+ if all_correct and len(results) > 0:
228
+ print(f"\n SUCCESS: All evaluations correct!")
229
+ else:
230
+ print(f"\n FAILED: Some evaluations incorrect")
231
+
232
+ return all_correct and len(results) > 0
233
+
234
+ except Exception as e:
235
+ print(f"\nERROR: {e}")
236
+ import traceback
237
+ traceback.print_exc()
238
+ return False
239
+
240
+
241
+ def check_docker_available():
242
+ """Check if Docker is available and running."""
243
+ import subprocess
244
+ try:
245
+ result = subprocess.run(
246
+ ["docker", "info"],
247
+ capture_output=True,
248
+ text=True,
249
+ timeout=30
250
+ )
251
+ if result.returncode != 0:
252
+ print("ERROR: Docker daemon is not running")
253
+ print(f" {result.stderr}")
254
+ return False
255
+ print("Docker is available and running")
256
+ return True
257
+ except FileNotFoundError:
258
+ print("ERROR: Docker command not found. Please install Docker.")
259
+ return False
260
+ except subprocess.TimeoutExpired:
261
+ print("ERROR: Docker command timed out")
262
+ return False
263
+
264
+
265
+ if __name__ == "__main__":
266
+ import argparse
267
+
268
+ parser = argparse.ArgumentParser(description="Test coding benchmarks with Docker execution")
269
+ parser.add_argument("task", nargs="?", default="humaneval", help="Benchmark name (default: humaneval)")
270
+ parser.add_argument("--limit", type=int, default=5, help="Number of pairs to test (default: 5)")
271
+ parser.add_argument("--output", type=str, default=None, help="Output directory")
272
+
273
+ args = parser.parse_args()
274
+
275
+ # Default output directory
276
+ if args.output is None:
277
+ args.output = str(Path(__file__).parent / "results")
278
+
279
+ print("Checking Docker availability...")
280
+ if not check_docker_available():
281
+ sys.exit(1)
282
+
283
+ print(f"\nRunning test for: {args.task}")
284
+ print(f"Limit: {args.limit} pairs")
285
+ print(f"Output: {args.output}")
286
+
287
+ success = test_coding_benchmark(
288
+ task_name=args.task,
289
+ output_dir=args.output,
290
+ limit=args.limit,
291
+ )
292
+
293
+ sys.exit(0 if success else 1)