wisent 0.7.701__py3-none-any.whl → 0.7.901__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/activation_cache.py +393 -0
  3. wisent/core/activations/activations.py +3 -3
  4. wisent/core/activations/activations_collector.py +9 -5
  5. wisent/core/activations/classifier_inference_strategy.py +12 -11
  6. wisent/core/activations/extraction_strategy.py +256 -84
  7. wisent/core/classifiers/classifiers/core/atoms.py +3 -2
  8. wisent/core/cli/__init__.py +2 -1
  9. wisent/core/cli/agent/apply_steering.py +5 -7
  10. wisent/core/cli/agent/train_classifier.py +19 -7
  11. wisent/core/cli/check_linearity.py +35 -3
  12. wisent/core/cli/cluster_benchmarks.py +4 -6
  13. wisent/core/cli/create_steering_vector.py +6 -4
  14. wisent/core/cli/diagnose_vectors.py +7 -4
  15. wisent/core/cli/estimate_unified_goodness_time.py +6 -4
  16. wisent/core/cli/generate_pairs_from_task.py +9 -56
  17. wisent/core/cli/geometry_search.py +137 -0
  18. wisent/core/cli/get_activations.py +1 -1
  19. wisent/core/cli/method_optimizer.py +4 -3
  20. wisent/core/cli/modify_weights.py +3 -2
  21. wisent/core/cli/optimize_sample_size.py +1 -1
  22. wisent/core/cli/optimize_steering.py +14 -16
  23. wisent/core/cli/optimize_weights.py +2 -1
  24. wisent/core/cli/preview_pairs.py +203 -0
  25. wisent/core/cli/steering_method_trainer.py +3 -3
  26. wisent/core/cli/tasks.py +19 -76
  27. wisent/core/cli/train_unified_goodness.py +3 -3
  28. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +4 -4
  29. wisent/core/contrastive_pairs/diagnostics/linearity.py +7 -0
  30. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
  31. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
  32. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
  33. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
  34. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
  35. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
  36. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
  37. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
  38. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
  39. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
  40. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
  41. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
  42. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
  43. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
  44. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
  45. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
  46. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
  47. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
  48. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
  49. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
  50. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
  51. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
  52. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
  53. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +3 -0
  54. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
  55. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
  56. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
  57. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
  58. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
  59. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
  60. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
  61. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
  62. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
  63. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
  64. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
  65. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
  66. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
  67. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
  68. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
  69. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
  70. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
  82. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
  83. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
  84. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
  85. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
  86. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
  87. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
  88. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
  89. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
  90. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
  96. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
  102. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
  103. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
  104. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
  105. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
  106. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
  107. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
  108. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
  109. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
  110. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
  111. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
  112. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
  113. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
  114. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
  115. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
  116. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
  117. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
  118. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
  119. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
  120. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
  121. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
  122. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
  123. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
  124. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
  125. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
  126. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
  127. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
  128. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
  129. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
  130. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
  131. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
  132. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
  133. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
  134. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
  135. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
  136. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
  137. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
  138. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
  139. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
  140. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
  141. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
  142. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
  143. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
  144. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
  145. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
  146. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
  147. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
  148. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
  149. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
  150. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
  151. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
  152. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
  153. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
  154. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
  155. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
  156. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
  157. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
  158. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
  159. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
  160. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
  161. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
  162. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
  163. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
  164. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
  165. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
  166. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
  167. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
  168. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
  169. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
  170. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
  171. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
  172. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
  173. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
  174. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
  175. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
  176. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
  177. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
  178. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
  179. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
  180. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
  181. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
  182. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
  183. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
  184. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
  185. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
  186. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
  187. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
  188. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
  189. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
  190. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
  191. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
  192. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
  193. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
  194. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
  195. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
  196. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
  197. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
  198. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
  199. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
  200. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
  201. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
  202. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
  203. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
  204. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
  205. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
  206. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
  207. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
  208. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
  209. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
  210. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
  211. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
  212. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
  213. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
  214. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
  215. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
  216. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
  217. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
  218. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
  219. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
  220. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
  221. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
  222. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
  223. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
  224. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
  225. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
  226. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
  227. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
  228. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
  229. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
  230. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
  231. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
  232. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
  233. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
  234. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
  235. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
  236. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
  237. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
  238. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
  239. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
  240. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
  241. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
  242. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
  243. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
  244. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
  245. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
  246. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
  247. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
  248. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
  249. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
  250. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
  251. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
  252. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
  253. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
  254. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
  255. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
  256. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
  257. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
  258. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
  259. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
  260. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
  261. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
  262. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
  263. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
  264. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
  265. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
  266. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
  267. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
  268. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
  269. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
  270. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
  271. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
  272. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
  273. wisent/core/data_loaders/loaders/lm_loader.py +12 -1
  274. wisent/core/geometry_runner.py +995 -0
  275. wisent/core/geometry_search_space.py +237 -0
  276. wisent/core/hyperparameter_optimizer.py +1 -1
  277. wisent/core/main.py +3 -0
  278. wisent/core/models/core/atoms.py +5 -3
  279. wisent/core/models/wisent_model.py +1 -1
  280. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  281. wisent/core/parser_arguments/check_linearity_parser.py +12 -2
  282. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
  283. wisent/core/parser_arguments/generate_vector_from_task_parser.py +2 -2
  284. wisent/core/parser_arguments/geometry_search_parser.py +61 -0
  285. wisent/core/parser_arguments/main_parser.py +8 -0
  286. wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
  287. wisent/core/steering.py +5 -3
  288. wisent/core/steering_methods/methods/hyperplane.py +2 -1
  289. wisent/core/synthetic/generators/nonsense_generator.py +30 -18
  290. wisent/core/trainers/steering_trainer.py +2 -2
  291. wisent/core/utils/device.py +27 -27
  292. wisent/core/utils/layer_combinations.py +70 -0
  293. wisent/examples/__init__.py +1 -0
  294. wisent/examples/scripts/__init__.py +1 -0
  295. wisent/examples/scripts/count_all_benchmarks.py +121 -0
  296. wisent/examples/scripts/discover_directions.py +469 -0
  297. wisent/examples/scripts/extract_benchmark_info.py +71 -0
  298. wisent/examples/scripts/generate_paper_data.py +384 -0
  299. wisent/examples/scripts/intervention_validation.py +626 -0
  300. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
  301. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
  302. wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
  303. wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
  304. wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
  305. wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
  306. wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
  307. wisent/examples/scripts/search_all_short_names.py +31 -0
  308. wisent/examples/scripts/test_all_benchmarks.py +138 -0
  309. wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
  310. wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
  311. wisent/examples/scripts/test_nonsense_baseline.py +261 -0
  312. wisent/examples/scripts/test_one_benchmark.py +324 -0
  313. wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
  314. wisent/examples/scripts/threshold_analysis.py +434 -0
  315. wisent/examples/scripts/visualization_gallery.py +582 -0
  316. wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
  317. wisent/parameters/lm_eval/category_directions.json +137 -0
  318. wisent/parameters/lm_eval/repair_plan.json +282 -0
  319. wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
  320. wisent/parameters/lm_eval/working_benchmarks.json +206 -0
  321. wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
  322. wisent/tests/test_detector_accuracy.py +1 -1
  323. wisent/tests/visualize_geometry.py +1 -1
  324. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
  325. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/RECORD +329 -295
  326. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
  327. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
  328. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
  329. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
  330. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import json
3
4
  from typing import Any
4
5
  from wisent.core.cli_logger import setup_logger
5
6
 
@@ -13,43 +14,45 @@ log = setup_logger(__name__)
13
14
 
14
15
  class MercuryExtractor(HuggingFaceBenchmarkExtractor):
15
16
  """
16
- Extractor for mercury dataset (code-to-code translation).
17
+ Extractor for Mercury - code efficiency benchmark.
17
18
 
18
- Schema (code_x_glue_cc_code_to_code_trans):
19
- - java: str (java code/prompt)
20
- - cs: str (c# code/answer)
19
+ Dataset: Elfsong/Mercury
20
+ Paper: "Mercury: A Code Efficiency Benchmark for LLM Code Synthesis"
21
21
 
22
- Note: This is a translation task, not code execution. Uses generation evaluator.
22
+ Mercury evaluates code efficiency by comparing different solutions
23
+ to the same problem based on runtime performance.
24
+
25
+ Schema:
26
+ - prompt: str (problem description)
27
+ - solutions: list[dict] with runtime and solution code
28
+ - test_cases: str (JSON with test inputs/outputs)
29
+ - difficulty: str
30
+
31
+ For code efficiency evaluation:
32
+ - Positive (correct) = Fastest solution
33
+ - Negative (incorrect) = Slowest solution
23
34
  """
24
35
 
25
- evaluator_name = "generation"
36
+ evaluator_name = "code_efficiency"
26
37
 
27
38
  def extract_contrastive_pairs(
28
39
  self,
29
40
  limit: int | None = None,
30
41
  ) -> list[ContrastivePair]:
31
42
  """
32
- Build contrastive pairs from mercury examples.
33
-
34
- Args:
35
- limit: Optional maximum number of pairs to produce.
36
-
37
- Returns:
38
- A list of ContrastivePair objects.
43
+ Build contrastive pairs from Mercury examples.
39
44
  """
40
45
  max_items = self._normalize_limit(limit)
41
46
 
42
- # Load dataset - using code_x_glue as alternative since tau/code_translation doesn't exist
43
47
  docs = self.load_dataset(
44
- dataset_name="code_x_glue_cc_code_to_code_trans",
45
- dataset_config="default",
46
- split="train",
48
+ dataset_name="Elfsong/Mercury",
49
+ split="eval",
47
50
  limit=max_items,
48
51
  )
49
52
 
50
53
  pairs: list[ContrastivePair] = []
51
54
 
52
- log.info(f"Extracting contrastive pairs from {len(docs)} mercury examples")
55
+ log.info(f"Extracting contrastive pairs from {len(docs)} Mercury examples")
53
56
 
54
57
  for doc in docs:
55
58
  pair = self._extract_pair_from_doc(doc)
@@ -59,53 +62,73 @@ class MercuryExtractor(HuggingFaceBenchmarkExtractor):
59
62
  break
60
63
 
61
64
  if not pairs:
62
- log.warning("No valid mercury pairs extracted")
65
+ log.warning("No valid Mercury pairs extracted")
63
66
 
64
67
  return pairs
65
68
 
66
69
  def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
67
70
  """
68
71
  Convert a single doc into a ContrastivePair.
69
-
70
- Returns None when required fields are missing or malformed.
72
+
73
+ Uses fastest vs slowest solution as correct vs incorrect.
71
74
  """
72
75
  try:
73
- question = doc.get("java", "").strip()
74
- answer = doc.get("cs", "")
76
+ prompt = doc.get("prompt", "").strip()
77
+ solutions = doc.get("solutions", [])
78
+ difficulty = doc.get("difficulty", "")
79
+ slug_name = doc.get("slug_name", "")
80
+ pretty_content = doc.get("pretty_content", [])
75
81
 
76
- if not question or not answer:
77
- log.debug("Skipping: missing question or answer")
82
+ if not prompt or not solutions or len(solutions) < 2:
78
83
  return None
79
84
 
80
- # Convert answer to string
81
- correct_answer = str(answer).strip()
85
+ # Sort solutions by runtime (fastest first)
86
+ # Runtime format is like "44ms", "36ms", etc.
87
+ def parse_runtime(sol):
88
+ runtime_str = sol.get("runtime", "999ms")
89
+ try:
90
+ return int(runtime_str.replace("ms", ""))
91
+ except:
92
+ return 999
93
+
94
+ sorted_solutions = sorted(solutions, key=parse_runtime)
95
+
96
+ fastest = sorted_solutions[0]
97
+ slowest = sorted_solutions[-1]
98
+
99
+ fastest_code = fastest.get("solution", "")
100
+ slowest_code = slowest.get("solution", "")
101
+
102
+ if not fastest_code or not slowest_code:
103
+ return None
104
+
105
+ # Use pretty_content if available for problem description
106
+ problem_desc = pretty_content[0] if pretty_content else prompt
82
107
 
83
- # Create incorrect answer (modify or corrupt)
84
- incorrect_answer = self._create_incorrect_answer(correct_answer)
108
+ formatted_question = f"""Code Efficiency Task:
85
109
 
86
- # Format the question
87
- formatted_question = f"Translate this Java code to C#:\n{question}"
110
+ {problem_desc}
111
+
112
+ Write an efficient Python solution."""
88
113
 
89
114
  metadata = {
90
115
  "label": "mercury",
91
- "source": "code_x_glue_cc_code_to_code_trans",
116
+ "source": "Elfsong/Mercury",
117
+ "slug_name": slug_name,
118
+ "difficulty": difficulty,
119
+ "fastest_runtime": fastest.get("runtime", ""),
120
+ "slowest_runtime": slowest.get("runtime", ""),
121
+ "is_code_efficiency_benchmark": True,
92
122
  }
93
123
 
94
124
  return self._build_pair(
95
125
  question=formatted_question,
96
- correct=correct_answer,
97
- incorrect=incorrect_answer,
126
+ correct=f"```python\n{fastest_code}\n```",
127
+ incorrect=f"```python\n{slowest_code}\n```",
98
128
  metadata=metadata,
99
129
  )
100
130
 
101
131
  except Exception as exc:
102
- log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
132
+ log.error(f"Error extracting Mercury pair: {exc}", exc_info=True)
103
133
  return None
104
134
 
105
- def _create_incorrect_answer(self, correct: str) -> str:
106
- """Create an incorrect answer by modifying the correct one."""
107
- # For code, corrupt it slightly
108
- if len(correct) > 10:
109
- return correct[:len(correct)//2] + "# CORRUPTED" + correct[len(correct)//2:]
110
- return f"{correct} # INCORRECT"
111
-
@@ -111,18 +111,8 @@ class OlympiadBenchExtractor(HuggingFaceBenchmarkExtractor):
111
111
  )
112
112
  log.info(f"Loaded {len(docs)} examples from OlympiadBench ({self.config})")
113
113
  except Exception as e:
114
- log.warning(f"Failed to load OlympiadBench with config {self.config}: {e}")
115
- # Try alternative config
116
- try:
117
- docs = self.load_dataset(
118
- dataset_name="lmms-lab/OlympiadBench",
119
- split="test",
120
- limit=max_items,
121
- )
122
- log.info(f"Loaded {len(docs)} examples from lmms-lab/OlympiadBench")
123
- except Exception as e2:
124
- log.error(f"Failed to load any OlympiadBench: {e2}")
125
- return []
114
+ log.error(f"Failed to load Hothan/OlympiadBench: {e}")
115
+ return []
126
116
 
127
117
  pairs: list[ContrastivePair] = []
128
118
 
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from typing import Any
4
+ from datasets import load_dataset
4
5
  from wisent.core.cli_logger import setup_logger
5
6
 
6
7
  from wisent.core.contrastive_pairs.core.pair import ContrastivePair
@@ -10,73 +11,61 @@ __all__ = ["PlanBenchExtractor"]
10
11
 
11
12
  log = setup_logger(__name__)
12
13
 
13
- # PlanBench domains
14
- PLANBENCH_DOMAINS = [
15
- "blocksworld", # Classic blocks world planning
16
- "logistics", # Package delivery logistics
17
- ]
18
-
19
- # PlanBench task types
20
- PLANBENCH_TASKS = [
21
- "plan_generation", # Generate a valid plan
22
- "cost_optimal_planning", # Generate cost-optimal plan
23
- "plan_verification", # Verify if a plan is valid
24
- "goal_recognition", # Recognize the goal from actions
25
- "plan_execution_reasoning", # Predict outcome of action execution
26
- "action_reordering", # Reorder actions for valid plan
14
+ PLANBENCH_CONFIGS = [
15
+ "task_1_plan_generation",
16
+ "task_2_plan_optimality",
17
+ "task_3_plan_verification",
18
+ "task_5_plan_generalization",
19
+ "task_7_plan_execution",
20
+ "task_8_1_goal_shuffling",
21
+ "task_8_2_full_to_partial",
27
22
  ]
28
23
 
29
24
 
30
25
  class PlanBenchExtractor(HuggingFaceBenchmarkExtractor):
31
26
  """
32
- Extractor for PlanBench - Planning and Reasoning Benchmark.
27
+ Extractor for PlanBench - Planning and Reasoning Benchmark (NeurIPS 2023).
33
28
 
34
29
  PlanBench evaluates LLMs on planning and reasoning about actions
35
30
  and change, using domains from the International Planning Competition.
36
31
 
37
- Domains:
38
- - Blocksworld: Classic blocks stacking problems
39
- - Logistics: Package delivery with trucks and planes
40
-
41
- Task Types:
42
- - Plan generation and cost-optimal planning
43
- - Plan verification
44
- - Goal recognition
45
- - Plan execution reasoning
46
- - Action reordering
32
+ Dataset: tasksource/planbench (HuggingFace)
47
33
 
48
- Dataset: GitHub karthikv792/LLMs-Planning
34
+ Available configs:
35
+ - task_1_plan_generation: Generate a valid plan
36
+ - task_2_plan_optimality: Generate cost-optimal plan
37
+ - task_3_plan_verification: Verify if a plan is valid
38
+ - task_5_plan_generalization: Generalize plan to new instances
39
+ - task_7_plan_execution: Predict execution outcome
40
+ - task_8_1_goal_shuffling: Handle shuffled goals
41
+ - task_8_2_full_to_partial: Full to partial observability
49
42
 
50
43
  For planning evaluation:
51
- - Positive (correct) = Valid plan or correct reasoning
52
- - Negative (incorrect) = Invalid plan or incorrect reasoning
44
+ - Positive (correct) = Valid plan matching ground truth
45
+ - Negative (incorrect) = Invalid or wrong plan
53
46
  """
54
47
 
55
- # Evaluator that should be used for this benchmark
56
48
  evaluator_name = "planning_reasoning"
57
49
 
58
- def __init__(self, domain: str = "blocksworld", task: str = "plan_generation"):
50
+ def __init__(self, config: str = "task_1_plan_generation"):
59
51
  """
60
52
  Initialize PlanBench extractor.
61
53
 
62
54
  Args:
63
- domain: Planning domain ("blocksworld", "logistics")
64
- task: Task type (e.g., "plan_generation", "plan_verification")
55
+ config: PlanBench task config (default: task_1_plan_generation)
65
56
  """
66
57
  super().__init__()
67
- self.domain = domain
68
- self.task = task
58
+ if config not in PLANBENCH_CONFIGS:
59
+ log.warning(f"Unknown config '{config}', using task_1_plan_generation")
60
+ config = "task_1_plan_generation"
61
+ self.config = config
69
62
 
70
63
  def extract_contrastive_pairs(
71
64
  self,
72
65
  limit: int | None = None,
73
66
  ) -> list[ContrastivePair]:
74
67
  """
75
- Build contrastive pairs from PlanBench examples.
76
-
77
- Creates pairs for planning evaluation:
78
- - Positive (correct) = Valid planning solution
79
- - Negative (incorrect) = Invalid planning solution
68
+ Build contrastive pairs from PlanBench.
80
69
 
81
70
  Args:
82
71
  limit: Optional maximum number of pairs to produce.
@@ -86,9 +75,8 @@ class PlanBenchExtractor(HuggingFaceBenchmarkExtractor):
86
75
  """
87
76
  max_items = self._normalize_limit(limit)
88
77
 
89
- # PlanBench is on GitHub, create examples based on documented structure
90
- docs = self._create_planbench_examples(max_items or 50)
91
- log.info(f"Created {len(docs)} PlanBench examples ({self.domain}, {self.task})")
78
+ docs = self._load_planbench_data()
79
+ log.info(f"Loaded {len(docs)} PlanBench examples (config: {self.config})")
92
80
 
93
81
  pairs: list[ContrastivePair] = []
94
82
 
@@ -104,161 +92,60 @@ class PlanBenchExtractor(HuggingFaceBenchmarkExtractor):
104
92
 
105
93
  return pairs
106
94
 
107
- def _create_planbench_examples(self, count: int) -> list[dict[str, Any]]:
108
- """Create examples based on PlanBench structure."""
109
- examples = []
110
-
111
- if self.domain == "blocksworld":
112
- examples = self._create_blocksworld_examples(count)
113
- elif self.domain == "logistics":
114
- examples = self._create_logistics_examples(count)
115
- else:
116
- examples = self._create_blocksworld_examples(count)
117
-
118
- return examples
119
-
120
- def _create_blocksworld_examples(self, count: int) -> list[dict[str, Any]]:
121
- """Create blocksworld planning examples."""
122
- blocksworld_cases = [
123
- {
124
- "initial_state": "Block A is on the table. Block B is on Block A. Block C is on the table.",
125
- "goal_state": "Block A is on Block B. Block B is on Block C.",
126
- "valid_plan": [
127
- "1. Unstack B from A",
128
- "2. Put B on C",
129
- "3. Pick up A",
130
- "4. Stack A on B",
131
- ],
132
- "invalid_plan": [
133
- "1. Pick up A", # Invalid - B is on A
134
- "2. Put A on B",
135
- ],
136
- },
137
- {
138
- "initial_state": "Block A is on Block B. Block B is on the table. Block C is on the table. The robot arm is empty.",
139
- "goal_state": "Block B is on Block A. Block C is on Block B.",
140
- "valid_plan": [
141
- "1. Unstack A from B",
142
- "2. Put A on the table",
143
- "3. Pick up B",
144
- "4. Stack B on A",
145
- "5. Pick up C",
146
- "6. Stack C on B",
147
- ],
148
- "invalid_plan": [
149
- "1. Stack B on A", # Invalid - A is on B
150
- ],
151
- },
152
- {
153
- "initial_state": "Block A, B, and C are on the table. Block D is on Block A.",
154
- "goal_state": "Block A is on Block B. Block B is on Block C. Block D is on Block A.",
155
- "valid_plan": [
156
- "1. Unstack D from A",
157
- "2. Put D on table",
158
- "3. Pick up B",
159
- "4. Stack B on C",
160
- "5. Pick up A",
161
- "6. Stack A on B",
162
- "7. Pick up D",
163
- "8. Stack D on A",
164
- ],
165
- "invalid_plan": [
166
- "1. Pick up A", # Invalid - D is on A
167
- ],
168
- },
169
- ]
170
-
171
- examples = []
172
- for i in range(count):
173
- case = blocksworld_cases[i % len(blocksworld_cases)].copy()
174
- case["case_id"] = f"blocks_{i:03d}"
175
- case["domain"] = "blocksworld"
176
- examples.append(case)
177
-
178
- return examples
179
-
180
- def _create_logistics_examples(self, count: int) -> list[dict[str, Any]]:
181
- """Create logistics planning examples."""
182
- logistics_cases = [
183
- {
184
- "initial_state": "Package P1 is in City A. Truck T1 is in City A. Package needs to go to City B.",
185
- "goal_state": "Package P1 is in City B.",
186
- "valid_plan": [
187
- "1. Load P1 onto T1 in City A",
188
- "2. Drive T1 from City A to City B",
189
- "3. Unload P1 from T1 in City B",
190
- ],
191
- "invalid_plan": [
192
- "1. Drive T1 from City A to City B",
193
- "2. Unload P1 from T1", # Invalid - P1 was never loaded
194
- ],
195
- },
196
- {
197
- "initial_state": "Package P1 is in City A. Package P2 is in City B. Plane A1 is in City A. Goal: P1 in City C, P2 in City A.",
198
- "goal_state": "Package P1 is in City C. Package P2 is in City A.",
199
- "valid_plan": [
200
- "1. Load P1 onto Plane A1 in City A",
201
- "2. Fly A1 from City A to City B",
202
- "3. Load P2 onto A1 in City B",
203
- "4. Fly A1 from City B to City A",
204
- "5. Unload P2 in City A",
205
- "6. Fly A1 from City A to City C",
206
- "7. Unload P1 in City C",
207
- ],
208
- "invalid_plan": [
209
- "1. Fly A1 to City B",
210
- "2. Unload P1", # P1 was never loaded
211
- ],
212
- },
213
- ]
214
-
215
- examples = []
216
- for i in range(count):
217
- case = logistics_cases[i % len(logistics_cases)].copy()
218
- case["case_id"] = f"logistics_{i:03d}"
219
- case["domain"] = "logistics"
220
- examples.append(case)
221
-
222
- return examples
95
+ def _load_planbench_data(self) -> list[dict[str, Any]]:
96
+ """Load PlanBench data from HuggingFace."""
97
+ try:
98
+ ds = load_dataset("tasksource/planbench", self.config, split="train")
99
+ examples = []
100
+ for i, item in enumerate(ds):
101
+ examples.append({
102
+ "case_id": f"planbench_{self.config}_{i:04d}",
103
+ "task": item.get("task", ""),
104
+ "prompt_type": item.get("prompt_type", ""),
105
+ "domain": item.get("domain", ""),
106
+ "instance_id": item.get("instance_id", ""),
107
+ "query": item.get("query", ""),
108
+ "ground_truth_plan": item.get("ground_truth_plan", ""),
109
+ })
110
+ return examples
111
+ except Exception as e:
112
+ log.error(f"Failed to load PlanBench from HuggingFace: {e}")
113
+ raise RuntimeError(f"Cannot load PlanBench data: {e}")
223
114
 
224
115
  def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
225
116
  """
226
117
  Convert a single doc into a ContrastivePair.
118
+
119
+ PlanBench HuggingFace format:
120
+ {"task": "task_1_plan_generation", "prompt_type": "oneshot", "domain": "...",
121
+ "instance_id": 2, "query": "...", "ground_truth_plan": "..."}
227
122
  """
228
123
  try:
229
124
  case_id = doc.get("case_id", "")
230
- initial_state = doc.get("initial_state", "").strip()
231
- goal_state = doc.get("goal_state", "").strip()
232
- valid_plan = doc.get("valid_plan", [])
233
- invalid_plan = doc.get("invalid_plan", [])
234
- domain = doc.get("domain", self.domain)
235
-
236
- if not initial_state or not goal_state:
237
- log.debug("Skipping: missing states")
238
- return None
125
+ query = doc.get("query", "").strip()
126
+ ground_truth_plan = doc.get("ground_truth_plan", "").strip()
127
+ domain = doc.get("domain", "")
128
+ task = doc.get("task", "")
239
129
 
240
- # Build the planning task prompt
241
- task_prompt = self._build_planning_prompt(
242
- initial_state, goal_state, domain
243
- )
130
+ if not query or not ground_truth_plan:
131
+ log.debug("Skipping: missing query or ground_truth_plan")
132
+ return None
244
133
 
245
- # Positive = valid plan
246
- correct_response = self._create_valid_plan_response(valid_plan)
247
- # Negative = invalid plan
248
- incorrect_response = self._create_invalid_plan_response(invalid_plan)
134
+ correct_response = self._create_correct_response(ground_truth_plan)
135
+ incorrect_response = self._create_incorrect_response(ground_truth_plan)
249
136
 
250
137
  metadata = {
251
138
  "label": "planbench",
252
- "source": "karthikv792/LLMs-Planning",
139
+ "source": "tasksource/planbench",
253
140
  "case_id": case_id,
254
141
  "domain": domain,
255
- "task": self.task,
256
- "plan_length": len(valid_plan),
142
+ "task": task,
143
+ "config": self.config,
257
144
  "is_planning_benchmark": True,
258
145
  }
259
146
 
260
147
  return self._build_pair(
261
- question=task_prompt,
148
+ question=query,
262
149
  correct=correct_response,
263
150
  incorrect=incorrect_response,
264
151
  metadata=metadata,
@@ -268,50 +155,22 @@ class PlanBenchExtractor(HuggingFaceBenchmarkExtractor):
268
155
  log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
269
156
  return None
270
157
 
271
- def _build_planning_prompt(
272
- self, initial_state: str, goal_state: str, domain: str
273
- ) -> str:
274
- """Build the planning task prompt."""
275
- domain_desc = ""
276
- if domain == "blocksworld":
277
- domain_desc = (
278
- "In this blocks world domain, you can:\n"
279
- "- Pick up a block (only if nothing is on it and arm is empty)\n"
280
- "- Put down a block on the table\n"
281
- "- Stack a block on another (only if target block is clear)\n"
282
- "- Unstack a block from another\n\n"
283
- )
284
- elif domain == "logistics":
285
- domain_desc = (
286
- "In this logistics domain, you can:\n"
287
- "- Load packages onto trucks/planes (at same location)\n"
288
- "- Unload packages from trucks/planes\n"
289
- "- Drive trucks between locations in same city\n"
290
- "- Fly planes between cities\n\n"
291
- )
292
-
158
+ def _create_correct_response(self, ground_truth_plan: str) -> str:
159
+ """Create a response with the correct plan."""
293
160
  return (
294
- f"{domain_desc}"
295
- f"Initial State:\n{initial_state}\n\n"
296
- f"Goal State:\n{goal_state}\n\n"
297
- "Generate a valid sequence of actions to achieve the goal state from "
298
- "the initial state. Ensure each action's preconditions are satisfied."
161
+ f"Here is the plan to achieve the goal:\n\n{ground_truth_plan}\n\n"
162
+ "Each action in this sequence has its preconditions satisfied."
299
163
  )
300
164
 
301
- def _create_valid_plan_response(self, plan: list[str]) -> str:
302
- """Create a response with a valid plan."""
303
- plan_str = "\n".join(plan)
304
- return (
305
- f"Here is a valid plan to achieve the goal:\n\n{plan_str}\n\n"
306
- "Each action in this sequence has its preconditions satisfied by the "
307
- "previous actions, and executing them in order will achieve the goal state."
308
- )
309
-
310
- def _create_invalid_plan_response(self, plan: list[str]) -> str:
311
- """Create a response with an invalid plan."""
312
- plan_str = "\n".join(plan) if plan else "1. [Incomplete plan]"
165
+ def _create_incorrect_response(self, ground_truth_plan: str) -> str:
166
+ """Create an incorrect response (wrong/incomplete plan)."""
167
+ lines = ground_truth_plan.strip().split("\n")
168
+ if len(lines) > 1:
169
+ wrong_plan = "\n".join(reversed(lines[:2]))
170
+ else:
171
+ wrong_plan = "(noop)"
313
172
  return (
314
- f"Here's my plan:\n\n{plan_str}\n\n"
173
+ f"Here's my plan:\n\n{wrong_plan}\n\n"
315
174
  "This should work to reach the goal."
316
175
  )
317
176
 
@@ -130,13 +130,46 @@ class PolyMathExtractor(HuggingFaceBenchmarkExtractor):
130
130
  return None
131
131
 
132
132
  def _create_incorrect_answer(self, correct: str) -> str:
133
- """Create an incorrect answer by modifying the correct one."""
133
+ """Create a meaningful incorrect answer using plausible wrong values."""
134
+ import random
135
+ import re
136
+ random.seed(hash(correct) % (2**32))
137
+
138
+ # Try symbolic parsing first
134
139
  try:
135
140
  parsed_correct = parse_latex(correct)
136
- incorrect = str(latex(parsed_correct + 1))
137
- return incorrect
141
+ transforms = [
142
+ parsed_correct * 2,
143
+ parsed_correct / 2,
144
+ parsed_correct - 1,
145
+ -parsed_correct,
146
+ ]
147
+ wrong = random.choice(transforms)
148
+ return str(latex(wrong))
138
149
  except Exception:
139
- return f"{correct} + 1"
150
+ pass
151
+
152
+ # Try simple integer
153
+ try:
154
+ clean = correct.replace('$', '').replace(',', '').strip()
155
+ num = int(clean)
156
+ wrong_vals = [num * 2, num // 2 if num > 1 else num * 3, num - 1, -num]
157
+ return str(random.choice(wrong_vals))
158
+ except ValueError:
159
+ pass
160
+
161
+ # For fractions
162
+ frac_match = re.match(r'\\frac\{(\d+)\}\{(\d+)\}', correct)
163
+ if frac_match:
164
+ n, d = int(frac_match.group(1)), int(frac_match.group(2))
165
+ return random.choice([f"\\frac{{{d}}}{{{n}}}", f"\\frac{{{n*2}}}{{{d}}}"])
166
+
167
+ # For pi expressions
168
+ if '\\pi' in correct:
169
+ return correct.replace('\\pi', '2\\pi') if '2\\pi' not in correct else correct.replace('2\\pi', '\\pi')
170
+
171
+ # Fallback
172
+ return random.choice(['0', '1', '-1', '2'])
140
173
 
141
174
 
142
175
  @staticmethod