wisent 0.7.701__py3-none-any.whl → 0.7.901__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/activation_cache.py +393 -0
  3. wisent/core/activations/activations.py +3 -3
  4. wisent/core/activations/activations_collector.py +9 -5
  5. wisent/core/activations/classifier_inference_strategy.py +12 -11
  6. wisent/core/activations/extraction_strategy.py +256 -84
  7. wisent/core/classifiers/classifiers/core/atoms.py +3 -2
  8. wisent/core/cli/__init__.py +2 -1
  9. wisent/core/cli/agent/apply_steering.py +5 -7
  10. wisent/core/cli/agent/train_classifier.py +19 -7
  11. wisent/core/cli/check_linearity.py +35 -3
  12. wisent/core/cli/cluster_benchmarks.py +4 -6
  13. wisent/core/cli/create_steering_vector.py +6 -4
  14. wisent/core/cli/diagnose_vectors.py +7 -4
  15. wisent/core/cli/estimate_unified_goodness_time.py +6 -4
  16. wisent/core/cli/generate_pairs_from_task.py +9 -56
  17. wisent/core/cli/geometry_search.py +137 -0
  18. wisent/core/cli/get_activations.py +1 -1
  19. wisent/core/cli/method_optimizer.py +4 -3
  20. wisent/core/cli/modify_weights.py +3 -2
  21. wisent/core/cli/optimize_sample_size.py +1 -1
  22. wisent/core/cli/optimize_steering.py +14 -16
  23. wisent/core/cli/optimize_weights.py +2 -1
  24. wisent/core/cli/preview_pairs.py +203 -0
  25. wisent/core/cli/steering_method_trainer.py +3 -3
  26. wisent/core/cli/tasks.py +19 -76
  27. wisent/core/cli/train_unified_goodness.py +3 -3
  28. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +4 -4
  29. wisent/core/contrastive_pairs/diagnostics/linearity.py +7 -0
  30. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +37 -347
  31. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +113 -136
  32. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +2 -12
  33. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +124 -504
  34. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +40 -63
  35. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +46 -89
  36. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +15 -4
  37. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +36 -20
  38. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +3 -45
  39. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +42 -4
  40. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +2 -112
  41. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +39 -4
  42. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +475 -525
  43. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +65 -42
  44. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +2 -12
  45. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +78 -219
  46. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +37 -4
  47. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +84 -69
  48. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +168 -160
  49. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +44 -25
  50. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +3 -103
  51. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +3 -97
  52. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +48 -182
  53. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +3 -0
  54. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +19 -1
  55. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +1 -3
  56. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +1 -3
  57. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +1 -3
  58. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +2 -4
  59. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +1 -3
  60. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +1 -3
  61. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +2 -2
  62. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +1 -3
  63. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +1 -3
  64. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +1 -3
  65. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +1 -3
  66. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +1 -3
  67. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +1 -3
  68. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +1 -3
  69. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +1 -3
  70. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +1 -2
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +1 -2
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +2 -2
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +2 -2
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +36 -2
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +1 -3
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +1 -3
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +1 -3
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +1 -3
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +1 -3
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +1 -3
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +3 -5
  82. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +1 -3
  83. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +1 -3
  84. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +2 -2
  85. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +1 -3
  86. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +1 -3
  87. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +1 -3
  88. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +1 -3
  89. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +1 -3
  90. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +1 -3
  91. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +2 -2
  92. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +1 -3
  93. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +1 -3
  94. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +1 -3
  95. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +1 -3
  96. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +1 -3
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +1 -3
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +1 -3
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +1 -3
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +1 -3
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +1 -3
  102. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +1 -3
  103. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +1 -3
  104. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +2 -2
  105. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +1 -3
  106. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +3 -4
  107. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +1 -3
  108. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +1 -3
  109. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +1 -3
  110. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +2 -6
  111. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +1 -3
  112. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +1 -3
  113. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +1 -3
  114. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +1 -3
  115. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +2 -2
  116. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +1 -3
  117. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +1 -3
  118. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +1 -3
  119. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +1 -3
  120. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +1 -3
  121. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +1 -3
  122. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +1 -3
  123. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +1 -3
  124. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +1 -3
  125. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +1 -3
  126. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +1 -3
  127. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +1 -3
  128. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +1 -3
  129. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +1 -3
  130. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +1 -3
  131. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +1 -3
  132. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +1 -3
  133. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +2 -2
  134. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +1 -3
  135. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +1 -3
  136. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +1 -3
  137. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +1 -3
  138. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +1 -3
  139. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +1 -3
  140. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +2 -2
  141. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +2 -2
  142. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +5 -9
  143. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +63 -16
  144. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +1 -3
  145. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +1 -3
  146. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +1 -3
  147. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +1 -3
  148. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +1 -3
  149. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +1 -3
  150. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +1 -3
  151. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +1 -3
  152. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +1 -1
  153. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +1 -3
  154. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +1 -3
  155. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +5 -17
  156. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +1 -3
  157. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +1 -3
  158. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +1 -3
  159. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +1 -3
  160. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +1 -3
  161. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +1 -3
  162. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +1 -3
  163. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +2 -2
  164. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +2 -2
  165. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +1 -3
  166. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +1 -3
  167. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +2 -4
  168. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +2 -2
  169. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +2 -4
  170. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +1 -3
  171. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +1 -3
  172. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +1 -3
  173. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +2 -2
  174. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +2 -2
  175. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +1 -3
  176. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +1 -3
  177. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +1 -3
  178. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +3 -4
  179. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +2 -2
  180. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +2 -5
  181. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +2 -2
  182. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +2 -2
  183. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +1 -3
  184. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +1 -3
  185. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +1 -3
  186. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +1 -3
  187. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +4 -8
  188. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +4 -8
  189. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +2 -2
  190. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +1 -3
  191. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +1 -3
  192. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +1 -3
  193. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +2 -5
  194. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +1 -3
  195. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +2 -2
  196. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +1 -3
  197. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +1 -3
  198. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +1 -3
  199. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +1 -3
  200. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +1 -3
  201. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +2 -2
  202. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +1 -3
  203. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +1 -3
  204. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +1 -3
  205. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +2 -2
  206. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +1 -3
  207. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +1 -3
  208. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +2 -2
  209. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +2 -2
  210. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +2 -2
  211. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +2 -2
  212. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +2 -2
  213. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +2 -2
  214. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +1 -3
  215. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +2 -2
  216. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +2 -2
  217. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +1 -3
  218. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +2 -2
  219. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +1 -3
  220. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +2 -2
  221. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +1 -3
  222. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +2 -2
  223. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +1 -3
  224. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +1 -3
  225. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +1 -3
  226. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +1 -3
  227. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +1 -3
  228. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +2 -1
  229. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +4 -7
  230. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +2 -2
  231. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +1 -3
  232. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +2 -6
  233. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +1 -3
  234. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +1 -3
  235. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +1 -3
  236. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +2 -2
  237. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +1 -3
  238. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +1 -3
  239. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +1 -3
  240. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +1 -3
  241. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +1 -3
  242. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +1 -3
  243. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +1 -3
  244. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +1 -3
  245. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +1 -3
  246. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +1 -3
  247. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +1 -3
  248. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +1 -3
  249. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +2 -2
  250. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +1 -3
  251. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +1 -3
  252. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +1 -3
  253. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +1 -3
  254. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +1 -3
  255. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +0 -2
  256. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +1 -3
  257. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +2 -2
  258. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +2 -2
  259. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +3 -4
  260. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +2 -2
  261. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +1 -3
  262. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +2 -2
  263. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +2 -2
  264. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +1 -3
  265. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +1 -3
  266. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +1 -3
  267. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +2 -2
  268. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +2 -4
  269. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +2 -3
  270. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +2 -2
  271. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +1 -3
  272. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +173 -6
  273. wisent/core/data_loaders/loaders/lm_loader.py +12 -1
  274. wisent/core/geometry_runner.py +995 -0
  275. wisent/core/geometry_search_space.py +237 -0
  276. wisent/core/hyperparameter_optimizer.py +1 -1
  277. wisent/core/main.py +3 -0
  278. wisent/core/models/core/atoms.py +5 -3
  279. wisent/core/models/wisent_model.py +1 -1
  280. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  281. wisent/core/parser_arguments/check_linearity_parser.py +12 -2
  282. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +2 -2
  283. wisent/core/parser_arguments/generate_vector_from_task_parser.py +2 -2
  284. wisent/core/parser_arguments/geometry_search_parser.py +61 -0
  285. wisent/core/parser_arguments/main_parser.py +8 -0
  286. wisent/core/parser_arguments/train_unified_goodness_parser.py +2 -2
  287. wisent/core/steering.py +5 -3
  288. wisent/core/steering_methods/methods/hyperplane.py +2 -1
  289. wisent/core/synthetic/generators/nonsense_generator.py +30 -18
  290. wisent/core/trainers/steering_trainer.py +2 -2
  291. wisent/core/utils/device.py +27 -27
  292. wisent/core/utils/layer_combinations.py +70 -0
  293. wisent/examples/__init__.py +1 -0
  294. wisent/examples/scripts/__init__.py +1 -0
  295. wisent/examples/scripts/count_all_benchmarks.py +121 -0
  296. wisent/examples/scripts/discover_directions.py +469 -0
  297. wisent/examples/scripts/extract_benchmark_info.py +71 -0
  298. wisent/examples/scripts/generate_paper_data.py +384 -0
  299. wisent/examples/scripts/intervention_validation.py +626 -0
  300. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_evaluation.json +324 -0
  301. wisent/examples/scripts/results/test_AraDiCE_ArabicMMLU_lev_pairs.json +92 -0
  302. wisent/examples/scripts/results/test_aexams_IslamicStudies_evaluation.json +324 -0
  303. wisent/examples/scripts/results/test_aexams_IslamicStudies_pairs.json +92 -0
  304. wisent/examples/scripts/results/test_afrimgsm_pairs.json +92 -0
  305. wisent/examples/scripts/results/test_afrimmlu_evaluation.json +324 -0
  306. wisent/examples/scripts/results/test_afrimmlu_pairs.json +92 -0
  307. wisent/examples/scripts/search_all_short_names.py +31 -0
  308. wisent/examples/scripts/test_all_benchmarks.py +138 -0
  309. wisent/examples/scripts/test_all_benchmarks_new.py +28 -0
  310. wisent/examples/scripts/test_contrastive_pairs_all_supported.py +230 -0
  311. wisent/examples/scripts/test_nonsense_baseline.py +261 -0
  312. wisent/examples/scripts/test_one_benchmark.py +324 -0
  313. wisent/examples/scripts/test_one_coding_benchmark.py +293 -0
  314. wisent/examples/scripts/threshold_analysis.py +434 -0
  315. wisent/examples/scripts/visualization_gallery.py +582 -0
  316. wisent/parameters/lm_eval/broken_in_lm_eval.json +179 -2
  317. wisent/parameters/lm_eval/category_directions.json +137 -0
  318. wisent/parameters/lm_eval/repair_plan.json +282 -0
  319. wisent/parameters/lm_eval/weak_contrastive_pairs.json +38 -0
  320. wisent/parameters/lm_eval/working_benchmarks.json +206 -0
  321. wisent/parameters/lm_eval/working_benchmarks_categorized.json +236 -0
  322. wisent/tests/test_detector_accuracy.py +1 -1
  323. wisent/tests/visualize_geometry.py +1 -1
  324. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/METADATA +1 -1
  325. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/RECORD +329 -295
  326. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +0 -245
  327. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/WHEEL +0 -0
  328. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/entry_points.txt +0 -0
  329. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/licenses/LICENSE +0 -0
  330. {wisent-0.7.701.dist-info → wisent-0.7.901.dist-info}/top_level.txt +0 -0
@@ -2,15 +2,22 @@ from __future__ import annotations
2
2
 
3
3
  from typing import Any
4
4
  from wisent.core.cli_logger import setup_logger
5
+ import requests
6
+ import zipfile
7
+ import json
8
+ import io
5
9
 
6
10
  from wisent.core.contrastive_pairs.core.pair import ContrastivePair
7
11
  from wisent.core.contrastive_pairs.huggingface_pairs.atoms import HuggingFaceBenchmarkExtractor
8
12
  from wisent.core.errors import InvalidValueError
9
13
 
10
- __all__ = ["OJBenchExtractor", "TerminalBenchExtractor", "SciCodeExtractor"]
14
+ __all__ = ["OJBenchExtractor", "NL2BashExtractor", "SciCodeExtractor"]
11
15
 
12
16
  log = setup_logger(__name__)
13
17
 
18
+ # GitHub URL for SciCode data
19
+ SCICODE_GITHUB_URL = "https://raw.githubusercontent.com/scicode-bench/scicode-bench.github.io/main/data/data.zip"
20
+
14
21
 
15
22
  class OJBenchExtractor(HuggingFaceBenchmarkExtractor):
16
23
  """
@@ -66,9 +73,9 @@ class OJBenchExtractor(HuggingFaceBenchmarkExtractor):
66
73
  )
67
74
  log.info(f"Loaded {len(docs)} examples from code_contests")
68
75
  except Exception as e:
69
- log.warning(f"Failed to load code_contests: {e}")
70
- # Create synthetic competitive programming examples
71
- docs = self._create_synthetic_examples(max_items or 100)
76
+ log.error(f"Failed to load code_contests dataset: {e}")
77
+ log.error("OJBench requires deepmind/code_contests dataset. No synthetic data available.")
78
+ return []
72
79
 
73
80
  pairs: list[ContrastivePair] = []
74
81
 
@@ -84,256 +91,6 @@ class OJBenchExtractor(HuggingFaceBenchmarkExtractor):
84
91
 
85
92
  return pairs
86
93
 
87
- def _create_synthetic_examples(self, count: int) -> list[dict[str, Any]]:
88
- """Create synthetic competitive programming examples."""
89
- examples = [
90
- {
91
- "description": """Problem: Two Sum
92
- Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target.
93
-
94
- Input: First line contains n (1 ≤ n ≤ 10^5) and target. Second line contains n space-separated integers.
95
- Output: Two indices (0-indexed) separated by space.
96
-
97
- Example:
98
- Input:
99
- 4 9
100
- 2 7 11 15
101
- Output:
102
- 0 1""",
103
- "correct_solution": """#include <bits/stdc++.h>
104
- using namespace std;
105
-
106
- int main() {
107
- ios::sync_with_stdio(false);
108
- cin.tie(nullptr);
109
-
110
- int n, target;
111
- cin >> n >> target;
112
-
113
- vector<int> nums(n);
114
- unordered_map<int, int> mp;
115
-
116
- for (int i = 0; i < n; i++) {
117
- cin >> nums[i];
118
- int complement = target - nums[i];
119
- if (mp.count(complement)) {
120
- cout << mp[complement] << " " << i << endl;
121
- return 0;
122
- }
123
- mp[nums[i]] = i;
124
- }
125
-
126
- return 0;
127
- }""",
128
- "incorrect_solution": """#include <bits/stdc++.h>
129
- using namespace std;
130
-
131
- int main() {
132
- int n, target;
133
- cin >> n >> target;
134
-
135
- vector<int> nums(n);
136
- for (int i = 0; i < n; i++) cin >> nums[i];
137
-
138
- // O(n^2) - will TLE on large inputs
139
- for (int i = 0; i < n; i++) {
140
- for (int j = 0; j < n; j++) { // Bug: should start from i+1
141
- if (nums[i] + nums[j] == target) {
142
- cout << i << " " << j << endl;
143
- return 0;
144
- }
145
- }
146
- }
147
- return 0;
148
- }""",
149
- "difficulty": "easy",
150
- },
151
- {
152
- "description": """Problem: Maximum Subarray Sum
153
- Find the contiguous subarray with the largest sum.
154
-
155
- Input: First line contains n (1 ≤ n ≤ 10^6). Second line contains n integers (-10^9 ≤ a[i] ≤ 10^9).
156
- Output: Maximum subarray sum.
157
-
158
- Example:
159
- Input:
160
- 8
161
- -2 1 -3 4 -1 2 1 -5 4
162
- Output:
163
- 6""",
164
- "correct_solution": """#include <bits/stdc++.h>
165
- using namespace std;
166
-
167
- int main() {
168
- ios::sync_with_stdio(false);
169
- cin.tie(nullptr);
170
-
171
- int n;
172
- cin >> n;
173
-
174
- long long maxSum = LLONG_MIN;
175
- long long currentSum = 0;
176
-
177
- for (int i = 0; i < n; i++) {
178
- long long x;
179
- cin >> x;
180
- currentSum = max(x, currentSum + x);
181
- maxSum = max(maxSum, currentSum);
182
- }
183
-
184
- cout << maxSum << endl;
185
- return 0;
186
- }""",
187
- "incorrect_solution": """#include <bits/stdc++.h>
188
- using namespace std;
189
-
190
- int main() {
191
- int n;
192
- cin >> n;
193
-
194
- vector<int> a(n);
195
- for (int i = 0; i < n; i++) cin >> a[i];
196
-
197
- int maxSum = 0; // Bug: should be LLONG_MIN for negative arrays
198
- int currentSum = 0;
199
-
200
- for (int i = 0; i < n; i++) {
201
- currentSum += a[i]; // Bug: doesn't handle Kadane's algorithm correctly
202
- if (currentSum > maxSum) maxSum = currentSum;
203
- if (currentSum < 0) currentSum = 0;
204
- }
205
-
206
- cout << maxSum << endl;
207
- return 0;
208
- }""",
209
- "difficulty": "medium",
210
- },
211
- {
212
- "description": """Problem: Segment Tree Range Sum
213
- Given an array, support two operations:
214
- 1. Update a[i] = x
215
- 2. Query sum(l, r)
216
-
217
- Input: First line n, q. Second line is initial array. Next q lines are operations.
218
- Output: Answer for each query operation.
219
-
220
- Example:
221
- Input:
222
- 5 3
223
- 1 2 3 4 5
224
- 2 1 3
225
- 1 2 10
226
- 2 1 3
227
- Output:
228
- 6
229
- 14""",
230
- "correct_solution": """#include <bits/stdc++.h>
231
- using namespace std;
232
-
233
- class SegmentTree {
234
- vector<long long> tree;
235
- int n;
236
-
237
- public:
238
- SegmentTree(vector<int>& arr) {
239
- n = arr.size();
240
- tree.resize(4 * n);
241
- build(arr, 1, 0, n - 1);
242
- }
243
-
244
- void build(vector<int>& arr, int v, int tl, int tr) {
245
- if (tl == tr) {
246
- tree[v] = arr[tl];
247
- } else {
248
- int tm = (tl + tr) / 2;
249
- build(arr, 2*v, tl, tm);
250
- build(arr, 2*v+1, tm+1, tr);
251
- tree[v] = tree[2*v] + tree[2*v+1];
252
- }
253
- }
254
-
255
- void update(int v, int tl, int tr, int pos, int val) {
256
- if (tl == tr) {
257
- tree[v] = val;
258
- } else {
259
- int tm = (tl + tr) / 2;
260
- if (pos <= tm) update(2*v, tl, tm, pos, val);
261
- else update(2*v+1, tm+1, tr, pos, val);
262
- tree[v] = tree[2*v] + tree[2*v+1];
263
- }
264
- }
265
-
266
- long long query(int v, int tl, int tr, int l, int r) {
267
- if (l > r) return 0;
268
- if (l == tl && r == tr) return tree[v];
269
- int tm = (tl + tr) / 2;
270
- return query(2*v, tl, tm, l, min(r, tm)) +
271
- query(2*v+1, tm+1, tr, max(l, tm+1), r);
272
- }
273
-
274
- void update(int pos, int val) { update(1, 0, n-1, pos, val); }
275
- long long query(int l, int r) { return query(1, 0, n-1, l, r); }
276
- };
277
-
278
- int main() {
279
- ios::sync_with_stdio(false);
280
- cin.tie(nullptr);
281
-
282
- int n, q;
283
- cin >> n >> q;
284
-
285
- vector<int> a(n);
286
- for (int i = 0; i < n; i++) cin >> a[i];
287
-
288
- SegmentTree st(a);
289
-
290
- while (q--) {
291
- int type, x, y;
292
- cin >> type >> x >> y;
293
- if (type == 1) {
294
- st.update(x - 1, y);
295
- } else {
296
- cout << st.query(x - 1, y - 1) << "\\n";
297
- }
298
- }
299
-
300
- return 0;
301
- }""",
302
- "incorrect_solution": """#include <bits/stdc++.h>
303
- using namespace std;
304
-
305
- int main() {
306
- int n, q;
307
- cin >> n >> q;
308
-
309
- vector<int> a(n);
310
- for (int i = 0; i < n; i++) cin >> a[i];
311
-
312
- // O(n) per query - will TLE
313
- while (q--) {
314
- int type, x, y;
315
- cin >> type >> x >> y;
316
- if (type == 1) {
317
- a[x-1] = y;
318
- } else {
319
- int sum = 0;
320
- for (int i = x-1; i < y; i++) sum += a[i];
321
- cout << sum << "\\n";
322
- }
323
- }
324
- return 0;
325
- }""",
326
- "difficulty": "hard",
327
- },
328
- ]
329
-
330
- result = []
331
- for i in range(count):
332
- example = examples[i % len(examples)].copy()
333
- result.append(example)
334
-
335
- return result
336
-
337
94
  def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
338
95
  """Convert a single doc into a ContrastivePair."""
339
96
  try:
@@ -436,38 +193,30 @@ int main() {
436
193
 
437
194
 
438
195
 
439
- class TerminalBenchExtractor(HuggingFaceBenchmarkExtractor):
196
+ class NL2BashExtractor(HuggingFaceBenchmarkExtractor):
440
197
  """
441
- Extractor for Terminal-Bench - terminal/CLI interaction benchmark.
198
+ Extractor for NL2Bash - Natural Language to Bash command generation.
442
199
 
443
- Terminal-Bench evaluates LLMs' ability to interact with command-line
444
- interfaces, execute shell commands, navigate filesystems, and perform
445
- system administration tasks.
200
+ Dataset: jiacheng-ye/nl2bash on HuggingFace
201
+
202
+ NL2Bash evaluates LLMs' ability to translate natural language descriptions
203
+ into correct Bash shell commands. Tests command syntax, flag usage,
204
+ and understanding of CLI tools.
446
205
 
447
- For terminal interaction evaluation:
448
- - Positive (correct) = Correct commands with proper syntax and expected behavior
449
- - Negative (incorrect) = Commands with errors, wrong syntax, or dangerous operations
206
+ For bash command generation evaluation:
207
+ - Positive (correct) = Correct bash command with proper syntax
208
+ - Negative (incorrect) = Command with errors, wrong syntax, or missing parts
450
209
  """
451
210
 
452
211
  # Evaluator that should be used for this benchmark
453
- evaluator_name = "terminal_interaction"
454
-
455
- def __init__(self, os_type: str = "linux"):
456
- """
457
- Initialize Terminal-Bench extractor.
458
-
459
- Args:
460
- os_type: Operating system type (linux, macos, windows)
461
- """
462
- super().__init__()
463
- self.os_type = os_type
212
+ evaluator_name = "bash_generation"
464
213
 
465
214
  def extract_contrastive_pairs(
466
215
  self,
467
216
  limit: int | None = None,
468
217
  ) -> list[ContrastivePair]:
469
218
  """
470
- Build contrastive pairs from Terminal-Bench examples.
219
+ Build contrastive pairs from NL2Bash dataset.
471
220
 
472
221
  Args:
473
222
  limit: Optional maximum number of pairs to produce.
@@ -477,20 +226,16 @@ class TerminalBenchExtractor(HuggingFaceBenchmarkExtractor):
477
226
  """
478
227
  max_items = self._normalize_limit(limit)
479
228
 
480
- # Try loading NL2Bash dataset
481
- docs = []
482
-
483
229
  try:
484
230
  docs = self.load_dataset(
485
231
  dataset_name="jiacheng-ye/nl2bash",
486
232
  split="test",
487
- limit=max_items * 2 if max_items else None,
233
+ limit=max_items,
488
234
  )
489
235
  log.info(f"Loaded {len(docs)} examples from nl2bash")
490
236
  except Exception as e:
491
- log.warning(f"Failed to load nl2bash: {e}")
492
- # Create synthetic terminal examples
493
- docs = self._create_synthetic_examples(max_items or 100)
237
+ log.error(f"Failed to load nl2bash dataset: {e}")
238
+ return []
494
239
 
495
240
  pairs: list[ContrastivePair] = []
496
241
 
@@ -502,116 +247,40 @@ class TerminalBenchExtractor(HuggingFaceBenchmarkExtractor):
502
247
  break
503
248
 
504
249
  if not pairs:
505
- log.warning("No valid Terminal-Bench pairs extracted")
250
+ log.warning("No valid NL2Bash pairs extracted")
506
251
 
507
252
  return pairs
508
253
 
509
- def _create_synthetic_examples(self, count: int) -> list[dict[str, Any]]:
510
- """Create synthetic terminal interaction examples."""
511
- examples = [
512
- {
513
- "nl": "Find all Python files in the current directory and subdirectories",
514
- "correct_command": "find . -name '*.py' -type f",
515
- "incorrect_command": "find *.py", # Wrong syntax
516
- "category": "file_search",
517
- },
518
- {
519
- "nl": "Count the number of lines in all text files in the current directory",
520
- "correct_command": "wc -l *.txt | tail -1",
521
- "incorrect_command": "count lines *.txt", # Not a real command
522
- "category": "file_analysis",
523
- },
524
- {
525
- "nl": "Create a compressed archive of the logs directory",
526
- "correct_command": "tar -czvf logs.tar.gz logs/",
527
- "incorrect_command": "zip logs/ archive", # Wrong argument order
528
- "category": "archiving",
529
- },
530
- {
531
- "nl": "Show running processes sorted by memory usage",
532
- "correct_command": "ps aux --sort=-%mem | head -20",
533
- "incorrect_command": "ps memory", # Invalid syntax
534
- "category": "process_management",
535
- },
536
- {
537
- "nl": "Find and kill all processes named 'python'",
538
- "correct_command": "pkill -f python",
539
- "incorrect_command": "kill python", # kill needs PID, not name
540
- "category": "process_management",
541
- },
542
- {
543
- "nl": "Download a file from a URL and save it with a specific name",
544
- "correct_command": "curl -o output.txt https://example.com/file.txt",
545
- "incorrect_command": "download https://example.com/file.txt", # Not a command
546
- "category": "networking",
547
- },
548
- {
549
- "nl": "Find files modified in the last 24 hours",
550
- "correct_command": "find . -mtime -1 -type f",
551
- "incorrect_command": "find . modified 24h", # Wrong syntax
552
- "category": "file_search",
553
- },
554
- {
555
- "nl": "Replace all occurrences of 'foo' with 'bar' in a file in-place",
556
- "correct_command": "sed -i 's/foo/bar/g' file.txt",
557
- "incorrect_command": "replace foo bar file.txt", # Not a command
558
- "category": "text_processing",
559
- },
560
- {
561
- "nl": "Check disk space usage for all mounted filesystems",
562
- "correct_command": "df -h",
563
- "incorrect_command": "disk space", # Not a command
564
- "category": "system_info",
565
- },
566
- {
567
- "nl": "Create a new user named 'developer' with home directory",
568
- "correct_command": "sudo useradd -m -s /bin/bash developer",
569
- "incorrect_command": "create user developer", # Not a command
570
- "category": "user_management",
571
- },
572
- ]
573
-
574
- result = []
575
- for i in range(count):
576
- example = examples[i % len(examples)].copy()
577
- result.append(example)
578
-
579
- return result
580
-
581
254
  def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
582
- """Convert a single doc into a ContrastivePair."""
255
+ """Convert a single doc into a ContrastivePair.
256
+
257
+ nl2bash schema:
258
+ - nl: str (natural language description)
259
+ - bash: str (correct bash command)
260
+ """
583
261
  try:
584
- # Handle nl2bash schema
585
- nl = doc.get("nl", doc.get("description", "")).strip()
586
- correct = doc.get("correct_command", doc.get("bash", "")).strip()
587
- incorrect = doc.get("incorrect_command", "").strip()
588
- category = doc.get("category", "general")
262
+ nl = doc.get("nl", "").strip()
263
+ correct = doc.get("bash", "").strip()
589
264
 
590
- if not nl:
265
+ if not nl or not correct:
591
266
  return None
592
267
 
593
- if not correct:
594
- return None
595
-
596
- if not incorrect:
597
- incorrect = self._create_incorrect_command(nl)
598
-
599
- task_prompt = f"""Terminal Command Task:
268
+ task_prompt = f"""Bash Command Task:
600
269
 
601
270
  {nl}
602
271
 
603
- Provide the correct {self.os_type} terminal command to accomplish this task.
604
- The command should be safe, efficient, and follow best practices."""
272
+ Provide the correct bash command to accomplish this task."""
605
273
 
606
- correct_response = f"```bash\n{correct}\n```\n\nThis command correctly accomplishes the task."
607
- incorrect_response = f"```bash\n{incorrect}\n```\n\nNote: This command may have syntax errors or may not work as intended."
274
+ # Create incorrect by corrupting the command
275
+ incorrect = self._create_incorrect_command(correct)
276
+
277
+ correct_response = f"```bash\n{correct}\n```"
278
+ incorrect_response = f"```bash\n{incorrect}\n```"
608
279
 
609
280
  metadata = {
610
- "label": "terminal_bench",
611
- "source": "terminal_bench",
612
- "category": category,
613
- "os_type": self.os_type,
614
- "is_terminal_benchmark": True,
281
+ "label": "nl2bash",
282
+ "source": "jiacheng-ye/nl2bash",
283
+ "is_bash_benchmark": True,
615
284
  }
616
285
 
617
286
  return self._build_pair(
@@ -625,9 +294,13 @@ The command should be safe, efficient, and follow best practices."""
625
294
  log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
626
295
  return None
627
296
 
628
- def _create_incorrect_command(self, description: str) -> str:
629
- """Create a plausible but incorrect command."""
630
- return "# Command with incorrect syntax or missing flags"
297
+ def _create_incorrect_command(self, correct: str) -> str:
298
+ """Create a plausible but incorrect command by corrupting the correct one."""
299
+ # Remove a flag or part of the command
300
+ parts = correct.split()
301
+ if len(parts) > 2:
302
+ return " ".join(parts[:-1]) # Remove last part
303
+ return correct + " --invalid-flag"
631
304
 
632
305
 
633
306
 
@@ -635,11 +308,12 @@ class SciCodeExtractor(HuggingFaceBenchmarkExtractor):
635
308
  """
636
309
  Extractor for SciCode - scientific computing code generation benchmark.
637
310
 
638
- SciCode evaluates LLMs' ability to generate code for scientific computing
639
- tasks including numerical methods, data analysis, and domain-specific
640
- scientific computations.
311
+ GitHub: https://scicode-bench.github.io/
312
+ Paper: "SciCode: A Research Coding Benchmark Curated by Scientists"
641
313
 
642
- Dataset: Various scientific computing datasets
314
+ SciCode evaluates LLMs' ability to generate code for scientific computing
315
+ tasks across Physics, Math, Material Science, Biology, and Chemistry.
316
+ Contains 338 subproblems from 80 main challenges.
643
317
 
644
318
  For scientific computing evaluation:
645
319
  - Positive (correct) = Scientifically accurate code with proper numerical methods
@@ -666,6 +340,8 @@ class SciCodeExtractor(HuggingFaceBenchmarkExtractor):
666
340
  """
667
341
  Build contrastive pairs from SciCode examples.
668
342
 
343
+ Loads data from GitHub ZIP archive.
344
+
669
345
  Args:
670
346
  limit: Optional maximum number of pairs to produce.
671
347
 
@@ -673,16 +349,21 @@ class SciCodeExtractor(HuggingFaceBenchmarkExtractor):
673
349
  A list of ContrastivePair objects.
674
350
  """
675
351
  max_items = self._normalize_limit(limit)
352
+ pairs: list[ContrastivePair] = []
676
353
 
677
- # Create synthetic scientific computing examples
678
- docs = self._create_synthetic_examples(max_items or 100)
354
+ docs = self._load_from_github()
355
+
356
+ if not docs:
357
+ log.error("Failed to load SciCode data from GitHub")
358
+ return []
679
359
 
680
- pairs: list[ContrastivePair] = []
360
+ log.info(f"Loaded {len(docs)} problems from SciCode GitHub")
681
361
 
682
362
  for doc in docs:
683
- if self.domain and doc.get("domain") != self.domain:
363
+ # Filter by domain if specified
364
+ if self.domain and doc.get("domain", "").lower() != self.domain.lower():
684
365
  continue
685
-
366
+
686
367
  pair = self._extract_pair_from_doc(doc)
687
368
  if pair is not None:
688
369
  pairs.append(pair)
@@ -694,125 +375,55 @@ class SciCodeExtractor(HuggingFaceBenchmarkExtractor):
694
375
 
695
376
  return pairs
696
377
 
697
- def _create_synthetic_examples(self, count: int) -> list[dict[str, Any]]:
698
- """Create synthetic scientific computing examples."""
699
- examples = [
700
- {
701
- "problem": "Implement numerical integration using Simpson's rule",
702
- "domain": "mathematics",
703
- "correct_solution": """import numpy as np
704
-
705
- def simpsons_rule(f, a, b, n):
706
- '''
707
- Integrate f(x) from a to b using Simpson's rule with n intervals.
708
- n must be even.
709
- '''
710
- if n % 2 != 0:
711
- raise InvalidValueError(param_name="n", actual=n, expected="even number for Simpson's rule")
712
-
713
- h = (b - a) / n
714
- x = np.linspace(a, b, n + 1)
715
- y = f(x)
716
-
717
- # Simpson's rule: h/3 * (y_0 + 4*y_1 + 2*y_2 + 4*y_3 + ... + y_n)
718
- integral = y[0] + y[-1]
719
- integral += 4 * np.sum(y[1:-1:2]) # odd indices
720
- integral += 2 * np.sum(y[2:-1:2]) # even indices (except first and last)
721
-
722
- return integral * h / 3
723
-
724
- # Example: Integrate sin(x) from 0 to pi (expected: 2.0)
725
- result = simpsons_rule(np.sin, 0, np.pi, 100)
726
- print(f"Integral of sin(x) from 0 to pi: {result:.10f}")""",
727
- "incorrect_solution": """import numpy as np
728
-
729
- def simpsons_rule(f, a, b, n):
730
- h = (b - a) / n
731
- x = np.linspace(a, b, n) # Bug: should be n+1 points
732
- y = f(x)
733
-
734
- # Wrong implementation - missing proper weighting
735
- integral = np.sum(y) * h # This is just rectangular rule
736
-
737
- return integral""",
738
- },
739
- {
740
- "problem": "Solve a system of ODEs using Runge-Kutta 4th order method",
741
- "domain": "physics",
742
- "correct_solution": """import numpy as np
743
-
744
- def rk4_step(f, t, y, h):
745
- '''
746
- Single step of RK4 method.
747
- f: function f(t, y) returning dy/dt
748
- t: current time
749
- y: current state vector
750
- h: step size
751
- '''
752
- k1 = h * f(t, y)
753
- k2 = h * f(t + h/2, y + k1/2)
754
- k3 = h * f(t + h/2, y + k2/2)
755
- k4 = h * f(t + h, y + k3)
756
-
757
- return y + (k1 + 2*k2 + 2*k3 + k4) / 6
758
-
759
- def solve_ode(f, y0, t_span, n_steps):
760
- '''
761
- Solve ODE system dy/dt = f(t, y) using RK4.
762
- '''
763
- t = np.linspace(t_span[0], t_span[1], n_steps + 1)
764
- h = t[1] - t[0]
765
-
766
- y = np.zeros((n_steps + 1, len(y0)))
767
- y[0] = y0
768
-
769
- for i in range(n_steps):
770
- y[i+1] = rk4_step(f, t[i], y[i], h)
771
-
772
- return t, y
773
-
774
- # Example: Simple harmonic oscillator
775
- def harmonic(t, y):
776
- return np.array([y[1], -y[0]])
777
-
778
- t, y = solve_ode(harmonic, np.array([1.0, 0.0]), [0, 10], 1000)""",
779
- "incorrect_solution": """import numpy as np
780
-
781
- def euler_step(f, t, y, h):
782
- # Using Euler method instead of RK4 - much less accurate
783
- return y + h * f(t, y)
784
-
785
- def solve_ode(f, y0, t_span, n_steps):
786
- t = np.linspace(t_span[0], t_span[1], n_steps) # Bug: should be n_steps+1
787
- h = (t_span[1] - t_span[0]) / n_steps
788
-
789
- y = [y0]
790
- for i in range(n_steps - 1):
791
- y.append(euler_step(f, t[i], y[i], h))
792
-
793
- return t, np.array(y)""",
794
- },
795
- ]
796
-
797
- result = []
798
- for i in range(count):
799
- example = examples[i % len(examples)].copy()
800
- result.append(example)
801
-
802
- return result
378
+ def _load_from_github(self) -> list[dict[str, Any]]:
379
+ """Load SciCode data from GitHub ZIP archive."""
380
+ try:
381
+ response = requests.get(SCICODE_GITHUB_URL, timeout=60)
382
+ response.raise_for_status()
383
+
384
+ all_problems = []
385
+ with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
386
+ for filename in zf.namelist():
387
+ if filename.endswith('.json'):
388
+ with zf.open(filename) as f:
389
+ try:
390
+ data = json.load(f)
391
+ if isinstance(data, list):
392
+ all_problems.extend(data)
393
+ elif isinstance(data, dict):
394
+ all_problems.append(data)
395
+ except json.JSONDecodeError:
396
+ continue
397
+
398
+ return all_problems
399
+
400
+ except Exception as e:
401
+ log.error(f"Failed to load SciCode from GitHub: {e}")
402
+ return []
803
403
 
804
404
  def _extract_pair_from_doc(self, doc: dict[str, Any]) -> ContrastivePair | None:
805
- """Convert a single doc into a ContrastivePair."""
405
+ """Convert a single doc into a ContrastivePair.
406
+
407
+ SciCode schema varies by file, but typically includes:
408
+ - problem_id: str
409
+ - problem: str (description)
410
+ - sub_problems: list of subproblems
411
+ - domain: str (Physics, Math, etc.)
412
+ """
806
413
  try:
807
- problem = doc.get("problem", "").strip()
414
+ problem_id = doc.get("problem_id", "")
415
+ problem = doc.get("problem", doc.get("description", "")).strip()
808
416
  domain = doc.get("domain", "general")
809
- correct = doc.get("correct_solution", "").strip()
810
- incorrect = doc.get("incorrect_solution", "").strip()
811
-
812
- if not problem or not correct:
417
+ sub_problems = doc.get("sub_problems", [])
418
+
419
+ # Try to get problem text from various fields
420
+ if not problem and sub_problems:
421
+ problem = sub_problems[0].get("problem", "") if sub_problems else ""
422
+
423
+ if not problem:
813
424
  return None
814
425
 
815
- task_prompt = f"""Scientific Computing Task:
426
+ task_prompt = f"""Scientific Computing Task ({domain}):
816
427
 
817
428
  {problem}
818
429
 
@@ -821,12 +432,21 @@ Provide a Python implementation that is:
821
432
  - Well-documented with clear variable names
822
433
  - Efficient and follows scientific computing best practices"""
823
434
 
435
+ # Create correct response placeholder (actual solution from benchmark)
436
+ correct = doc.get("solution", doc.get("code", "# Correct solution would go here"))
437
+ if isinstance(correct, list):
438
+ correct = correct[0] if correct else "# Solution"
439
+
440
+ # Create incorrect by corrupting
441
+ incorrect = "# Incorrect implementation with numerical errors\nimport numpy as np\nresult = 0 # Wrong approach"
442
+
824
443
  correct_response = f"```python\n{correct}\n```"
825
444
  incorrect_response = f"```python\n{incorrect}\n```"
826
445
 
827
446
  metadata = {
828
447
  "label": "scicode",
829
- "source": "scicode",
448
+ "source": "scicode-bench/SciCode",
449
+ "problem_id": problem_id,
830
450
  "domain": domain,
831
451
  "is_scientific_computing_benchmark": True,
832
452
  }
@@ -839,6 +459,6 @@ Provide a Python implementation that is:
839
459
  )
840
460
 
841
461
  except Exception as exc:
842
- log.error(f"Error extracting pair from doc: {exc}", exc_info=True)
462
+ log.error(f"Error extracting SciCode pair: {exc}", exc_info=True)
843
463
  return None
844
464