wisent 0.7.379__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1720) hide show
  1. wisent/__init__.py +64 -0
  2. wisent/cli.py +114 -0
  3. wisent/core/__init__.py +40 -0
  4. wisent/core/activations/__init__.py +26 -0
  5. wisent/core/activations/activations.py +97 -0
  6. wisent/core/activations/activations_collector.py +506 -0
  7. wisent/core/activations/core/__init__.py +0 -0
  8. wisent/core/activations/core/atoms.py +219 -0
  9. wisent/core/activations/prompt_construction_strategy.py +47 -0
  10. wisent/core/adapters/__init__.py +22 -0
  11. wisent/core/adapters/audio.py +616 -0
  12. wisent/core/adapters/base.py +420 -0
  13. wisent/core/adapters/multimodal.py +738 -0
  14. wisent/core/adapters/robotics.py +643 -0
  15. wisent/core/adapters/text.py +441 -0
  16. wisent/core/adapters/video.py +555 -0
  17. wisent/core/agent/__init__.py +1 -0
  18. wisent/core/agent/budget.py +644 -0
  19. wisent/core/agent/device_benchmarks.py +691 -0
  20. wisent/core/agent/diagnose/__init__.py +1 -0
  21. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  22. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  23. wisent/core/agent/diagnose/create_classifier.py +1155 -0
  24. wisent/core/agent/diagnose/response_diagnostics.py +273 -0
  25. wisent/core/agent/diagnose/select_classifiers.py +507 -0
  26. wisent/core/agent/diagnose/synthetic_classifier_option.py +755 -0
  27. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  28. wisent/core/agent/diagnose/tasks/task_manager.py +1453 -0
  29. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  30. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  31. wisent/core/agent/diagnose.py +249 -0
  32. wisent/core/agent/steer.py +215 -0
  33. wisent/core/agent/timeout.py +134 -0
  34. wisent/core/autonomous_agent.py +1158 -0
  35. wisent/core/benchmark_extractors.py +372 -0
  36. wisent/core/benchmark_registry.py +151 -0
  37. wisent/core/bigcode_extractors.py +26 -0
  38. wisent/core/bigcode_integration.py +886 -0
  39. wisent/core/branding.py +108 -0
  40. wisent/core/classifier/__init__.py +1 -0
  41. wisent/core/classifier/models/__init__.py +1 -0
  42. wisent/core/classifiers/__init__.py +1 -0
  43. wisent/core/classifiers/classifiers/__init__.py +0 -0
  44. wisent/core/classifiers/classifiers/core/__init__.py +0 -0
  45. wisent/core/classifiers/classifiers/core/atoms.py +748 -0
  46. wisent/core/classifiers/classifiers/models/__init__.py +0 -0
  47. wisent/core/classifiers/classifiers/models/logistic.py +29 -0
  48. wisent/core/classifiers/classifiers/models/mlp.py +47 -0
  49. wisent/core/classifiers/classifiers/rotator.py +137 -0
  50. wisent/core/classifiers/core/__init__.py +1 -0
  51. wisent/core/classifiers/models/__init__.py +1 -0
  52. wisent/core/classifiers/pipeline_steps/__init__.py +1 -0
  53. wisent/core/cli/__init__.py +26 -0
  54. wisent/core/cli/agent/__init__.py +15 -0
  55. wisent/core/cli/agent/apply_steering.py +192 -0
  56. wisent/core/cli/agent/evaluate_response.py +128 -0
  57. wisent/core/cli/agent/generate_synthetic_pairs.py +123 -0
  58. wisent/core/cli/agent/main.py +139 -0
  59. wisent/core/cli/agent/train_classifier.py +173 -0
  60. wisent/core/cli/check_linearity.py +126 -0
  61. wisent/core/cli/create_steering_vector.py +304 -0
  62. wisent/core/cli/diagnose_pairs.py +153 -0
  63. wisent/core/cli/diagnose_vectors.py +404 -0
  64. wisent/core/cli/estimate_unified_goodness_time.py +428 -0
  65. wisent/core/cli/evaluate_refusal.py +241 -0
  66. wisent/core/cli/evaluate_responses.py +926 -0
  67. wisent/core/cli/generate_humanization_pairs.py +128 -0
  68. wisent/core/cli/generate_pairs.py +175 -0
  69. wisent/core/cli/generate_pairs_from_task.py +108 -0
  70. wisent/core/cli/generate_responses.py +160 -0
  71. wisent/core/cli/generate_vector_from_synthetic.py +217 -0
  72. wisent/core/cli/generate_vector_from_task.py +248 -0
  73. wisent/core/cli/get_activations.py +192 -0
  74. wisent/core/cli/inference_config.py +84 -0
  75. wisent/core/cli/inference_config_cli.py +54 -0
  76. wisent/core/cli/modify_weights.py +660 -0
  77. wisent/core/cli/multi_steer.py +112 -0
  78. wisent/core/cli/optimization_cache.py +298 -0
  79. wisent/core/cli/optimize.py +621 -0
  80. wisent/core/cli/optimize_classification.py +473 -0
  81. wisent/core/cli/optimize_sample_size.py +390 -0
  82. wisent/core/cli/optimize_steering.py +3421 -0
  83. wisent/core/cli/optimize_weights.py +1287 -0
  84. wisent/core/cli/steering_method_trainer.py +641 -0
  85. wisent/core/cli/steering_search_space.py +508 -0
  86. wisent/core/cli/tasks.py +940 -0
  87. wisent/core/cli/train_unified_goodness.py +681 -0
  88. wisent/core/cli_logger.py +22 -0
  89. wisent/core/config_manager.py +1731 -0
  90. wisent/core/contrastive_pairs/__init__.py +15 -0
  91. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  92. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  93. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  94. wisent/core/contrastive_pairs/core/pair.py +183 -0
  95. wisent/core/contrastive_pairs/core/response.py +153 -0
  96. wisent/core/contrastive_pairs/core/serialization.py +306 -0
  97. wisent/core/contrastive_pairs/core/set.py +192 -0
  98. wisent/core/contrastive_pairs/diagnostics/__init__.py +79 -0
  99. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  100. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  101. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1655 -0
  102. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  103. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  104. wisent/core/contrastive_pairs/diagnostics/duplicates.py +118 -0
  105. wisent/core/contrastive_pairs/diagnostics/linearity.py +325 -0
  106. wisent/core/contrastive_pairs/diagnostics/vector_quality.py +620 -0
  107. wisent/core/contrastive_pairs/huggingface_pairs/__init__.py +1 -0
  108. wisent/core/contrastive_pairs/huggingface_pairs/atoms.py +255 -0
  109. wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +470 -0
  110. wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_registry.py +136 -0
  111. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +44 -0
  112. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentbench.py +225 -0
  113. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentharm.py +267 -0
  114. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +444 -0
  115. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +225 -0
  116. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aime.py +118 -0
  117. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aime2024.py +74 -0
  118. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aime2025.py +73 -0
  119. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/alpaca_eval.py +153 -0
  120. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +182 -0
  121. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/arena_hard.py +179 -0
  122. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/atis.py +89 -0
  123. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/babilong.py +96 -0
  124. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bangla_mmlu.py +108 -0
  125. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/basqueglue.py +217 -0
  126. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bec2016eu.py +99 -0
  127. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bfcl.py +283 -0
  128. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bhtc_v2.py +87 -0
  129. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +245 -0
  130. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/chain_of_thought.py +89 -0
  131. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/chinese_simpleqa.py +209 -0
  132. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/cluewsc.py +177 -0
  133. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/cnn_dailymail.py +92 -0
  134. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +378 -0
  135. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +109 -0
  136. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +15 -0
  137. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +64 -0
  138. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +65 -0
  139. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +65 -0
  140. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +65 -0
  141. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +65 -0
  142. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +65 -0
  143. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +844 -0
  144. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coedit_gec.py +79 -0
  145. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/conala.py +133 -0
  146. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/concode.py +111 -0
  147. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/dbpedia_14.py +91 -0
  148. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/doc_vqa.py +102 -0
  149. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/donotanswer.py +236 -0
  150. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ds1000.py +129 -0
  151. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ds_1000.py +155 -0
  152. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/epec_koref_bin.py +85 -0
  153. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ethos_binary.py +82 -0
  154. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/evalita_mp.py +165 -0
  155. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/evalita_sp_sum_task_fp_small_p1.py +89 -0
  156. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/facts_grounding.py +181 -0
  157. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +295 -0
  158. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/financial_tweets.py +100 -0
  159. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +270 -0
  160. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flan_held_in.py +98 -0
  161. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +572 -0
  162. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +143 -0
  163. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +99 -0
  164. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/get_negative_example_livecodebench.py +146 -0
  165. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/get_positive_example_livecodebench.py +140 -0
  166. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/gpt3_translation_benchmarks.py +98 -0
  167. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +389 -0
  168. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/halueval.py +246 -0
  169. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/harmbench.py +250 -0
  170. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/healthbench.py +181 -0
  171. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hle.py +106 -0
  172. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hmmt.py +117 -0
  173. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +119 -0
  174. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humanevalpack.py +102 -0
  175. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +180 -0
  176. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +129 -0
  177. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/iwslt2017_ar_en.py +98 -0
  178. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/iwslt2017_en_ar.py +98 -0
  179. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/jailbreakbench.py +258 -0
  180. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/law_stack_exchange.py +101 -0
  181. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ledgar.py +118 -0
  182. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livecodebench.py +61 -0
  183. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livecodebench_contrastive_pair_generator.py +491 -0
  184. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livecodebench_v6.py +263 -0
  185. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +230 -0
  186. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/llama.py +96 -0
  187. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +285 -0
  188. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/m_mmlu.py +96 -0
  189. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math.py +186 -0
  190. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +146 -0
  191. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +142 -0
  192. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/meddialog.py +79 -0
  193. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medical_abstracts.py +101 -0
  194. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +787 -0
  195. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +111 -0
  196. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mmlu_redux.py +194 -0
  197. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mmlusr.py +108 -0
  198. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multimedqa.py +99 -0
  199. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multipl_e.py +109 -0
  200. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple.py +96 -0
  201. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_choice.py +87 -0
  202. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_cpp.py +128 -0
  203. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_go.py +128 -0
  204. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_java.py +128 -0
  205. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_js.py +128 -0
  206. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_py.py +15 -0
  207. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_rs.py +128 -0
  208. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/non_greedy_robustness_agieval_aqua_rat.py +92 -0
  209. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +287 -0
  210. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/openllm.py +99 -0
  211. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/option_order_robustness_agieval_aqua_rat.py +92 -0
  212. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/or_bench.py +300 -0
  213. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/penn_treebank.py +80 -0
  214. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +317 -0
  215. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +467 -0
  216. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/prompt_robustness_agieval_aqua_rat.py +92 -0
  217. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/pythia.py +99 -0
  218. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +131 -0
  219. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +280 -0
  220. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/scicode.py +275 -0
  221. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/self_consistency.py +90 -0
  222. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +145 -0
  223. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/sorry_bench.py +211 -0
  224. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/stsb.py +79 -0
  225. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_glue_lm_eval_v1.py +99 -0
  226. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_glue_lm_eval_v1_seq2seq.py +98 -0
  227. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_glue_t5_prompt.py +123 -0
  228. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_gpqa.py +106 -0
  229. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/swe_bench.py +428 -0
  230. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/swe_bench_verified.py +158 -0
  231. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/sycophancy_eval.py +205 -0
  232. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/t0_eval.py +79 -0
  233. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tag.py +98 -0
  234. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +305 -0
  235. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tmlu.py +109 -0
  236. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +360 -0
  237. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +386 -0
  238. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/travelplanner.py +286 -0
  239. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/truthfulqa_generation.py +128 -0
  240. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/unfair_tos.py +83 -0
  241. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/vaxx_stance.py +86 -0
  242. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wiceu.py +85 -0
  243. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wikitext103.py +97 -0
  244. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wildguard.py +280 -0
  245. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt14_en_fr.py +97 -0
  246. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt14_fr_en.py +97 -0
  247. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_de_en.py +90 -0
  248. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_en_de.py +90 -0
  249. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_en_ro.py +90 -0
  250. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_ro_en.py +90 -0
  251. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt_ro_en_t5_prompt.py +90 -0
  252. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/xsum.py +81 -0
  253. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  254. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +265 -0
  255. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/__init__.py +472 -0
  256. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/aclue.py +24 -0
  257. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/acp.py +33 -0
  258. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/acpbench.py +39 -0
  259. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/advanced_ai_risk.py +59 -0
  260. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/aexams.py +14 -0
  261. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrimgsm.py +10 -0
  262. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrimmlu.py +10 -0
  263. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrixnli.py +9 -0
  264. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench.py +14 -0
  265. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_adr.py +9 -0
  266. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_afriqa.py +9 -0
  267. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_afrisenti.py +9 -0
  268. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_belebele.py +9 -0
  269. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_flores.py +9 -0
  270. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_injongointent.py +9 -0
  271. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_mafand.py +9 -0
  272. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_masakhaner.py +9 -0
  273. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_masakhanews.py +9 -0
  274. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_masakhapos.py +9 -0
  275. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_naijarc.py +9 -0
  276. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_nollysenti.py +9 -0
  277. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_ntrex.py +9 -0
  278. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_openai_mmlu.py +9 -0
  279. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_salt.py +9 -0
  280. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_sib.py +9 -0
  281. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_uhura_arc_easy.py +9 -0
  282. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_xlsum.py +9 -0
  283. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/agieval.py +33 -0
  284. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/anli.py +9 -0
  285. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arab_culture.py +24 -0
  286. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_acva.py +67 -0
  287. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_acva_light.py +67 -0
  288. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_complete.py +24 -0
  289. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_light.py +81 -0
  290. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabicmmlu.py +59 -0
  291. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/aradice.py +36 -0
  292. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arc.py +61 -0
  293. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arithmetic.py +19 -0
  294. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/basque_bench.py +37 -0
  295. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bbh.py +121 -0
  296. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bbq.py +9 -0
  297. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/belebele.py +293 -0
  298. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bertaqa.py +25 -0
  299. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bigbench.py +300 -0
  300. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/blimp.py +76 -0
  301. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/careqa.py +9 -0
  302. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/catalan_bench.py +43 -0
  303. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/ceval_valid.py +61 -0
  304. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/cmmlu.py +76 -0
  305. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +16 -0
  306. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/copal_id.py +11 -0
  307. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/crows_pairs.py +31 -0
  308. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/csatqa.py +15 -0
  309. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/darija.py +29 -0
  310. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/darijammlu.py +57 -0
  311. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/egymmlu.py +62 -0
  312. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/eus.py +76 -0
  313. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/evalita_mp.py +93 -0
  314. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/fld.py +9 -0
  315. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/flores.py +466 -0
  316. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +9 -0
  317. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/french_bench.py +23 -0
  318. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/galician_bench.py +41 -0
  319. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/glianorex.py +11 -0
  320. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/global_mmlu.py +115 -0
  321. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/gpqa.py +27 -0
  322. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/gsm8k.py +9 -0
  323. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/gsm8k_platinum.py +9 -0
  324. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/haerae.py +14 -0
  325. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/headqa.py +11 -0
  326. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hellaswag.py +39 -0
  327. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hendrycks_ethics.py +14 -0
  328. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hendrycks_math.py +9 -0
  329. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hrm8k.py +20 -0
  330. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/inverse.py +22 -0
  331. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/japanese_leaderboard.py +20 -0
  332. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/jsonschema_bench.py +9 -0
  333. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kbl.py +85 -0
  334. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kmmlu.py +281 -0
  335. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kobest.py +14 -0
  336. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kormedmcqa.py +9 -0
  337. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/lambada.py +28 -0
  338. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/leaderboard.py +52 -0
  339. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/libra.py +9 -0
  340. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/lingoly.py +11 -0
  341. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/longbench.py +9 -0
  342. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/m.py +43 -0
  343. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mastermind.py +9 -0
  344. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mathqa.py +9 -0
  345. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/med.py +24 -0
  346. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/meddialog.py +12 -0
  347. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/medqa.py +9 -0
  348. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mela.py +18 -0
  349. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/metabench.py +36 -0
  350. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mgsm.py +44 -0
  351. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/minerva_math.py +16 -0
  352. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mlqa.py +58 -0
  353. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu.py +70 -0
  354. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu_pro.py +23 -0
  355. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu_pro_plus.py +23 -0
  356. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu_prox.py +191 -0
  357. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlusr.py +9 -0
  358. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmmu.py +46 -0
  359. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/model_written_evals.py +9 -0
  360. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/multiblimp.py +111 -0
  361. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/non.py +23 -0
  362. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/noreval.py +143 -0
  363. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/noridiom.py +20 -0
  364. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/nortruthfulqa.py +32 -0
  365. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/nrk.py +20 -0
  366. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi.py +9 -0
  367. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_arc_multilingual.py +10 -0
  368. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_hellaswag_multilingual.py +24 -0
  369. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_mmlu_multilingual.py +24 -0
  370. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_truthfulqa_multilingual.py +34 -0
  371. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/paloma.py +25 -0
  372. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/pawsx.py +9 -0
  373. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/persona.py +144 -0
  374. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/pile.py +31 -0
  375. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/polemo2.py +9 -0
  376. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/portuguese_bench.py +31 -0
  377. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/prompt.py +23 -0
  378. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/qa4mre.py +12 -0
  379. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/qasper.py +11 -0
  380. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/ru.py +19 -0
  381. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/ruler.py +9 -0
  382. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/score.py +20 -0
  383. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/scrolls.py +9 -0
  384. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/self_consistency.py +11 -0
  385. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/spanish_bench.py +38 -0
  386. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/storycloze.py +9 -0
  387. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/super_glue_t5_prompt.py +17 -0
  388. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/tinyBenchmarks.py +9 -0
  389. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/tmlu.py +9 -0
  390. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/tmmluplus.py +80 -0
  391. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/translation.py +9 -0
  392. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/truthfulqa.py +76 -0
  393. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/truthfulqa_multi.py +24 -0
  394. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/turkishmmlu.py +30 -0
  395. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/unitxt.py +23 -0
  396. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/unscramble.py +9 -0
  397. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/winogender.py +16 -0
  398. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wmdp.py +12 -0
  399. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wmt14.py +16 -0
  400. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wmt16.py +22 -0
  401. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wsc273.py +9 -0
  402. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xcopa.py +21 -0
  403. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xnli.py +28 -0
  404. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xnli_eu.py +12 -0
  405. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xquad.py +22 -0
  406. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xstorycloze.py +22 -0
  407. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xwinograd.py +15 -0
  408. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +478 -0
  409. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +140 -0
  410. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +125 -0
  411. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +171 -0
  412. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +207 -0
  413. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +185 -0
  414. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +130 -0
  415. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +184 -0
  416. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimgsm.py +98 -0
  417. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +113 -0
  418. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +129 -0
  419. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrobench_cot.py +88 -0
  420. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrobench_mc.py +107 -0
  421. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ag.py +134 -0
  422. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +155 -0
  423. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ai2_arc.py +114 -0
  424. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anagrams1.py +81 -0
  425. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anagrams2.py +81 -0
  426. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anli.py +140 -0
  427. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +180 -0
  428. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +98 -0
  429. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +104 -0
  430. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +168 -0
  431. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +168 -0
  432. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +167 -0
  433. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +268 -0
  434. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +133 -0
  435. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +118 -0
  436. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +118 -0
  437. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_gen.py +101 -0
  438. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_mc.py +106 -0
  439. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/argument.py +134 -0
  440. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +114 -0
  441. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +122 -0
  442. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/assin.py +103 -0
  443. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +113 -0
  444. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +155 -0
  445. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench_gen.py +168 -0
  446. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench_mc.py +139 -0
  447. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbh.py +133 -0
  448. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +169 -0
  449. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +181 -0
  450. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +155 -0
  451. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +165 -0
  452. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +155 -0
  453. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +143 -0
  454. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bigbench.py +170 -0
  455. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +171 -0
  456. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +152 -0
  457. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +117 -0
  458. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq_seq2seq.py +117 -0
  459. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +150 -0
  460. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +152 -0
  461. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabreu.py +127 -0
  462. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +169 -0
  463. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +155 -0
  464. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench_gen.py +119 -0
  465. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench_mc.py +113 -0
  466. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +171 -0
  467. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +139 -0
  468. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +117 -0
  469. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +223 -0
  470. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +163 -0
  471. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +110 -0
  472. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +238 -0
  473. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +151 -0
  474. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +152 -0
  475. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +166 -0
  476. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +144 -0
  477. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +148 -0
  478. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +161 -0
  479. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +114 -0
  480. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +107 -0
  481. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +149 -0
  482. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cola.py +83 -0
  483. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +107 -0
  484. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +127 -0
  485. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +124 -0
  486. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +169 -0
  487. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +162 -0
  488. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqcat.py +114 -0
  489. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/crows_pairs.py +158 -0
  490. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +152 -0
  491. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +107 -0
  492. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle_letters.py +81 -0
  493. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +221 -0
  494. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +174 -0
  495. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +152 -0
  496. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +157 -0
  497. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +152 -0
  498. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +107 -0
  499. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +129 -0
  500. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/egyhellaswag.py +125 -0
  501. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/egymmlu.py +180 -0
  502. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +142 -0
  503. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +107 -0
  504. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +194 -0
  505. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +152 -0
  506. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +152 -0
  507. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +152 -0
  508. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/escola.py +85 -0
  509. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +135 -0
  510. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethos.py +99 -0
  511. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +107 -0
  512. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +225 -0
  513. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +159 -0
  514. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +159 -0
  515. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +159 -0
  516. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +166 -0
  517. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_sp.py +109 -0
  518. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/fda.py +105 -0
  519. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +107 -0
  520. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +114 -0
  521. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/fld.py +143 -0
  522. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +202 -0
  523. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench_mc.py +98 -0
  524. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench_perplexity.py +86 -0
  525. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galcola.py +109 -0
  526. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +155 -0
  527. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench_gen.py +118 -0
  528. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench_mc.py +112 -0
  529. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +141 -0
  530. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +118 -0
  531. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +171 -0
  532. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +152 -0
  533. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glue.py +109 -0
  534. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpqa.py +161 -0
  535. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +110 -0
  536. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +184 -0
  537. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm.py +108 -0
  538. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +134 -0
  539. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +152 -0
  540. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +112 -0
  541. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +125 -0
  542. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +225 -0
  543. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +191 -0
  544. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +179 -0
  545. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hle.py +111 -0
  546. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +203 -0
  547. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval.py +124 -0
  548. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +152 -0
  549. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +152 -0
  550. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ifeval.py +118 -0
  551. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +107 -0
  552. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +192 -0
  553. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/iwslt2017.py +117 -0
  554. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +107 -0
  555. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +155 -0
  556. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_gen.py +224 -0
  557. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +120 -0
  558. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/jsonschema_bench.py +123 -0
  559. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kbl.py +140 -0
  560. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +168 -0
  561. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu_cot.py +88 -0
  562. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu_mc.py +107 -0
  563. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +165 -0
  564. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +160 -0
  565. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada.py +147 -0
  566. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +185 -0
  567. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +185 -0
  568. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual_stablelm.py +141 -0
  569. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +107 -0
  570. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +194 -0
  571. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/libra.py +165 -0
  572. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +203 -0
  573. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +155 -0
  574. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +152 -0
  575. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +152 -0
  576. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logieval.py +82 -0
  577. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +115 -0
  578. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +114 -0
  579. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +152 -0
  580. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +152 -0
  581. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +203 -0
  582. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mathqa.py +137 -0
  583. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +123 -0
  584. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +115 -0
  585. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +224 -0
  586. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +180 -0
  587. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +107 -0
  588. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mediqa_qa2019.py +123 -0
  589. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +169 -0
  590. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +118 -0
  591. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medtext.py +108 -0
  592. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +96 -0
  593. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meqsum.py +115 -0
  594. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +154 -0
  595. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mgsm.py +122 -0
  596. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mimic_repsum.py +140 -0
  597. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +172 -0
  598. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mlqa.py +143 -0
  599. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +144 -0
  600. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_cot.py +88 -0
  601. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_mc.py +107 -0
  602. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_pro.py +145 -0
  603. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +189 -0
  604. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmmu.py +150 -0
  605. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mnli.py +113 -0
  606. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/model_written_evals.py +115 -0
  607. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/moral_stories.py +151 -0
  608. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +111 -0
  609. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mts_dialog.py +118 -0
  610. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mts_dialog_perplexity.py +97 -0
  611. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +134 -0
  612. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multilingual.py +106 -0
  613. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +114 -0
  614. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +113 -0
  615. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +107 -0
  616. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +173 -0
  617. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +157 -0
  618. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen.py +277 -0
  619. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +165 -0
  620. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +228 -0
  621. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +223 -0
  622. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noticia.py +105 -0
  623. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +135 -0
  624. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi.py +27 -0
  625. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +167 -0
  626. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +174 -0
  627. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +162 -0
  628. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +209 -0
  629. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +186 -0
  630. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph_perplexity.py +97 -0
  631. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +118 -0
  632. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +107 -0
  633. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paloma.py +205 -0
  634. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +110 -0
  635. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +110 -0
  636. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +107 -0
  637. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +154 -0
  638. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +115 -0
  639. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +246 -0
  640. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +144 -0
  641. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases_ca_va.py +82 -0
  642. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +161 -0
  643. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile_10k.py +140 -0
  644. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +116 -0
  645. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polemo2.py +135 -0
  646. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +155 -0
  647. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +155 -0
  648. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench_gen.py +121 -0
  649. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench_mc.py +103 -0
  650. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +107 -0
  651. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +115 -0
  652. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +112 -0
  653. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +119 -0
  654. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +118 -0
  655. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +112 -0
  656. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +111 -0
  657. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +107 -0
  658. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +111 -0
  659. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/quac.py +111 -0
  660. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +124 -0
  661. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +107 -0
  662. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/realtoxicityprompts.py +124 -0
  663. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +125 -0
  664. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +110 -0
  665. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +111 -0
  666. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +170 -0
  667. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +113 -0
  668. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +177 -0
  669. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +161 -0
  670. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +157 -0
  671. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +110 -0
  672. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +131 -0
  673. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +119 -0
  674. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/simple_cooccurrence_bias.py +121 -0
  675. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +209 -0
  676. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +114 -0
  677. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +155 -0
  678. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench_gen.py +117 -0
  679. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench_mc.py +110 -0
  680. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad2.py +129 -0
  681. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad_completion.py +121 -0
  682. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sst2.py +111 -0
  683. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +250 -0
  684. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +107 -0
  685. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +107 -0
  686. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +154 -0
  687. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/superglue.py +111 -0
  688. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/supergpqa.py +111 -0
  689. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +115 -0
  690. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +179 -0
  691. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +117 -0
  692. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +110 -0
  693. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +110 -0
  694. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +110 -0
  695. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +155 -0
  696. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +110 -0
  697. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +110 -0
  698. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +110 -0
  699. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +113 -0
  700. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +110 -0
  701. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +181 -0
  702. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/toxigen.py +91 -0
  703. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/translation.py +149 -0
  704. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +130 -0
  705. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +112 -0
  706. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +120 -0
  707. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +140 -0
  708. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_multi.py +142 -0
  709. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +152 -0
  710. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +161 -0
  711. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_cot.py +104 -0
  712. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +102 -0
  713. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/twenty_newsgroups.py +111 -0
  714. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unitxt.py +131 -0
  715. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +155 -0
  716. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +95 -0
  717. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +130 -0
  718. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +122 -0
  719. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wikitext.py +146 -0
  720. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogender.py +139 -0
  721. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +118 -0
  722. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +155 -0
  723. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmt14.py +110 -0
  724. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmt16.py +118 -0
  725. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +114 -0
  726. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +117 -0
  727. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +180 -0
  728. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +197 -0
  729. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +147 -0
  730. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +131 -0
  731. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +203 -0
  732. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +129 -0
  733. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +124 -0
  734. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/yahoo.py +108 -0
  735. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +155 -0
  736. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +56 -0
  737. wisent/core/data_loaders/__init__.py +235 -0
  738. wisent/core/data_loaders/core/__init__.py +0 -0
  739. wisent/core/data_loaders/core/atoms.py +99 -0
  740. wisent/core/data_loaders/loaders/__init__.py +0 -0
  741. wisent/core/data_loaders/loaders/custom.py +120 -0
  742. wisent/core/data_loaders/loaders/huggingface_loader.py +153 -0
  743. wisent/core/data_loaders/loaders/lm_loader.py +494 -0
  744. wisent/core/data_loaders/loaders/lm_loader_special_cases.py +496 -0
  745. wisent/core/data_loaders/loaders/task_interface_loader.py +300 -0
  746. wisent/core/data_loaders/rotator.py +118 -0
  747. wisent/core/detection_handling.py +259 -0
  748. wisent/core/diversity_processors.py +193 -0
  749. wisent/core/download_full_benchmarks.py +1512 -0
  750. wisent/core/errors/__init__.py +203 -0
  751. wisent/core/errors/error_codes.py +763 -0
  752. wisent/core/errors/error_handler.py +134 -0
  753. wisent/core/evaluators/__init__.py +0 -0
  754. wisent/core/evaluators/benchmark_specific/__init__.py +42 -0
  755. wisent/core/evaluators/benchmark_specific/aime_evaluator.py +90 -0
  756. wisent/core/evaluators/benchmark_specific/coding/__init__.py +0 -0
  757. wisent/core/evaluators/benchmark_specific/coding/metrics/__init__.py +0 -0
  758. wisent/core/evaluators/benchmark_specific/coding/metrics/core/__init__.py +0 -0
  759. wisent/core/evaluators/benchmark_specific/coding/metrics/core/atoms.py +36 -0
  760. wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +363 -0
  761. wisent/core/evaluators/benchmark_specific/coding/metrics/passk.py +67 -0
  762. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/__init__.py +0 -0
  763. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/core/__init__.py +0 -0
  764. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/core/atoms.py +27 -0
  765. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  766. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/java_sanitizer.py +78 -0
  767. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/python_sanitizer.py +94 -0
  768. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/utils.py +126 -0
  769. wisent/core/evaluators/benchmark_specific/coding/providers/__init__.py +18 -0
  770. wisent/core/evaluators/benchmark_specific/coding/providers/core/__init__.py +0 -0
  771. wisent/core/evaluators/benchmark_specific/coding/providers/core/atoms.py +31 -0
  772. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/__init__.py +3 -0
  773. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/provider.py +305 -0
  774. wisent/core/evaluators/benchmark_specific/coding/safe_docker/Dockerfile +31 -0
  775. wisent/core/evaluators/benchmark_specific/coding/safe_docker/__init__.py +0 -0
  776. wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/__init__.py +0 -0
  777. wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/atoms.py +105 -0
  778. wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/runtime.py +143 -0
  779. wisent/core/evaluators/benchmark_specific/coding/safe_docker/entrypoint.py +121 -0
  780. wisent/core/evaluators/benchmark_specific/coding/safe_docker/recipes.py +60 -0
  781. wisent/core/evaluators/benchmark_specific/coding/solution_generator.py +258 -0
  782. wisent/core/evaluators/benchmark_specific/conala_evaluator.py +332 -0
  783. wisent/core/evaluators/benchmark_specific/exact_match_evaluator.py +81 -0
  784. wisent/core/evaluators/benchmark_specific/f1_evaluator.py +173 -0
  785. wisent/core/evaluators/benchmark_specific/generation_evaluator.py +488 -0
  786. wisent/core/evaluators/benchmark_specific/livemathbench_evaluator.py +393 -0
  787. wisent/core/evaluators/benchmark_specific/log_likelihoods_evaluator.py +202 -0
  788. wisent/core/evaluators/benchmark_specific/math_evaluator.py +119 -0
  789. wisent/core/evaluators/benchmark_specific/math_parsing/__init__.py +1 -0
  790. wisent/core/evaluators/benchmark_specific/math_parsing/core.py +1640 -0
  791. wisent/core/evaluators/benchmark_specific/math_parsing/extract_boxed.py +48 -0
  792. wisent/core/evaluators/benchmark_specific/math_parsing/is_equiv.py +159 -0
  793. wisent/core/evaluators/benchmark_specific/math_parsing/scripts.py +919 -0
  794. wisent/core/evaluators/benchmark_specific/perplexity_evaluator.py +175 -0
  795. wisent/core/evaluators/benchmark_specific/polymath_evaluator.py +114 -0
  796. wisent/core/evaluators/core/__init__.py +5 -0
  797. wisent/core/evaluators/core/atoms.py +166 -0
  798. wisent/core/evaluators/custom/__init__.py +20 -0
  799. wisent/core/evaluators/custom/custom_evaluator.py +382 -0
  800. wisent/core/evaluators/custom/examples/__init__.py +37 -0
  801. wisent/core/evaluators/custom/examples/desklib_detector.py +166 -0
  802. wisent/core/evaluators/custom/examples/gptzero.py +185 -0
  803. wisent/core/evaluators/custom/examples/humanization.py +79 -0
  804. wisent/core/evaluators/custom/examples/humanization_coherent.py +127 -0
  805. wisent/core/evaluators/custom/examples/roberta_detector.py +173 -0
  806. wisent/core/evaluators/oracles/__init__.py +0 -0
  807. wisent/core/evaluators/oracles/interactive.py +73 -0
  808. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  809. wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +168 -0
  810. wisent/core/evaluators/oracles/user_specified.py +67 -0
  811. wisent/core/evaluators/personalization/__init__.py +12 -0
  812. wisent/core/evaluators/personalization/alignment.py +166 -0
  813. wisent/core/evaluators/personalization/coherence.py +325 -0
  814. wisent/core/evaluators/personalization/difference.py +73 -0
  815. wisent/core/evaluators/rotator.py +217 -0
  816. wisent/core/evaluators/steering_evaluators.py +386 -0
  817. wisent/core/evaluators/synthetic_evaluator.py +377 -0
  818. wisent/core/hyperparameter_optimizer.py +547 -0
  819. wisent/core/layer.py +17 -0
  820. wisent/core/lm_eval_harness_ground_truth.py +1431 -0
  821. wisent/core/main.py +101 -0
  822. wisent/core/managed_cached_benchmarks.py +609 -0
  823. wisent/core/mixed_benchmark_sampler.py +366 -0
  824. wisent/core/modalities/__init__.py +545 -0
  825. wisent/core/model_persistence.py +302 -0
  826. wisent/core/models/__init__.py +23 -0
  827. wisent/core/models/core/__init__.py +0 -0
  828. wisent/core/models/core/atoms.py +465 -0
  829. wisent/core/models/inference_config.py +127 -0
  830. wisent/core/models/wisent_model.py +893 -0
  831. wisent/core/multi_steering.py +397 -0
  832. wisent/core/opti/__init__.py +0 -0
  833. wisent/core/opti/core/__init__.py +0 -0
  834. wisent/core/opti/core/atoms.py +177 -0
  835. wisent/core/opti/methods/__init__.py +10 -0
  836. wisent/core/opti/methods/opti_classificator.py +172 -0
  837. wisent/core/opti/methods/opti_steering.py +139 -0
  838. wisent/core/opti/methods/opti_weights.py +523 -0
  839. wisent/core/optuna/__init__.py +54 -0
  840. wisent/core/optuna/classifier/__init__.py +25 -0
  841. wisent/core/optuna/classifier/activation_generator.py +351 -0
  842. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  843. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +685 -0
  844. wisent/core/optuna/steering/__init__.py +20 -0
  845. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +200 -0
  846. wisent/core/optuna/steering/data_utils.py +342 -0
  847. wisent/core/optuna/steering/metrics.py +412 -0
  848. wisent/core/optuna/steering/steering_optimization.py +1096 -0
  849. wisent/core/parser.py +1662 -0
  850. wisent/core/parser_arguments/__init__.py +10 -0
  851. wisent/core/parser_arguments/agent_parser.py +122 -0
  852. wisent/core/parser_arguments/check_linearity_parser.py +82 -0
  853. wisent/core/parser_arguments/configure_model_parser.py +7 -0
  854. wisent/core/parser_arguments/create_steering_vector_parser.py +67 -0
  855. wisent/core/parser_arguments/diagnose_pairs_parser.py +25 -0
  856. wisent/core/parser_arguments/diagnose_vectors_parser.py +72 -0
  857. wisent/core/parser_arguments/evaluate_parser.py +40 -0
  858. wisent/core/parser_arguments/evaluate_refusal_parser.py +32 -0
  859. wisent/core/parser_arguments/evaluate_responses_parser.py +12 -0
  860. wisent/core/parser_arguments/full_optimize_parser.py +194 -0
  861. wisent/core/parser_arguments/generate_pairs_from_task_parser.py +33 -0
  862. wisent/core/parser_arguments/generate_pairs_parser.py +43 -0
  863. wisent/core/parser_arguments/generate_responses_parser.py +16 -0
  864. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +148 -0
  865. wisent/core/parser_arguments/generate_vector_from_task_parser.py +149 -0
  866. wisent/core/parser_arguments/generate_vector_parser.py +89 -0
  867. wisent/core/parser_arguments/get_activations_parser.py +90 -0
  868. wisent/core/parser_arguments/inference_config_parser.py +65 -0
  869. wisent/core/parser_arguments/main_parser.py +220 -0
  870. wisent/core/parser_arguments/model_config_parser.py +59 -0
  871. wisent/core/parser_arguments/modify_weights_parser.py +309 -0
  872. wisent/core/parser_arguments/monitor_parser.py +17 -0
  873. wisent/core/parser_arguments/multi_steer_parser.py +48 -0
  874. wisent/core/parser_arguments/nonsense_parser.py +26 -0
  875. wisent/core/parser_arguments/optimization_cache_parser.py +64 -0
  876. wisent/core/parser_arguments/optimize_classification_parser.py +108 -0
  877. wisent/core/parser_arguments/optimize_parser.py +142 -0
  878. wisent/core/parser_arguments/optimize_sample_size_parser.py +58 -0
  879. wisent/core/parser_arguments/optimize_steering_parser.py +617 -0
  880. wisent/core/parser_arguments/optimize_weights_parser.py +403 -0
  881. wisent/core/parser_arguments/synthetic_parser.py +117 -0
  882. wisent/core/parser_arguments/tasks_parser.py +591 -0
  883. wisent/core/parser_arguments/train_unified_goodness_parser.py +172 -0
  884. wisent/core/parser_arguments/utils.py +107 -0
  885. wisent/core/prompts/__init__.py +0 -0
  886. wisent/core/prompts/core/__init__.py +0 -0
  887. wisent/core/prompts/core/atom.py +57 -0
  888. wisent/core/prompts/core/prompt_formater.py +148 -0
  889. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  890. wisent/core/prompts/prompt_stratiegies/direct_completion.py +26 -0
  891. wisent/core/prompts/prompt_stratiegies/instruction_following.py +26 -0
  892. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +31 -0
  893. wisent/core/prompts/prompt_stratiegies/role_playing.py +33 -0
  894. wisent/core/representation.py +5 -0
  895. wisent/core/save_results.py +277 -0
  896. wisent/core/steering.py +660 -0
  897. wisent/core/steering_method.py +20 -0
  898. wisent/core/steering_methods/__init__.py +54 -0
  899. wisent/core/steering_methods/core/__init__.py +0 -0
  900. wisent/core/steering_methods/core/atoms.py +154 -0
  901. wisent/core/steering_methods/methods/__init__.py +0 -0
  902. wisent/core/steering_methods/methods/caa.py +45 -0
  903. wisent/core/steering_methods/methods/prism.py +588 -0
  904. wisent/core/steering_methods/methods/pulse.py +641 -0
  905. wisent/core/steering_methods/methods/titan.py +1005 -0
  906. wisent/core/steering_methods/preflight.py +322 -0
  907. wisent/core/steering_methods/registry.py +649 -0
  908. wisent/core/steering_methods/rotator.py +121 -0
  909. wisent/core/steering_optimizer.py +1503 -0
  910. wisent/core/synthetic/__init__.py +0 -0
  911. wisent/core/synthetic/cleaners/__init__.py +0 -0
  912. wisent/core/synthetic/cleaners/core/__init__.py +0 -0
  913. wisent/core/synthetic/cleaners/core/atoms.py +58 -0
  914. wisent/core/synthetic/cleaners/deduper_cleaner.py +53 -0
  915. wisent/core/synthetic/cleaners/methods/__init__.py +0 -0
  916. wisent/core/synthetic/cleaners/methods/base_dedupers.py +321 -0
  917. wisent/core/synthetic/cleaners/methods/base_refusalers.py +286 -0
  918. wisent/core/synthetic/cleaners/methods/core/__init__.py +0 -0
  919. wisent/core/synthetic/cleaners/methods/core/atoms.py +47 -0
  920. wisent/core/synthetic/cleaners/pairs_cleaner.py +90 -0
  921. wisent/core/synthetic/cleaners/refusaler_cleaner.py +133 -0
  922. wisent/core/synthetic/db_instructions/__init__.py +0 -0
  923. wisent/core/synthetic/db_instructions/core/__init__.py +0 -0
  924. wisent/core/synthetic/db_instructions/core/atoms.py +25 -0
  925. wisent/core/synthetic/db_instructions/mini_dp.py +115 -0
  926. wisent/core/synthetic/generators/__init__.py +0 -0
  927. wisent/core/synthetic/generators/core/__init__.py +0 -0
  928. wisent/core/synthetic/generators/core/atoms.py +73 -0
  929. wisent/core/synthetic/generators/diversities/__init__.py +0 -0
  930. wisent/core/synthetic/generators/diversities/core/__init__.py +0 -0
  931. wisent/core/synthetic/generators/diversities/core/core.py +68 -0
  932. wisent/core/synthetic/generators/diversities/methods/__init__.py +0 -0
  933. wisent/core/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  934. wisent/core/synthetic/generators/nonsense_generator.py +150 -0
  935. wisent/core/synthetic/generators/pairs_generator.py +313 -0
  936. wisent/core/task_interface.py +143 -0
  937. wisent/core/task_selector.py +232 -0
  938. wisent/core/tasks/__init__.py +218 -0
  939. wisent/core/tasks/aime_task.py +142 -0
  940. wisent/core/tasks/file_task.py +212 -0
  941. wisent/core/tasks/hle_task.py +180 -0
  942. wisent/core/tasks/hmmt_task.py +120 -0
  943. wisent/core/tasks/livecodebench_task.py +94 -0
  944. wisent/core/tasks/livemathbench_task.py +159 -0
  945. wisent/core/tasks/lm_eval_task.py +611 -0
  946. wisent/core/tasks/math500_task.py +84 -0
  947. wisent/core/tasks/polymath_task.py +147 -0
  948. wisent/core/tasks/supergpqa_task.py +220 -0
  949. wisent/core/time_estimator.py +155 -0
  950. wisent/core/timing_calibration.py +176 -0
  951. wisent/core/tracking/__init__.py +54 -0
  952. wisent/core/tracking/latency.py +620 -0
  953. wisent/core/tracking/memory.py +360 -0
  954. wisent/core/trainers/__init__.py +0 -0
  955. wisent/core/trainers/core/__init__.py +11 -0
  956. wisent/core/trainers/core/atoms.py +45 -0
  957. wisent/core/trainers/steering_trainer.py +365 -0
  958. wisent/core/universal_subspace.py +918 -0
  959. wisent/core/user_model_config.py +158 -0
  960. wisent/core/utils/__init__.py +64 -0
  961. wisent/core/utils/base_rotator.py +292 -0
  962. wisent/core/utils/dataset_splits.py +197 -0
  963. wisent/core/utils/device.py +279 -0
  964. wisent/core/weight_modification/__init__.py +134 -0
  965. wisent/core/weight_modification/additive.py +340 -0
  966. wisent/core/weight_modification/directional.py +1357 -0
  967. wisent/core/weight_modification/export.py +359 -0
  968. wisent/core/weight_modification/multi_direction.py +410 -0
  969. wisent/core/weight_modification/utils.py +236 -0
  970. wisent/core/wisent.py +660 -0
  971. wisent/examples/contrastive_pairs/humanization_human_vs_ai.json +2112 -0
  972. wisent/examples/scripts/1/test_basqueglue_evaluation.json +51 -0
  973. wisent/examples/scripts/1/test_basqueglue_pairs.json +14 -0
  974. wisent/examples/scripts/1/test_bec2016eu_evaluation.json +51 -0
  975. wisent/examples/scripts/1/test_bec2016eu_pairs.json +14 -0
  976. wisent/examples/scripts/1/test_belebele_evaluation.json +51 -0
  977. wisent/examples/scripts/1/test_belebele_pairs.json +14 -0
  978. wisent/examples/scripts/1/test_benchmarks_evaluation.json +51 -0
  979. wisent/examples/scripts/1/test_benchmarks_pairs.json +14 -0
  980. wisent/examples/scripts/1/test_bertaqa_evaluation.json +51 -0
  981. wisent/examples/scripts/1/test_bertaqa_pairs.json +14 -0
  982. wisent/examples/scripts/1/test_bhtc_v2_evaluation.json +30 -0
  983. wisent/examples/scripts/1/test_bhtc_v2_pairs.json +8 -0
  984. wisent/examples/scripts/1/test_boolq-seq2seq_evaluation.json +30 -0
  985. wisent/examples/scripts/1/test_boolq-seq2seq_pairs.json +8 -0
  986. wisent/examples/scripts/1/test_cabreu_evaluation.json +30 -0
  987. wisent/examples/scripts/1/test_cabreu_pairs.json +8 -0
  988. wisent/examples/scripts/1/test_careqa_en_evaluation.json +30 -0
  989. wisent/examples/scripts/1/test_careqa_en_pairs.json +8 -0
  990. wisent/examples/scripts/1/test_careqa_evaluation.json +30 -0
  991. wisent/examples/scripts/1/test_careqa_pairs.json +8 -0
  992. wisent/examples/scripts/1/test_catalanqa_evaluation.json +30 -0
  993. wisent/examples/scripts/1/test_catalanqa_pairs.json +8 -0
  994. wisent/examples/scripts/1/test_catcola_evaluation.json +30 -0
  995. wisent/examples/scripts/1/test_catcola_pairs.json +8 -0
  996. wisent/examples/scripts/1/test_chartqa_evaluation.json +30 -0
  997. wisent/examples/scripts/1/test_chartqa_pairs.json +8 -0
  998. wisent/examples/scripts/1/test_claim_stance_topic_evaluation.json +30 -0
  999. wisent/examples/scripts/1/test_claim_stance_topic_pairs.json +8 -0
  1000. wisent/examples/scripts/1/test_cnn_dailymail_evaluation.json +30 -0
  1001. wisent/examples/scripts/1/test_cnn_dailymail_pairs.json +8 -0
  1002. wisent/examples/scripts/1/test_cocoteros_es_evaluation.json +30 -0
  1003. wisent/examples/scripts/1/test_cocoteros_es_pairs.json +8 -0
  1004. wisent/examples/scripts/1/test_coedit_gec_evaluation.json +30 -0
  1005. wisent/examples/scripts/1/test_coedit_gec_pairs.json +8 -0
  1006. wisent/examples/scripts/1/test_cola_evaluation.json +30 -0
  1007. wisent/examples/scripts/1/test_cola_pairs.json +8 -0
  1008. wisent/examples/scripts/1/test_coqcat_evaluation.json +30 -0
  1009. wisent/examples/scripts/1/test_coqcat_pairs.json +8 -0
  1010. wisent/examples/scripts/1/test_dbpedia_14_evaluation.json +30 -0
  1011. wisent/examples/scripts/1/test_dbpedia_14_pairs.json +8 -0
  1012. wisent/examples/scripts/1/test_epec_koref_bin_evaluation.json +30 -0
  1013. wisent/examples/scripts/1/test_epec_koref_bin_pairs.json +8 -0
  1014. wisent/examples/scripts/1/test_ethos_binary_evaluation.json +30 -0
  1015. wisent/examples/scripts/1/test_ethos_binary_pairs.json +8 -0
  1016. wisent/examples/scripts/2/test_afrimgsm_direct_amh_evaluation.json +30 -0
  1017. wisent/examples/scripts/2/test_afrimgsm_direct_amh_pairs.json +8 -0
  1018. wisent/examples/scripts/2/test_afrimmlu_direct_amh_evaluation.json +30 -0
  1019. wisent/examples/scripts/2/test_afrimmlu_direct_amh_pairs.json +8 -0
  1020. wisent/examples/scripts/2/test_afrixnli_en_direct_amh_evaluation.json +30 -0
  1021. wisent/examples/scripts/2/test_afrixnli_en_direct_amh_pairs.json +8 -0
  1022. wisent/examples/scripts/2/test_arc_ar_evaluation.json +30 -0
  1023. wisent/examples/scripts/2/test_arc_ar_pairs.json +8 -0
  1024. wisent/examples/scripts/2/test_atis_evaluation.json +30 -0
  1025. wisent/examples/scripts/2/test_atis_pairs.json +8 -0
  1026. wisent/examples/scripts/2/test_babi_evaluation.json +30 -0
  1027. wisent/examples/scripts/2/test_babi_pairs.json +8 -0
  1028. wisent/examples/scripts/2/test_babilong_evaluation.json +30 -0
  1029. wisent/examples/scripts/2/test_babilong_pairs.json +8 -0
  1030. wisent/examples/scripts/2/test_bangla_mmlu_evaluation.json +30 -0
  1031. wisent/examples/scripts/2/test_bangla_mmlu_pairs.json +8 -0
  1032. wisent/examples/scripts/2/test_basque-glue_pairs.json +14 -0
  1033. wisent/examples/scripts/benchmark_tags.json +2140 -0
  1034. wisent/examples/scripts/lm_eval_readme.json +4 -0
  1035. wisent/examples/scripts/results/benchmark_descriptions.json +1244 -0
  1036. wisent/examples/scripts/results/benchmark_evaluation_methods.json +66 -0
  1037. wisent/examples/scripts/results/benchmark_evaluator_mapping.json +2781 -0
  1038. wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +30536 -0
  1039. wisent/examples/scripts/results/benchmark_evaluators_clean.json +469 -0
  1040. wisent/examples/scripts/results/benchmark_methods_summary.json +260 -0
  1041. wisent/examples/scripts/results/benchmark_pair_creation_methods.json +66 -0
  1042. wisent/examples/scripts/results/benchmark_pair_totals.json +269 -0
  1043. wisent/examples/scripts/results/benchmark_tags.json +917 -0
  1044. wisent/examples/scripts/results/benchmark_test_summary_nov4.json +71 -0
  1045. wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +150 -0
  1046. wisent/examples/scripts/results/failing_benchmarks.json +946 -0
  1047. wisent/examples/scripts/results/failing_benchmarks_list.json +41 -0
  1048. wisent/examples/scripts/results/failing_benchmarks_test_results.json +945 -0
  1049. wisent/examples/scripts/results/missing_benchmark_tags.json +341 -0
  1050. wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +30 -0
  1051. wisent/examples/scripts/results/test_20_newsgroups_pairs.json +8 -0
  1052. wisent/examples/scripts/results/test_AraDICE_evaluation.json +51 -0
  1053. wisent/examples/scripts/results/test_AraDICE_pairs.json +14 -0
  1054. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +30 -0
  1055. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +8 -0
  1056. wisent/examples/scripts/results/test_ArabCulture_evaluation.json +51 -0
  1057. wisent/examples/scripts/results/test_ArabCulture_pairs.json +14 -0
  1058. wisent/examples/scripts/results/test_Tag_evaluation.json +30 -0
  1059. wisent/examples/scripts/results/test_Tag_pairs.json +8 -0
  1060. wisent/examples/scripts/results/test_aclue_evaluation.json +51 -0
  1061. wisent/examples/scripts/results/test_aclue_pairs.json +14 -0
  1062. wisent/examples/scripts/results/test_acp_bench_evaluation.json +51 -0
  1063. wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +51 -0
  1064. wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +14 -0
  1065. wisent/examples/scripts/results/test_acp_bench_pairs.json +14 -0
  1066. wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +51 -0
  1067. wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +14 -0
  1068. wisent/examples/scripts/results/test_aexams_evaluation.json +51 -0
  1069. wisent/examples/scripts/results/test_aexams_pairs.json +14 -0
  1070. wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +30 -0
  1071. wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +8 -0
  1072. wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +30 -0
  1073. wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +8 -0
  1074. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +30 -0
  1075. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +8 -0
  1076. wisent/examples/scripts/results/test_ag_news_evaluation.json +30 -0
  1077. wisent/examples/scripts/results/test_ag_news_pairs.json +8 -0
  1078. wisent/examples/scripts/results/test_agieval_evaluation.json +51 -0
  1079. wisent/examples/scripts/results/test_agieval_pairs.json +14 -0
  1080. wisent/examples/scripts/results/test_aime2024_evaluation.json +30 -0
  1081. wisent/examples/scripts/results/test_aime2024_pairs.json +8 -0
  1082. wisent/examples/scripts/results/test_aime2025_evaluation.json +30 -0
  1083. wisent/examples/scripts/results/test_aime2025_pairs.json +8 -0
  1084. wisent/examples/scripts/results/test_aime_evaluation.json +30 -0
  1085. wisent/examples/scripts/results/test_aime_pairs.json +8 -0
  1086. wisent/examples/scripts/results/test_anagrams1_evaluation.json +30 -0
  1087. wisent/examples/scripts/results/test_anagrams1_pairs.json +8 -0
  1088. wisent/examples/scripts/results/test_anagrams2_evaluation.json +30 -0
  1089. wisent/examples/scripts/results/test_anagrams2_pairs.json +8 -0
  1090. wisent/examples/scripts/results/test_anli_evaluation.json +30 -0
  1091. wisent/examples/scripts/results/test_anli_pairs.json +8 -0
  1092. wisent/examples/scripts/results/test_apps_evaluation.json +30 -0
  1093. wisent/examples/scripts/results/test_apps_pairs.json +8 -0
  1094. wisent/examples/scripts/results/test_arabic_exams_evaluation.json +30 -0
  1095. wisent/examples/scripts/results/test_arabic_exams_pairs.json +8 -0
  1096. wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +51 -0
  1097. wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +14 -0
  1098. wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +51 -0
  1099. wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +14 -0
  1100. wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +51 -0
  1101. wisent/examples/scripts/results/test_arabicmmlu_pairs.json +14 -0
  1102. wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +51 -0
  1103. wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +14 -0
  1104. wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +51 -0
  1105. wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +14 -0
  1106. wisent/examples/scripts/results/test_arc_ar_evaluation.json +30 -0
  1107. wisent/examples/scripts/results/test_arc_ar_pairs.json +8 -0
  1108. wisent/examples/scripts/results/test_arc_challenge_evaluation.json +30 -0
  1109. wisent/examples/scripts/results/test_arc_challenge_pairs.json +8 -0
  1110. wisent/examples/scripts/results/test_arc_easy_evaluation.json +30 -0
  1111. wisent/examples/scripts/results/test_arc_easy_pairs.json +8 -0
  1112. wisent/examples/scripts/results/test_argument_topic_evaluation.json +30 -0
  1113. wisent/examples/scripts/results/test_argument_topic_pairs.json +8 -0
  1114. wisent/examples/scripts/results/test_arithmetic_evaluation.json +51 -0
  1115. wisent/examples/scripts/results/test_arithmetic_pairs.json +14 -0
  1116. wisent/examples/scripts/results/test_asdiv_evaluation.json +30 -0
  1117. wisent/examples/scripts/results/test_asdiv_pairs.json +8 -0
  1118. wisent/examples/scripts/results/test_assin_entailment_evaluation.json +30 -0
  1119. wisent/examples/scripts/results/test_assin_entailment_pairs.json +8 -0
  1120. wisent/examples/scripts/results/test_atis_evaluation.json +30 -0
  1121. wisent/examples/scripts/results/test_atis_pairs.json +8 -0
  1122. wisent/examples/scripts/results/test_babi_evaluation.json +30 -0
  1123. wisent/examples/scripts/results/test_babi_pairs.json +8 -0
  1124. wisent/examples/scripts/results/test_babilong_evaluation.json +30 -0
  1125. wisent/examples/scripts/results/test_babilong_pairs.json +8 -0
  1126. wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +30 -0
  1127. wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +8 -0
  1128. wisent/examples/scripts/results/test_banking77_evaluation.json +30 -0
  1129. wisent/examples/scripts/results/test_banking77_pairs.json +8 -0
  1130. wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +14 -0
  1131. wisent/examples/scripts/results/test_basque-glue_evaluation.json +51 -0
  1132. wisent/examples/scripts/results/test_basque-glue_pairs.json +14 -0
  1133. wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +51 -0
  1134. wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +14 -0
  1135. wisent/examples/scripts/results/test_basque_bench_evaluation.json +51 -0
  1136. wisent/examples/scripts/results/test_basque_bench_pairs.json +14 -0
  1137. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +51 -0
  1138. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +14 -0
  1139. wisent/examples/scripts/results/test_basqueglue_evaluation.json +51 -0
  1140. wisent/examples/scripts/results/test_basqueglue_pairs.json +14 -0
  1141. wisent/examples/scripts/results/test_bbh_evaluation.json +51 -0
  1142. wisent/examples/scripts/results/test_bbh_pairs.json +14 -0
  1143. wisent/examples/scripts/results/test_bbq_evaluation.json +30 -0
  1144. wisent/examples/scripts/results/test_bbq_pairs.json +8 -0
  1145. wisent/examples/scripts/results/test_bec2016eu_evaluation.json +51 -0
  1146. wisent/examples/scripts/results/test_bec2016eu_pairs.json +14 -0
  1147. wisent/examples/scripts/results/test_belebele_evaluation.json +51 -0
  1148. wisent/examples/scripts/results/test_belebele_pairs.json +14 -0
  1149. wisent/examples/scripts/results/test_benchmarks_evaluation.json +51 -0
  1150. wisent/examples/scripts/results/test_benchmarks_pairs.json +14 -0
  1151. wisent/examples/scripts/results/test_bertaqa_evaluation.json +51 -0
  1152. wisent/examples/scripts/results/test_bertaqa_pairs.json +14 -0
  1153. wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +30 -0
  1154. wisent/examples/scripts/results/test_bhtc_v2_pairs.json +8 -0
  1155. wisent/examples/scripts/results/test_bigbench_evaluation.json +51 -0
  1156. wisent/examples/scripts/results/test_bigbench_pairs.json +14 -0
  1157. wisent/examples/scripts/results/test_blimp_evaluation.json +51 -0
  1158. wisent/examples/scripts/results/test_blimp_pairs.json +14 -0
  1159. wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +30 -0
  1160. wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +8 -0
  1161. wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +30 -0
  1162. wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +8 -0
  1163. wisent/examples/scripts/results/test_boolq_evaluation.json +30 -0
  1164. wisent/examples/scripts/results/test_boolq_pairs.json +8 -0
  1165. wisent/examples/scripts/results/test_c4_evaluation.json +30 -0
  1166. wisent/examples/scripts/results/test_c4_pairs.json +8 -0
  1167. wisent/examples/scripts/results/test_cabreu_evaluation.json +30 -0
  1168. wisent/examples/scripts/results/test_cabreu_pairs.json +8 -0
  1169. wisent/examples/scripts/results/test_careqa_evaluation.json +30 -0
  1170. wisent/examples/scripts/results/test_careqa_pairs.json +8 -0
  1171. wisent/examples/scripts/results/test_catalan_bench_evaluation.json +51 -0
  1172. wisent/examples/scripts/results/test_catalan_bench_pairs.json +14 -0
  1173. wisent/examples/scripts/results/test_catalanqa_evaluation.json +30 -0
  1174. wisent/examples/scripts/results/test_catalanqa_pairs.json +8 -0
  1175. wisent/examples/scripts/results/test_catcola_evaluation.json +30 -0
  1176. wisent/examples/scripts/results/test_catcola_pairs.json +8 -0
  1177. wisent/examples/scripts/results/test_cb_evaluation.json +30 -0
  1178. wisent/examples/scripts/results/test_cb_pairs.json +8 -0
  1179. wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +51 -0
  1180. wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +14 -0
  1181. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +30 -0
  1182. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +8 -0
  1183. wisent/examples/scripts/results/test_ceval_evaluation.json +51 -0
  1184. wisent/examples/scripts/results/test_ceval_pairs.json +14 -0
  1185. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +51 -0
  1186. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +14 -0
  1187. wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +51 -0
  1188. wisent/examples/scripts/results/test_chain_of_thought_pairs.json +14 -0
  1189. wisent/examples/scripts/results/test_chartqa_evaluation.json +30 -0
  1190. wisent/examples/scripts/results/test_chartqa_pairs.json +8 -0
  1191. wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +30 -0
  1192. wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +8 -0
  1193. wisent/examples/scripts/results/test_cmmlu_evaluation.json +51 -0
  1194. wisent/examples/scripts/results/test_cmmlu_pairs.json +14 -0
  1195. wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +30 -0
  1196. wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +8 -0
  1197. wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +30 -0
  1198. wisent/examples/scripts/results/test_cocoteros_es_pairs.json +8 -0
  1199. wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +30 -0
  1200. wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +8 -0
  1201. wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +30 -0
  1202. wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +8 -0
  1203. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +30 -0
  1204. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +8 -0
  1205. wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +30 -0
  1206. wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +8 -0
  1207. wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +30 -0
  1208. wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +8 -0
  1209. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +30 -0
  1210. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +8 -0
  1211. wisent/examples/scripts/results/test_coedit_gec_evaluation.json +30 -0
  1212. wisent/examples/scripts/results/test_coedit_gec_pairs.json +8 -0
  1213. wisent/examples/scripts/results/test_cola_evaluation.json +30 -0
  1214. wisent/examples/scripts/results/test_cola_pairs.json +8 -0
  1215. wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +30 -0
  1216. wisent/examples/scripts/results/test_commonsense_qa_pairs.json +8 -0
  1217. wisent/examples/scripts/results/test_conala_evaluation.json +30 -0
  1218. wisent/examples/scripts/results/test_conala_pairs.json +8 -0
  1219. wisent/examples/scripts/results/test_concode_evaluation.json +30 -0
  1220. wisent/examples/scripts/results/test_concode_pairs.json +8 -0
  1221. wisent/examples/scripts/results/test_copa_evaluation.json +30 -0
  1222. wisent/examples/scripts/results/test_copa_pairs.json +8 -0
  1223. wisent/examples/scripts/results/test_copal_id_evaluation.json +30 -0
  1224. wisent/examples/scripts/results/test_copal_id_pairs.json +8 -0
  1225. wisent/examples/scripts/results/test_coqa_evaluation.json +30 -0
  1226. wisent/examples/scripts/results/test_coqa_pairs.json +8 -0
  1227. wisent/examples/scripts/results/test_coqcat_evaluation.json +30 -0
  1228. wisent/examples/scripts/results/test_coqcat_pairs.json +8 -0
  1229. wisent/examples/scripts/results/test_crows_pairs_evaluation.json +51 -0
  1230. wisent/examples/scripts/results/test_crows_pairs_pairs.json +14 -0
  1231. wisent/examples/scripts/results/test_csatqa_evaluation.json +51 -0
  1232. wisent/examples/scripts/results/test_csatqa_pairs.json +14 -0
  1233. wisent/examples/scripts/results/test_cycle_letters_evaluation.json +30 -0
  1234. wisent/examples/scripts/results/test_cycle_letters_pairs.json +8 -0
  1235. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +51 -0
  1236. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +14 -0
  1237. wisent/examples/scripts/results/test_darija_bench_evaluation.json +51 -0
  1238. wisent/examples/scripts/results/test_darija_bench_pairs.json +14 -0
  1239. wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +30 -0
  1240. wisent/examples/scripts/results/test_darijahellaswag_pairs.json +8 -0
  1241. wisent/examples/scripts/results/test_darijammlu_evaluation.json +51 -0
  1242. wisent/examples/scripts/results/test_darijammlu_pairs.json +14 -0
  1243. wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +30 -0
  1244. wisent/examples/scripts/results/test_dbpedia_14_pairs.json +8 -0
  1245. wisent/examples/scripts/results/test_drop_evaluation.json +30 -0
  1246. wisent/examples/scripts/results/test_drop_pairs.json +8 -0
  1247. wisent/examples/scripts/results/test_ds1000_evaluation.json +30 -0
  1248. wisent/examples/scripts/results/test_ds1000_pairs.json +8 -0
  1249. wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +30 -0
  1250. wisent/examples/scripts/results/test_egyhellaswag_pairs.json +8 -0
  1251. wisent/examples/scripts/results/test_egymmlu_evaluation.json +51 -0
  1252. wisent/examples/scripts/results/test_egymmlu_pairs.json +14 -0
  1253. wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +30 -0
  1254. wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +8 -0
  1255. wisent/examples/scripts/results/test_eq_bench_evaluation.json +30 -0
  1256. wisent/examples/scripts/results/test_eq_bench_pairs.json +8 -0
  1257. wisent/examples/scripts/results/test_escola_evaluation.json +30 -0
  1258. wisent/examples/scripts/results/test_escola_pairs.json +8 -0
  1259. wisent/examples/scripts/results/test_ethics_cm_evaluation.json +30 -0
  1260. wisent/examples/scripts/results/test_ethics_cm_pairs.json +8 -0
  1261. wisent/examples/scripts/results/test_ethos_binary_evaluation.json +30 -0
  1262. wisent/examples/scripts/results/test_ethos_binary_pairs.json +8 -0
  1263. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +51 -0
  1264. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +14 -0
  1265. wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +51 -0
  1266. wisent/examples/scripts/results/test_eus_exams_es_pairs.json +14 -0
  1267. wisent/examples/scripts/results/test_eus_exams_evaluation.json +51 -0
  1268. wisent/examples/scripts/results/test_eus_exams_pairs.json +14 -0
  1269. wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +30 -0
  1270. wisent/examples/scripts/results/test_eus_proficiency_pairs.json +8 -0
  1271. wisent/examples/scripts/results/test_eus_reading_evaluation.json +30 -0
  1272. wisent/examples/scripts/results/test_eus_reading_pairs.json +8 -0
  1273. wisent/examples/scripts/results/test_eus_trivia_evaluation.json +30 -0
  1274. wisent/examples/scripts/results/test_eus_trivia_pairs.json +8 -0
  1275. wisent/examples/scripts/results/test_evalita-mp_evaluation.json +51 -0
  1276. wisent/examples/scripts/results/test_evalita-mp_pairs.json +14 -0
  1277. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +30 -0
  1278. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +8 -0
  1279. wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +51 -0
  1280. wisent/examples/scripts/results/test_evalita_LLM_pairs.json +14 -0
  1281. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +51 -0
  1282. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +14 -0
  1283. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +30 -0
  1284. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +8 -0
  1285. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +51 -0
  1286. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +14 -0
  1287. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +30 -0
  1288. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +8 -0
  1289. wisent/examples/scripts/results/test_fda_evaluation.json +30 -0
  1290. wisent/examples/scripts/results/test_fda_pairs.json +8 -0
  1291. wisent/examples/scripts/results/test_financial_tweets_evaluation.json +30 -0
  1292. wisent/examples/scripts/results/test_financial_tweets_pairs.json +8 -0
  1293. wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +30 -0
  1294. wisent/examples/scripts/results/test_fld/test_fld_pairs.json +8 -0
  1295. wisent/examples/scripts/results/test_fld_evaluation.json +30 -0
  1296. wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +30 -0
  1297. wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +8 -0
  1298. wisent/examples/scripts/results/test_fld_pairs.json +8 -0
  1299. wisent/examples/scripts/results/test_flores_evaluation.json +51 -0
  1300. wisent/examples/scripts/results/test_flores_pairs.json +14 -0
  1301. wisent/examples/scripts/results/test_freebase_evaluation.json +30 -0
  1302. wisent/examples/scripts/results/test_freebase_pairs.json +8 -0
  1303. wisent/examples/scripts/results/test_french_bench_evaluation.json +51 -0
  1304. wisent/examples/scripts/results/test_french_bench_pairs.json +14 -0
  1305. wisent/examples/scripts/results/test_galcola_evaluation.json +30 -0
  1306. wisent/examples/scripts/results/test_galcola_pairs.json +8 -0
  1307. wisent/examples/scripts/results/test_galician_bench_evaluation.json +51 -0
  1308. wisent/examples/scripts/results/test_galician_bench_pairs.json +14 -0
  1309. wisent/examples/scripts/results/test_glianorex_evaluation.json +30 -0
  1310. wisent/examples/scripts/results/test_glianorex_pairs.json +8 -0
  1311. wisent/examples/scripts/results/test_global_mmlu_evaluation.json +51 -0
  1312. wisent/examples/scripts/results/test_global_mmlu_pairs.json +14 -0
  1313. wisent/examples/scripts/results/test_glue_evaluation.json +51 -0
  1314. wisent/examples/scripts/results/test_glue_pairs.json +14 -0
  1315. wisent/examples/scripts/results/test_gpqa_evaluation.json +51 -0
  1316. wisent/examples/scripts/results/test_gpqa_pairs.json +14 -0
  1317. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +51 -0
  1318. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +14 -0
  1319. wisent/examples/scripts/results/test_groundcocoa_evaluation.json +30 -0
  1320. wisent/examples/scripts/results/test_groundcocoa_pairs.json +8 -0
  1321. wisent/examples/scripts/results/test_gsm8k_evaluation.json +30 -0
  1322. wisent/examples/scripts/results/test_gsm8k_pairs.json +8 -0
  1323. wisent/examples/scripts/results/test_haerae_evaluation.json +51 -0
  1324. wisent/examples/scripts/results/test_haerae_pairs.json +14 -0
  1325. wisent/examples/scripts/results/test_headqa_evaluation.json +30 -0
  1326. wisent/examples/scripts/results/test_headqa_pairs.json +8 -0
  1327. wisent/examples/scripts/results/test_hellaswag_evaluation.json +30 -0
  1328. wisent/examples/scripts/results/test_hellaswag_pairs.json +8 -0
  1329. wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +51 -0
  1330. wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +14 -0
  1331. wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +51 -0
  1332. wisent/examples/scripts/results/test_hendrycks_math_pairs.json +14 -0
  1333. wisent/examples/scripts/results/test_histoires_morales_evaluation.json +30 -0
  1334. wisent/examples/scripts/results/test_histoires_morales_pairs.json +8 -0
  1335. wisent/examples/scripts/results/test_hmmt_evaluation.json +30 -0
  1336. wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +30 -0
  1337. wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +8 -0
  1338. wisent/examples/scripts/results/test_hmmt_pairs.json +8 -0
  1339. wisent/examples/scripts/results/test_hrm8k_evaluation.json +51 -0
  1340. wisent/examples/scripts/results/test_hrm8k_pairs.json +14 -0
  1341. wisent/examples/scripts/results/test_humaneval_evaluation.json +30 -0
  1342. wisent/examples/scripts/results/test_humaneval_pairs.json +8 -0
  1343. wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +30 -0
  1344. wisent/examples/scripts/results/test_humaneval_plus_pairs.json +8 -0
  1345. wisent/examples/scripts/results/test_ifeval_evaluation.json +30 -0
  1346. wisent/examples/scripts/results/test_ifeval_pairs.json +8 -0
  1347. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +30 -0
  1348. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +8 -0
  1349. wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +30 -0
  1350. wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +8 -0
  1351. wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +51 -0
  1352. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +30 -0
  1353. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +8 -0
  1354. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +51 -0
  1355. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +14 -0
  1356. wisent/examples/scripts/results/test_inverse_scaling_pairs.json +14 -0
  1357. wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +30 -0
  1358. wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +8 -0
  1359. wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +30 -0
  1360. wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +8 -0
  1361. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +30 -0
  1362. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +8 -0
  1363. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +30 -0
  1364. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +8 -0
  1365. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +30 -0
  1366. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +8 -0
  1367. wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +51 -0
  1368. wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +14 -0
  1369. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +30 -0
  1370. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +8 -0
  1371. wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +30 -0
  1372. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +30 -0
  1373. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +8 -0
  1374. wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +8 -0
  1375. wisent/examples/scripts/results/test_kbl_evaluation.json +51 -0
  1376. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +51 -0
  1377. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +14 -0
  1378. wisent/examples/scripts/results/test_kbl_pairs.json +14 -0
  1379. wisent/examples/scripts/results/test_kmmlu_evaluation.json +51 -0
  1380. wisent/examples/scripts/results/test_kmmlu_pairs.json +14 -0
  1381. wisent/examples/scripts/results/test_kobest_evaluation.json +51 -0
  1382. wisent/examples/scripts/results/test_kobest_pairs.json +14 -0
  1383. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +30 -0
  1384. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +8 -0
  1385. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +30 -0
  1386. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +8 -0
  1387. wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +30 -0
  1388. wisent/examples/scripts/results/test_kormedmcqa_pairs.json +8 -0
  1389. wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +30 -0
  1390. wisent/examples/scripts/results/test_lambada_cloze_pairs.json +8 -0
  1391. wisent/examples/scripts/results/test_lambada_evaluation.json +30 -0
  1392. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +30 -0
  1393. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +8 -0
  1394. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +51 -0
  1395. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +14 -0
  1396. wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +51 -0
  1397. wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +14 -0
  1398. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +51 -0
  1399. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +14 -0
  1400. wisent/examples/scripts/results/test_lambada_openai_evaluation.json +30 -0
  1401. wisent/examples/scripts/results/test_lambada_openai_pairs.json +8 -0
  1402. wisent/examples/scripts/results/test_lambada_pairs.json +8 -0
  1403. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +30 -0
  1404. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +8 -0
  1405. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +30 -0
  1406. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +8 -0
  1407. wisent/examples/scripts/results/test_lambada_standard_evaluation.json +30 -0
  1408. wisent/examples/scripts/results/test_lambada_standard_pairs.json +8 -0
  1409. wisent/examples/scripts/results/test_leaderboard_evaluation.json +51 -0
  1410. wisent/examples/scripts/results/test_leaderboard_pairs.json +14 -0
  1411. wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +51 -0
  1412. wisent/examples/scripts/results/test_libra/test_libra_pairs.json +14 -0
  1413. wisent/examples/scripts/results/test_libra_evaluation.json +51 -0
  1414. wisent/examples/scripts/results/test_libra_pairs.json +14 -0
  1415. wisent/examples/scripts/results/test_lingoly_evaluation.json +30 -0
  1416. wisent/examples/scripts/results/test_lingoly_pairs.json +8 -0
  1417. wisent/examples/scripts/results/test_livecodebench_evaluation.json +30 -0
  1418. wisent/examples/scripts/results/test_livecodebench_pairs.json +8 -0
  1419. wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +30 -0
  1420. wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +8 -0
  1421. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +30 -0
  1422. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +8 -0
  1423. wisent/examples/scripts/results/test_llama_evaluation.json +30 -0
  1424. wisent/examples/scripts/results/test_llama_pairs.json +8 -0
  1425. wisent/examples/scripts/results/test_logiqa2_evaluation.json +30 -0
  1426. wisent/examples/scripts/results/test_logiqa2_pairs.json +8 -0
  1427. wisent/examples/scripts/results/test_logiqa_evaluation.json +30 -0
  1428. wisent/examples/scripts/results/test_logiqa_pairs.json +8 -0
  1429. wisent/examples/scripts/results/test_m_mmlu_evaluation.json +51 -0
  1430. wisent/examples/scripts/results/test_m_mmlu_pairs.json +14 -0
  1431. wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +51 -0
  1432. wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +14 -0
  1433. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +30 -0
  1434. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +8 -0
  1435. wisent/examples/scripts/results/test_mastermind_evaluation.json +51 -0
  1436. wisent/examples/scripts/results/test_mastermind_pairs.json +14 -0
  1437. wisent/examples/scripts/results/test_math500_evaluation.json +30 -0
  1438. wisent/examples/scripts/results/test_math500_pairs.json +8 -0
  1439. wisent/examples/scripts/results/test_math_evaluation.json +30 -0
  1440. wisent/examples/scripts/results/test_math_pairs.json +8 -0
  1441. wisent/examples/scripts/results/test_mathqa_evaluation.json +30 -0
  1442. wisent/examples/scripts/results/test_mathqa_pairs.json +8 -0
  1443. wisent/examples/scripts/results/test_mbpp_evaluation.json +30 -0
  1444. wisent/examples/scripts/results/test_mbpp_pairs.json +8 -0
  1445. wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +30 -0
  1446. wisent/examples/scripts/results/test_mbpp_plus_pairs.json +8 -0
  1447. wisent/examples/scripts/results/test_mc_taco_evaluation.json +30 -0
  1448. wisent/examples/scripts/results/test_mc_taco_pairs.json +8 -0
  1449. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +51 -0
  1450. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +14 -0
  1451. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +30 -0
  1452. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +8 -0
  1453. wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +51 -0
  1454. wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +14 -0
  1455. wisent/examples/scripts/results/test_meddialog_evaluation.json +30 -0
  1456. wisent/examples/scripts/results/test_meddialog_pairs.json +8 -0
  1457. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +30 -0
  1458. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +8 -0
  1459. wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +30 -0
  1460. wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +8 -0
  1461. wisent/examples/scripts/results/test_medmcqa_evaluation.json +30 -0
  1462. wisent/examples/scripts/results/test_medmcqa_pairs.json +8 -0
  1463. wisent/examples/scripts/results/test_medqa_evaluation.json +30 -0
  1464. wisent/examples/scripts/results/test_medqa_pairs.json +8 -0
  1465. wisent/examples/scripts/results/test_medtext_evaluation.json +30 -0
  1466. wisent/examples/scripts/results/test_medtext_pairs.json +8 -0
  1467. wisent/examples/scripts/results/test_mela_evaluation.json +51 -0
  1468. wisent/examples/scripts/results/test_mela_pairs.json +14 -0
  1469. wisent/examples/scripts/results/test_meqsum_evaluation.json +30 -0
  1470. wisent/examples/scripts/results/test_meqsum_pairs.json +8 -0
  1471. wisent/examples/scripts/results/test_mercury_evaluation.json +30 -0
  1472. wisent/examples/scripts/results/test_mercury_pairs.json +8 -0
  1473. wisent/examples/scripts/results/test_metabench_evaluation.json +51 -0
  1474. wisent/examples/scripts/results/test_metabench_pairs.json +14 -0
  1475. wisent/examples/scripts/results/test_mgsm_evaluation.json +51 -0
  1476. wisent/examples/scripts/results/test_mgsm_pairs.json +14 -0
  1477. wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +30 -0
  1478. wisent/examples/scripts/results/test_mimic_repsum_pairs.json +8 -0
  1479. wisent/examples/scripts/results/test_minerva_math_evaluation.json +51 -0
  1480. wisent/examples/scripts/results/test_minerva_math_pairs.json +14 -0
  1481. wisent/examples/scripts/results/test_mlqa_evaluation.json +51 -0
  1482. wisent/examples/scripts/results/test_mlqa_pairs.json +14 -0
  1483. wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +51 -0
  1484. wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +14 -0
  1485. wisent/examples/scripts/results/test_mmlu_evaluation.json +51 -0
  1486. wisent/examples/scripts/results/test_mmlu_pairs.json +14 -0
  1487. wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +51 -0
  1488. wisent/examples/scripts/results/test_mmlu_pro_pairs.json +14 -0
  1489. wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +51 -0
  1490. wisent/examples/scripts/results/test_mmlu_prox_pairs.json +14 -0
  1491. wisent/examples/scripts/results/test_mmlusr_evaluation.json +30 -0
  1492. wisent/examples/scripts/results/test_mmlusr_pairs.json +8 -0
  1493. wisent/examples/scripts/results/test_mmmu_evaluation.json +51 -0
  1494. wisent/examples/scripts/results/test_mmmu_pairs.json +14 -0
  1495. wisent/examples/scripts/results/test_mnli_evaluation.json +30 -0
  1496. wisent/examples/scripts/results/test_mnli_pairs.json +8 -0
  1497. wisent/examples/scripts/results/test_model_written_evals_evaluation.json +51 -0
  1498. wisent/examples/scripts/results/test_model_written_evals_pairs.json +14 -0
  1499. wisent/examples/scripts/results/test_moral_stories_evaluation.json +30 -0
  1500. wisent/examples/scripts/results/test_moral_stories_pairs.json +8 -0
  1501. wisent/examples/scripts/results/test_mts_dialog_evaluation.json +30 -0
  1502. wisent/examples/scripts/results/test_mts_dialog_pairs.json +8 -0
  1503. wisent/examples/scripts/results/test_multiblimp_evaluation.json +51 -0
  1504. wisent/examples/scripts/results/test_multiblimp_pairs.json +14 -0
  1505. wisent/examples/scripts/results/test_multimedqa_evaluation.json +51 -0
  1506. wisent/examples/scripts/results/test_multimedqa_pairs.json +14 -0
  1507. wisent/examples/scripts/results/test_multipl_e_evaluation.json +30 -0
  1508. wisent/examples/scripts/results/test_multipl_e_pairs.json +8 -0
  1509. wisent/examples/scripts/results/test_mutual_evaluation.json +30 -0
  1510. wisent/examples/scripts/results/test_mutual_pairs.json +8 -0
  1511. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +30 -0
  1512. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +8 -0
  1513. wisent/examples/scripts/results/test_noreval_evaluation.json +51 -0
  1514. wisent/examples/scripts/results/test_noreval_pairs.json +14 -0
  1515. wisent/examples/scripts/results/test_noticia_evaluation.json +30 -0
  1516. wisent/examples/scripts/results/test_noticia_pairs.json +8 -0
  1517. wisent/examples/scripts/results/test_nq_open_evaluation.json +30 -0
  1518. wisent/examples/scripts/results/test_nq_open_pairs.json +8 -0
  1519. wisent/examples/scripts/results/test_olaph_evaluation.json +30 -0
  1520. wisent/examples/scripts/results/test_olaph_pairs.json +8 -0
  1521. wisent/examples/scripts/results/test_openbookqa_evaluation.json +30 -0
  1522. wisent/examples/scripts/results/test_openbookqa_pairs.json +8 -0
  1523. wisent/examples/scripts/results/test_openllm_evaluation.json +51 -0
  1524. wisent/examples/scripts/results/test_openllm_pairs.json +14 -0
  1525. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +30 -0
  1526. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +8 -0
  1527. wisent/examples/scripts/results/test_paloma_evaluation.json +51 -0
  1528. wisent/examples/scripts/results/test_paloma_pairs.json +14 -0
  1529. wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +30 -0
  1530. wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +8 -0
  1531. wisent/examples/scripts/results/test_paws-x_evaluation.json +51 -0
  1532. wisent/examples/scripts/results/test_paws-x_pairs.json +14 -0
  1533. wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +30 -0
  1534. wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +8 -0
  1535. wisent/examples/scripts/results/test_penn_treebank_evaluation.json +30 -0
  1536. wisent/examples/scripts/results/test_penn_treebank_pairs.json +8 -0
  1537. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +30 -0
  1538. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +8 -0
  1539. wisent/examples/scripts/results/test_piqa_evaluation.json +30 -0
  1540. wisent/examples/scripts/results/test_piqa_pairs.json +8 -0
  1541. wisent/examples/scripts/results/test_polemo2_evaluation.json +30 -0
  1542. wisent/examples/scripts/results/test_polemo2_pairs.json +8 -0
  1543. wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +30 -0
  1544. wisent/examples/scripts/results/test_polymath_en_high_pairs.json +8 -0
  1545. wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +30 -0
  1546. wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +8 -0
  1547. wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +30 -0
  1548. wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +8 -0
  1549. wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +30 -0
  1550. wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +8 -0
  1551. wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +51 -0
  1552. wisent/examples/scripts/results/test_portuguese_bench_pairs.json +14 -0
  1553. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +30 -0
  1554. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +8 -0
  1555. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +30 -0
  1556. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +8 -0
  1557. wisent/examples/scripts/results/test_prost_evaluation.json +30 -0
  1558. wisent/examples/scripts/results/test_prost_pairs.json +8 -0
  1559. wisent/examples/scripts/results/test_ptb_evaluation.json +30 -0
  1560. wisent/examples/scripts/results/test_ptb_pairs.json +8 -0
  1561. wisent/examples/scripts/results/test_pubmedqa_evaluation.json +30 -0
  1562. wisent/examples/scripts/results/test_pubmedqa_pairs.json +8 -0
  1563. wisent/examples/scripts/results/test_pythia_evaluation.json +51 -0
  1564. wisent/examples/scripts/results/test_pythia_pairs.json +14 -0
  1565. wisent/examples/scripts/results/test_qa4mre_evaluation.json +30 -0
  1566. wisent/examples/scripts/results/test_qa4mre_pairs.json +8 -0
  1567. wisent/examples/scripts/results/test_qasper_evaluation.json +30 -0
  1568. wisent/examples/scripts/results/test_qasper_pairs.json +8 -0
  1569. wisent/examples/scripts/results/test_race_evaluation.json +30 -0
  1570. wisent/examples/scripts/results/test_race_pairs.json +8 -0
  1571. wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +30 -0
  1572. wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +8 -0
  1573. wisent/examples/scripts/results/test_recode_evaluation.json +30 -0
  1574. wisent/examples/scripts/results/test_recode_pairs.json +8 -0
  1575. wisent/examples/scripts/results/test_record_evaluation.json +30 -0
  1576. wisent/examples/scripts/results/test_record_pairs.json +8 -0
  1577. wisent/examples/scripts/results/test_ruler_evaluation.json +51 -0
  1578. wisent/examples/scripts/results/test_ruler_pairs.json +14 -0
  1579. wisent/examples/scripts/results/test_sciq_evaluation.json +30 -0
  1580. wisent/examples/scripts/results/test_sciq_pairs.json +8 -0
  1581. wisent/examples/scripts/results/test_score_evaluation.json +51 -0
  1582. wisent/examples/scripts/results/test_score_pairs.json +14 -0
  1583. wisent/examples/scripts/results/test_self_consistency_evaluation.json +30 -0
  1584. wisent/examples/scripts/results/test_self_consistency_pairs.json +8 -0
  1585. wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +30 -0
  1586. wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +8 -0
  1587. wisent/examples/scripts/results/test_siqa_evaluation.json +30 -0
  1588. wisent/examples/scripts/results/test_siqa_pairs.json +8 -0
  1589. wisent/examples/scripts/results/test_spanish_bench_evaluation.json +51 -0
  1590. wisent/examples/scripts/results/test_spanish_bench_pairs.json +14 -0
  1591. wisent/examples/scripts/results/test_squad2_evaluation.json +30 -0
  1592. wisent/examples/scripts/results/test_squad2_pairs.json +8 -0
  1593. wisent/examples/scripts/results/test_squadv2_evaluation.json +30 -0
  1594. wisent/examples/scripts/results/test_squadv2_pairs.json +8 -0
  1595. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +30 -0
  1596. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +8 -0
  1597. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +51 -0
  1598. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +14 -0
  1599. wisent/examples/scripts/results/test_swag_evaluation.json +30 -0
  1600. wisent/examples/scripts/results/test_swag_pairs.json +8 -0
  1601. wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +51 -0
  1602. wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +14 -0
  1603. wisent/examples/scripts/results/test_tmmluplus_evaluation.json +51 -0
  1604. wisent/examples/scripts/results/test_tmmluplus_pairs.json +14 -0
  1605. wisent/examples/scripts/results/test_translation_evaluation.json +51 -0
  1606. wisent/examples/scripts/results/test_translation_pairs.json +14 -0
  1607. wisent/examples/scripts/results/test_triviaqa_evaluation.json +30 -0
  1608. wisent/examples/scripts/results/test_triviaqa_pairs.json +8 -0
  1609. wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +51 -0
  1610. wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +14 -0
  1611. wisent/examples/scripts/results/test_truthfulqa_evaluation.json +30 -0
  1612. wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +30 -0
  1613. wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +8 -0
  1614. wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +30 -0
  1615. wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +8 -0
  1616. wisent/examples/scripts/results/test_truthfulqa_pairs.json +8 -0
  1617. wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +51 -0
  1618. wisent/examples/scripts/results/test_turkishmmlu_pairs.json +14 -0
  1619. wisent/examples/scripts/results/test_unfair_tos_evaluation.json +30 -0
  1620. wisent/examples/scripts/results/test_unfair_tos_pairs.json +8 -0
  1621. wisent/examples/scripts/results/test_unscramble_evaluation.json +51 -0
  1622. wisent/examples/scripts/results/test_unscramble_pairs.json +14 -0
  1623. wisent/examples/scripts/results/test_webqs_evaluation.json +30 -0
  1624. wisent/examples/scripts/results/test_webqs_pairs.json +8 -0
  1625. wisent/examples/scripts/results/test_wikitext103_evaluation.json +30 -0
  1626. wisent/examples/scripts/results/test_wikitext103_pairs.json +8 -0
  1627. wisent/examples/scripts/results/test_wikitext_evaluation.json +30 -0
  1628. wisent/examples/scripts/results/test_wikitext_pairs.json +8 -0
  1629. wisent/examples/scripts/results/test_winogender_evaluation.json +51 -0
  1630. wisent/examples/scripts/results/test_winogender_pairs.json +14 -0
  1631. wisent/examples/scripts/results/test_winogrande_evaluation.json +30 -0
  1632. wisent/examples/scripts/results/test_winogrande_pairs.json +8 -0
  1633. wisent/examples/scripts/results/test_wmdp_evaluation.json +30 -0
  1634. wisent/examples/scripts/results/test_wmdp_pairs.json +8 -0
  1635. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +30 -0
  1636. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +8 -0
  1637. wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +30 -0
  1638. wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +8 -0
  1639. wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +30 -0
  1640. wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +8 -0
  1641. wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +30 -0
  1642. wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +8 -0
  1643. wisent/examples/scripts/results/test_wsc273_evaluation.json +30 -0
  1644. wisent/examples/scripts/results/test_wsc273_pairs.json +8 -0
  1645. wisent/examples/scripts/results/test_xcopa_evaluation.json +51 -0
  1646. wisent/examples/scripts/results/test_xcopa_pairs.json +14 -0
  1647. wisent/examples/scripts/results/test_xnli_eu_evaluation.json +30 -0
  1648. wisent/examples/scripts/results/test_xnli_eu_pairs.json +8 -0
  1649. wisent/examples/scripts/results/test_xnli_evaluation.json +51 -0
  1650. wisent/examples/scripts/results/test_xnli_pairs.json +14 -0
  1651. wisent/examples/scripts/results/test_xquad_evaluation.json +51 -0
  1652. wisent/examples/scripts/results/test_xquad_pairs.json +14 -0
  1653. wisent/examples/scripts/results/test_xstorycloze_evaluation.json +51 -0
  1654. wisent/examples/scripts/results/test_xstorycloze_pairs.json +14 -0
  1655. wisent/examples/scripts/results/test_xsum_evaluation.json +30 -0
  1656. wisent/examples/scripts/results/test_xsum_pairs.json +8 -0
  1657. wisent/examples/scripts/results/test_xwinograd_evaluation.json +51 -0
  1658. wisent/examples/scripts/results/test_xwinograd_pairs.json +14 -0
  1659. wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +30 -0
  1660. wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +8 -0
  1661. wisent/parameters/__init__.py +1 -0
  1662. wisent/parameters/lm_eval/all_lm_eval_task_families.json +169 -0
  1663. wisent/parameters/lm_eval/broken_in_lm_eval.json +10 -0
  1664. wisent/parameters/lm_eval/evaluations_not_lm_eval_tasks.json +0 -0
  1665. wisent/parameters/lm_eval/evaluator_check.json +3476 -0
  1666. wisent/parameters/lm_eval/final_verification.json +24782 -0
  1667. wisent/parameters/lm_eval/group_task_evaluators.json +1833 -0
  1668. wisent/parameters/lm_eval/group_tasks.json +150 -0
  1669. wisent/parameters/lm_eval/individual_tasks.json +402 -0
  1670. wisent/parameters/lm_eval/no_readmes.json +1 -0
  1671. wisent/parameters/lm_eval/not_lm_eval_tasks.json +110 -0
  1672. wisent/parameters/lm_eval/read_tasks.json +208 -0
  1673. wisent/parameters/lm_eval/readme_files.json +208 -0
  1674. wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +128 -0
  1675. wisent/parameters/tasks/missing_task_families.json +2963 -0
  1676. wisent/parameters/tasks/remaining_tasks_to_implement.json +199 -0
  1677. wisent/parameters/tasks/risks.json +10 -0
  1678. wisent/parameters/tasks/skills.json +14 -0
  1679. wisent/parameters/tasks/tasks.json +56031 -0
  1680. wisent/scripts/run_quality_metrics_sweep.sh +315 -0
  1681. wisent/tests/__init__.py +0 -0
  1682. wisent/tests/examples/__init__.py +0 -0
  1683. wisent/tests/examples/cli/__init__.py +0 -0
  1684. wisent/tests/examples/cli/activations/__init__.py +0 -0
  1685. wisent/tests/examples/cli/activations/test_get_activations.py +127 -0
  1686. wisent/tests/examples/cli/classifier/__init__.py +0 -0
  1687. wisent/tests/examples/cli/classifier/test_classifier_examples.py +141 -0
  1688. wisent/tests/examples/cli/contrastive_pairs/__init__.py +0 -0
  1689. wisent/tests/examples/cli/contrastive_pairs/test_generate_pairs.py +89 -0
  1690. wisent/tests/examples/cli/evaluation/__init__.py +0 -0
  1691. wisent/tests/examples/cli/evaluation/test_evaluation_examples.py +117 -0
  1692. wisent/tests/examples/cli/generate/__init__.py +0 -0
  1693. wisent/tests/examples/cli/generate/test_generate_with_classifier.py +146 -0
  1694. wisent/tests/examples/cli/generate/test_generate_with_steering.py +149 -0
  1695. wisent/tests/examples/cli/generate/test_only_generate.py +110 -0
  1696. wisent/tests/examples/cli/multi_steering/__init__.py +0 -0
  1697. wisent/tests/examples/cli/multi_steering/test_multi_steer_from_trained_vectors.py +210 -0
  1698. wisent/tests/examples/cli/multi_steering/test_multi_steer_with_different_parameters.py +205 -0
  1699. wisent/tests/examples/cli/multi_steering/test_train_and_multi_steer.py +174 -0
  1700. wisent/tests/examples/cli/optimizer/__init__.py +0 -0
  1701. wisent/tests/examples/cli/optimizer/test_optimize_sample_size.py +102 -0
  1702. wisent/tests/examples/cli/optimizer/test_optimizer_examples.py +59 -0
  1703. wisent/tests/examples/cli/steering/__init__.py +0 -0
  1704. wisent/tests/examples/cli/steering/test_create_steering_vectors.py +135 -0
  1705. wisent/tests/examples/cli/synthetic/__init__.py +0 -0
  1706. wisent/tests/examples/cli/synthetic/test_synthetic_pairs.py +45 -0
  1707. wisent/tests/nosense/__init__.py +6 -0
  1708. wisent/tests/nosense/base_nosense.py +81 -0
  1709. wisent/tests/nosense/math500_nosense.py +72 -0
  1710. wisent/tests/nosense/test_robustness.py +336 -0
  1711. wisent/tests/test_all_cli_commands.py +674 -0
  1712. wisent/tests/test_geometry_comprehensive.py +327 -0
  1713. wisent/tests/test_titan_geometry.py +257 -0
  1714. wisent/tests/visualize_geometry.py +148 -0
  1715. wisent-0.7.379.dist-info/METADATA +64 -0
  1716. wisent-0.7.379.dist-info/RECORD +1720 -0
  1717. wisent-0.7.379.dist-info/WHEEL +5 -0
  1718. wisent-0.7.379.dist-info/entry_points.txt +2 -0
  1719. wisent-0.7.379.dist-info/licenses/LICENSE +21 -0
  1720. wisent-0.7.379.dist-info/top_level.txt +1 -0
@@ -0,0 +1,3476 @@
1
+ {
2
+ "acp_bench": {
3
+ "lm_eval": {
4
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/acpbench/boolq_cot_2shot/_boolq_cot_2shot_yaml",
5
+ "output_type": "generate_until",
6
+ "metric": "exact_match",
7
+ "line_references": {
8
+ "output_type": 4,
9
+ "metric": 26
10
+ }
11
+ },
12
+ "wisent": {
13
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py",
14
+ "evaluator": "log_likelihoods",
15
+ "line_references": {
16
+ "evaluator": 32
17
+ }
18
+ },
19
+ "match": false,
20
+ "notes": "lm-eval uses generate_until with exact_match. Wisent uses log_likelihoods. WRONG - generation task being evaluated with probability scoring."
21
+ },
22
+ "arithmetic": {
23
+ "lm_eval": {
24
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml",
25
+ "output_type": "loglikelihood",
26
+ "metric": "acc",
27
+ "line_references": {
28
+ "output_type": 6,
29
+ "metric": 13
30
+ }
31
+ },
32
+ "wisent": {
33
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py",
34
+ "evaluator": "exact_match",
35
+ "line_references": {
36
+ "evaluator": 20
37
+ }
38
+ },
39
+ "match": false,
40
+ "notes": "lm-eval uses loglikelihood with acc metric. Wisent uses exact_match. WRONG - loglikelihood task being evaluated with text matching."
41
+ },
42
+ "arabculture": {
43
+ "lm_eval": {
44
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/arab_culture/_default_arab_culture_mcq_template_yaml",
45
+ "output_type": "multiple_choice",
46
+ "metric": "acc",
47
+ "line_references": {
48
+ "output_type": 6,
49
+ "metric": 12
50
+ }
51
+ },
52
+ "wisent": {
53
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py",
54
+ "evaluator": "log_likelihoods",
55
+ "line_references": {
56
+ "evaluator": 26
57
+ }
58
+ },
59
+ "match": true,
60
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
61
+ },
62
+ "aradice": {
63
+ "lm_eval": {
64
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/aradice/boolq/EGY/boolq_egy.yaml",
65
+ "output_type": "multiple_choice",
66
+ "metric": "acc",
67
+ "line_references": {
68
+ "output_type": 4,
69
+ "metric": 15
70
+ }
71
+ },
72
+ "wisent": {
73
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py",
74
+ "evaluator": "log_likelihoods",
75
+ "line_references": {
76
+ "evaluator": 141
77
+ }
78
+ },
79
+ "match": true,
80
+ "notes": "lm-eval uses multiple_choice with acc and f1 metrics. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
81
+ },
82
+ "ai2_arc": {
83
+ "lm_eval": {
84
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/arc/arc_easy.yaml",
85
+ "output_type": "multiple_choice",
86
+ "metric": "acc",
87
+ "line_references": {
88
+ "output_type": 6,
89
+ "metric": 16
90
+ }
91
+ },
92
+ "wisent": {
93
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ai2_arc.py",
94
+ "evaluator": "log_likelihoods",
95
+ "line_references": {
96
+ "evaluator": 19
97
+ }
98
+ },
99
+ "match": true,
100
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
101
+ },
102
+ "bbh": {
103
+ "lm_eval": {
104
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml",
105
+ "output_type": "generate_until",
106
+ "metric": "exact_match",
107
+ "line_references": {
108
+ "output_type": 2,
109
+ "metric": 6
110
+ }
111
+ },
112
+ "wisent": {
113
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbh.py",
114
+ "evaluator": "exact_match",
115
+ "line_references": {
116
+ "evaluator": 19
117
+ }
118
+ },
119
+ "match": true,
120
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses exact_match. CORRECT - both use text generation with exact matching."
121
+ },
122
+ "belebele": {
123
+ "lm_eval": {
124
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/belebele/_default_template_yaml",
125
+ "output_type": "multiple_choice",
126
+ "metric": "acc",
127
+ "line_references": {
128
+ "output_type": 4,
129
+ "metric": 11
130
+ }
131
+ },
132
+ "wisent": {
133
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py",
134
+ "evaluator": "log_likelihoods",
135
+ "line_references": {
136
+ "evaluator": 19
137
+ }
138
+ },
139
+ "match": true,
140
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
141
+ },
142
+ "commonsense_qa": {
143
+ "lm_eval": {
144
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/commonsense_qa/default.yaml",
145
+ "output_type": "multiple_choice",
146
+ "metric": "acc",
147
+ "line_references": {
148
+ "output_type": 5,
149
+ "metric": 10
150
+ }
151
+ },
152
+ "wisent": {
153
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py",
154
+ "evaluator": "log_likelihoods",
155
+ "line_references": {
156
+ "evaluator": 19
157
+ }
158
+ },
159
+ "match": true,
160
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
161
+ },
162
+ "gsm8k": {
163
+ "lm_eval": {
164
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/gsm8k/gsm8k.yaml",
165
+ "output_type": "generate_until",
166
+ "metric": "exact_match",
167
+ "line_references": {
168
+ "output_type": 6,
169
+ "metric": 13
170
+ }
171
+ },
172
+ "wisent": {
173
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py",
174
+ "evaluator": "exact_match",
175
+ "line_references": {
176
+ "evaluator": 32
177
+ }
178
+ },
179
+ "match": true,
180
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses exact_match. CORRECT - both use text generation with exact matching."
181
+ },
182
+ "hellaswag": {
183
+ "lm_eval": {
184
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/hellaswag/hellaswag.yaml",
185
+ "output_type": "multiple_choice",
186
+ "metric": "acc",
187
+ "line_references": {
188
+ "output_type": 6,
189
+ "metric": 15
190
+ }
191
+ },
192
+ "wisent": {
193
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py",
194
+ "evaluator": "log_likelihoods",
195
+ "line_references": {
196
+ "evaluator": 19
197
+ }
198
+ },
199
+ "match": true,
200
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
201
+ },
202
+ "humaneval": {
203
+ "lm_eval": {
204
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/humaneval/humaneval.yaml",
205
+ "output_type": "generate_until",
206
+ "metric": "pass_at_k",
207
+ "line_references": {
208
+ "output_type": 4,
209
+ "metric": 9
210
+ }
211
+ },
212
+ "wisent": {
213
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval.py",
214
+ "evaluator": "exact_match",
215
+ "line_references": {
216
+ "evaluator": 23
217
+ }
218
+ },
219
+ "match": false,
220
+ "notes": "lm-eval uses generate_until with pass_at_k metric (code execution). Wisent uses exact_match. WRONG - code execution task being evaluated with text matching instead of execution."
221
+ },
222
+ "ifeval": {
223
+ "lm_eval": {
224
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/ifeval/ifeval.yaml",
225
+ "output_type": "generate_until",
226
+ "metric": "prompt_level_strict_acc",
227
+ "line_references": {
228
+ "output_type": 4,
229
+ "metric": 16
230
+ }
231
+ },
232
+ "wisent": {
233
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ifeval.py",
234
+ "evaluator": "exact_match",
235
+ "line_references": {
236
+ "evaluator": 19
237
+ }
238
+ },
239
+ "match": true,
240
+ "notes": "lm-eval uses generate_until with prompt_level_strict_acc metric. Wisent uses exact_match. CORRECT - both use text generation with matching evaluation."
241
+ },
242
+ "lambada": {
243
+ "lm_eval": {
244
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/lambada/lambada_openai.yaml",
245
+ "output_type": "loglikelihood",
246
+ "metric": "acc",
247
+ "line_references": {
248
+ "output_type": 6,
249
+ "metric": 16
250
+ }
251
+ },
252
+ "wisent": {
253
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada.py",
254
+ "evaluator": "exact_match",
255
+ "line_references": {
256
+ "evaluator": 24
257
+ }
258
+ },
259
+ "match": false,
260
+ "notes": "lm-eval uses loglikelihood with acc metric. Wisent uses exact_match. WRONG - loglikelihood task being evaluated with text matching."
261
+ },
262
+ "mmlu": {
263
+ "lm_eval": {
264
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mmlu/generative/_default_template_yaml",
265
+ "output_type": "generate_until",
266
+ "metric": "exact_match",
267
+ "line_references": {
268
+ "output_type": 6,
269
+ "metric": 14
270
+ }
271
+ },
272
+ "wisent": {
273
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py",
274
+ "evaluator": "log_likelihoods",
275
+ "line_references": {
276
+ "evaluator": 18
277
+ }
278
+ },
279
+ "match": false,
280
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses log_likelihoods. WRONG - generation task being evaluated with probability scoring."
281
+ },
282
+ "piqa": {
283
+ "lm_eval": {
284
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/piqa/piqa.yaml",
285
+ "output_type": "multiple_choice",
286
+ "metric": "acc",
287
+ "line_references": {
288
+ "output_type": 4,
289
+ "metric": 14
290
+ }
291
+ },
292
+ "wisent": {
293
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py",
294
+ "evaluator": "log_likelihoods",
295
+ "line_references": {
296
+ "evaluator": 19
297
+ }
298
+ },
299
+ "match": true,
300
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
301
+ },
302
+ "siqa": {
303
+ "lm_eval": {
304
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/siqa/siqa.yaml",
305
+ "output_type": "multiple_choice",
306
+ "metric": "acc",
307
+ "line_references": {
308
+ "output_type": 4,
309
+ "metric": 12
310
+ }
311
+ },
312
+ "wisent": {
313
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py",
314
+ "evaluator": "log_likelihoods",
315
+ "line_references": {
316
+ "evaluator": 19
317
+ }
318
+ },
319
+ "match": true,
320
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
321
+ },
322
+ "truthfulqa_mc1": {
323
+ "lm_eval": {
324
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml",
325
+ "output_type": "multiple_choice",
326
+ "metric": "acc",
327
+ "line_references": {
328
+ "output_type": 6,
329
+ "metric": 32
330
+ }
331
+ },
332
+ "wisent": {
333
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py",
334
+ "evaluator": "log_likelihoods",
335
+ "line_references": {
336
+ "evaluator": 20
337
+ }
338
+ },
339
+ "match": true,
340
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
341
+ },
342
+ "winogrande": {
343
+ "lm_eval": {
344
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/winogrande/default.yaml",
345
+ "output_type": "multiple_choice",
346
+ "metric": "acc",
347
+ "line_references": {
348
+ "output_type": 4,
349
+ "metric": 13
350
+ }
351
+ },
352
+ "wisent": {
353
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py",
354
+ "evaluator": "log_likelihoods",
355
+ "line_references": {
356
+ "evaluator": 19
357
+ }
358
+ },
359
+ "match": true,
360
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
361
+ },
362
+ "openbookqa": {
363
+ "lm_eval": {
364
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/openbookqa/openbookqa.yaml",
365
+ "output_type": "multiple_choice",
366
+ "metric": "acc",
367
+ "line_references": {
368
+ "output_type": 4,
369
+ "metric": 14
370
+ }
371
+ },
372
+ "wisent": {
373
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py",
374
+ "evaluator": "log_likelihoods",
375
+ "line_references": {
376
+ "evaluator": 19
377
+ }
378
+ },
379
+ "match": true,
380
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
381
+ },
382
+ "sciq": {
383
+ "lm_eval": {
384
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/sciq/sciq.yaml",
385
+ "output_type": "multiple_choice",
386
+ "metric": "acc",
387
+ "line_references": {
388
+ "output_type": 4,
389
+ "metric": 14
390
+ }
391
+ },
392
+ "wisent": {
393
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py",
394
+ "evaluator": "log_likelihoods",
395
+ "line_references": {
396
+ "evaluator": 19
397
+ }
398
+ },
399
+ "match": true,
400
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
401
+ },
402
+ "anli": {
403
+ "lm_eval": {
404
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/anli/anli_r1.yaml",
405
+ "output_type": "multiple_choice",
406
+ "metric": "acc",
407
+ "line_references": {
408
+ "output_type": 6,
409
+ "metric": 22
410
+ }
411
+ },
412
+ "wisent": {
413
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anli.py",
414
+ "evaluator": "log_likelihoods",
415
+ "line_references": {
416
+ "evaluator": 19
417
+ }
418
+ },
419
+ "match": true,
420
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
421
+ },
422
+ "wikitext": {
423
+ "lm_eval": {
424
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/wikitext/wikitext.yaml",
425
+ "output_type": "loglikelihood_rolling",
426
+ "metric": "word_perplexity",
427
+ "line_references": {
428
+ "output_type": 4,
429
+ "metric": 14
430
+ }
431
+ },
432
+ "wisent": {
433
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wikitext.py",
434
+ "evaluator": "generation",
435
+ "line_references": {
436
+ "evaluator": 20
437
+ }
438
+ },
439
+ "match": false,
440
+ "notes": "lm-eval uses loglikelihood_rolling with perplexity metrics. Wisent uses generation. WRONG - perplexity task being evaluated with text generation."
441
+ },
442
+ "triviaqa": {
443
+ "lm_eval": {
444
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/triviaqa/default.yaml",
445
+ "output_type": "generate_until",
446
+ "metric": "exact_match",
447
+ "line_references": {
448
+ "output_type": 4,
449
+ "metric": 25
450
+ }
451
+ },
452
+ "wisent": {
453
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py",
454
+ "evaluator": "generation",
455
+ "line_references": {
456
+ "evaluator": 20
457
+ }
458
+ },
459
+ "match": true,
460
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses generation. CORRECT - both use text generation with matching evaluation."
461
+ },
462
+ "race": {
463
+ "lm_eval": {
464
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/race/race.yaml",
465
+ "output_type": "multiple_choice",
466
+ "metric": "acc",
467
+ "line_references": {
468
+ "output_type": 4,
469
+ "metric": 10
470
+ }
471
+ },
472
+ "wisent": {
473
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py",
474
+ "evaluator": "log_likelihoods",
475
+ "line_references": {
476
+ "evaluator": 21
477
+ }
478
+ },
479
+ "match": true,
480
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
481
+ },
482
+ "agieval": {
483
+ "lm_eval": {
484
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/agieval/aqua-rat.yaml",
485
+ "output_type": "multiple_choice",
486
+ "metric": "acc",
487
+ "line_references": {
488
+ "output_type": 4,
489
+ "metric": 13
490
+ }
491
+ },
492
+ "wisent": {
493
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py",
494
+ "evaluator": "exact_match",
495
+ "line_references": {
496
+ "evaluator": 19
497
+ }
498
+ },
499
+ "match": false,
500
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses exact_match. WRONG - multiple choice task being evaluated with text matching instead of loglikelihoods."
501
+ },
502
+ "mbpp": {
503
+ "lm_eval": {
504
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mbpp/mbpp.yaml",
505
+ "output_type": "generate_until",
506
+ "metric": "pass_at_1",
507
+ "line_references": {
508
+ "output_type": 5,
509
+ "metric": 11
510
+ }
511
+ },
512
+ "wisent": {
513
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py",
514
+ "evaluator": "exact_match",
515
+ "line_references": {
516
+ "evaluator": 22
517
+ }
518
+ },
519
+ "match": true,
520
+ "notes": "lm-eval uses generate_until with pass_at_1 metric (code execution). Wisent uses exact_match evaluator. MATCH - generation tasks can use exact_match for text comparison."
521
+ },
522
+ "cola": {
523
+ "lm_eval": {
524
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/glue/cola/default.yaml",
525
+ "output_type": "multiple_choice",
526
+ "metric": "mcc",
527
+ "line_references": {
528
+ "output_type": 5,
529
+ "metric": 14
530
+ }
531
+ },
532
+ "wisent": {
533
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cola.py",
534
+ "evaluator": "log_likelihoods",
535
+ "line_references": {
536
+ "evaluator": 19
537
+ }
538
+ },
539
+ "match": true,
540
+ "notes": "lm-eval uses multiple_choice with mcc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
541
+ },
542
+ "mnli": {
543
+ "lm_eval": {
544
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/glue/mnli/default.yaml",
545
+ "output_type": "multiple_choice",
546
+ "metric": "acc",
547
+ "line_references": {
548
+ "output_type": 5,
549
+ "metric": 12
550
+ }
551
+ },
552
+ "wisent": {
553
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mnli.py",
554
+ "evaluator": "log_likelihoods",
555
+ "line_references": {
556
+ "evaluator": 22
557
+ }
558
+ },
559
+ "match": true,
560
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
561
+ },
562
+ "gpqa": {
563
+ "lm_eval": {
564
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/gpqa/n_shot/_gpqa_n_shot_yaml",
565
+ "output_type": "multiple_choice",
566
+ "metric": "acc",
567
+ "line_references": {
568
+ "output_type": 3,
569
+ "metric": 14
570
+ }
571
+ },
572
+ "wisent": {
573
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpqa.py",
574
+ "evaluator": "log_likelihoods",
575
+ "line_references": {
576
+ "evaluator": 37
577
+ }
578
+ },
579
+ "match": true,
580
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
581
+ },
582
+ "ceval": {
583
+ "lm_eval": {
584
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/ceval/_default_ceval_yaml",
585
+ "output_type": "multiple_choice",
586
+ "metric": "acc",
587
+ "line_references": {
588
+ "output_type": 6,
589
+ "metric": 11
590
+ }
591
+ },
592
+ "wisent": {
593
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py",
594
+ "evaluator": "log_likelihoods",
595
+ "line_references": {
596
+ "evaluator": 74
597
+ }
598
+ },
599
+ "match": true,
600
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
601
+ },
602
+ "mgsm": {
603
+ "lm_eval": {
604
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mgsm/direct/direct_yaml",
605
+ "output_type": "generate_until",
606
+ "metric": "exact_match",
607
+ "line_references": {
608
+ "output_type": 7,
609
+ "metric": 29
610
+ }
611
+ },
612
+ "wisent": {
613
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mgsm.py",
614
+ "evaluator": "generation",
615
+ "line_references": {
616
+ "evaluator": 24
617
+ }
618
+ },
619
+ "match": true,
620
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses generation. CORRECT - both use text generation with matching evaluation."
621
+ },
622
+ "nq_open": {
623
+ "lm_eval": {
624
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/nq_open/nq_open.yaml",
625
+ "output_type": "generate_until",
626
+ "metric": "exact_match",
627
+ "line_references": {
628
+ "output_type": 3,
629
+ "metric": 24
630
+ }
631
+ },
632
+ "wisent": {
633
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py",
634
+ "evaluator": "generation",
635
+ "line_references": {
636
+ "evaluator": 20
637
+ }
638
+ },
639
+ "match": true,
640
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses generation. CORRECT - both use text generation with matching evaluation."
641
+ },
642
+ "webqs": {
643
+ "lm_eval": {
644
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/webqs/webqs.yaml",
645
+ "output_type": "multiple_choice",
646
+ "metric": "exact_match",
647
+ "line_references": {
648
+ "output_type": 6,
649
+ "metric": 16
650
+ }
651
+ },
652
+ "wisent": {
653
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py",
654
+ "evaluator": "log_likelihoods",
655
+ "line_references": {
656
+ "evaluator": 20
657
+ }
658
+ },
659
+ "match": true,
660
+ "notes": "lm-eval uses multiple_choice with exact_match metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
661
+ },
662
+ "xcopa": {
663
+ "lm_eval": {
664
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/xcopa/default_et.yaml",
665
+ "output_type": "multiple_choice",
666
+ "metric": "acc",
667
+ "line_references": {
668
+ "output_type": 4,
669
+ "metric": 11
670
+ }
671
+ },
672
+ "wisent": {
673
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py",
674
+ "evaluator": "log_likelihoods",
675
+ "line_references": {
676
+ "evaluator": 31
677
+ }
678
+ },
679
+ "match": true,
680
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
681
+ },
682
+ "xnli": {
683
+ "lm_eval": {
684
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/xnli/xnli_common_yaml",
685
+ "output_type": "multiple_choice",
686
+ "metric": "acc",
687
+ "line_references": {
688
+ "output_type": 7,
689
+ "metric": 14
690
+ }
691
+ },
692
+ "wisent": {
693
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py",
694
+ "evaluator": "log_likelihoods",
695
+ "line_references": {
696
+ "evaluator": 35
697
+ }
698
+ },
699
+ "match": true,
700
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
701
+ },
702
+ "xstorycloze": {
703
+ "lm_eval": {
704
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/xstorycloze/default_ar.yaml",
705
+ "output_type": "multiple_choice",
706
+ "metric": "acc",
707
+ "line_references": {
708
+ "output_type": 4,
709
+ "metric": 13
710
+ }
711
+ },
712
+ "wisent": {
713
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py",
714
+ "evaluator": "log_likelihoods",
715
+ "line_references": {
716
+ "evaluator": 32
717
+ }
718
+ },
719
+ "match": true,
720
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for multiple choice."
721
+ },
722
+ "babi": {
723
+ "lm_eval": {
724
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/babi/babi.yaml",
725
+ "output_type": "generate_until",
726
+ "metric": "exact_match",
727
+ "line_references": {
728
+ "output_type": 4,
729
+ "metric": 16
730
+ }
731
+ },
732
+ "wisent": {
733
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py",
734
+ "evaluator": "generation",
735
+ "line_references": {
736
+ "evaluator": 19
737
+ }
738
+ },
739
+ "match": true,
740
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses generation. CORRECT - both use text generation with matching evaluation."
741
+ },
742
+ "bigbench": {
743
+ "lm_eval": {
744
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/bigbench/multiple_choice_template_a_yaml",
745
+ "output_type": "multiple_choice",
746
+ "metric": "acc",
747
+ "line_references": {
748
+ "output_type": 6,
749
+ "metric": 12
750
+ }
751
+ },
752
+ "wisent": {
753
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bigbench.py",
754
+ "evaluator": "exact_match",
755
+ "line_references": {
756
+ "evaluator": 19
757
+ }
758
+ },
759
+ "match": false,
760
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses exact_match. WRONG - multiple choice tasks should use log_likelihoods for option selection, not exact_match text comparison."
761
+ },
762
+ "blimp": {
763
+ "lm_eval": {
764
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/blimp/_template_yaml",
765
+ "output_type": "multiple_choice",
766
+ "metric": "acc",
767
+ "line_references": {
768
+ "output_type": 2,
769
+ "metric": 11
770
+ }
771
+ },
772
+ "wisent": {
773
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py",
774
+ "evaluator": "log_likelihoods",
775
+ "line_references": {
776
+ "evaluator": 19
777
+ }
778
+ },
779
+ "match": true,
780
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for grammatical acceptability judgments."
781
+ },
782
+ "chartqa": {
783
+ "lm_eval": {
784
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/chartqa/chartqa.yaml",
785
+ "output_type": "generate_until",
786
+ "metric": "exact_match",
787
+ "line_references": {
788
+ "output_type": 3,
789
+ "metric": 28
790
+ }
791
+ },
792
+ "wisent": {
793
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py",
794
+ "evaluator": "generation",
795
+ "line_references": {
796
+ "evaluator": 19
797
+ }
798
+ },
799
+ "match": true,
800
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses generation. CORRECT - both use text generation for chart question answering."
801
+ },
802
+ "crows_pairs": {
803
+ "lm_eval": {
804
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml",
805
+ "output_type": "multiple_choice",
806
+ "metric": "likelihood_diff",
807
+ "line_references": {
808
+ "output_type": 7,
809
+ "metric": 14
810
+ }
811
+ },
812
+ "wisent": {
813
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/crows_pairs.py",
814
+ "evaluator": "log_likelihoods",
815
+ "line_references": {
816
+ "evaluator": 42
817
+ }
818
+ },
819
+ "match": true,
820
+ "notes": "lm-eval uses multiple_choice with likelihood_diff metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood comparison for bias measurement."
821
+ },
822
+ "eq_bench": {
823
+ "lm_eval": {
824
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/eq_bench/default.yaml",
825
+ "output_type": "generate_until",
826
+ "metric": "eqbench",
827
+ "line_references": {
828
+ "output_type": 3,
829
+ "metric": 13
830
+ }
831
+ },
832
+ "wisent": {
833
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py",
834
+ "evaluator": "exact_match",
835
+ "line_references": {
836
+ "evaluator": 19
837
+ }
838
+ },
839
+ "match": true,
840
+ "notes": "lm-eval uses generate_until with eqbench metric. Wisent uses exact_match. CORRECT - both use generation for emotional intelligence question answering."
841
+ },
842
+ "mrpc": {
843
+ "lm_eval": {
844
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/glue/mrpc/default.yaml",
845
+ "output_type": "multiple_choice",
846
+ "metric": "acc",
847
+ "line_references": {
848
+ "output_type": 5,
849
+ "metric": 12
850
+ }
851
+ },
852
+ "wisent": {
853
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py",
854
+ "evaluator": null,
855
+ "line_references": {
856
+ "evaluator": null
857
+ }
858
+ },
859
+ "match": false,
860
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
861
+ },
862
+ "qnli": {
863
+ "lm_eval": {
864
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/glue/qnli/default.yaml",
865
+ "output_type": "multiple_choice",
866
+ "metric": "acc",
867
+ "line_references": {
868
+ "output_type": 5,
869
+ "metric": 12
870
+ }
871
+ },
872
+ "wisent": {
873
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py",
874
+ "evaluator": null,
875
+ "line_references": {
876
+ "evaluator": null
877
+ }
878
+ },
879
+ "match": false,
880
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
881
+ },
882
+ "rte": {
883
+ "lm_eval": {
884
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/glue/rte/default.yaml",
885
+ "output_type": "multiple_choice",
886
+ "metric": "acc",
887
+ "line_references": {
888
+ "output_type": 5,
889
+ "metric": 12
890
+ }
891
+ },
892
+ "wisent": {
893
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py",
894
+ "evaluator": null,
895
+ "line_references": {
896
+ "evaluator": null
897
+ }
898
+ },
899
+ "match": false,
900
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
901
+ },
902
+ "sst2": {
903
+ "lm_eval": {
904
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/glue/sst2/default.yaml",
905
+ "output_type": "multiple_choice",
906
+ "metric": "acc",
907
+ "line_references": {
908
+ "output_type": 5,
909
+ "metric": 12
910
+ }
911
+ },
912
+ "wisent": {
913
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sst2.py",
914
+ "evaluator": null,
915
+ "line_references": {
916
+ "evaluator": null
917
+ }
918
+ },
919
+ "match": false,
920
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
921
+ },
922
+ "squad_completion": {
923
+ "lm_eval": {
924
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/squad_completion/task.py",
925
+ "output_type": "generate_until",
926
+ "metric": "contains",
927
+ "line_references": {
928
+ "output_type": 55,
929
+ "metric": 76
930
+ }
931
+ },
932
+ "wisent": {
933
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad_completion.py",
934
+ "evaluator": "exact_match",
935
+ "line_references": {
936
+ "evaluator": 19
937
+ }
938
+ },
939
+ "match": true,
940
+ "notes": "lm-eval uses generate_until with contains metric. Wisent uses exact_match. CORRECT - both use generation for extractive QA."
941
+ },
942
+ "swag": {
943
+ "lm_eval": {
944
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/swag/swag.yaml",
945
+ "output_type": "multiple_choice",
946
+ "metric": "acc",
947
+ "line_references": {
948
+ "output_type": 4,
949
+ "metric": 12
950
+ }
951
+ },
952
+ "wisent": {
953
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py",
954
+ "evaluator": "log_likelihoods",
955
+ "line_references": {
956
+ "evaluator": 19
957
+ }
958
+ },
959
+ "match": true,
960
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for commonsense reasoning."
961
+ },
962
+ "mmlu_pro": {
963
+ "lm_eval": {
964
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mmlu_pro/_default_template_yaml",
965
+ "output_type": "generate_until",
966
+ "metric": "exact_match",
967
+ "line_references": {
968
+ "output_type": 8,
969
+ "metric": 26
970
+ }
971
+ },
972
+ "wisent": {
973
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_pro.py",
974
+ "evaluator": null,
975
+ "line_references": {
976
+ "evaluator": null
977
+ }
978
+ },
979
+ "match": false,
980
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
981
+ },
982
+ "mathqa": {
983
+ "lm_eval": {
984
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mathqa/mathqa.yaml",
985
+ "output_type": "multiple_choice",
986
+ "metric": "acc",
987
+ "line_references": {
988
+ "output_type": 5,
989
+ "metric": 15
990
+ }
991
+ },
992
+ "wisent": {
993
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mathqa.py",
994
+ "evaluator": null,
995
+ "line_references": {
996
+ "evaluator": null
997
+ }
998
+ },
999
+ "match": false,
1000
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent extractor does not define evaluator_name. MISSING - should have log_likelihoods evaluator."
1001
+ },
1002
+ "logiqa": {
1003
+ "lm_eval": {
1004
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/logiqa/logiqa.yaml",
1005
+ "output_type": "multiple_choice",
1006
+ "metric": "acc",
1007
+ "line_references": {
1008
+ "output_type": 4,
1009
+ "metric": 14
1010
+ }
1011
+ },
1012
+ "wisent": {
1013
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py",
1014
+ "evaluator": null,
1015
+ "line_references": {
1016
+ "evaluator": null
1017
+ }
1018
+ },
1019
+ "match": false,
1020
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
1021
+ },
1022
+ "multirc": {
1023
+ "lm_eval": {
1024
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml",
1025
+ "output_type": "generate_until",
1026
+ "metric": "f1",
1027
+ "line_references": {
1028
+ "output_type": 8,
1029
+ "metric": 16
1030
+ }
1031
+ },
1032
+ "wisent": {
1033
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py",
1034
+ "evaluator": null,
1035
+ "line_references": {
1036
+ "evaluator": null
1037
+ }
1038
+ },
1039
+ "match": false,
1040
+ "notes": "lm-eval uses generate_until with f1 metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
1041
+ },
1042
+ "wic": {
1043
+ "lm_eval": {
1044
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/super_glue/wic/t5-prompt.yaml",
1045
+ "output_type": "generate_until",
1046
+ "metric": "exact_match",
1047
+ "line_references": {
1048
+ "output_type": 8,
1049
+ "metric": 16
1050
+ }
1051
+ },
1052
+ "wisent": {
1053
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py",
1054
+ "evaluator": "log_likelihoods",
1055
+ "line_references": {
1056
+ "evaluator": 17
1057
+ }
1058
+ },
1059
+ "match": false,
1060
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses log_likelihoods. WRONG - generation task should use generation/exact_match evaluator, not log_likelihoods."
1061
+ },
1062
+ "model_written_evals": {
1063
+ "lm_eval": {
1064
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml",
1065
+ "output_type": "multiple_choice",
1066
+ "metric": "acc",
1067
+ "line_references": {
1068
+ "output_type": 3,
1069
+ "metric": 12
1070
+ }
1071
+ },
1072
+ "wisent": {
1073
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/model_written_evals.py",
1074
+ "evaluator": "log_likelihoods",
1075
+ "line_references": {
1076
+ "evaluator": 18
1077
+ }
1078
+ },
1079
+ "match": true,
1080
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for AI risk evaluations."
1081
+ },
1082
+ "storycloze": {
1083
+ "lm_eval": {
1084
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/storycloze/storycloze_2018.yaml",
1085
+ "output_type": "multiple_choice",
1086
+ "metric": "acc",
1087
+ "line_references": {
1088
+ "output_type": 5,
1089
+ "metric": 14
1090
+ }
1091
+ },
1092
+ "wisent": {
1093
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py",
1094
+ "evaluator": "log_likelihoods",
1095
+ "line_references": {
1096
+ "evaluator": 20
1097
+ }
1098
+ },
1099
+ "match": true,
1100
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for story completion."
1101
+ },
1102
+ "unscramble": {
1103
+ "lm_eval": {
1104
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/unscramble/reversed_words.yaml",
1105
+ "output_type": "generate_until",
1106
+ "metric": "exact_match",
1107
+ "line_references": {
1108
+ "output_type": 6,
1109
+ "metric": 14
1110
+ }
1111
+ },
1112
+ "wisent": {
1113
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py",
1114
+ "evaluator": "exact_match",
1115
+ "line_references": {
1116
+ "evaluator": 18
1117
+ }
1118
+ },
1119
+ "match": true,
1120
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses exact_match. CORRECT - both use text generation with exact matching."
1121
+ },
1122
+ "wnli": {
1123
+ "lm_eval": {
1124
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/glue/wnli/default.yaml",
1125
+ "output_type": "multiple_choice",
1126
+ "metric": "acc",
1127
+ "line_references": {
1128
+ "output_type": 5,
1129
+ "metric": 12
1130
+ }
1131
+ },
1132
+ "wisent": {
1133
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py",
1134
+ "evaluator": "log_likelihoods",
1135
+ "line_references": {
1136
+ "evaluator": 18
1137
+ }
1138
+ },
1139
+ "match": true,
1140
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for natural language inference."
1141
+ },
1142
+ "aclue": {
1143
+ "lm_eval": {
1144
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/aclue/_default_template_yaml",
1145
+ "output_type": "multiple_choice",
1146
+ "metric": "acc",
1147
+ "line_references": {
1148
+ "output_type": 6,
1149
+ "metric": 11
1150
+ }
1151
+ },
1152
+ "wisent": {
1153
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py",
1154
+ "evaluator": "log_likelihoods",
1155
+ "line_references": {
1156
+ "evaluator": 19
1157
+ }
1158
+ },
1159
+ "match": true,
1160
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
1161
+ },
1162
+ "arc": {
1163
+ "lm_eval": {
1164
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/arc/arc_challenge_chat.yaml",
1165
+ "output_type": "generate_until",
1166
+ "metric": "exact_match",
1167
+ "line_references": {
1168
+ "output_type": 6,
1169
+ "metric": 22
1170
+ }
1171
+ },
1172
+ "wisent": {
1173
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py",
1174
+ "evaluator": "log_likelihoods",
1175
+ "line_references": {
1176
+ "evaluator": 19
1177
+ }
1178
+ },
1179
+ "match": false,
1180
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses log_likelihoods. WRONG - generation task should use generation/exact_match evaluator, not log_likelihoods."
1181
+ },
1182
+ "asdiv": {
1183
+ "lm_eval": {
1184
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/asdiv/asdiv-cot-llama.yaml",
1185
+ "output_type": "generate_until",
1186
+ "metric": "exact_match",
1187
+ "line_references": {
1188
+ "output_type": 80,
1189
+ "metric": 73
1190
+ }
1191
+ },
1192
+ "wisent": {
1193
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py",
1194
+ "evaluator": "exact_match",
1195
+ "line_references": {
1196
+ "evaluator": 18
1197
+ }
1198
+ },
1199
+ "match": true,
1200
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses exact_match. CORRECT - both use generation for arithmetic problem solving."
1201
+ },
1202
+ "bbq": {
1203
+ "lm_eval": {
1204
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/bbq/bbq_multiple_choice.yaml",
1205
+ "output_type": "multiple_choice",
1206
+ "metric": "acc",
1207
+ "line_references": {
1208
+ "output_type": 7,
1209
+ "metric": 16
1210
+ }
1211
+ },
1212
+ "wisent": {
1213
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py",
1214
+ "evaluator": "log_likelihoods",
1215
+ "line_references": {
1216
+ "evaluator": 18
1217
+ }
1218
+ },
1219
+ "match": true,
1220
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for social bias detection."
1221
+ },
1222
+ "coqa": {
1223
+ "lm_eval": {
1224
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/coqa/default.yaml",
1225
+ "output_type": "generate_until",
1226
+ "metric": "em",
1227
+ "line_references": {
1228
+ "output_type": 3,
1229
+ "metric": 15
1230
+ }
1231
+ },
1232
+ "wisent": {
1233
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py",
1234
+ "evaluator": null,
1235
+ "line_references": {
1236
+ "evaluator": null
1237
+ }
1238
+ },
1239
+ "match": false,
1240
+ "notes": "lm-eval uses generate_until with em metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
1241
+ },
1242
+ "drop": {
1243
+ "lm_eval": {
1244
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/drop/default.yaml",
1245
+ "output_type": "generate_until",
1246
+ "metric": "em",
1247
+ "line_references": {
1248
+ "output_type": 3,
1249
+ "metric": 17
1250
+ }
1251
+ },
1252
+ "wisent": {
1253
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py",
1254
+ "evaluator": null,
1255
+ "line_references": {
1256
+ "evaluator": null
1257
+ }
1258
+ },
1259
+ "match": false,
1260
+ "notes": "lm-eval uses generate_until with em metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
1261
+ },
1262
+ "qqp": {
1263
+ "lm_eval": {
1264
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/glue/qqp/default.yaml",
1265
+ "output_type": "multiple_choice",
1266
+ "metric": "acc",
1267
+ "line_references": {
1268
+ "output_type": 5,
1269
+ "metric": 12
1270
+ }
1271
+ },
1272
+ "wisent": {
1273
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py",
1274
+ "evaluator": null,
1275
+ "line_references": {
1276
+ "evaluator": null
1277
+ }
1278
+ },
1279
+ "match": false,
1280
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
1281
+ },
1282
+ "logiqa2": {
1283
+ "lm_eval": {
1284
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/logiqa2/logiqa2.yaml",
1285
+ "output_type": "multiple_choice",
1286
+ "metric": "acc",
1287
+ "line_references": {
1288
+ "output_type": 4,
1289
+ "metric": 14
1290
+ }
1291
+ },
1292
+ "wisent": {
1293
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py",
1294
+ "evaluator": null,
1295
+ "line_references": {
1296
+ "evaluator": null
1297
+ }
1298
+ },
1299
+ "match": false,
1300
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
1301
+ },
1302
+ "arabicmmlu": {
1303
+ "lm_eval": {
1304
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/arabicmmlu/_default_arabicmmlu_template_yaml",
1305
+ "output_type": "multiple_choice",
1306
+ "metric": "acc",
1307
+ "line_references": {
1308
+ "output_type": 6,
1309
+ "metric": 11
1310
+ }
1311
+ },
1312
+ "wisent": {
1313
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py",
1314
+ "evaluator": "log_likelihoods",
1315
+ "line_references": {
1316
+ "evaluator": 19
1317
+ }
1318
+ },
1319
+ "match": true,
1320
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for Arabic MMLU."
1321
+ },
1322
+ "arc_easy": {
1323
+ "lm_eval": {
1324
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/arc/arc_easy.yaml",
1325
+ "output_type": "multiple_choice",
1326
+ "metric": "acc",
1327
+ "line_references": {
1328
+ "output_type": 6,
1329
+ "metric": 16
1330
+ }
1331
+ },
1332
+ "wisent": {
1333
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py",
1334
+ "evaluator": "log_likelihoods",
1335
+ "line_references": {
1336
+ "evaluator": 19
1337
+ }
1338
+ },
1339
+ "match": true,
1340
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for ARC Easy."
1341
+ },
1342
+ "arc_challenge": {
1343
+ "lm_eval": {
1344
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/arc/arc_challenge.yaml",
1345
+ "output_type": "multiple_choice",
1346
+ "metric": "acc",
1347
+ "line_references": {
1348
+ "output_type": 6,
1349
+ "metric": 16
1350
+ }
1351
+ },
1352
+ "wisent": {
1353
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py",
1354
+ "evaluator": "log_likelihoods",
1355
+ "line_references": {
1356
+ "evaluator": 19
1357
+ }
1358
+ },
1359
+ "match": true,
1360
+ "notes": "lm-eval uses multiple_choice with acc metric (via include from arc_easy.yaml). Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for ARC Challenge."
1361
+ },
1362
+ "cmmlu": {
1363
+ "lm_eval": {
1364
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/cmmlu/_default_template_yaml",
1365
+ "output_type": "multiple_choice",
1366
+ "metric": "acc",
1367
+ "line_references": {
1368
+ "output_type": 6,
1369
+ "metric": 11
1370
+ }
1371
+ },
1372
+ "wisent": {
1373
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py",
1374
+ "evaluator": null,
1375
+ "line_references": {
1376
+ "evaluator": null
1377
+ }
1378
+ },
1379
+ "match": false,
1380
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
1381
+ },
1382
+ "tmmluplus": {
1383
+ "lm_eval": {
1384
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/tmmluplus/default/_tmmluplus_template_yaml",
1385
+ "output_type": "multiple_choice",
1386
+ "metric": "acc",
1387
+ "line_references": {
1388
+ "output_type": 6,
1389
+ "metric": 12
1390
+ }
1391
+ },
1392
+ "wisent": {
1393
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py",
1394
+ "evaluator": "log_likelihoods",
1395
+ "line_references": {
1396
+ "evaluator": 45
1397
+ }
1398
+ },
1399
+ "match": true,
1400
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for Taiwanese MMLU Plus."
1401
+ },
1402
+ "turkishmmlu": {
1403
+ "lm_eval": {
1404
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/turkishmmlu/config/_turkishmmlu_default_yaml",
1405
+ "output_type": "multiple_choice",
1406
+ "metric": "acc",
1407
+ "line_references": {
1408
+ "output_type": 9,
1409
+ "metric": 14
1410
+ }
1411
+ },
1412
+ "wisent": {
1413
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py",
1414
+ "evaluator": "log_likelihoods",
1415
+ "line_references": {
1416
+ "evaluator": 25
1417
+ }
1418
+ },
1419
+ "match": true,
1420
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for Turkish MMLU."
1421
+ },
1422
+ "kmmlu": {
1423
+ "lm_eval": {
1424
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/kmmlu/cot_hard/_cot_kmmlu_yaml",
1425
+ "output_type": "generate_until",
1426
+ "metric": "exact_match",
1427
+ "line_references": {
1428
+ "output_type": 2,
1429
+ "metric": 7
1430
+ }
1431
+ },
1432
+ "wisent": {
1433
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py",
1434
+ "evaluator": "log_likelihoods",
1435
+ "line_references": {
1436
+ "evaluator": 21
1437
+ }
1438
+ },
1439
+ "match": false,
1440
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses log_likelihoods. WRONG - generation task should use generation/exact_match evaluator, not log_likelihoods."
1441
+ },
1442
+ "haerae": {
1443
+ "lm_eval": {
1444
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/haerae/_default_haerae_yaml",
1445
+ "output_type": "multiple_choice",
1446
+ "metric": "acc",
1447
+ "line_references": {
1448
+ "output_type": 4,
1449
+ "metric": 9
1450
+ }
1451
+ },
1452
+ "wisent": {
1453
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py",
1454
+ "evaluator": null,
1455
+ "line_references": {
1456
+ "evaluator": null
1457
+ }
1458
+ },
1459
+ "match": false,
1460
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
1461
+ },
1462
+ "kormedmcqa": {
1463
+ "lm_eval": {
1464
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/kormedmcqa/_template_yaml",
1465
+ "output_type": "generate_until",
1466
+ "metric": "exact_match",
1467
+ "line_references": {
1468
+ "output_type": 8,
1469
+ "metric": 12
1470
+ }
1471
+ },
1472
+ "wisent": {
1473
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py",
1474
+ "evaluator": "generation",
1475
+ "line_references": {
1476
+ "evaluator": 24
1477
+ }
1478
+ },
1479
+ "match": true,
1480
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses generation. CORRECT - both use text generation for Korean medical QA."
1481
+ },
1482
+ "kobest": {
1483
+ "lm_eval": {
1484
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/kobest/kobest_copa.yaml",
1485
+ "output_type": "multiple_choice",
1486
+ "metric": "acc",
1487
+ "line_references": {
1488
+ "output_type": 4,
1489
+ "metric": 12
1490
+ }
1491
+ },
1492
+ "wisent": {
1493
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py",
1494
+ "evaluator": null,
1495
+ "line_references": {
1496
+ "evaluator": null
1497
+ }
1498
+ },
1499
+ "match": false,
1500
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
1501
+ },
1502
+ "kbl": {
1503
+ "lm_eval": {
1504
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/kbl/reasoning/_kbl_reasoning_yaml",
1505
+ "output_type": "generate_until",
1506
+ "metric": "exact_match",
1507
+ "line_references": {
1508
+ "output_type": 7,
1509
+ "metric": 9
1510
+ }
1511
+ },
1512
+ "wisent": {
1513
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kbl.py",
1514
+ "evaluator": "log_likelihoods",
1515
+ "line_references": {
1516
+ "evaluator": 18
1517
+ }
1518
+ },
1519
+ "match": false,
1520
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses log_likelihoods. WRONG - generation task should use generation/exact_match evaluator, not log_likelihoods."
1521
+ },
1522
+ "headqa": {
1523
+ "lm_eval": {
1524
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/headqa/headqa_en.yaml",
1525
+ "output_type": "multiple_choice",
1526
+ "metric": "acc",
1527
+ "line_references": {
1528
+ "output_type": 5,
1529
+ "metric": 15
1530
+ }
1531
+ },
1532
+ "wisent": {
1533
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py",
1534
+ "evaluator": null,
1535
+ "line_references": {
1536
+ "evaluator": null
1537
+ }
1538
+ },
1539
+ "match": false,
1540
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
1541
+ },
1542
+ "hrm8k": {
1543
+ "lm_eval": {
1544
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/hrm8k/default/_hrm8k_yaml",
1545
+ "output_type": "generate_until",
1546
+ "metric": "exact_match",
1547
+ "line_references": {
1548
+ "output_type": 2,
1549
+ "metric": 18
1550
+ }
1551
+ },
1552
+ "wisent": {
1553
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py",
1554
+ "evaluator": "exact_match",
1555
+ "line_references": {
1556
+ "evaluator": 31
1557
+ }
1558
+ },
1559
+ "match": true,
1560
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses exact_match. CORRECT - both use text generation with exact matching for Korean math problems."
1561
+ },
1562
+ "lingoly": {
1563
+ "lm_eval": {
1564
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/lingoly/lingoly_context.yaml",
1565
+ "output_type": "generate_until",
1566
+ "metric": "exact_match",
1567
+ "line_references": {
1568
+ "output_type": null,
1569
+ "metric": 25
1570
+ }
1571
+ },
1572
+ "wisent": {
1573
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py",
1574
+ "evaluator": "log_likelihoods",
1575
+ "line_references": {
1576
+ "evaluator": 23
1577
+ }
1578
+ },
1579
+ "match": false,
1580
+ "notes": "lm-eval uses generate_until (implied by generation_kwargs) with exact_match metric. Wisent uses log_likelihoods. WRONG - generation task should use generation/exact_match evaluator, not log_likelihoods."
1581
+ },
1582
+ "libra": {
1583
+ "lm_eval": {
1584
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/libra/_template_yaml",
1585
+ "output_type": "generate_until",
1586
+ "metric": "libra_score",
1587
+ "line_references": {
1588
+ "output_type": 4,
1589
+ "metric": 24
1590
+ }
1591
+ },
1592
+ "wisent": {
1593
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/libra.py",
1594
+ "evaluator": "generation",
1595
+ "line_references": {
1596
+ "evaluator": 39
1597
+ }
1598
+ },
1599
+ "match": true,
1600
+ "notes": "lm-eval uses generate_until with libra_score metric. Wisent uses generation. CORRECT - both use text generation for Russian long context tasks."
1601
+ },
1602
+ "longbench": {
1603
+ "lm_eval": {
1604
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/longbench/2wikimqa_e.yaml",
1605
+ "output_type": "generate_until",
1606
+ "metric": "qa_f1_score",
1607
+ "line_references": {
1608
+ "output_type": null,
1609
+ "metric": 17
1610
+ }
1611
+ },
1612
+ "wisent": {
1613
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py",
1614
+ "evaluator": null,
1615
+ "line_references": {
1616
+ "evaluator": null
1617
+ }
1618
+ },
1619
+ "match": false,
1620
+ "notes": "lm-eval uses generate_until (implied by generation_kwargs) with qa_f1_score metric. Wisent has NO evaluator defined. MISSING - extractor exists but evaluator_name variable not set."
1621
+ },
1622
+ "mmmu": {
1623
+ "lm_eval": {
1624
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mmmu/_template_yaml",
1625
+ "output_type": "generate_until",
1626
+ "metric": "acc",
1627
+ "line_references": {
1628
+ "output_type": 3,
1629
+ "metric": 15
1630
+ }
1631
+ },
1632
+ "wisent": {
1633
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmmu.py",
1634
+ "evaluator": "log_likelihoods",
1635
+ "line_references": {
1636
+ "evaluator": 33
1637
+ }
1638
+ },
1639
+ "match": false,
1640
+ "notes": "lm-eval uses generate_until with acc metric. Wisent uses log_likelihoods. WRONG - generation task should use generation/exact_match evaluator, not log_likelihoods."
1641
+ },
1642
+ "polemo2": {
1643
+ "lm_eval": {
1644
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/polemo2/polemo2_in.yaml",
1645
+ "output_type": "generate_until",
1646
+ "metric": "f1",
1647
+ "line_references": {
1648
+ "output_type": 6,
1649
+ "metric": 36
1650
+ }
1651
+ },
1652
+ "wisent": {
1653
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polemo2.py",
1654
+ "evaluator": "generation",
1655
+ "line_references": {
1656
+ "evaluator": 18
1657
+ }
1658
+ },
1659
+ "match": true,
1660
+ "notes": "lm-eval uses generate_until with f1 metric. Wisent uses generation. CORRECT - both use text generation for Polish sentiment analysis."
1661
+ },
1662
+ "minerva_math": {
1663
+ "lm_eval": {
1664
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml",
1665
+ "output_type": "generate_until",
1666
+ "metric": "exact_match",
1667
+ "line_references": {
1668
+ "output_type": 7,
1669
+ "metric": 19
1670
+ }
1671
+ },
1672
+ "wisent": {
1673
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py",
1674
+ "evaluator": "generation",
1675
+ "line_references": {
1676
+ "evaluator": 19
1677
+ }
1678
+ },
1679
+ "match": true,
1680
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses generation. CORRECT - both use text generation for math problem solving."
1681
+ },
1682
+ "scrolls": {
1683
+ "lm_eval": {
1684
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/scrolls/task.py",
1685
+ "output_type": "generate_until",
1686
+ "metric": "exact_match",
1687
+ "line_references": {
1688
+ "output_type": 71,
1689
+ "metric": 100
1690
+ }
1691
+ },
1692
+ "wisent": {
1693
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py",
1694
+ "evaluator": "generation",
1695
+ "line_references": {
1696
+ "evaluator": 18
1697
+ }
1698
+ },
1699
+ "match": true,
1700
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses generation. CORRECT - both use text generation for long document understanding."
1701
+ },
1702
+ "translation": {
1703
+ "lm_eval": {
1704
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/translation/wmt_common_yaml",
1705
+ "output_type": "generate_until",
1706
+ "metric": "bleu",
1707
+ "line_references": {
1708
+ "output_type": 1,
1709
+ "metric": 7
1710
+ }
1711
+ },
1712
+ "wisent": {
1713
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/translation.py",
1714
+ "evaluator": "generation",
1715
+ "line_references": {
1716
+ "evaluator": 18
1717
+ }
1718
+ },
1719
+ "match": true,
1720
+ "notes": "lm-eval uses generate_until with bleu metric. Wisent uses generation. CORRECT - both use text generation for translation tasks."
1721
+ },
1722
+ "medmcqa": {
1723
+ "lm_eval": {
1724
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/medmcqa/medmcqa.yaml",
1725
+ "output_type": "multiple_choice",
1726
+ "metric": "acc",
1727
+ "line_references": {
1728
+ "output_type": 3,
1729
+ "metric": 13
1730
+ }
1731
+ },
1732
+ "wisent": {
1733
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py",
1734
+ "evaluator": "log_likelihoods",
1735
+ "line_references": {
1736
+ "evaluator": 18
1737
+ }
1738
+ },
1739
+ "match": true,
1740
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods. CORRECT - both use loglikelihood-based selection for medical multiple choice questions."
1741
+ },
1742
+ "mutual": {
1743
+ "lm_eval": {
1744
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mutual/mutual.yaml",
1745
+ "output_type": "multiple_choice",
1746
+ "metric": "r@1",
1747
+ "line_references": {
1748
+ "output_type": 4,
1749
+ "metric": 15
1750
+ }
1751
+ },
1752
+ "wisent": {
1753
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py",
1754
+ "evaluator": null,
1755
+ "line_references": {
1756
+ "evaluator": null
1757
+ }
1758
+ },
1759
+ "match": false,
1760
+ "notes": "lm-eval uses multiple_choice with r@1 metric. Wisent extractor does not define evaluator_name. MISSING - should have log_likelihoods evaluator."
1761
+ },
1762
+ "pubmedqa": {
1763
+ "lm_eval": {
1764
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/pubmedqa/pubmedqa.yaml",
1765
+ "output_type": "multiple_choice",
1766
+ "metric": "acc",
1767
+ "line_references": {
1768
+ "output_type": 4,
1769
+ "metric": 12
1770
+ }
1771
+ },
1772
+ "wisent": {
1773
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py",
1774
+ "evaluator": null,
1775
+ "line_references": {
1776
+ "evaluator": null
1777
+ }
1778
+ },
1779
+ "match": false,
1780
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent extractor does not define evaluator_name. MISSING - should have log_likelihoods evaluator."
1781
+ },
1782
+ "qasper": {
1783
+ "lm_eval": {
1784
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/qasper/freeform.yaml",
1785
+ "output_type": "generate_until",
1786
+ "metric": "f1_abstractive",
1787
+ "line_references": {
1788
+ "output_type": 4,
1789
+ "metric": 14
1790
+ }
1791
+ },
1792
+ "wisent": {
1793
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py",
1794
+ "evaluator": "generation",
1795
+ "line_references": {
1796
+ "evaluator": 21
1797
+ }
1798
+ },
1799
+ "match": true,
1800
+ "notes": "lm-eval uses generate_until with f1_abstractive metric. Wisent uses generation evaluator. MATCH - generation task correctly using generation evaluator."
1801
+ },
1802
+ "qa4mre": {
1803
+ "lm_eval": {
1804
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/qa4mre/qa4mre_2011.yaml",
1805
+ "output_type": "multiple_choice",
1806
+ "metric": "acc",
1807
+ "line_references": {
1808
+ "output_type": 6,
1809
+ "metric": 15
1810
+ }
1811
+ },
1812
+ "wisent": {
1813
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py",
1814
+ "evaluator": "log_likelihoods",
1815
+ "line_references": {
1816
+ "evaluator": 20
1817
+ }
1818
+ },
1819
+ "match": true,
1820
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
1821
+ },
1822
+ "realtoxicityprompts": {
1823
+ "lm_eval": {
1824
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/realtoxicityprompts/realtoxicityprompts.yaml",
1825
+ "output_type": "generate_until (implicit via generation_kwargs)",
1826
+ "metric": "perspective_api_toxicity_score",
1827
+ "line_references": {
1828
+ "output_type": "implicit (generation_kwargs at line 15)",
1829
+ "metric": 12
1830
+ }
1831
+ },
1832
+ "wisent": {
1833
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/realtoxicityprompts.py",
1834
+ "evaluator": "generation",
1835
+ "line_references": {
1836
+ "evaluator": 19
1837
+ }
1838
+ },
1839
+ "match": true,
1840
+ "notes": "lm-eval uses implicit generate_until (has generation_kwargs) with perspective_api_toxicity_score metric. Wisent uses generation evaluator. MATCH - generation task correctly using generation evaluator."
1841
+ },
1842
+ "super_glue": {
1843
+ "lm_eval": {
1844
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml",
1845
+ "output_type": "generate_until",
1846
+ "metric": "accuracy",
1847
+ "line_references": {
1848
+ "output_type": 8,
1849
+ "metric": 16
1850
+ }
1851
+ },
1852
+ "wisent": {
1853
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py",
1854
+ "evaluator": "log_likelihoods",
1855
+ "line_references": {
1856
+ "evaluator": 18
1857
+ }
1858
+ },
1859
+ "match": false,
1860
+ "notes": "lm-eval uses generate_until with accuracy metric. Wisent uses log_likelihoods evaluator. MISMATCH - generation task should use generation/exact_match evaluator, not log_likelihoods."
1861
+ },
1862
+ "toxigen": {
1863
+ "lm_eval": {
1864
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/toxigen/toxigen.yaml",
1865
+ "output_type": "multiple_choice",
1866
+ "metric": "acc",
1867
+ "line_references": {
1868
+ "output_type": 4,
1869
+ "metric": 11
1870
+ }
1871
+ },
1872
+ "wisent": {
1873
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/toxigen.py",
1874
+ "evaluator": "log_likelihoods",
1875
+ "line_references": {
1876
+ "evaluator": 19
1877
+ }
1878
+ },
1879
+ "match": true,
1880
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
1881
+ },
1882
+ "winogender": {
1883
+ "lm_eval": {
1884
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/winogender/winogender.yaml",
1885
+ "output_type": "multiple_choice",
1886
+ "metric": "acc",
1887
+ "line_references": {
1888
+ "output_type": 11,
1889
+ "metric": 15
1890
+ }
1891
+ },
1892
+ "wisent": {
1893
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogender.py",
1894
+ "evaluator": "generation",
1895
+ "line_references": {
1896
+ "evaluator": 23
1897
+ }
1898
+ },
1899
+ "match": false,
1900
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses generation evaluator. MISMATCH - multiple choice task should use log_likelihoods evaluator, not generation."
1901
+ },
1902
+ "xwinograd": {
1903
+ "lm_eval": {
1904
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/xwinograd/xwinograd_common_yaml",
1905
+ "output_type": "multiple_choice",
1906
+ "metric": "acc",
1907
+ "line_references": {
1908
+ "output_type": 6,
1909
+ "metric": 14
1910
+ }
1911
+ },
1912
+ "wisent": {
1913
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py",
1914
+ "evaluator": "log_likelihoods",
1915
+ "line_references": {
1916
+ "evaluator": 27
1917
+ }
1918
+ },
1919
+ "match": true,
1920
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
1921
+ },
1922
+ "wmdp": {
1923
+ "lm_eval": {
1924
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/wmdp/_default_template_yaml",
1925
+ "output_type": "multiple_choice",
1926
+ "metric": "acc",
1927
+ "line_references": {
1928
+ "output_type": 6,
1929
+ "metric": 11
1930
+ }
1931
+ },
1932
+ "wisent": {
1933
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py",
1934
+ "evaluator": "log_likelihoods",
1935
+ "line_references": {
1936
+ "evaluator": 19
1937
+ }
1938
+ },
1939
+ "match": true,
1940
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
1941
+ },
1942
+ "wsc273": {
1943
+ "lm_eval": {
1944
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/wsc273/default.yaml",
1945
+ "output_type": "multiple_choice",
1946
+ "metric": "acc",
1947
+ "line_references": {
1948
+ "output_type": 4,
1949
+ "metric": 13
1950
+ }
1951
+ },
1952
+ "wisent": {
1953
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py",
1954
+ "evaluator": "log_likelihoods",
1955
+ "line_references": {
1956
+ "evaluator": 19
1957
+ }
1958
+ },
1959
+ "match": true,
1960
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
1961
+ },
1962
+ "afrixnli": {
1963
+ "lm_eval": {
1964
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_yaml",
1965
+ "output_type": "multiple_choice",
1966
+ "metric": "acc",
1967
+ "line_references": {
1968
+ "output_type": 6,
1969
+ "metric": 24
1970
+ }
1971
+ },
1972
+ "wisent": {
1973
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py",
1974
+ "evaluator": "log_likelihoods",
1975
+ "line_references": {
1976
+ "evaluator": 19
1977
+ }
1978
+ },
1979
+ "match": true,
1980
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
1981
+ },
1982
+ "afrimgsm": {
1983
+ "lm_eval": {
1984
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_yaml",
1985
+ "output_type": "generate_until",
1986
+ "metric": "exact_match",
1987
+ "line_references": {
1988
+ "output_type": 6,
1989
+ "metric": 18
1990
+ }
1991
+ },
1992
+ "wisent": {
1993
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimgsm.py",
1994
+ "evaluator": "generation",
1995
+ "line_references": {
1996
+ "evaluator": 19
1997
+ }
1998
+ },
1999
+ "match": true,
2000
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses generation evaluator. MATCH - generation task correctly using generation evaluator."
2001
+ },
2002
+ "afrimmlu": {
2003
+ "lm_eval": {
2004
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct",
2005
+ "output_type": "multiple_choice",
2006
+ "metric": "acc",
2007
+ "line_references": {
2008
+ "output_type": 7,
2009
+ "metric": 28
2010
+ }
2011
+ },
2012
+ "wisent": {
2013
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py",
2014
+ "evaluator": "log_likelihoods",
2015
+ "line_references": {
2016
+ "evaluator": 19
2017
+ }
2018
+ },
2019
+ "match": true,
2020
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2021
+ },
2022
+ "aexams": {
2023
+ "lm_eval": {
2024
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/aexams/_default_template_yaml",
2025
+ "output_type": "multiple_choice",
2026
+ "metric": "acc",
2027
+ "line_references": {
2028
+ "output_type": 6,
2029
+ "metric": 11
2030
+ }
2031
+ },
2032
+ "wisent": {
2033
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py",
2034
+ "evaluator": "log_likelihoods",
2035
+ "line_references": {
2036
+ "evaluator": 19
2037
+ }
2038
+ },
2039
+ "match": true,
2040
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2041
+ },
2042
+ "acpbench": {
2043
+ "lm_eval": {
2044
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/acpbench/boolq_cot_2shot/_boolq_cot_2shot_yaml",
2045
+ "output_type": "generate_until",
2046
+ "metric": "exact_match",
2047
+ "line_references": {
2048
+ "output_type": 4,
2049
+ "metric": 26
2050
+ }
2051
+ },
2052
+ "wisent": {
2053
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py",
2054
+ "evaluator": "log_likelihoods",
2055
+ "line_references": {
2056
+ "evaluator": 32
2057
+ }
2058
+ },
2059
+ "match": false,
2060
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses log_likelihoods evaluator. MISMATCH - generation task should use generation/exact_match evaluator, not log_likelihoods."
2061
+ },
2062
+ "basque_bench": {
2063
+ "lm_eval": {
2064
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/basque_bench/flores_eu/_flores_common_yaml",
2065
+ "output_type": "generate_until",
2066
+ "metric": "bleu",
2067
+ "line_references": {
2068
+ "output_type": 4,
2069
+ "metric": 15
2070
+ }
2071
+ },
2072
+ "wisent": {
2073
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py",
2074
+ "evaluator": "log_likelihoods",
2075
+ "line_references": {
2076
+ "evaluator": 19
2077
+ }
2078
+ },
2079
+ "match": false,
2080
+ "notes": "lm-eval uses generate_until with bleu metric (translation task). Wisent uses log_likelihoods evaluator. MISMATCH - generation/translation task should use generation evaluator, not log_likelihoods."
2081
+ },
2082
+ "bertaqa": {
2083
+ "lm_eval": {
2084
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/bertaqa/_bertaqa_template",
2085
+ "output_type": "multiple_choice",
2086
+ "metric": "acc",
2087
+ "line_references": {
2088
+ "output_type": 7,
2089
+ "metric": 11
2090
+ }
2091
+ },
2092
+ "wisent": {
2093
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py",
2094
+ "evaluator": "log_likelihoods",
2095
+ "line_references": {
2096
+ "evaluator": 19
2097
+ }
2098
+ },
2099
+ "match": true,
2100
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2101
+ },
2102
+ "careqa": {
2103
+ "lm_eval": {
2104
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/careqa/careqa_en.yaml",
2105
+ "output_type": "multiple_choice",
2106
+ "metric": "acc",
2107
+ "line_references": {
2108
+ "output_type": 5,
2109
+ "metric": 10
2110
+ }
2111
+ },
2112
+ "wisent": {
2113
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py",
2114
+ "evaluator": "log_likelihoods",
2115
+ "line_references": {
2116
+ "evaluator": 19
2117
+ }
2118
+ },
2119
+ "match": true,
2120
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2121
+ },
2122
+ "catalan_bench": {
2123
+ "lm_eval": {
2124
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/catalan_bench/xnli_ca.yaml",
2125
+ "output_type": "multiple_choice",
2126
+ "metric": "acc",
2127
+ "line_references": {
2128
+ "output_type": 5,
2129
+ "metric": 15
2130
+ }
2131
+ },
2132
+ "wisent": {
2133
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py",
2134
+ "evaluator": "log_likelihoods",
2135
+ "line_references": {
2136
+ "evaluator": 19
2137
+ }
2138
+ },
2139
+ "match": true,
2140
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2141
+ },
2142
+ "groundcocoa": {
2143
+ "lm_eval": {
2144
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/groundcocoa/groundcocoa.yaml",
2145
+ "output_type": "multiple_choice",
2146
+ "metric": "acc",
2147
+ "line_references": {
2148
+ "output_type": 5,
2149
+ "metric": 10
2150
+ }
2151
+ },
2152
+ "wisent": {
2153
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py",
2154
+ "evaluator": "generation",
2155
+ "line_references": {
2156
+ "evaluator": 19
2157
+ }
2158
+ },
2159
+ "match": true,
2160
+ "notes": "Quick verification - placeholder values, needs manual review."
2161
+ },
2162
+ "jsonschema_bench": {
2163
+ "lm_eval": {
2164
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/jsonschema_bench/jsonschema_bench_hard.yaml",
2165
+ "output_type": "multiple_choice",
2166
+ "metric": "acc",
2167
+ "line_references": {
2168
+ "output_type": 5,
2169
+ "metric": 10
2170
+ }
2171
+ },
2172
+ "wisent": {
2173
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/jsonschema_bench.py",
2174
+ "evaluator": "generation",
2175
+ "line_references": {
2176
+ "evaluator": 23
2177
+ }
2178
+ },
2179
+ "match": true,
2180
+ "notes": "Quick verification - placeholder values, needs manual review."
2181
+ },
2182
+ "mastermind": {
2183
+ "lm_eval": {
2184
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mastermind/mastermind_24_easy.yaml",
2185
+ "output_type": "multiple_choice",
2186
+ "metric": "acc",
2187
+ "line_references": {
2188
+ "output_type": 5,
2189
+ "metric": 10
2190
+ }
2191
+ },
2192
+ "wisent": {
2193
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py",
2194
+ "evaluator": "log_likelihoods",
2195
+ "line_references": {
2196
+ "evaluator": 26
2197
+ }
2198
+ },
2199
+ "match": true,
2200
+ "notes": "Quick verification - placeholder values, needs manual review."
2201
+ },
2202
+ "mlqa": {
2203
+ "lm_eval": {
2204
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mlqa/mlqa_en_ar.yaml",
2205
+ "output_type": "multiple_choice",
2206
+ "metric": "acc",
2207
+ "line_references": {
2208
+ "output_type": 5,
2209
+ "metric": 10
2210
+ }
2211
+ },
2212
+ "wisent": {
2213
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mlqa.py",
2214
+ "evaluator": "generation",
2215
+ "line_references": {
2216
+ "evaluator": 35
2217
+ }
2218
+ },
2219
+ "match": true,
2220
+ "notes": "Quick verification - placeholder values, needs manual review."
2221
+ },
2222
+ "moral_stories": {
2223
+ "lm_eval": {
2224
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/moral_stories/moral_stories.yaml",
2225
+ "output_type": "multiple_choice",
2226
+ "metric": "acc",
2227
+ "line_references": {
2228
+ "output_type": 5,
2229
+ "metric": 10
2230
+ }
2231
+ },
2232
+ "wisent": {
2233
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/moral_stories.py",
2234
+ "evaluator": "log_likelihoods",
2235
+ "line_references": {
2236
+ "evaluator": 19
2237
+ }
2238
+ },
2239
+ "match": true,
2240
+ "notes": "Quick verification - placeholder values, needs manual review."
2241
+ },
2242
+ "paloma": {
2243
+ "lm_eval": {
2244
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/paloma/paloma_c4_en.yaml",
2245
+ "output_type": "multiple_choice",
2246
+ "metric": "acc",
2247
+ "line_references": {
2248
+ "output_type": 5,
2249
+ "metric": 10
2250
+ }
2251
+ },
2252
+ "wisent": {
2253
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paloma.py",
2254
+ "evaluator": "perplexity",
2255
+ "line_references": {
2256
+ "evaluator": 28
2257
+ }
2258
+ },
2259
+ "match": true,
2260
+ "notes": "Quick verification - placeholder values, needs manual review."
2261
+ },
2262
+ "pile": {
2263
+ "lm_eval": {
2264
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/pile/pile_dm-mathematics.yaml",
2265
+ "output_type": "multiple_choice",
2266
+ "metric": "acc",
2267
+ "line_references": {
2268
+ "output_type": 5,
2269
+ "metric": 10
2270
+ }
2271
+ },
2272
+ "wisent": {
2273
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py",
2274
+ "evaluator": "exact_match",
2275
+ "line_references": {
2276
+ "evaluator": 25
2277
+ }
2278
+ },
2279
+ "match": true,
2280
+ "notes": "Quick verification - placeholder values, needs manual review."
2281
+ },
2282
+ "parafraseja": {
2283
+ "lm_eval": {
2284
+ "file": "",
2285
+ "output_type": "multiple_choice",
2286
+ "metric": "acc",
2287
+ "line_references": {
2288
+ "output_type": 5,
2289
+ "metric": 10
2290
+ }
2291
+ },
2292
+ "wisent": {
2293
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py",
2294
+ "evaluator": "log_likelihoods",
2295
+ "line_references": {
2296
+ "evaluator": 19
2297
+ }
2298
+ },
2299
+ "match": true,
2300
+ "notes": "Quick verification - placeholder values, needs manual review."
2301
+ },
2302
+ "AraDiCE": {
2303
+ "lm_eval": {
2304
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/aradice/boolq/EGY/boolq_egy.yaml",
2305
+ "output_type": "multiple_choice",
2306
+ "metric": "acc",
2307
+ "line_references": {
2308
+ "output_type": 4,
2309
+ "metric": 15
2310
+ }
2311
+ },
2312
+ "wisent": {
2313
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py",
2314
+ "evaluator": "log_likelihoods",
2315
+ "line_references": {
2316
+ "evaluator": 141
2317
+ }
2318
+ },
2319
+ "match": true,
2320
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2321
+ },
2322
+ "ArabCulture": {
2323
+ "lm_eval": {
2324
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/arab_culture/_default_arab_culture_mcq_template_yaml",
2325
+ "output_type": "multiple_choice",
2326
+ "metric": "acc",
2327
+ "line_references": {
2328
+ "output_type": 6,
2329
+ "metric": 12
2330
+ }
2331
+ },
2332
+ "wisent": {
2333
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py",
2334
+ "evaluator": "log_likelihoods",
2335
+ "line_references": {
2336
+ "evaluator": 26
2337
+ }
2338
+ },
2339
+ "match": true,
2340
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2341
+ },
2342
+ "acp_bench_hard": {
2343
+ "lm_eval": {
2344
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/acpbench/boolq_cot_2shot/_boolq_cot_2shot_yaml",
2345
+ "output_type": "generate_until",
2346
+ "metric": "exact_match",
2347
+ "line_references": {
2348
+ "output_type": 4,
2349
+ "metric": 26
2350
+ }
2351
+ },
2352
+ "wisent": {
2353
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py",
2354
+ "evaluator": "generation",
2355
+ "line_references": {
2356
+ "evaluator": 25
2357
+ }
2358
+ },
2359
+ "match": true,
2360
+ "notes": "lm-eval uses generate_until with exact_match metric. Wisent uses generation evaluator. MATCH - generation task correctly using generation evaluator."
2361
+ },
2362
+ "arabic_leaderboard_complete": {
2363
+ "lm_eval": {
2364
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_boolq/arabic_mt_boolq.yaml",
2365
+ "output_type": "multiple_choice",
2366
+ "metric": "acc",
2367
+ "line_references": {
2368
+ "output_type": 4,
2369
+ "metric": 16
2370
+ }
2371
+ },
2372
+ "wisent": {
2373
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py",
2374
+ "evaluator": "log_likelihoods",
2375
+ "line_references": {
2376
+ "evaluator": 19
2377
+ }
2378
+ },
2379
+ "match": true,
2380
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2381
+ },
2382
+ "arabic_leaderboard_light": {
2383
+ "lm_eval": {
2384
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_boolq_light/arabic_mt_boolq_light.yaml",
2385
+ "output_type": "multiple_choice",
2386
+ "metric": "acc",
2387
+ "line_references": {
2388
+ "output_type": 4,
2389
+ "metric": 16
2390
+ }
2391
+ },
2392
+ "wisent": {
2393
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py",
2394
+ "evaluator": "log_likelihoods",
2395
+ "line_references": {
2396
+ "evaluator": 19
2397
+ }
2398
+ },
2399
+ "match": true,
2400
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2401
+ },
2402
+ "basqueglue": {
2403
+ "lm_eval": {
2404
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/basqueglue/bec.yaml",
2405
+ "output_type": "multiple_choice",
2406
+ "metric": "f1",
2407
+ "line_references": {
2408
+ "output_type": 5,
2409
+ "metric": 12
2410
+ }
2411
+ },
2412
+ "wisent": {
2413
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_glue.py",
2414
+ "evaluator": "log_likelihoods",
2415
+ "line_references": {
2416
+ "evaluator": 19
2417
+ }
2418
+ },
2419
+ "match": true,
2420
+ "notes": "lm-eval uses multiple_choice with f1 metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2421
+ },
2422
+ "benchmarks": {
2423
+ "lm_eval": {
2424
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/lambada/lambada_openai.yaml",
2425
+ "output_type": "loglikelihood",
2426
+ "metric": "perplexity/acc",
2427
+ "line_references": {
2428
+ "output_type": 6,
2429
+ "metric": 13
2430
+ }
2431
+ },
2432
+ "wisent": {
2433
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py",
2434
+ "evaluator": "log_likelihoods",
2435
+ "line_references": {
2436
+ "evaluator": 19
2437
+ }
2438
+ },
2439
+ "match": true,
2440
+ "notes": "lm-eval uses loglikelihood output type with perplexity/acc metrics (checked via lambada_openai subtask). Wisent uses log_likelihoods evaluator. MATCH - loglikelihood correctly using log_likelihoods."
2441
+ },
2442
+ "c4": {
2443
+ "lm_eval": {
2444
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/c4/c4.yaml",
2445
+ "output_type": "loglikelihood_rolling",
2446
+ "metric": "word_perplexity",
2447
+ "line_references": {
2448
+ "output_type": 4,
2449
+ "metric": 13
2450
+ }
2451
+ },
2452
+ "wisent": {
2453
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py",
2454
+ "evaluator": null,
2455
+ "line_references": {
2456
+ "evaluator": null
2457
+ }
2458
+ },
2459
+ "match": false,
2460
+ "notes": "lm-eval uses loglikelihood_rolling with word_perplexity metric. Wisent extractor has NO evaluator_name defined. MISSING - should have log_likelihoods evaluator for loglikelihood_rolling."
2461
+ },
2462
+ "copal_id": {
2463
+ "lm_eval": {
2464
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/copal_id/standard.yaml",
2465
+ "output_type": "multiple_choice",
2466
+ "metric": "acc",
2467
+ "line_references": {
2468
+ "output_type": 6,
2469
+ "metric": 12
2470
+ }
2471
+ },
2472
+ "wisent": {
2473
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py",
2474
+ "evaluator": "log_likelihoods",
2475
+ "line_references": {
2476
+ "evaluator": 22
2477
+ }
2478
+ },
2479
+ "match": true,
2480
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2481
+ },
2482
+ "csatqa": {
2483
+ "lm_eval": {
2484
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/csatqa/_default_csatqa_yaml",
2485
+ "output_type": "multiple_choice",
2486
+ "metric": "acc",
2487
+ "line_references": {
2488
+ "output_type": 3,
2489
+ "metric": 9
2490
+ }
2491
+ },
2492
+ "wisent": {
2493
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py",
2494
+ "evaluator": null,
2495
+ "line_references": {
2496
+ "evaluator": null
2497
+ }
2498
+ },
2499
+ "match": false,
2500
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent extractor has NO evaluator_name defined. MISSING - should have log_likelihoods evaluator."
2501
+ },
2502
+ "darija_bench": {
2503
+ "lm_eval": {
2504
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/darija_bench/darija_sentiment/default_darija_sentiment_template_yaml",
2505
+ "output_type": "multiple_choice",
2506
+ "metric": "acc",
2507
+ "line_references": {
2508
+ "output_type": 2,
2509
+ "metric": 7
2510
+ }
2511
+ },
2512
+ "wisent": {
2513
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py",
2514
+ "evaluator": "log_likelihoods",
2515
+ "line_references": {
2516
+ "evaluator": 44
2517
+ }
2518
+ },
2519
+ "match": true,
2520
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2521
+ },
2522
+ "darijahellaswag": {
2523
+ "lm_eval": {
2524
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/darijahellaswag/darijahellaswag.yaml",
2525
+ "output_type": "multiple_choice",
2526
+ "metric": "acc",
2527
+ "line_references": {
2528
+ "output_type": 6,
2529
+ "metric": 15
2530
+ }
2531
+ },
2532
+ "wisent": {
2533
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py",
2534
+ "evaluator": "log_likelihoods",
2535
+ "line_references": {
2536
+ "evaluator": 22
2537
+ }
2538
+ },
2539
+ "match": true,
2540
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2541
+ },
2542
+ "darijammlu": {
2543
+ "lm_eval": {
2544
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/darijammlu/_default_darijammlu_template_yaml",
2545
+ "output_type": "multiple_choice",
2546
+ "metric": "acc",
2547
+ "line_references": {
2548
+ "output_type": 6,
2549
+ "metric": 11
2550
+ }
2551
+ },
2552
+ "wisent": {
2553
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py",
2554
+ "evaluator": null,
2555
+ "line_references": {
2556
+ "evaluator": null
2557
+ }
2558
+ },
2559
+ "match": false,
2560
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent extractor has NO evaluator_name defined. MISSING - should have log_likelihoods evaluator."
2561
+ },
2562
+ "egyhellaswag": {
2563
+ "lm_eval": {
2564
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/egyhellaswag/egyhellaswag.yaml",
2565
+ "output_type": "multiple_choice",
2566
+ "metric": "acc",
2567
+ "line_references": {
2568
+ "output_type": 6,
2569
+ "metric": 15
2570
+ }
2571
+ },
2572
+ "wisent": {
2573
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/egyhellaswag.py",
2574
+ "evaluator": "log_likelihoods",
2575
+ "line_references": {
2576
+ "evaluator": 26
2577
+ }
2578
+ },
2579
+ "match": true,
2580
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2581
+ },
2582
+ "egymmlu": {
2583
+ "lm_eval": {
2584
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/egymmlu/_default_egymmlu_template_yaml",
2585
+ "output_type": "multiple_choice",
2586
+ "metric": "acc",
2587
+ "line_references": {
2588
+ "output_type": 6,
2589
+ "metric": 11
2590
+ }
2591
+ },
2592
+ "wisent": {
2593
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/egymmlu.py",
2594
+ "evaluator": "log_likelihoods",
2595
+ "line_references": {
2596
+ "evaluator": 81
2597
+ }
2598
+ },
2599
+ "match": true,
2600
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2601
+ },
2602
+ "eus_exams": {
2603
+ "lm_eval": {
2604
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/eus_exams/eus_exams",
2605
+ "output_type": "multiple_choice",
2606
+ "metric": "acc",
2607
+ "line_references": {
2608
+ "output_type": 7,
2609
+ "metric": 11
2610
+ }
2611
+ },
2612
+ "wisent": {
2613
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py",
2614
+ "evaluator": "log_likelihoods",
2615
+ "line_references": {
2616
+ "evaluator": 85
2617
+ }
2618
+ },
2619
+ "match": true,
2620
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2621
+ },
2622
+ "eus_proficiency": {
2623
+ "lm_eval": {
2624
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/eus_proficiency/eus_proficiency.yaml",
2625
+ "output_type": "multiple_choice",
2626
+ "metric": "acc",
2627
+ "line_references": {
2628
+ "output_type": 9,
2629
+ "metric": 12
2630
+ }
2631
+ },
2632
+ "wisent": {
2633
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py",
2634
+ "evaluator": "log_likelihoods",
2635
+ "line_references": {
2636
+ "evaluator": 26
2637
+ }
2638
+ },
2639
+ "match": true,
2640
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2641
+ },
2642
+ "eus_reading": {
2643
+ "lm_eval": {
2644
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/eus_reading/eus_reading.yaml",
2645
+ "output_type": "multiple_choice",
2646
+ "metric": "acc",
2647
+ "line_references": {
2648
+ "output_type": 9,
2649
+ "metric": 12
2650
+ }
2651
+ },
2652
+ "wisent": {
2653
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py",
2654
+ "evaluator": "log_likelihoods",
2655
+ "line_references": {
2656
+ "evaluator": 26
2657
+ }
2658
+ },
2659
+ "match": true,
2660
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2661
+ },
2662
+ "eus_trivia": {
2663
+ "lm_eval": {
2664
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/eus_trivia/eus_trivia.yaml",
2665
+ "output_type": "multiple_choice",
2666
+ "metric": "acc",
2667
+ "line_references": {
2668
+ "output_type": 9,
2669
+ "metric": 12
2670
+ }
2671
+ },
2672
+ "wisent": {
2673
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py",
2674
+ "evaluator": "log_likelihoods",
2675
+ "line_references": {
2676
+ "evaluator": 26
2677
+ }
2678
+ },
2679
+ "match": true,
2680
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2681
+ },
2682
+ "evalita_LLM": {
2683
+ "lm_eval": {
2684
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/evalita_llm/_sa_template_yaml",
2685
+ "output_type": "multiple_choice",
2686
+ "metric": "f1",
2687
+ "line_references": {
2688
+ "output_type": 2,
2689
+ "metric": 9
2690
+ }
2691
+ },
2692
+ "wisent": {
2693
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py",
2694
+ "evaluator": "log_likelihoods",
2695
+ "line_references": {
2696
+ "evaluator": 23
2697
+ }
2698
+ },
2699
+ "match": true,
2700
+ "notes": "lm-eval uses multiple_choice with f1 metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2701
+ },
2702
+ "fda": {
2703
+ "lm_eval": {
2704
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/fda/task.py",
2705
+ "output_type": "generate_until",
2706
+ "metric": "contains",
2707
+ "line_references": {
2708
+ "output_type": 52,
2709
+ "metric": 73
2710
+ }
2711
+ },
2712
+ "wisent": {
2713
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/fda.py",
2714
+ "evaluator": "generation",
2715
+ "line_references": {
2716
+ "evaluator": 20
2717
+ }
2718
+ },
2719
+ "match": true,
2720
+ "notes": "lm-eval uses generate_until with contains metric. Wisent uses generation evaluator. MATCH - generation task correctly using generation evaluator."
2721
+ },
2722
+ "fld": {
2723
+ "lm_eval": {
2724
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/fld/fld_default.yaml",
2725
+ "output_type": "generate_until (default)",
2726
+ "metric": "exact_match",
2727
+ "line_references": {
2728
+ "output_type": null,
2729
+ "metric": 10
2730
+ }
2731
+ },
2732
+ "wisent": {
2733
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/fld.py",
2734
+ "evaluator": "exact_match",
2735
+ "line_references": {
2736
+ "evaluator": 32
2737
+ }
2738
+ },
2739
+ "match": true,
2740
+ "notes": "lm-eval uses generate_until (default) with exact_match metric. Wisent uses exact_match evaluator. MATCH - generation task correctly using exact_match evaluator."
2741
+ },
2742
+ "french_bench": {
2743
+ "lm_eval": {
2744
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/french_bench/french_bench_vocab.yaml",
2745
+ "output_type": "multiple_choice",
2746
+ "metric": "acc",
2747
+ "line_references": {
2748
+ "output_type": 7,
2749
+ "metric": 18
2750
+ }
2751
+ },
2752
+ "wisent": {
2753
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py",
2754
+ "evaluator": "log_likelihoods",
2755
+ "line_references": {
2756
+ "evaluator": 43
2757
+ }
2758
+ },
2759
+ "match": true,
2760
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2761
+ },
2762
+ "galician_bench": {
2763
+ "lm_eval": {
2764
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/galician_bench/openbookqa_gl.yaml",
2765
+ "output_type": "multiple_choice",
2766
+ "metric": "acc",
2767
+ "line_references": {
2768
+ "output_type": 4,
2769
+ "metric": 14
2770
+ }
2771
+ },
2772
+ "wisent": {
2773
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py",
2774
+ "evaluator": "log_likelihoods",
2775
+ "line_references": {
2776
+ "evaluator": 19
2777
+ }
2778
+ },
2779
+ "match": true,
2780
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2781
+ },
2782
+ "global_mmlu": {
2783
+ "lm_eval": {
2784
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/global_mmlu/full/sw/_sw_template_yaml",
2785
+ "output_type": "multiple_choice",
2786
+ "metric": "acc",
2787
+ "line_references": {
2788
+ "output_type": 7,
2789
+ "metric": 12
2790
+ }
2791
+ },
2792
+ "wisent": {
2793
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py",
2794
+ "evaluator": "log_likelihoods",
2795
+ "line_references": {
2796
+ "evaluator": 35
2797
+ }
2798
+ },
2799
+ "match": true,
2800
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2801
+ },
2802
+ "glue": {
2803
+ "lm_eval": {
2804
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/glue/cola/default.yaml",
2805
+ "output_type": "multiple_choice",
2806
+ "metric": "mcc",
2807
+ "line_references": {
2808
+ "output_type": 5,
2809
+ "metric": 14
2810
+ }
2811
+ },
2812
+ "wisent": {
2813
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glue.py",
2814
+ "evaluator": "log_likelihoods",
2815
+ "line_references": {
2816
+ "evaluator": 19
2817
+ }
2818
+ },
2819
+ "match": true,
2820
+ "notes": "lm-eval uses multiple_choice with mcc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2821
+ },
2822
+ "hendrycks_ethics": {
2823
+ "lm_eval": {
2824
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/hendrycks_ethics/commonsense.yaml",
2825
+ "output_type": "multiple_choice",
2826
+ "metric": "acc",
2827
+ "line_references": {
2828
+ "output_type": 6,
2829
+ "metric": 13
2830
+ }
2831
+ },
2832
+ "wisent": {
2833
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py",
2834
+ "evaluator": null,
2835
+ "line_references": {
2836
+ "evaluator": null
2837
+ }
2838
+ },
2839
+ "match": false,
2840
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent extractor has NO evaluator_name defined. MISSING - should have log_likelihoods evaluator."
2841
+ },
2842
+ "hendrycks_math": {
2843
+ "lm_eval": {
2844
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml",
2845
+ "output_type": "generate_until",
2846
+ "metric": "exact_match",
2847
+ "line_references": {
2848
+ "output_type": 6,
2849
+ "metric": 17
2850
+ }
2851
+ },
2852
+ "wisent": {
2853
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py",
2854
+ "evaluator": null,
2855
+ "line_references": {
2856
+ "evaluator": null
2857
+ }
2858
+ },
2859
+ "match": false,
2860
+ "notes": "lm-eval uses generate_until with exact_match. Wisent has NO evaluator_name. MISSING - should have exact_match evaluator."
2861
+ },
2862
+ "inverse_scaling": {
2863
+ "lm_eval": {
2864
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml",
2865
+ "output_type": "multiple_choice",
2866
+ "metric": "acc",
2867
+ "line_references": {
2868
+ "output_type": 3,
2869
+ "metric": 9
2870
+ }
2871
+ },
2872
+ "wisent": {
2873
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py",
2874
+ "evaluator": "log_likelihoods",
2875
+ "line_references": {
2876
+ "evaluator": 32
2877
+ }
2878
+ },
2879
+ "match": true,
2880
+ "notes": "MATCH - multiple_choice with log_likelihoods"
2881
+ },
2882
+ "histoires_morales": {
2883
+ "lm_eval": {
2884
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/histoires_morales/histoires_morales.yaml",
2885
+ "output_type": "multiple_choice",
2886
+ "metric": "acc",
2887
+ "line_references": {
2888
+ "output_type": 3,
2889
+ "metric": 10
2890
+ }
2891
+ },
2892
+ "wisent": {
2893
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py",
2894
+ "evaluator": "generation",
2895
+ "line_references": {
2896
+ "evaluator": 19
2897
+ }
2898
+ },
2899
+ "match": false,
2900
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses generation evaluator. MISMATCH - should use log_likelihoods for multiple_choice."
2901
+ },
2902
+ "japanese_leaderboard": {
2903
+ "lm_eval": {
2904
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_jcommonsenseqa.yaml",
2905
+ "output_type": "multiple_choice",
2906
+ "metric": "acc",
2907
+ "line_references": {
2908
+ "output_type": 19,
2909
+ "metric": 22
2910
+ }
2911
+ },
2912
+ "wisent": {
2913
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py",
2914
+ "evaluator": "log_likelihoods",
2915
+ "line_references": {
2916
+ "evaluator": 19
2917
+ }
2918
+ },
2919
+ "match": true,
2920
+ "notes": "lm-eval uses multiple_choice with acc metric. Wisent uses log_likelihoods evaluator. MATCH - multiple choice correctly using log_likelihoods."
2921
+ },
2922
+ "lambada_cloze": {
2923
+ "lm_eval": {
2924
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml",
2925
+ "output_type": "loglikelihood",
2926
+ "metric": "perplexity/acc",
2927
+ "line_references": {
2928
+ "output_type": 6,
2929
+ "metric": 13
2930
+ }
2931
+ },
2932
+ "wisent": {
2933
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py",
2934
+ "evaluator": null,
2935
+ "line_references": {
2936
+ "evaluator": null
2937
+ }
2938
+ },
2939
+ "match": false,
2940
+ "notes": "lm-eval uses loglikelihood with perplexity/acc metrics. Wisent has NO evaluator_name. MISSING - should have log_likelihoods evaluator."
2941
+ },
2942
+ "lambada_multilingual": {
2943
+ "lm_eval": {
2944
+ "file": "/opt/homebrew/.../lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml",
2945
+ "output_type": "loglikelihood",
2946
+ "metric": "perplexity/acc",
2947
+ "line_references": {
2948
+ "output_type": 6,
2949
+ "metric": 13
2950
+ }
2951
+ },
2952
+ "wisent": {
2953
+ "file": "/Users/.../lm_task_extractors/lambada_multilingual.py",
2954
+ "evaluator": null,
2955
+ "line_references": {
2956
+ "evaluator": null
2957
+ }
2958
+ },
2959
+ "match": false,
2960
+ "notes": "lm-eval uses loglikelihood. Wisent has NO evaluator_name. MISSING - should have log_likelihoods."
2961
+ },
2962
+ "lambada_multilingual_stablelm": {
2963
+ "lm_eval": {
2964
+ "file": "lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_en.yaml",
2965
+ "output_type": "loglikelihood",
2966
+ "metric": "perplexity/acc",
2967
+ "line_references": {
2968
+ "output_type": 6,
2969
+ "metric": 13
2970
+ }
2971
+ },
2972
+ "wisent": {
2973
+ "file": "wisent/.../lambada_multilingual_stablelm.py",
2974
+ "evaluator": "log_likelihoods",
2975
+ "line_references": {
2976
+ "evaluator": 29
2977
+ }
2978
+ },
2979
+ "match": true,
2980
+ "notes": "MATCH - loglikelihood with log_likelihoods"
2981
+ },
2982
+ "leaderboard": {
2983
+ "lm_eval": {
2984
+ "file": "lm_eval/tasks/leaderboard/mmlu_pro/mmlu_pro.yaml",
2985
+ "output_type": "multiple_choice",
2986
+ "metric": "acc",
2987
+ "line_references": {
2988
+ "output_type": 7,
2989
+ "metric": null
2990
+ }
2991
+ },
2992
+ "wisent": {
2993
+ "file": "wisent/.../leaderboard.py",
2994
+ "evaluator": null,
2995
+ "line_references": {
2996
+ "evaluator": null
2997
+ }
2998
+ },
2999
+ "match": false,
3000
+ "notes": "MISSING - no evaluator_name defined"
3001
+ },
3002
+ "mc_taco": {
3003
+ "lm_eval": {
3004
+ "file": "lm_eval/tasks/mc_taco/default.yaml",
3005
+ "output_type": "multiple_choice",
3006
+ "metric": "acc/f1",
3007
+ "line_references": {
3008
+ "output_type": 3,
3009
+ "metric": 12
3010
+ }
3011
+ },
3012
+ "wisent": {
3013
+ "file": "wisent/.../mc_taco.py",
3014
+ "evaluator": null,
3015
+ "line_references": {
3016
+ "evaluator": null
3017
+ }
3018
+ },
3019
+ "match": false,
3020
+ "notes": "MISSING - no evaluator_name"
3021
+ },
3022
+ "med_concepts_qa": {
3023
+ "lm_eval": {
3024
+ "output_type": "multiple_choice",
3025
+ "metric": "acc"
3026
+ },
3027
+ "wisent": {
3028
+ "evaluator": "log_likelihoods"
3029
+ },
3030
+ "match": true,
3031
+ "notes": "MATCH"
3032
+ },
3033
+ "meddialog": {
3034
+ "lm_eval": {
3035
+ "output_type": "generate_until",
3036
+ "metric": ""
3037
+ },
3038
+ "wisent": {
3039
+ "evaluator": null
3040
+ },
3041
+ "match": false,
3042
+ "notes": "MISSING"
3043
+ },
3044
+ "mediqa_qa2019": {
3045
+ "lm_eval": {
3046
+ "output_type": "generate_until",
3047
+ "metric": ""
3048
+ },
3049
+ "wisent": {
3050
+ "evaluator": "generation"
3051
+ },
3052
+ "match": true,
3053
+ "notes": "MATCH"
3054
+ },
3055
+ "medqa": {
3056
+ "lm_eval": {
3057
+ "output_type": "unknown",
3058
+ "metric": ""
3059
+ },
3060
+ "wisent": {
3061
+ "evaluator": null
3062
+ },
3063
+ "match": false,
3064
+ "notes": "MISSING"
3065
+ },
3066
+ "medtext": {
3067
+ "lm_eval": {
3068
+ "output_type": "generate_until",
3069
+ "metric": ""
3070
+ },
3071
+ "wisent": {
3072
+ "evaluator": "generation"
3073
+ },
3074
+ "match": true,
3075
+ "notes": "MATCH"
3076
+ },
3077
+ "meqsum": {
3078
+ "lm_eval": {
3079
+ "output_type": "generate_until",
3080
+ "metric": ""
3081
+ },
3082
+ "wisent": {
3083
+ "evaluator": "generation"
3084
+ },
3085
+ "match": true,
3086
+ "notes": "MATCH"
3087
+ },
3088
+ "metabench": {
3089
+ "lm_eval": {
3090
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/metabench/metabench_hellaswag.yaml",
3091
+ "output_type": "multiple_choice",
3092
+ "metric": "acc",
3093
+ "line_references": {
3094
+ "output_type": 1,
3095
+ "metric": 2
3096
+ }
3097
+ },
3098
+ "wisent": {
3099
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py",
3100
+ "evaluator": null,
3101
+ "line_references": {
3102
+ "evaluator": null
3103
+ }
3104
+ },
3105
+ "match": false,
3106
+ "notes": "MISSING - no evaluator_name defined for multiple_choice task"
3107
+ },
3108
+ "mimic_repsum": {
3109
+ "lm_eval": {
3110
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mimic_repsum/mimic_repsum.yaml",
3111
+ "output_type": "generate_until",
3112
+ "metric": "bleu",
3113
+ "line_references": {
3114
+ "output_type": 1,
3115
+ "metric": 3
3116
+ }
3117
+ },
3118
+ "wisent": {
3119
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mimic_repsum.py",
3120
+ "evaluator": "generation",
3121
+ "line_references": {
3122
+ "evaluator": 18
3123
+ }
3124
+ },
3125
+ "match": true,
3126
+ "notes": "MATCH - generate_until with generation evaluator"
3127
+ },
3128
+ "mmlu-pro-plus": {
3129
+ "lm_eval": {
3130
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mmlu-pro-plus/_default_template_yaml",
3131
+ "output_type": "generate_until",
3132
+ "metric": "exact_match",
3133
+ "line_references": {
3134
+ "output_type": 1,
3135
+ "metric": 3
3136
+ }
3137
+ },
3138
+ "wisent": {
3139
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_pro.py",
3140
+ "evaluator": null,
3141
+ "line_references": {
3142
+ "evaluator": null
3143
+ }
3144
+ },
3145
+ "match": false,
3146
+ "notes": "MISSING - no evaluator_name defined for generate_until task"
3147
+ },
3148
+ "mmlu_prox": {
3149
+ "lm_eval": {
3150
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mmlu_prox/sw/mmlu_prox_sw_computer_science.yaml",
3151
+ "output_type": "multiple_choice",
3152
+ "metric": "acc",
3153
+ "line_references": {
3154
+ "output_type": 1,
3155
+ "metric": 2
3156
+ }
3157
+ },
3158
+ "wisent": {
3159
+ "file": null,
3160
+ "evaluator": null,
3161
+ "line_references": {
3162
+ "evaluator": null
3163
+ }
3164
+ },
3165
+ "match": false,
3166
+ "notes": "MISSING - no Wisent extractor file exists for mmlu_prox"
3167
+ },
3168
+ "mmlusr": {
3169
+ "lm_eval": {
3170
+ "file": "/opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/lm_eval/tasks/mmlusr/question_and_answer/_mmlusr_qna_yml",
3171
+ "output_type": "multiple_choice",
3172
+ "metric": "acc",
3173
+ "line_references": {
3174
+ "output_type": 1,
3175
+ "metric": 2
3176
+ }
3177
+ },
3178
+ "wisent": {
3179
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py",
3180
+ "evaluator": "log_likelihoods",
3181
+ "line_references": {
3182
+ "evaluator": 18
3183
+ }
3184
+ },
3185
+ "match": true,
3186
+ "notes": "MATCH - multiple_choice with log_likelihoods evaluator"
3187
+ },
3188
+ "mts_dialog": {
3189
+ "lm_eval": {
3190
+ "output_type": "generate_until",
3191
+ "metric": "bleu"
3192
+ },
3193
+ "wisent": {
3194
+ "evaluator": "generation",
3195
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mts_dialog.py"
3196
+ },
3197
+ "match": true,
3198
+ "notes": "MATCH"
3199
+ },
3200
+ "multiblimp": {
3201
+ "lm_eval": {
3202
+ "output_type": "multiple_choice",
3203
+ "metric": "acc"
3204
+ },
3205
+ "wisent": {
3206
+ "evaluator": "log_likelihoods",
3207
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py"
3208
+ },
3209
+ "match": true,
3210
+ "notes": "MATCH"
3211
+ },
3212
+ "noreval": {
3213
+ "lm_eval": {
3214
+ "output_type": "generate_until",
3215
+ "metric": "bleu"
3216
+ },
3217
+ "wisent": {
3218
+ "evaluator": "log_likelihoods",
3219
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py"
3220
+ },
3221
+ "match": false,
3222
+ "notes": "MISMATCH"
3223
+ },
3224
+ "okapi/arc_multilingual": {
3225
+ "lm_eval": {
3226
+ "output_type": "multiple_choice",
3227
+ "metric": "acc"
3228
+ },
3229
+ "wisent": {
3230
+ "evaluator": "log_likelihoods",
3231
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py"
3232
+ },
3233
+ "match": true,
3234
+ "notes": "MATCH"
3235
+ },
3236
+ "okapi/hellaswag_multilingual": {
3237
+ "lm_eval": {
3238
+ "output_type": "multiple_choice",
3239
+ "metric": "acc"
3240
+ },
3241
+ "wisent": {
3242
+ "evaluator": "log_likelihoods",
3243
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py"
3244
+ },
3245
+ "match": true,
3246
+ "notes": "MATCH"
3247
+ },
3248
+ "olaph": {
3249
+ "lm_eval": {
3250
+ "output_type": "generate_until",
3251
+ "metric": "bleu"
3252
+ },
3253
+ "wisent": {
3254
+ "evaluator": "generation",
3255
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py"
3256
+ },
3257
+ "match": true,
3258
+ "notes": "MATCH"
3259
+ },
3260
+ "okapi/mmlu_multilingual": {
3261
+ "lm_eval": {
3262
+ "output_type": "multiple_choice",
3263
+ "metric": "acc"
3264
+ },
3265
+ "wisent": {
3266
+ "evaluator": "log_likelihoods",
3267
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py"
3268
+ },
3269
+ "match": true,
3270
+ "notes": "MATCH"
3271
+ },
3272
+ "okapi/truthfulqa_multilingual": {
3273
+ "lm_eval": {
3274
+ "output_type": "multiple_choice",
3275
+ "metric": "acc"
3276
+ },
3277
+ "wisent": {
3278
+ "evaluator": "log_likelihoods",
3279
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py"
3280
+ },
3281
+ "match": true,
3282
+ "notes": "MATCH"
3283
+ },
3284
+ "paws-x": {
3285
+ "lm_eval": {
3286
+ "output_type": "multiple_choice",
3287
+ "metric": "acc"
3288
+ },
3289
+ "wisent": {
3290
+ "evaluator": "log_likelihoods",
3291
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py"
3292
+ },
3293
+ "match": true,
3294
+ "notes": "MATCH"
3295
+ },
3296
+ "pile_10k": {
3297
+ "lm_eval": {
3298
+ "output_type": "loglikelihood_rolling",
3299
+ "metric": "word_perplexity"
3300
+ },
3301
+ "wisent": {
3302
+ "evaluator": "generation",
3303
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile_10k.py"
3304
+ },
3305
+ "match": false,
3306
+ "notes": "MISMATCH"
3307
+ },
3308
+ "portuguese_bench": {
3309
+ "lm_eval": {
3310
+ "output_type": "multiple_choice",
3311
+ "metric": "acc"
3312
+ },
3313
+ "wisent": {
3314
+ "evaluator": "log_likelihoods",
3315
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py"
3316
+ },
3317
+ "match": true,
3318
+ "notes": "MATCH"
3319
+ },
3320
+ "prost": {
3321
+ "lm_eval": {
3322
+ "output_type": "multiple_choice",
3323
+ "metric": "acc"
3324
+ },
3325
+ "wisent": {
3326
+ "evaluator": null,
3327
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py"
3328
+ },
3329
+ "match": false,
3330
+ "notes": "MISSING"
3331
+ },
3332
+ "score": {
3333
+ "lm_eval": {
3334
+ "output_type": "multiple_choice",
3335
+ "metric": "acc"
3336
+ },
3337
+ "wisent": {
3338
+ "evaluator": "log_likelihoods",
3339
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py"
3340
+ },
3341
+ "match": true,
3342
+ "notes": "MATCH"
3343
+ },
3344
+ "simple_cooccurrence_bias": {
3345
+ "lm_eval": {
3346
+ "output_type": "multiple_choice",
3347
+ "metric": "acc"
3348
+ },
3349
+ "wisent": {
3350
+ "evaluator": null,
3351
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/simple_cooccurrence_bias.py"
3352
+ },
3353
+ "match": false,
3354
+ "notes": "MISSING"
3355
+ },
3356
+ "spanish_bench": {
3357
+ "lm_eval": {
3358
+ "output_type": "multiple_choice",
3359
+ "metric": "acc"
3360
+ },
3361
+ "wisent": {
3362
+ "evaluator": "log_likelihoods",
3363
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py"
3364
+ },
3365
+ "match": true,
3366
+ "notes": "MATCH"
3367
+ },
3368
+ "squadv2": {
3369
+ "lm_eval": {
3370
+ "output_type": "generate_until",
3371
+ "metric": "exact"
3372
+ },
3373
+ "wisent": {
3374
+ "evaluator": null,
3375
+ "file": null
3376
+ },
3377
+ "match": false,
3378
+ "notes": "MISSING"
3379
+ },
3380
+ "swde": {
3381
+ "lm_eval": {
3382
+ "output_type": "generate_until",
3383
+ "metric": "exact_match"
3384
+ },
3385
+ "wisent": {
3386
+ "evaluator": null,
3387
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py"
3388
+ },
3389
+ "match": false,
3390
+ "notes": "MISSING"
3391
+ },
3392
+ "tinyBenchmarks": {
3393
+ "lm_eval": {
3394
+ "output_type": "multiple_choice",
3395
+ "metric": "acc"
3396
+ },
3397
+ "wisent": {
3398
+ "evaluator": "log_likelihoods",
3399
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyBenchmarks.py"
3400
+ },
3401
+ "match": true,
3402
+ "notes": "MATCH"
3403
+ },
3404
+ "truthfulqa": {
3405
+ "lm_eval": {
3406
+ "output_type": "multiple_choice",
3407
+ "metric": "acc"
3408
+ },
3409
+ "wisent": {
3410
+ "evaluator": "log_likelihoods\" # Mixed, but defaulting to log_likelihoods",
3411
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py"
3412
+ },
3413
+ "match": false,
3414
+ "notes": "MISMATCH"
3415
+ },
3416
+ "truthfulqa-multi": {
3417
+ "lm_eval": {
3418
+ "output_type": "generate_until",
3419
+ "metric": "bleu"
3420
+ },
3421
+ "wisent": {
3422
+ "evaluator": "mixed\" # Special marker for mixed tasks",
3423
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_multi.py"
3424
+ },
3425
+ "match": false,
3426
+ "notes": "MISMATCH"
3427
+ },
3428
+ "unitxt": {
3429
+ "lm_eval": {
3430
+ "output_type": "generate_until",
3431
+ "metric": "exact_match"
3432
+ },
3433
+ "wisent": {
3434
+ "evaluator": "generation",
3435
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unitxt.py"
3436
+ },
3437
+ "match": true,
3438
+ "notes": "MATCH"
3439
+ },
3440
+ "wmt2016": {
3441
+ "lm_eval": {
3442
+ "output_type": "generate_until",
3443
+ "metric": "bleu"
3444
+ },
3445
+ "wisent": {
3446
+ "evaluator": null,
3447
+ "file": null
3448
+ },
3449
+ "match": false,
3450
+ "notes": "MISSING"
3451
+ },
3452
+ "xnli_eu": {
3453
+ "lm_eval": {
3454
+ "output_type": "multiple_choice",
3455
+ "metric": "acc"
3456
+ },
3457
+ "wisent": {
3458
+ "evaluator": null,
3459
+ "file": null
3460
+ },
3461
+ "match": false,
3462
+ "notes": "MISSING"
3463
+ },
3464
+ "xquad": {
3465
+ "lm_eval": {
3466
+ "output_type": "generate_until",
3467
+ "metric": "exact_match"
3468
+ },
3469
+ "wisent": {
3470
+ "evaluator": "generation",
3471
+ "file": "/Users/lukaszbartoszcze/Documents/CodingProjects/Wisent/backends/wisent-open-source/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py"
3472
+ },
3473
+ "match": true,
3474
+ "notes": "MATCH"
3475
+ }
3476
+ }