wisent 0.7.379__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1720) hide show
  1. wisent/__init__.py +64 -0
  2. wisent/cli.py +114 -0
  3. wisent/core/__init__.py +40 -0
  4. wisent/core/activations/__init__.py +26 -0
  5. wisent/core/activations/activations.py +97 -0
  6. wisent/core/activations/activations_collector.py +506 -0
  7. wisent/core/activations/core/__init__.py +0 -0
  8. wisent/core/activations/core/atoms.py +219 -0
  9. wisent/core/activations/prompt_construction_strategy.py +47 -0
  10. wisent/core/adapters/__init__.py +22 -0
  11. wisent/core/adapters/audio.py +616 -0
  12. wisent/core/adapters/base.py +420 -0
  13. wisent/core/adapters/multimodal.py +738 -0
  14. wisent/core/adapters/robotics.py +643 -0
  15. wisent/core/adapters/text.py +441 -0
  16. wisent/core/adapters/video.py +555 -0
  17. wisent/core/agent/__init__.py +1 -0
  18. wisent/core/agent/budget.py +644 -0
  19. wisent/core/agent/device_benchmarks.py +691 -0
  20. wisent/core/agent/diagnose/__init__.py +1 -0
  21. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  22. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  23. wisent/core/agent/diagnose/create_classifier.py +1155 -0
  24. wisent/core/agent/diagnose/response_diagnostics.py +273 -0
  25. wisent/core/agent/diagnose/select_classifiers.py +507 -0
  26. wisent/core/agent/diagnose/synthetic_classifier_option.py +755 -0
  27. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  28. wisent/core/agent/diagnose/tasks/task_manager.py +1453 -0
  29. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  30. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  31. wisent/core/agent/diagnose.py +249 -0
  32. wisent/core/agent/steer.py +215 -0
  33. wisent/core/agent/timeout.py +134 -0
  34. wisent/core/autonomous_agent.py +1158 -0
  35. wisent/core/benchmark_extractors.py +372 -0
  36. wisent/core/benchmark_registry.py +151 -0
  37. wisent/core/bigcode_extractors.py +26 -0
  38. wisent/core/bigcode_integration.py +886 -0
  39. wisent/core/branding.py +108 -0
  40. wisent/core/classifier/__init__.py +1 -0
  41. wisent/core/classifier/models/__init__.py +1 -0
  42. wisent/core/classifiers/__init__.py +1 -0
  43. wisent/core/classifiers/classifiers/__init__.py +0 -0
  44. wisent/core/classifiers/classifiers/core/__init__.py +0 -0
  45. wisent/core/classifiers/classifiers/core/atoms.py +748 -0
  46. wisent/core/classifiers/classifiers/models/__init__.py +0 -0
  47. wisent/core/classifiers/classifiers/models/logistic.py +29 -0
  48. wisent/core/classifiers/classifiers/models/mlp.py +47 -0
  49. wisent/core/classifiers/classifiers/rotator.py +137 -0
  50. wisent/core/classifiers/core/__init__.py +1 -0
  51. wisent/core/classifiers/models/__init__.py +1 -0
  52. wisent/core/classifiers/pipeline_steps/__init__.py +1 -0
  53. wisent/core/cli/__init__.py +26 -0
  54. wisent/core/cli/agent/__init__.py +15 -0
  55. wisent/core/cli/agent/apply_steering.py +192 -0
  56. wisent/core/cli/agent/evaluate_response.py +128 -0
  57. wisent/core/cli/agent/generate_synthetic_pairs.py +123 -0
  58. wisent/core/cli/agent/main.py +139 -0
  59. wisent/core/cli/agent/train_classifier.py +173 -0
  60. wisent/core/cli/check_linearity.py +126 -0
  61. wisent/core/cli/create_steering_vector.py +304 -0
  62. wisent/core/cli/diagnose_pairs.py +153 -0
  63. wisent/core/cli/diagnose_vectors.py +404 -0
  64. wisent/core/cli/estimate_unified_goodness_time.py +428 -0
  65. wisent/core/cli/evaluate_refusal.py +241 -0
  66. wisent/core/cli/evaluate_responses.py +926 -0
  67. wisent/core/cli/generate_humanization_pairs.py +128 -0
  68. wisent/core/cli/generate_pairs.py +175 -0
  69. wisent/core/cli/generate_pairs_from_task.py +108 -0
  70. wisent/core/cli/generate_responses.py +160 -0
  71. wisent/core/cli/generate_vector_from_synthetic.py +217 -0
  72. wisent/core/cli/generate_vector_from_task.py +248 -0
  73. wisent/core/cli/get_activations.py +192 -0
  74. wisent/core/cli/inference_config.py +84 -0
  75. wisent/core/cli/inference_config_cli.py +54 -0
  76. wisent/core/cli/modify_weights.py +660 -0
  77. wisent/core/cli/multi_steer.py +112 -0
  78. wisent/core/cli/optimization_cache.py +298 -0
  79. wisent/core/cli/optimize.py +621 -0
  80. wisent/core/cli/optimize_classification.py +473 -0
  81. wisent/core/cli/optimize_sample_size.py +390 -0
  82. wisent/core/cli/optimize_steering.py +3421 -0
  83. wisent/core/cli/optimize_weights.py +1287 -0
  84. wisent/core/cli/steering_method_trainer.py +641 -0
  85. wisent/core/cli/steering_search_space.py +508 -0
  86. wisent/core/cli/tasks.py +940 -0
  87. wisent/core/cli/train_unified_goodness.py +681 -0
  88. wisent/core/cli_logger.py +22 -0
  89. wisent/core/config_manager.py +1731 -0
  90. wisent/core/contrastive_pairs/__init__.py +15 -0
  91. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  92. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  93. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  94. wisent/core/contrastive_pairs/core/pair.py +183 -0
  95. wisent/core/contrastive_pairs/core/response.py +153 -0
  96. wisent/core/contrastive_pairs/core/serialization.py +306 -0
  97. wisent/core/contrastive_pairs/core/set.py +192 -0
  98. wisent/core/contrastive_pairs/diagnostics/__init__.py +79 -0
  99. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  100. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  101. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +1655 -0
  102. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  103. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  104. wisent/core/contrastive_pairs/diagnostics/duplicates.py +118 -0
  105. wisent/core/contrastive_pairs/diagnostics/linearity.py +325 -0
  106. wisent/core/contrastive_pairs/diagnostics/vector_quality.py +620 -0
  107. wisent/core/contrastive_pairs/huggingface_pairs/__init__.py +1 -0
  108. wisent/core/contrastive_pairs/huggingface_pairs/atoms.py +255 -0
  109. wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_manifest.py +470 -0
  110. wisent/core/contrastive_pairs/huggingface_pairs/hf_extractor_registry.py +136 -0
  111. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/__init__.py +44 -0
  112. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentbench.py +225 -0
  113. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentharm.py +267 -0
  114. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/agentic_search.py +444 -0
  115. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aider_polyglot.py +225 -0
  116. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aime.py +118 -0
  117. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aime2024.py +74 -0
  118. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/aime2025.py +73 -0
  119. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/alpaca_eval.py +153 -0
  120. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/apps.py +182 -0
  121. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/arena_hard.py +179 -0
  122. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/atis.py +89 -0
  123. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/babilong.py +96 -0
  124. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bangla_mmlu.py +108 -0
  125. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/basqueglue.py +217 -0
  126. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bec2016eu.py +99 -0
  127. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bfcl.py +283 -0
  128. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/bhtc_v2.py +87 -0
  129. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/browsecomp.py +245 -0
  130. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/chain_of_thought.py +89 -0
  131. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/chinese_simpleqa.py +209 -0
  132. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/cluewsc.py +177 -0
  133. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/cnn_dailymail.py +92 -0
  134. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codeforces.py +378 -0
  135. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue.py +109 -0
  136. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text.py +15 -0
  137. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_go.py +64 -0
  138. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_java.py +65 -0
  139. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_javascript.py +65 -0
  140. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_php.py +65 -0
  141. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_python.py +65 -0
  142. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/codexglue_code_to_text_ruby.py +65 -0
  143. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coding_benchmarks.py +844 -0
  144. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/coedit_gec.py +79 -0
  145. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/conala.py +133 -0
  146. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/concode.py +111 -0
  147. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/dbpedia_14.py +91 -0
  148. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/doc_vqa.py +102 -0
  149. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/donotanswer.py +236 -0
  150. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ds1000.py +129 -0
  151. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ds_1000.py +155 -0
  152. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/epec_koref_bin.py +85 -0
  153. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ethos_binary.py +82 -0
  154. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/evalita_mp.py +165 -0
  155. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/evalita_sp_sum_task_fp_small_p1.py +89 -0
  156. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/facts_grounding.py +181 -0
  157. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/faithbench.py +295 -0
  158. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/financial_tweets.py +100 -0
  159. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flames.py +270 -0
  160. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flan_held_in.py +98 -0
  161. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/flores.py +572 -0
  162. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/frames.py +143 -0
  163. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/freebase.py +99 -0
  164. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/get_negative_example_livecodebench.py +146 -0
  165. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/get_positive_example_livecodebench.py +140 -0
  166. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/gpt3_translation_benchmarks.py +98 -0
  167. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hallucinations_leaderboard.py +389 -0
  168. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/halueval.py +246 -0
  169. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/harmbench.py +250 -0
  170. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/healthbench.py +181 -0
  171. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hle.py +106 -0
  172. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/hmmt.py +117 -0
  173. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humaneval.py +119 -0
  174. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/humanevalpack.py +102 -0
  175. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instruct_humaneval.py +180 -0
  176. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/instructhumaneval.py +129 -0
  177. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/iwslt2017_ar_en.py +98 -0
  178. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/iwslt2017_en_ar.py +98 -0
  179. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/jailbreakbench.py +258 -0
  180. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/law_stack_exchange.py +101 -0
  181. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/ledgar.py +118 -0
  182. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livecodebench.py +61 -0
  183. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livecodebench_contrastive_pair_generator.py +491 -0
  184. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livecodebench_v6.py +263 -0
  185. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/livemathbench.py +230 -0
  186. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/llama.py +96 -0
  187. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/longform_writing.py +285 -0
  188. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/m_mmlu.py +96 -0
  189. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math.py +186 -0
  190. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/math500.py +146 -0
  191. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mbpp.py +142 -0
  192. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/meddialog.py +79 -0
  193. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medical_abstracts.py +101 -0
  194. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/medium_priority_benchmarks.py +787 -0
  195. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mercury.py +111 -0
  196. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mmlu_redux.py +194 -0
  197. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/mmlusr.py +108 -0
  198. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multimedqa.py +99 -0
  199. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multipl_e.py +109 -0
  200. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple.py +96 -0
  201. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_choice.py +87 -0
  202. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_cpp.py +128 -0
  203. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_go.py +128 -0
  204. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_java.py +128 -0
  205. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_js.py +128 -0
  206. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_py.py +15 -0
  207. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/multiple_rs.py +128 -0
  208. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/non_greedy_robustness_agieval_aqua_rat.py +92 -0
  209. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/olympiadbench.py +287 -0
  210. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/openllm.py +99 -0
  211. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/option_order_robustness_agieval_aqua_rat.py +92 -0
  212. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/or_bench.py +300 -0
  213. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/penn_treebank.py +80 -0
  214. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/planbench.py +317 -0
  215. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/polymath.py +467 -0
  216. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/prompt_robustness_agieval_aqua_rat.py +92 -0
  217. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/pythia.py +99 -0
  218. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/recode.py +131 -0
  219. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/refusalbench.py +280 -0
  220. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/scicode.py +275 -0
  221. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/self_consistency.py +90 -0
  222. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/simpleqa.py +145 -0
  223. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/sorry_bench.py +211 -0
  224. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/stsb.py +79 -0
  225. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_glue_lm_eval_v1.py +99 -0
  226. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_glue_lm_eval_v1_seq2seq.py +98 -0
  227. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_glue_t5_prompt.py +123 -0
  228. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/super_gpqa.py +106 -0
  229. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/swe_bench.py +428 -0
  230. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/swe_bench_verified.py +158 -0
  231. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/sycophancy_eval.py +205 -0
  232. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/t0_eval.py +79 -0
  233. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tag.py +98 -0
  234. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tau_bench.py +305 -0
  235. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/tmlu.py +109 -0
  236. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolbench.py +360 -0
  237. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/toolemu.py +386 -0
  238. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/travelplanner.py +286 -0
  239. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/truthfulqa_generation.py +128 -0
  240. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/unfair_tos.py +83 -0
  241. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/vaxx_stance.py +86 -0
  242. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wiceu.py +85 -0
  243. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wikitext103.py +97 -0
  244. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wildguard.py +280 -0
  245. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt14_en_fr.py +97 -0
  246. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt14_fr_en.py +97 -0
  247. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_de_en.py +90 -0
  248. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_en_de.py +90 -0
  249. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_en_ro.py +90 -0
  250. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt16_ro_en.py +90 -0
  251. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/wmt_ro_en_t5_prompt.py +90 -0
  252. wisent/core/contrastive_pairs/huggingface_pairs/hf_task_extractors/xsum.py +81 -0
  253. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  254. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +265 -0
  255. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/__init__.py +472 -0
  256. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/aclue.py +24 -0
  257. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/acp.py +33 -0
  258. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/acpbench.py +39 -0
  259. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/advanced_ai_risk.py +59 -0
  260. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/aexams.py +14 -0
  261. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrimgsm.py +10 -0
  262. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrimmlu.py +10 -0
  263. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrixnli.py +9 -0
  264. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench.py +14 -0
  265. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_adr.py +9 -0
  266. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_afriqa.py +9 -0
  267. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_afrisenti.py +9 -0
  268. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_belebele.py +9 -0
  269. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_flores.py +9 -0
  270. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_injongointent.py +9 -0
  271. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_mafand.py +9 -0
  272. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_masakhaner.py +9 -0
  273. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_masakhanews.py +9 -0
  274. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_masakhapos.py +9 -0
  275. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_naijarc.py +9 -0
  276. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_nollysenti.py +9 -0
  277. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_ntrex.py +9 -0
  278. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_openai_mmlu.py +9 -0
  279. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_salt.py +9 -0
  280. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_sib.py +9 -0
  281. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_uhura_arc_easy.py +9 -0
  282. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/afrobench_xlsum.py +9 -0
  283. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/agieval.py +33 -0
  284. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/anli.py +9 -0
  285. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arab_culture.py +24 -0
  286. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_acva.py +67 -0
  287. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_acva_light.py +67 -0
  288. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_complete.py +24 -0
  289. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabic_leaderboard_light.py +81 -0
  290. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arabicmmlu.py +59 -0
  291. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/aradice.py +36 -0
  292. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arc.py +61 -0
  293. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/arithmetic.py +19 -0
  294. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/basque_bench.py +37 -0
  295. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bbh.py +121 -0
  296. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bbq.py +9 -0
  297. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/belebele.py +293 -0
  298. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bertaqa.py +25 -0
  299. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/bigbench.py +300 -0
  300. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/blimp.py +76 -0
  301. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/careqa.py +9 -0
  302. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/catalan_bench.py +43 -0
  303. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/ceval_valid.py +61 -0
  304. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/cmmlu.py +76 -0
  305. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/code_x_glue.py +16 -0
  306. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/copal_id.py +11 -0
  307. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/crows_pairs.py +31 -0
  308. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/csatqa.py +15 -0
  309. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/darija.py +29 -0
  310. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/darijammlu.py +57 -0
  311. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/egymmlu.py +62 -0
  312. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/eus.py +76 -0
  313. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/evalita_mp.py +93 -0
  314. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/fld.py +9 -0
  315. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/flores.py +466 -0
  316. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/freebase.py +9 -0
  317. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/french_bench.py +23 -0
  318. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/galician_bench.py +41 -0
  319. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/glianorex.py +11 -0
  320. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/global_mmlu.py +115 -0
  321. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/gpqa.py +27 -0
  322. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/gsm8k.py +9 -0
  323. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/gsm8k_platinum.py +9 -0
  324. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/haerae.py +14 -0
  325. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/headqa.py +11 -0
  326. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hellaswag.py +39 -0
  327. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hendrycks_ethics.py +14 -0
  328. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hendrycks_math.py +9 -0
  329. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/hrm8k.py +20 -0
  330. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/inverse.py +22 -0
  331. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/japanese_leaderboard.py +20 -0
  332. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/jsonschema_bench.py +9 -0
  333. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kbl.py +85 -0
  334. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kmmlu.py +281 -0
  335. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kobest.py +14 -0
  336. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/kormedmcqa.py +9 -0
  337. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/lambada.py +28 -0
  338. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/leaderboard.py +52 -0
  339. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/libra.py +9 -0
  340. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/lingoly.py +11 -0
  341. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/longbench.py +9 -0
  342. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/m.py +43 -0
  343. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mastermind.py +9 -0
  344. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mathqa.py +9 -0
  345. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/med.py +24 -0
  346. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/meddialog.py +12 -0
  347. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/medqa.py +9 -0
  348. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mela.py +18 -0
  349. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/metabench.py +36 -0
  350. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mgsm.py +44 -0
  351. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/minerva_math.py +16 -0
  352. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mlqa.py +58 -0
  353. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu.py +70 -0
  354. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu_pro.py +23 -0
  355. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu_pro_plus.py +23 -0
  356. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlu_prox.py +191 -0
  357. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmlusr.py +9 -0
  358. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/mmmu.py +46 -0
  359. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/model_written_evals.py +9 -0
  360. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/multiblimp.py +111 -0
  361. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/non.py +23 -0
  362. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/noreval.py +143 -0
  363. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/noridiom.py +20 -0
  364. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/nortruthfulqa.py +32 -0
  365. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/nrk.py +20 -0
  366. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi.py +9 -0
  367. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_arc_multilingual.py +10 -0
  368. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_hellaswag_multilingual.py +24 -0
  369. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_mmlu_multilingual.py +24 -0
  370. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/okapi_truthfulqa_multilingual.py +34 -0
  371. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/paloma.py +25 -0
  372. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/pawsx.py +9 -0
  373. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/persona.py +144 -0
  374. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/pile.py +31 -0
  375. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/polemo2.py +9 -0
  376. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/portuguese_bench.py +31 -0
  377. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/prompt.py +23 -0
  378. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/qa4mre.py +12 -0
  379. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/qasper.py +11 -0
  380. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/ru.py +19 -0
  381. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/ruler.py +9 -0
  382. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/score.py +20 -0
  383. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/scrolls.py +9 -0
  384. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/self_consistency.py +11 -0
  385. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/spanish_bench.py +38 -0
  386. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/storycloze.py +9 -0
  387. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/super_glue_t5_prompt.py +17 -0
  388. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/tinyBenchmarks.py +9 -0
  389. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/tmlu.py +9 -0
  390. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/tmmluplus.py +80 -0
  391. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/translation.py +9 -0
  392. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/truthfulqa.py +76 -0
  393. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/truthfulqa_multi.py +24 -0
  394. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/turkishmmlu.py +30 -0
  395. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/unitxt.py +23 -0
  396. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/unscramble.py +9 -0
  397. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/winogender.py +16 -0
  398. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wmdp.py +12 -0
  399. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wmt14.py +16 -0
  400. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wmt16.py +22 -0
  401. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/wsc273.py +9 -0
  402. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xcopa.py +21 -0
  403. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xnli.py +28 -0
  404. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xnli_eu.py +12 -0
  405. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xquad.py +22 -0
  406. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xstorycloze.py +22 -0
  407. wisent/core/contrastive_pairs/lm_eval_pairs/group_task_manifests/xwinograd.py +15 -0
  408. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +478 -0
  409. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +140 -0
  410. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +125 -0
  411. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aclue.py +171 -0
  412. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench.py +207 -0
  413. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/acp_bench_hard.py +185 -0
  414. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/advanced.py +130 -0
  415. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aexams.py +184 -0
  416. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimgsm.py +98 -0
  417. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrimmlu.py +113 -0
  418. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrixnli.py +129 -0
  419. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrobench_cot.py +88 -0
  420. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/afrobench_mc.py +107 -0
  421. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ag.py +134 -0
  422. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/agieval.py +155 -0
  423. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ai2_arc.py +114 -0
  424. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anagrams1.py +81 -0
  425. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anagrams2.py +81 -0
  426. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/anli.py +140 -0
  427. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabculture.py +180 -0
  428. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic.py +98 -0
  429. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_exams.py +104 -0
  430. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_complete.py +168 -0
  431. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabic_leaderboard_light.py +168 -0
  432. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arabicmmlu.py +167 -0
  433. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/aradice.py +268 -0
  434. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc.py +133 -0
  435. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +118 -0
  436. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +118 -0
  437. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_gen.py +101 -0
  438. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_mc.py +106 -0
  439. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/argument.py +134 -0
  440. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +114 -0
  441. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +122 -0
  442. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/assin.py +103 -0
  443. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/babi.py +113 -0
  444. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench.py +155 -0
  445. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench_gen.py +168 -0
  446. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/basque_bench_mc.py +139 -0
  447. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbh.py +133 -0
  448. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bbq.py +169 -0
  449. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/belebele.py +181 -0
  450. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/benchmarks.py +155 -0
  451. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bertaqa.py +165 -0
  452. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhs.py +155 -0
  453. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bhtc.py +143 -0
  454. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/bigbench.py +170 -0
  455. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp.py +171 -0
  456. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/blimp_nl.py +152 -0
  457. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +117 -0
  458. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq_seq2seq.py +117 -0
  459. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/c4.py +150 -0
  460. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabbq.py +152 -0
  461. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cabreu.py +127 -0
  462. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/careqa.py +169 -0
  463. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench.py +155 -0
  464. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench_gen.py +119 -0
  465. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalan_bench_mc.py +113 -0
  466. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catalanqa.py +171 -0
  467. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/catcola.py +139 -0
  468. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +117 -0
  469. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval.py +223 -0
  470. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ceval_valid.py +163 -0
  471. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chain.py +110 -0
  472. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/chartqa.py +238 -0
  473. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/claim.py +151 -0
  474. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/click.py +152 -0
  475. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cmmlu.py +166 -0
  476. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cnn.py +144 -0
  477. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cocoteros.py +148 -0
  478. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code2text.py +161 -0
  479. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/code_x_glue.py +114 -0
  480. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/codexglue.py +107 -0
  481. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coedit.py +149 -0
  482. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cola.py +83 -0
  483. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense.py +107 -0
  484. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/commonsense_qa.py +127 -0
  485. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +124 -0
  486. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copal_id.py +169 -0
  487. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +162 -0
  488. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqcat.py +114 -0
  489. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/crows_pairs.py +158 -0
  490. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/csatqa.py +152 -0
  491. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle.py +107 -0
  492. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cycle_letters.py +81 -0
  493. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darija_bench.py +221 -0
  494. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijahellaswag.py +174 -0
  495. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/darijammlu.py +152 -0
  496. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/dbpedia.py +157 -0
  497. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/discrim_eval.py +152 -0
  498. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/doc.py +107 -0
  499. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +129 -0
  500. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/egyhellaswag.py +125 -0
  501. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/egymmlu.py +180 -0
  502. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/epec.py +142 -0
  503. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq.py +107 -0
  504. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench.py +194 -0
  505. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_ca.py +152 -0
  506. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eq_bench_es.py +152 -0
  507. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/esbbq.py +152 -0
  508. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/escola.py +85 -0
  509. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethics.py +135 -0
  510. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ethos.py +99 -0
  511. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus.py +107 -0
  512. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_exams.py +225 -0
  513. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_proficiency.py +159 -0
  514. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_reading.py +159 -0
  515. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/eus_trivia.py +159 -0
  516. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_llm.py +166 -0
  517. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/evalita_sp.py +109 -0
  518. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/fda.py +105 -0
  519. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/financial.py +107 -0
  520. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/flan.py +114 -0
  521. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/fld.py +143 -0
  522. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench.py +202 -0
  523. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench_mc.py +98 -0
  524. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/french_bench_perplexity.py +86 -0
  525. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galcola.py +109 -0
  526. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench.py +155 -0
  527. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench_gen.py +118 -0
  528. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/galician_bench_mc.py +112 -0
  529. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gaokao.py +141 -0
  530. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glianorex.py +118 -0
  531. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_mmlu.py +171 -0
  532. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/global_piqa.py +152 -0
  533. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/glue.py +109 -0
  534. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpqa.py +161 -0
  535. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gpt3.py +110 -0
  536. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/groundcocoa.py +184 -0
  537. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm.py +108 -0
  538. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +134 -0
  539. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/haerae.py +152 -0
  540. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +112 -0
  541. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +125 -0
  542. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_ethics.py +225 -0
  543. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hendrycks_math.py +191 -0
  544. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/histoires_morales.py +179 -0
  545. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hle.py +111 -0
  546. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hrm8k.py +203 -0
  547. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval.py +124 -0
  548. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/humaneval_infilling.py +152 -0
  549. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/icelandic_winogrande.py +152 -0
  550. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ifeval.py +118 -0
  551. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse.py +107 -0
  552. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/inverse_scaling.py +192 -0
  553. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/iwslt2017.py +117 -0
  554. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ja.py +107 -0
  555. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard.py +155 -0
  556. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_gen.py +224 -0
  557. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/japanese_leaderboard_mc.py +120 -0
  558. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/jsonschema_bench.py +123 -0
  559. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kbl.py +140 -0
  560. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu.py +168 -0
  561. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu_cot.py +88 -0
  562. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kmmlu_mc.py +107 -0
  563. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kobest.py +165 -0
  564. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/kormedmcqa.py +160 -0
  565. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada.py +147 -0
  566. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_cloze.py +185 -0
  567. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual.py +185 -0
  568. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lambada_multilingual_stablelm.py +141 -0
  569. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/law.py +107 -0
  570. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/leaderboard.py +194 -0
  571. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/libra.py +165 -0
  572. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lingoly.py +203 -0
  573. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livemathbench.py +155 -0
  574. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/llama3.py +152 -0
  575. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/lm_syneval.py +152 -0
  576. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logieval.py +82 -0
  577. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +115 -0
  578. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +114 -0
  579. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbench.py +152 -0
  580. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/longbenchv2.py +152 -0
  581. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mastermind.py +203 -0
  582. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mathqa.py +137 -0
  583. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mbpp.py +123 -0
  584. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +115 -0
  585. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/med_concepts_qa.py +224 -0
  586. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meddialog.py +180 -0
  587. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medical.py +107 -0
  588. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mediqa_qa2019.py +123 -0
  589. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medmcqa.py +169 -0
  590. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +118 -0
  591. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medtext.py +108 -0
  592. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mela.py +96 -0
  593. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/meqsum.py +115 -0
  594. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/metabench.py +154 -0
  595. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mgsm.py +122 -0
  596. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mimic_repsum.py +140 -0
  597. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/minerva_math.py +172 -0
  598. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mlqa.py +143 -0
  599. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu.py +144 -0
  600. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_cot.py +88 -0
  601. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_mc.py +107 -0
  602. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlu_pro.py +145 -0
  603. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmlusr.py +189 -0
  604. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mmmu.py +150 -0
  605. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mnli.py +113 -0
  606. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/model_written_evals.py +115 -0
  607. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/moral_stories.py +151 -0
  608. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +111 -0
  609. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mts_dialog.py +118 -0
  610. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mts_dialog_perplexity.py +97 -0
  611. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multiblimp.py +134 -0
  612. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multilingual.py +106 -0
  613. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +114 -0
  614. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +113 -0
  615. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/non.py +107 -0
  616. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval.py +173 -0
  617. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_exact.py +157 -0
  618. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen.py +277 -0
  619. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_gen_exact.py +165 -0
  620. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc.py +228 -0
  621. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noreval_mc_log_likelihoods.py +223 -0
  622. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/noticia.py +105 -0
  623. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/nq_open.py +135 -0
  624. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi.py +27 -0
  625. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_arc_multilingual.py +167 -0
  626. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_hellaswag_multilingual.py +174 -0
  627. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_mmlu_multilingual.py +162 -0
  628. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/okapi_truthfulqa_multilingual.py +209 -0
  629. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph.py +186 -0
  630. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/olaph_perplexity.py +97 -0
  631. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +118 -0
  632. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/option.py +107 -0
  633. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paloma.py +205 -0
  634. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafraseja.py +110 -0
  635. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/parafrases.py +110 -0
  636. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws.py +107 -0
  637. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/paws_x.py +154 -0
  638. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +115 -0
  639. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/persona.py +246 -0
  640. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases.py +144 -0
  641. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/phrases_ca_va.py +82 -0
  642. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile.py +161 -0
  643. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pile_10k.py +140 -0
  644. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +116 -0
  645. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polemo2.py +135 -0
  646. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/polymath.py +155 -0
  647. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench.py +155 -0
  648. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench_gen.py +121 -0
  649. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/portuguese_bench_mc.py +103 -0
  650. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prompt.py +107 -0
  651. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +115 -0
  652. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +112 -0
  653. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +119 -0
  654. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +118 -0
  655. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper_bool.py +112 -0
  656. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +111 -0
  657. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnlieu.py +107 -0
  658. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +111 -0
  659. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/quac.py +111 -0
  660. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +124 -0
  661. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/random.py +107 -0
  662. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/realtoxicityprompts.py +124 -0
  663. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +125 -0
  664. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/reversed.py +110 -0
  665. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +111 -0
  666. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/ruler.py +170 -0
  667. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +113 -0
  668. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/score.py +177 -0
  669. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls.py +161 -0
  670. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/scrolls_mc.py +157 -0
  671. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/self.py +110 -0
  672. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue.py +131 -0
  673. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sglue_rte.py +119 -0
  674. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/simple_cooccurrence_bias.py +121 -0
  675. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/siqa.py +209 -0
  676. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +114 -0
  677. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench.py +155 -0
  678. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench_gen.py +117 -0
  679. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/spanish_bench_mc.py +110 -0
  680. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad2.py +129 -0
  681. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad_completion.py +121 -0
  682. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sst2.py +111 -0
  683. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/storycloze.py +250 -0
  684. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/summarization.py +107 -0
  685. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super.py +107 -0
  686. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/super_glue.py +154 -0
  687. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/superglue.py +111 -0
  688. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/supergpqa.py +111 -0
  689. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +115 -0
  690. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swde.py +179 -0
  691. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sycophancy.py +117 -0
  692. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/t0.py +110 -0
  693. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/teca.py +110 -0
  694. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyarc.py +110 -0
  695. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinybenchmarks.py +155 -0
  696. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinygsm8k.py +110 -0
  697. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinyhellaswag.py +110 -0
  698. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinymmlu.py +110 -0
  699. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinytruthfulqa.py +113 -0
  700. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tinywinogrande.py +110 -0
  701. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/tmmluplus.py +181 -0
  702. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/toxigen.py +91 -0
  703. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/translation.py +149 -0
  704. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +130 -0
  705. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa.py +112 -0
  706. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +120 -0
  707. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +140 -0
  708. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_multi.py +142 -0
  709. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turblimp_core.py +152 -0
  710. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu.py +161 -0
  711. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_cot.py +104 -0
  712. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/turkishmmlu_mc.py +102 -0
  713. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/twenty_newsgroups.py +111 -0
  714. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unitxt.py +131 -0
  715. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/unscramble.py +155 -0
  716. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/vaxx.py +95 -0
  717. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +130 -0
  718. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +122 -0
  719. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wikitext.py +146 -0
  720. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogender.py +139 -0
  721. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +118 -0
  722. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmdp.py +155 -0
  723. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmt14.py +110 -0
  724. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wmt16.py +118 -0
  725. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +114 -0
  726. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +117 -0
  727. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc273.py +180 -0
  728. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xcopa.py +197 -0
  729. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xlsum.py +147 -0
  730. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +131 -0
  731. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xquad.py +203 -0
  732. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +129 -0
  733. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +124 -0
  734. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/yahoo.py +108 -0
  735. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/zhoblimp.py +155 -0
  736. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +56 -0
  737. wisent/core/data_loaders/__init__.py +235 -0
  738. wisent/core/data_loaders/core/__init__.py +0 -0
  739. wisent/core/data_loaders/core/atoms.py +99 -0
  740. wisent/core/data_loaders/loaders/__init__.py +0 -0
  741. wisent/core/data_loaders/loaders/custom.py +120 -0
  742. wisent/core/data_loaders/loaders/huggingface_loader.py +153 -0
  743. wisent/core/data_loaders/loaders/lm_loader.py +494 -0
  744. wisent/core/data_loaders/loaders/lm_loader_special_cases.py +496 -0
  745. wisent/core/data_loaders/loaders/task_interface_loader.py +300 -0
  746. wisent/core/data_loaders/rotator.py +118 -0
  747. wisent/core/detection_handling.py +259 -0
  748. wisent/core/diversity_processors.py +193 -0
  749. wisent/core/download_full_benchmarks.py +1512 -0
  750. wisent/core/errors/__init__.py +203 -0
  751. wisent/core/errors/error_codes.py +763 -0
  752. wisent/core/errors/error_handler.py +134 -0
  753. wisent/core/evaluators/__init__.py +0 -0
  754. wisent/core/evaluators/benchmark_specific/__init__.py +42 -0
  755. wisent/core/evaluators/benchmark_specific/aime_evaluator.py +90 -0
  756. wisent/core/evaluators/benchmark_specific/coding/__init__.py +0 -0
  757. wisent/core/evaluators/benchmark_specific/coding/metrics/__init__.py +0 -0
  758. wisent/core/evaluators/benchmark_specific/coding/metrics/core/__init__.py +0 -0
  759. wisent/core/evaluators/benchmark_specific/coding/metrics/core/atoms.py +36 -0
  760. wisent/core/evaluators/benchmark_specific/coding/metrics/evaluator.py +363 -0
  761. wisent/core/evaluators/benchmark_specific/coding/metrics/passk.py +67 -0
  762. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/__init__.py +0 -0
  763. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/core/__init__.py +0 -0
  764. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/core/atoms.py +27 -0
  765. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  766. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/java_sanitizer.py +78 -0
  767. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/python_sanitizer.py +94 -0
  768. wisent/core/evaluators/benchmark_specific/coding/output_sanitizer/utils.py +126 -0
  769. wisent/core/evaluators/benchmark_specific/coding/providers/__init__.py +18 -0
  770. wisent/core/evaluators/benchmark_specific/coding/providers/core/__init__.py +0 -0
  771. wisent/core/evaluators/benchmark_specific/coding/providers/core/atoms.py +31 -0
  772. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/__init__.py +3 -0
  773. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/provider.py +305 -0
  774. wisent/core/evaluators/benchmark_specific/coding/safe_docker/Dockerfile +31 -0
  775. wisent/core/evaluators/benchmark_specific/coding/safe_docker/__init__.py +0 -0
  776. wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/__init__.py +0 -0
  777. wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/atoms.py +105 -0
  778. wisent/core/evaluators/benchmark_specific/coding/safe_docker/core/runtime.py +143 -0
  779. wisent/core/evaluators/benchmark_specific/coding/safe_docker/entrypoint.py +121 -0
  780. wisent/core/evaluators/benchmark_specific/coding/safe_docker/recipes.py +60 -0
  781. wisent/core/evaluators/benchmark_specific/coding/solution_generator.py +258 -0
  782. wisent/core/evaluators/benchmark_specific/conala_evaluator.py +332 -0
  783. wisent/core/evaluators/benchmark_specific/exact_match_evaluator.py +81 -0
  784. wisent/core/evaluators/benchmark_specific/f1_evaluator.py +173 -0
  785. wisent/core/evaluators/benchmark_specific/generation_evaluator.py +488 -0
  786. wisent/core/evaluators/benchmark_specific/livemathbench_evaluator.py +393 -0
  787. wisent/core/evaluators/benchmark_specific/log_likelihoods_evaluator.py +202 -0
  788. wisent/core/evaluators/benchmark_specific/math_evaluator.py +119 -0
  789. wisent/core/evaluators/benchmark_specific/math_parsing/__init__.py +1 -0
  790. wisent/core/evaluators/benchmark_specific/math_parsing/core.py +1640 -0
  791. wisent/core/evaluators/benchmark_specific/math_parsing/extract_boxed.py +48 -0
  792. wisent/core/evaluators/benchmark_specific/math_parsing/is_equiv.py +159 -0
  793. wisent/core/evaluators/benchmark_specific/math_parsing/scripts.py +919 -0
  794. wisent/core/evaluators/benchmark_specific/perplexity_evaluator.py +175 -0
  795. wisent/core/evaluators/benchmark_specific/polymath_evaluator.py +114 -0
  796. wisent/core/evaluators/core/__init__.py +5 -0
  797. wisent/core/evaluators/core/atoms.py +166 -0
  798. wisent/core/evaluators/custom/__init__.py +20 -0
  799. wisent/core/evaluators/custom/custom_evaluator.py +382 -0
  800. wisent/core/evaluators/custom/examples/__init__.py +37 -0
  801. wisent/core/evaluators/custom/examples/desklib_detector.py +166 -0
  802. wisent/core/evaluators/custom/examples/gptzero.py +185 -0
  803. wisent/core/evaluators/custom/examples/humanization.py +79 -0
  804. wisent/core/evaluators/custom/examples/humanization_coherent.py +127 -0
  805. wisent/core/evaluators/custom/examples/roberta_detector.py +173 -0
  806. wisent/core/evaluators/oracles/__init__.py +0 -0
  807. wisent/core/evaluators/oracles/interactive.py +73 -0
  808. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  809. wisent/core/evaluators/oracles/truthfulqa_gen_evaluator.py +168 -0
  810. wisent/core/evaluators/oracles/user_specified.py +67 -0
  811. wisent/core/evaluators/personalization/__init__.py +12 -0
  812. wisent/core/evaluators/personalization/alignment.py +166 -0
  813. wisent/core/evaluators/personalization/coherence.py +325 -0
  814. wisent/core/evaluators/personalization/difference.py +73 -0
  815. wisent/core/evaluators/rotator.py +217 -0
  816. wisent/core/evaluators/steering_evaluators.py +386 -0
  817. wisent/core/evaluators/synthetic_evaluator.py +377 -0
  818. wisent/core/hyperparameter_optimizer.py +547 -0
  819. wisent/core/layer.py +17 -0
  820. wisent/core/lm_eval_harness_ground_truth.py +1431 -0
  821. wisent/core/main.py +101 -0
  822. wisent/core/managed_cached_benchmarks.py +609 -0
  823. wisent/core/mixed_benchmark_sampler.py +366 -0
  824. wisent/core/modalities/__init__.py +545 -0
  825. wisent/core/model_persistence.py +302 -0
  826. wisent/core/models/__init__.py +23 -0
  827. wisent/core/models/core/__init__.py +0 -0
  828. wisent/core/models/core/atoms.py +465 -0
  829. wisent/core/models/inference_config.py +127 -0
  830. wisent/core/models/wisent_model.py +893 -0
  831. wisent/core/multi_steering.py +397 -0
  832. wisent/core/opti/__init__.py +0 -0
  833. wisent/core/opti/core/__init__.py +0 -0
  834. wisent/core/opti/core/atoms.py +177 -0
  835. wisent/core/opti/methods/__init__.py +10 -0
  836. wisent/core/opti/methods/opti_classificator.py +172 -0
  837. wisent/core/opti/methods/opti_steering.py +139 -0
  838. wisent/core/opti/methods/opti_weights.py +523 -0
  839. wisent/core/optuna/__init__.py +54 -0
  840. wisent/core/optuna/classifier/__init__.py +25 -0
  841. wisent/core/optuna/classifier/activation_generator.py +351 -0
  842. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  843. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +685 -0
  844. wisent/core/optuna/steering/__init__.py +20 -0
  845. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +200 -0
  846. wisent/core/optuna/steering/data_utils.py +342 -0
  847. wisent/core/optuna/steering/metrics.py +412 -0
  848. wisent/core/optuna/steering/steering_optimization.py +1096 -0
  849. wisent/core/parser.py +1662 -0
  850. wisent/core/parser_arguments/__init__.py +10 -0
  851. wisent/core/parser_arguments/agent_parser.py +122 -0
  852. wisent/core/parser_arguments/check_linearity_parser.py +82 -0
  853. wisent/core/parser_arguments/configure_model_parser.py +7 -0
  854. wisent/core/parser_arguments/create_steering_vector_parser.py +67 -0
  855. wisent/core/parser_arguments/diagnose_pairs_parser.py +25 -0
  856. wisent/core/parser_arguments/diagnose_vectors_parser.py +72 -0
  857. wisent/core/parser_arguments/evaluate_parser.py +40 -0
  858. wisent/core/parser_arguments/evaluate_refusal_parser.py +32 -0
  859. wisent/core/parser_arguments/evaluate_responses_parser.py +12 -0
  860. wisent/core/parser_arguments/full_optimize_parser.py +194 -0
  861. wisent/core/parser_arguments/generate_pairs_from_task_parser.py +33 -0
  862. wisent/core/parser_arguments/generate_pairs_parser.py +43 -0
  863. wisent/core/parser_arguments/generate_responses_parser.py +16 -0
  864. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +148 -0
  865. wisent/core/parser_arguments/generate_vector_from_task_parser.py +149 -0
  866. wisent/core/parser_arguments/generate_vector_parser.py +89 -0
  867. wisent/core/parser_arguments/get_activations_parser.py +90 -0
  868. wisent/core/parser_arguments/inference_config_parser.py +65 -0
  869. wisent/core/parser_arguments/main_parser.py +220 -0
  870. wisent/core/parser_arguments/model_config_parser.py +59 -0
  871. wisent/core/parser_arguments/modify_weights_parser.py +309 -0
  872. wisent/core/parser_arguments/monitor_parser.py +17 -0
  873. wisent/core/parser_arguments/multi_steer_parser.py +48 -0
  874. wisent/core/parser_arguments/nonsense_parser.py +26 -0
  875. wisent/core/parser_arguments/optimization_cache_parser.py +64 -0
  876. wisent/core/parser_arguments/optimize_classification_parser.py +108 -0
  877. wisent/core/parser_arguments/optimize_parser.py +142 -0
  878. wisent/core/parser_arguments/optimize_sample_size_parser.py +58 -0
  879. wisent/core/parser_arguments/optimize_steering_parser.py +617 -0
  880. wisent/core/parser_arguments/optimize_weights_parser.py +403 -0
  881. wisent/core/parser_arguments/synthetic_parser.py +117 -0
  882. wisent/core/parser_arguments/tasks_parser.py +591 -0
  883. wisent/core/parser_arguments/train_unified_goodness_parser.py +172 -0
  884. wisent/core/parser_arguments/utils.py +107 -0
  885. wisent/core/prompts/__init__.py +0 -0
  886. wisent/core/prompts/core/__init__.py +0 -0
  887. wisent/core/prompts/core/atom.py +57 -0
  888. wisent/core/prompts/core/prompt_formater.py +148 -0
  889. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  890. wisent/core/prompts/prompt_stratiegies/direct_completion.py +26 -0
  891. wisent/core/prompts/prompt_stratiegies/instruction_following.py +26 -0
  892. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +31 -0
  893. wisent/core/prompts/prompt_stratiegies/role_playing.py +33 -0
  894. wisent/core/representation.py +5 -0
  895. wisent/core/save_results.py +277 -0
  896. wisent/core/steering.py +660 -0
  897. wisent/core/steering_method.py +20 -0
  898. wisent/core/steering_methods/__init__.py +54 -0
  899. wisent/core/steering_methods/core/__init__.py +0 -0
  900. wisent/core/steering_methods/core/atoms.py +154 -0
  901. wisent/core/steering_methods/methods/__init__.py +0 -0
  902. wisent/core/steering_methods/methods/caa.py +45 -0
  903. wisent/core/steering_methods/methods/prism.py +588 -0
  904. wisent/core/steering_methods/methods/pulse.py +641 -0
  905. wisent/core/steering_methods/methods/titan.py +1005 -0
  906. wisent/core/steering_methods/preflight.py +322 -0
  907. wisent/core/steering_methods/registry.py +649 -0
  908. wisent/core/steering_methods/rotator.py +121 -0
  909. wisent/core/steering_optimizer.py +1503 -0
  910. wisent/core/synthetic/__init__.py +0 -0
  911. wisent/core/synthetic/cleaners/__init__.py +0 -0
  912. wisent/core/synthetic/cleaners/core/__init__.py +0 -0
  913. wisent/core/synthetic/cleaners/core/atoms.py +58 -0
  914. wisent/core/synthetic/cleaners/deduper_cleaner.py +53 -0
  915. wisent/core/synthetic/cleaners/methods/__init__.py +0 -0
  916. wisent/core/synthetic/cleaners/methods/base_dedupers.py +321 -0
  917. wisent/core/synthetic/cleaners/methods/base_refusalers.py +286 -0
  918. wisent/core/synthetic/cleaners/methods/core/__init__.py +0 -0
  919. wisent/core/synthetic/cleaners/methods/core/atoms.py +47 -0
  920. wisent/core/synthetic/cleaners/pairs_cleaner.py +90 -0
  921. wisent/core/synthetic/cleaners/refusaler_cleaner.py +133 -0
  922. wisent/core/synthetic/db_instructions/__init__.py +0 -0
  923. wisent/core/synthetic/db_instructions/core/__init__.py +0 -0
  924. wisent/core/synthetic/db_instructions/core/atoms.py +25 -0
  925. wisent/core/synthetic/db_instructions/mini_dp.py +115 -0
  926. wisent/core/synthetic/generators/__init__.py +0 -0
  927. wisent/core/synthetic/generators/core/__init__.py +0 -0
  928. wisent/core/synthetic/generators/core/atoms.py +73 -0
  929. wisent/core/synthetic/generators/diversities/__init__.py +0 -0
  930. wisent/core/synthetic/generators/diversities/core/__init__.py +0 -0
  931. wisent/core/synthetic/generators/diversities/core/core.py +68 -0
  932. wisent/core/synthetic/generators/diversities/methods/__init__.py +0 -0
  933. wisent/core/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  934. wisent/core/synthetic/generators/nonsense_generator.py +150 -0
  935. wisent/core/synthetic/generators/pairs_generator.py +313 -0
  936. wisent/core/task_interface.py +143 -0
  937. wisent/core/task_selector.py +232 -0
  938. wisent/core/tasks/__init__.py +218 -0
  939. wisent/core/tasks/aime_task.py +142 -0
  940. wisent/core/tasks/file_task.py +212 -0
  941. wisent/core/tasks/hle_task.py +180 -0
  942. wisent/core/tasks/hmmt_task.py +120 -0
  943. wisent/core/tasks/livecodebench_task.py +94 -0
  944. wisent/core/tasks/livemathbench_task.py +159 -0
  945. wisent/core/tasks/lm_eval_task.py +611 -0
  946. wisent/core/tasks/math500_task.py +84 -0
  947. wisent/core/tasks/polymath_task.py +147 -0
  948. wisent/core/tasks/supergpqa_task.py +220 -0
  949. wisent/core/time_estimator.py +155 -0
  950. wisent/core/timing_calibration.py +176 -0
  951. wisent/core/tracking/__init__.py +54 -0
  952. wisent/core/tracking/latency.py +620 -0
  953. wisent/core/tracking/memory.py +360 -0
  954. wisent/core/trainers/__init__.py +0 -0
  955. wisent/core/trainers/core/__init__.py +11 -0
  956. wisent/core/trainers/core/atoms.py +45 -0
  957. wisent/core/trainers/steering_trainer.py +365 -0
  958. wisent/core/universal_subspace.py +918 -0
  959. wisent/core/user_model_config.py +158 -0
  960. wisent/core/utils/__init__.py +64 -0
  961. wisent/core/utils/base_rotator.py +292 -0
  962. wisent/core/utils/dataset_splits.py +197 -0
  963. wisent/core/utils/device.py +279 -0
  964. wisent/core/weight_modification/__init__.py +134 -0
  965. wisent/core/weight_modification/additive.py +340 -0
  966. wisent/core/weight_modification/directional.py +1357 -0
  967. wisent/core/weight_modification/export.py +359 -0
  968. wisent/core/weight_modification/multi_direction.py +410 -0
  969. wisent/core/weight_modification/utils.py +236 -0
  970. wisent/core/wisent.py +660 -0
  971. wisent/examples/contrastive_pairs/humanization_human_vs_ai.json +2112 -0
  972. wisent/examples/scripts/1/test_basqueglue_evaluation.json +51 -0
  973. wisent/examples/scripts/1/test_basqueglue_pairs.json +14 -0
  974. wisent/examples/scripts/1/test_bec2016eu_evaluation.json +51 -0
  975. wisent/examples/scripts/1/test_bec2016eu_pairs.json +14 -0
  976. wisent/examples/scripts/1/test_belebele_evaluation.json +51 -0
  977. wisent/examples/scripts/1/test_belebele_pairs.json +14 -0
  978. wisent/examples/scripts/1/test_benchmarks_evaluation.json +51 -0
  979. wisent/examples/scripts/1/test_benchmarks_pairs.json +14 -0
  980. wisent/examples/scripts/1/test_bertaqa_evaluation.json +51 -0
  981. wisent/examples/scripts/1/test_bertaqa_pairs.json +14 -0
  982. wisent/examples/scripts/1/test_bhtc_v2_evaluation.json +30 -0
  983. wisent/examples/scripts/1/test_bhtc_v2_pairs.json +8 -0
  984. wisent/examples/scripts/1/test_boolq-seq2seq_evaluation.json +30 -0
  985. wisent/examples/scripts/1/test_boolq-seq2seq_pairs.json +8 -0
  986. wisent/examples/scripts/1/test_cabreu_evaluation.json +30 -0
  987. wisent/examples/scripts/1/test_cabreu_pairs.json +8 -0
  988. wisent/examples/scripts/1/test_careqa_en_evaluation.json +30 -0
  989. wisent/examples/scripts/1/test_careqa_en_pairs.json +8 -0
  990. wisent/examples/scripts/1/test_careqa_evaluation.json +30 -0
  991. wisent/examples/scripts/1/test_careqa_pairs.json +8 -0
  992. wisent/examples/scripts/1/test_catalanqa_evaluation.json +30 -0
  993. wisent/examples/scripts/1/test_catalanqa_pairs.json +8 -0
  994. wisent/examples/scripts/1/test_catcola_evaluation.json +30 -0
  995. wisent/examples/scripts/1/test_catcola_pairs.json +8 -0
  996. wisent/examples/scripts/1/test_chartqa_evaluation.json +30 -0
  997. wisent/examples/scripts/1/test_chartqa_pairs.json +8 -0
  998. wisent/examples/scripts/1/test_claim_stance_topic_evaluation.json +30 -0
  999. wisent/examples/scripts/1/test_claim_stance_topic_pairs.json +8 -0
  1000. wisent/examples/scripts/1/test_cnn_dailymail_evaluation.json +30 -0
  1001. wisent/examples/scripts/1/test_cnn_dailymail_pairs.json +8 -0
  1002. wisent/examples/scripts/1/test_cocoteros_es_evaluation.json +30 -0
  1003. wisent/examples/scripts/1/test_cocoteros_es_pairs.json +8 -0
  1004. wisent/examples/scripts/1/test_coedit_gec_evaluation.json +30 -0
  1005. wisent/examples/scripts/1/test_coedit_gec_pairs.json +8 -0
  1006. wisent/examples/scripts/1/test_cola_evaluation.json +30 -0
  1007. wisent/examples/scripts/1/test_cola_pairs.json +8 -0
  1008. wisent/examples/scripts/1/test_coqcat_evaluation.json +30 -0
  1009. wisent/examples/scripts/1/test_coqcat_pairs.json +8 -0
  1010. wisent/examples/scripts/1/test_dbpedia_14_evaluation.json +30 -0
  1011. wisent/examples/scripts/1/test_dbpedia_14_pairs.json +8 -0
  1012. wisent/examples/scripts/1/test_epec_koref_bin_evaluation.json +30 -0
  1013. wisent/examples/scripts/1/test_epec_koref_bin_pairs.json +8 -0
  1014. wisent/examples/scripts/1/test_ethos_binary_evaluation.json +30 -0
  1015. wisent/examples/scripts/1/test_ethos_binary_pairs.json +8 -0
  1016. wisent/examples/scripts/2/test_afrimgsm_direct_amh_evaluation.json +30 -0
  1017. wisent/examples/scripts/2/test_afrimgsm_direct_amh_pairs.json +8 -0
  1018. wisent/examples/scripts/2/test_afrimmlu_direct_amh_evaluation.json +30 -0
  1019. wisent/examples/scripts/2/test_afrimmlu_direct_amh_pairs.json +8 -0
  1020. wisent/examples/scripts/2/test_afrixnli_en_direct_amh_evaluation.json +30 -0
  1021. wisent/examples/scripts/2/test_afrixnli_en_direct_amh_pairs.json +8 -0
  1022. wisent/examples/scripts/2/test_arc_ar_evaluation.json +30 -0
  1023. wisent/examples/scripts/2/test_arc_ar_pairs.json +8 -0
  1024. wisent/examples/scripts/2/test_atis_evaluation.json +30 -0
  1025. wisent/examples/scripts/2/test_atis_pairs.json +8 -0
  1026. wisent/examples/scripts/2/test_babi_evaluation.json +30 -0
  1027. wisent/examples/scripts/2/test_babi_pairs.json +8 -0
  1028. wisent/examples/scripts/2/test_babilong_evaluation.json +30 -0
  1029. wisent/examples/scripts/2/test_babilong_pairs.json +8 -0
  1030. wisent/examples/scripts/2/test_bangla_mmlu_evaluation.json +30 -0
  1031. wisent/examples/scripts/2/test_bangla_mmlu_pairs.json +8 -0
  1032. wisent/examples/scripts/2/test_basque-glue_pairs.json +14 -0
  1033. wisent/examples/scripts/benchmark_tags.json +2140 -0
  1034. wisent/examples/scripts/lm_eval_readme.json +4 -0
  1035. wisent/examples/scripts/results/benchmark_descriptions.json +1244 -0
  1036. wisent/examples/scripts/results/benchmark_evaluation_methods.json +66 -0
  1037. wisent/examples/scripts/results/benchmark_evaluator_mapping.json +2781 -0
  1038. wisent/examples/scripts/results/benchmark_evaluator_mapping_updated.json +30536 -0
  1039. wisent/examples/scripts/results/benchmark_evaluators_clean.json +469 -0
  1040. wisent/examples/scripts/results/benchmark_methods_summary.json +260 -0
  1041. wisent/examples/scripts/results/benchmark_pair_creation_methods.json +66 -0
  1042. wisent/examples/scripts/results/benchmark_pair_totals.json +269 -0
  1043. wisent/examples/scripts/results/benchmark_tags.json +917 -0
  1044. wisent/examples/scripts/results/benchmark_test_summary_nov4.json +71 -0
  1045. wisent/examples/scripts/results/coding_benchmarks_test_code_status.json +150 -0
  1046. wisent/examples/scripts/results/failing_benchmarks.json +946 -0
  1047. wisent/examples/scripts/results/failing_benchmarks_list.json +41 -0
  1048. wisent/examples/scripts/results/failing_benchmarks_test_results.json +945 -0
  1049. wisent/examples/scripts/results/missing_benchmark_tags.json +341 -0
  1050. wisent/examples/scripts/results/test_20_newsgroups_evaluation.json +30 -0
  1051. wisent/examples/scripts/results/test_20_newsgroups_pairs.json +8 -0
  1052. wisent/examples/scripts/results/test_AraDICE_evaluation.json +51 -0
  1053. wisent/examples/scripts/results/test_AraDICE_pairs.json +14 -0
  1054. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_evaluation.json +30 -0
  1055. wisent/examples/scripts/results/test_AraDiCE_boolq_egy/test_AraDiCE_boolq_egy_pairs.json +8 -0
  1056. wisent/examples/scripts/results/test_ArabCulture_evaluation.json +51 -0
  1057. wisent/examples/scripts/results/test_ArabCulture_pairs.json +14 -0
  1058. wisent/examples/scripts/results/test_Tag_evaluation.json +30 -0
  1059. wisent/examples/scripts/results/test_Tag_pairs.json +8 -0
  1060. wisent/examples/scripts/results/test_aclue_evaluation.json +51 -0
  1061. wisent/examples/scripts/results/test_aclue_pairs.json +14 -0
  1062. wisent/examples/scripts/results/test_acp_bench_evaluation.json +51 -0
  1063. wisent/examples/scripts/results/test_acp_bench_hard_evaluation.json +51 -0
  1064. wisent/examples/scripts/results/test_acp_bench_hard_pairs.json +14 -0
  1065. wisent/examples/scripts/results/test_acp_bench_pairs.json +14 -0
  1066. wisent/examples/scripts/results/test_advanced_ai_risk_evaluation.json +51 -0
  1067. wisent/examples/scripts/results/test_advanced_ai_risk_pairs.json +14 -0
  1068. wisent/examples/scripts/results/test_aexams_evaluation.json +51 -0
  1069. wisent/examples/scripts/results/test_aexams_pairs.json +14 -0
  1070. wisent/examples/scripts/results/test_afrimgsm_direct_amh_evaluation.json +30 -0
  1071. wisent/examples/scripts/results/test_afrimgsm_direct_amh_pairs.json +8 -0
  1072. wisent/examples/scripts/results/test_afrimmlu_direct_amh_evaluation.json +30 -0
  1073. wisent/examples/scripts/results/test_afrimmlu_direct_amh_pairs.json +8 -0
  1074. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_evaluation.json +30 -0
  1075. wisent/examples/scripts/results/test_afrixnli_en_direct_amh_pairs.json +8 -0
  1076. wisent/examples/scripts/results/test_ag_news_evaluation.json +30 -0
  1077. wisent/examples/scripts/results/test_ag_news_pairs.json +8 -0
  1078. wisent/examples/scripts/results/test_agieval_evaluation.json +51 -0
  1079. wisent/examples/scripts/results/test_agieval_pairs.json +14 -0
  1080. wisent/examples/scripts/results/test_aime2024_evaluation.json +30 -0
  1081. wisent/examples/scripts/results/test_aime2024_pairs.json +8 -0
  1082. wisent/examples/scripts/results/test_aime2025_evaluation.json +30 -0
  1083. wisent/examples/scripts/results/test_aime2025_pairs.json +8 -0
  1084. wisent/examples/scripts/results/test_aime_evaluation.json +30 -0
  1085. wisent/examples/scripts/results/test_aime_pairs.json +8 -0
  1086. wisent/examples/scripts/results/test_anagrams1_evaluation.json +30 -0
  1087. wisent/examples/scripts/results/test_anagrams1_pairs.json +8 -0
  1088. wisent/examples/scripts/results/test_anagrams2_evaluation.json +30 -0
  1089. wisent/examples/scripts/results/test_anagrams2_pairs.json +8 -0
  1090. wisent/examples/scripts/results/test_anli_evaluation.json +30 -0
  1091. wisent/examples/scripts/results/test_anli_pairs.json +8 -0
  1092. wisent/examples/scripts/results/test_apps_evaluation.json +30 -0
  1093. wisent/examples/scripts/results/test_apps_pairs.json +8 -0
  1094. wisent/examples/scripts/results/test_arabic_exams_evaluation.json +30 -0
  1095. wisent/examples/scripts/results/test_arabic_exams_pairs.json +8 -0
  1096. wisent/examples/scripts/results/test_arabic_leaderboard_complete_evaluation.json +51 -0
  1097. wisent/examples/scripts/results/test_arabic_leaderboard_complete_pairs.json +14 -0
  1098. wisent/examples/scripts/results/test_arabic_leaderboard_light_evaluation.json +51 -0
  1099. wisent/examples/scripts/results/test_arabic_leaderboard_light_pairs.json +14 -0
  1100. wisent/examples/scripts/results/test_arabicmmlu_evaluation.json +51 -0
  1101. wisent/examples/scripts/results/test_arabicmmlu_pairs.json +14 -0
  1102. wisent/examples/scripts/results/test_aradice/test_aradice_evaluation.json +51 -0
  1103. wisent/examples/scripts/results/test_aradice/test_aradice_pairs.json +14 -0
  1104. wisent/examples/scripts/results/test_aradice3/test_aradice_evaluation.json +51 -0
  1105. wisent/examples/scripts/results/test_aradice3/test_aradice_pairs.json +14 -0
  1106. wisent/examples/scripts/results/test_arc_ar_evaluation.json +30 -0
  1107. wisent/examples/scripts/results/test_arc_ar_pairs.json +8 -0
  1108. wisent/examples/scripts/results/test_arc_challenge_evaluation.json +30 -0
  1109. wisent/examples/scripts/results/test_arc_challenge_pairs.json +8 -0
  1110. wisent/examples/scripts/results/test_arc_easy_evaluation.json +30 -0
  1111. wisent/examples/scripts/results/test_arc_easy_pairs.json +8 -0
  1112. wisent/examples/scripts/results/test_argument_topic_evaluation.json +30 -0
  1113. wisent/examples/scripts/results/test_argument_topic_pairs.json +8 -0
  1114. wisent/examples/scripts/results/test_arithmetic_evaluation.json +51 -0
  1115. wisent/examples/scripts/results/test_arithmetic_pairs.json +14 -0
  1116. wisent/examples/scripts/results/test_asdiv_evaluation.json +30 -0
  1117. wisent/examples/scripts/results/test_asdiv_pairs.json +8 -0
  1118. wisent/examples/scripts/results/test_assin_entailment_evaluation.json +30 -0
  1119. wisent/examples/scripts/results/test_assin_entailment_pairs.json +8 -0
  1120. wisent/examples/scripts/results/test_atis_evaluation.json +30 -0
  1121. wisent/examples/scripts/results/test_atis_pairs.json +8 -0
  1122. wisent/examples/scripts/results/test_babi_evaluation.json +30 -0
  1123. wisent/examples/scripts/results/test_babi_pairs.json +8 -0
  1124. wisent/examples/scripts/results/test_babilong_evaluation.json +30 -0
  1125. wisent/examples/scripts/results/test_babilong_pairs.json +8 -0
  1126. wisent/examples/scripts/results/test_bangla_mmlu_evaluation.json +30 -0
  1127. wisent/examples/scripts/results/test_bangla_mmlu_pairs.json +8 -0
  1128. wisent/examples/scripts/results/test_banking77_evaluation.json +30 -0
  1129. wisent/examples/scripts/results/test_banking77_pairs.json +8 -0
  1130. wisent/examples/scripts/results/test_basque/test_basque-glue_pairs.json +14 -0
  1131. wisent/examples/scripts/results/test_basque-glue_evaluation.json +51 -0
  1132. wisent/examples/scripts/results/test_basque-glue_pairs.json +14 -0
  1133. wisent/examples/scripts/results/test_basque2/test_basque-glue_evaluation.json +51 -0
  1134. wisent/examples/scripts/results/test_basque2/test_basque-glue_pairs.json +14 -0
  1135. wisent/examples/scripts/results/test_basque_bench_evaluation.json +51 -0
  1136. wisent/examples/scripts/results/test_basque_bench_pairs.json +14 -0
  1137. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_evaluation.json +51 -0
  1138. wisent/examples/scripts/results/test_basque_glue/test_basque-glue_pairs.json +14 -0
  1139. wisent/examples/scripts/results/test_basqueglue_evaluation.json +51 -0
  1140. wisent/examples/scripts/results/test_basqueglue_pairs.json +14 -0
  1141. wisent/examples/scripts/results/test_bbh_evaluation.json +51 -0
  1142. wisent/examples/scripts/results/test_bbh_pairs.json +14 -0
  1143. wisent/examples/scripts/results/test_bbq_evaluation.json +30 -0
  1144. wisent/examples/scripts/results/test_bbq_pairs.json +8 -0
  1145. wisent/examples/scripts/results/test_bec2016eu_evaluation.json +51 -0
  1146. wisent/examples/scripts/results/test_bec2016eu_pairs.json +14 -0
  1147. wisent/examples/scripts/results/test_belebele_evaluation.json +51 -0
  1148. wisent/examples/scripts/results/test_belebele_pairs.json +14 -0
  1149. wisent/examples/scripts/results/test_benchmarks_evaluation.json +51 -0
  1150. wisent/examples/scripts/results/test_benchmarks_pairs.json +14 -0
  1151. wisent/examples/scripts/results/test_bertaqa_evaluation.json +51 -0
  1152. wisent/examples/scripts/results/test_bertaqa_pairs.json +14 -0
  1153. wisent/examples/scripts/results/test_bhtc_v2_evaluation.json +30 -0
  1154. wisent/examples/scripts/results/test_bhtc_v2_pairs.json +8 -0
  1155. wisent/examples/scripts/results/test_bigbench_evaluation.json +51 -0
  1156. wisent/examples/scripts/results/test_bigbench_pairs.json +14 -0
  1157. wisent/examples/scripts/results/test_blimp_evaluation.json +51 -0
  1158. wisent/examples/scripts/results/test_blimp_pairs.json +14 -0
  1159. wisent/examples/scripts/results/test_boolq/test_boolq_evaluation.json +30 -0
  1160. wisent/examples/scripts/results/test_boolq/test_boolq_pairs.json +8 -0
  1161. wisent/examples/scripts/results/test_boolq-seq2seq_evaluation.json +30 -0
  1162. wisent/examples/scripts/results/test_boolq-seq2seq_pairs.json +8 -0
  1163. wisent/examples/scripts/results/test_boolq_evaluation.json +30 -0
  1164. wisent/examples/scripts/results/test_boolq_pairs.json +8 -0
  1165. wisent/examples/scripts/results/test_c4_evaluation.json +30 -0
  1166. wisent/examples/scripts/results/test_c4_pairs.json +8 -0
  1167. wisent/examples/scripts/results/test_cabreu_evaluation.json +30 -0
  1168. wisent/examples/scripts/results/test_cabreu_pairs.json +8 -0
  1169. wisent/examples/scripts/results/test_careqa_evaluation.json +30 -0
  1170. wisent/examples/scripts/results/test_careqa_pairs.json +8 -0
  1171. wisent/examples/scripts/results/test_catalan_bench_evaluation.json +51 -0
  1172. wisent/examples/scripts/results/test_catalan_bench_pairs.json +14 -0
  1173. wisent/examples/scripts/results/test_catalanqa_evaluation.json +30 -0
  1174. wisent/examples/scripts/results/test_catalanqa_pairs.json +8 -0
  1175. wisent/examples/scripts/results/test_catcola_evaluation.json +30 -0
  1176. wisent/examples/scripts/results/test_catcola_pairs.json +8 -0
  1177. wisent/examples/scripts/results/test_cb_evaluation.json +30 -0
  1178. wisent/examples/scripts/results/test_cb_pairs.json +8 -0
  1179. wisent/examples/scripts/results/test_ceval/test_ceval_evaluation.json +51 -0
  1180. wisent/examples/scripts/results/test_ceval/test_ceval_pairs.json +14 -0
  1181. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_evaluation.json +30 -0
  1182. wisent/examples/scripts/results/test_ceval_accountant/test_ceval-valid_accountant_pairs.json +8 -0
  1183. wisent/examples/scripts/results/test_ceval_evaluation.json +51 -0
  1184. wisent/examples/scripts/results/test_ceval_pairs.json +14 -0
  1185. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_evaluation.json +51 -0
  1186. wisent/examples/scripts/results/test_ceval_valid/test_ceval_valid_pairs.json +14 -0
  1187. wisent/examples/scripts/results/test_chain_of_thought_evaluation.json +51 -0
  1188. wisent/examples/scripts/results/test_chain_of_thought_pairs.json +14 -0
  1189. wisent/examples/scripts/results/test_chartqa_evaluation.json +30 -0
  1190. wisent/examples/scripts/results/test_chartqa_pairs.json +8 -0
  1191. wisent/examples/scripts/results/test_claim_stance_topic_evaluation.json +30 -0
  1192. wisent/examples/scripts/results/test_claim_stance_topic_pairs.json +8 -0
  1193. wisent/examples/scripts/results/test_cmmlu_evaluation.json +51 -0
  1194. wisent/examples/scripts/results/test_cmmlu_pairs.json +14 -0
  1195. wisent/examples/scripts/results/test_cnn_dailymail_evaluation.json +30 -0
  1196. wisent/examples/scripts/results/test_cnn_dailymail_pairs.json +8 -0
  1197. wisent/examples/scripts/results/test_cocoteros_es_evaluation.json +30 -0
  1198. wisent/examples/scripts/results/test_cocoteros_es_pairs.json +8 -0
  1199. wisent/examples/scripts/results/test_codexglue_code_to_text_go_evaluation.json +30 -0
  1200. wisent/examples/scripts/results/test_codexglue_code_to_text_go_pairs.json +8 -0
  1201. wisent/examples/scripts/results/test_codexglue_code_to_text_java_evaluation.json +30 -0
  1202. wisent/examples/scripts/results/test_codexglue_code_to_text_java_pairs.json +8 -0
  1203. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_evaluation.json +30 -0
  1204. wisent/examples/scripts/results/test_codexglue_code_to_text_javascript_pairs.json +8 -0
  1205. wisent/examples/scripts/results/test_codexglue_code_to_text_php_evaluation.json +30 -0
  1206. wisent/examples/scripts/results/test_codexglue_code_to_text_php_pairs.json +8 -0
  1207. wisent/examples/scripts/results/test_codexglue_code_to_text_python_evaluation.json +30 -0
  1208. wisent/examples/scripts/results/test_codexglue_code_to_text_python_pairs.json +8 -0
  1209. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_evaluation.json +30 -0
  1210. wisent/examples/scripts/results/test_codexglue_code_to_text_ruby_pairs.json +8 -0
  1211. wisent/examples/scripts/results/test_coedit_gec_evaluation.json +30 -0
  1212. wisent/examples/scripts/results/test_coedit_gec_pairs.json +8 -0
  1213. wisent/examples/scripts/results/test_cola_evaluation.json +30 -0
  1214. wisent/examples/scripts/results/test_cola_pairs.json +8 -0
  1215. wisent/examples/scripts/results/test_commonsense_qa_evaluation.json +30 -0
  1216. wisent/examples/scripts/results/test_commonsense_qa_pairs.json +8 -0
  1217. wisent/examples/scripts/results/test_conala_evaluation.json +30 -0
  1218. wisent/examples/scripts/results/test_conala_pairs.json +8 -0
  1219. wisent/examples/scripts/results/test_concode_evaluation.json +30 -0
  1220. wisent/examples/scripts/results/test_concode_pairs.json +8 -0
  1221. wisent/examples/scripts/results/test_copa_evaluation.json +30 -0
  1222. wisent/examples/scripts/results/test_copa_pairs.json +8 -0
  1223. wisent/examples/scripts/results/test_copal_id_evaluation.json +30 -0
  1224. wisent/examples/scripts/results/test_copal_id_pairs.json +8 -0
  1225. wisent/examples/scripts/results/test_coqa_evaluation.json +30 -0
  1226. wisent/examples/scripts/results/test_coqa_pairs.json +8 -0
  1227. wisent/examples/scripts/results/test_coqcat_evaluation.json +30 -0
  1228. wisent/examples/scripts/results/test_coqcat_pairs.json +8 -0
  1229. wisent/examples/scripts/results/test_crows_pairs_evaluation.json +51 -0
  1230. wisent/examples/scripts/results/test_crows_pairs_pairs.json +14 -0
  1231. wisent/examples/scripts/results/test_csatqa_evaluation.json +51 -0
  1232. wisent/examples/scripts/results/test_csatqa_pairs.json +14 -0
  1233. wisent/examples/scripts/results/test_cycle_letters_evaluation.json +30 -0
  1234. wisent/examples/scripts/results/test_cycle_letters_pairs.json +8 -0
  1235. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_evaluation.json +51 -0
  1236. wisent/examples/scripts/results/test_darija_bench/test_darija_bench_pairs.json +14 -0
  1237. wisent/examples/scripts/results/test_darija_bench_evaluation.json +51 -0
  1238. wisent/examples/scripts/results/test_darija_bench_pairs.json +14 -0
  1239. wisent/examples/scripts/results/test_darijahellaswag_evaluation.json +30 -0
  1240. wisent/examples/scripts/results/test_darijahellaswag_pairs.json +8 -0
  1241. wisent/examples/scripts/results/test_darijammlu_evaluation.json +51 -0
  1242. wisent/examples/scripts/results/test_darijammlu_pairs.json +14 -0
  1243. wisent/examples/scripts/results/test_dbpedia_14_evaluation.json +30 -0
  1244. wisent/examples/scripts/results/test_dbpedia_14_pairs.json +8 -0
  1245. wisent/examples/scripts/results/test_drop_evaluation.json +30 -0
  1246. wisent/examples/scripts/results/test_drop_pairs.json +8 -0
  1247. wisent/examples/scripts/results/test_ds1000_evaluation.json +30 -0
  1248. wisent/examples/scripts/results/test_ds1000_pairs.json +8 -0
  1249. wisent/examples/scripts/results/test_egyhellaswag_evaluation.json +30 -0
  1250. wisent/examples/scripts/results/test_egyhellaswag_pairs.json +8 -0
  1251. wisent/examples/scripts/results/test_egymmlu_evaluation.json +51 -0
  1252. wisent/examples/scripts/results/test_egymmlu_pairs.json +14 -0
  1253. wisent/examples/scripts/results/test_epec_koref_bin_evaluation.json +30 -0
  1254. wisent/examples/scripts/results/test_epec_koref_bin_pairs.json +8 -0
  1255. wisent/examples/scripts/results/test_eq_bench_evaluation.json +30 -0
  1256. wisent/examples/scripts/results/test_eq_bench_pairs.json +8 -0
  1257. wisent/examples/scripts/results/test_escola_evaluation.json +30 -0
  1258. wisent/examples/scripts/results/test_escola_pairs.json +8 -0
  1259. wisent/examples/scripts/results/test_ethics_cm_evaluation.json +30 -0
  1260. wisent/examples/scripts/results/test_ethics_cm_pairs.json +8 -0
  1261. wisent/examples/scripts/results/test_ethos_binary_evaluation.json +30 -0
  1262. wisent/examples/scripts/results/test_ethos_binary_pairs.json +8 -0
  1263. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_evaluation.json +51 -0
  1264. wisent/examples/scripts/results/test_eus_exams/test_eus_exams_pairs.json +14 -0
  1265. wisent/examples/scripts/results/test_eus_exams_es_evaluation.json +51 -0
  1266. wisent/examples/scripts/results/test_eus_exams_es_pairs.json +14 -0
  1267. wisent/examples/scripts/results/test_eus_exams_evaluation.json +51 -0
  1268. wisent/examples/scripts/results/test_eus_exams_pairs.json +14 -0
  1269. wisent/examples/scripts/results/test_eus_proficiency_evaluation.json +30 -0
  1270. wisent/examples/scripts/results/test_eus_proficiency_pairs.json +8 -0
  1271. wisent/examples/scripts/results/test_eus_reading_evaluation.json +30 -0
  1272. wisent/examples/scripts/results/test_eus_reading_pairs.json +8 -0
  1273. wisent/examples/scripts/results/test_eus_trivia_evaluation.json +30 -0
  1274. wisent/examples/scripts/results/test_eus_trivia_pairs.json +8 -0
  1275. wisent/examples/scripts/results/test_evalita-mp_evaluation.json +51 -0
  1276. wisent/examples/scripts/results/test_evalita-mp_pairs.json +14 -0
  1277. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +30 -0
  1278. wisent/examples/scripts/results/test_evalita-sp_sum_task_fp-small_p1_pairs.json +8 -0
  1279. wisent/examples/scripts/results/test_evalita_LLM_evaluation.json +51 -0
  1280. wisent/examples/scripts/results/test_evalita_LLM_pairs.json +14 -0
  1281. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_evaluation.json +51 -0
  1282. wisent/examples/scripts/results/test_evalita_llm/test_evalita_llm_pairs.json +14 -0
  1283. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_evaluation.json +30 -0
  1284. wisent/examples/scripts/results/test_evalita_mp/test_evalita-mp_te_prompt-1_pairs.json +8 -0
  1285. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_evaluation.json +51 -0
  1286. wisent/examples/scripts/results/test_evalita_mp2/test_evalita_mp_pairs.json +14 -0
  1287. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_evaluation.json +30 -0
  1288. wisent/examples/scripts/results/test_evalita_sp2/test_evalita-sp_sum_task_fp-small_p1_pairs.json +8 -0
  1289. wisent/examples/scripts/results/test_fda_evaluation.json +30 -0
  1290. wisent/examples/scripts/results/test_fda_pairs.json +8 -0
  1291. wisent/examples/scripts/results/test_financial_tweets_evaluation.json +30 -0
  1292. wisent/examples/scripts/results/test_financial_tweets_pairs.json +8 -0
  1293. wisent/examples/scripts/results/test_fld/test_fld_evaluation.json +30 -0
  1294. wisent/examples/scripts/results/test_fld/test_fld_pairs.json +8 -0
  1295. wisent/examples/scripts/results/test_fld_evaluation.json +30 -0
  1296. wisent/examples/scripts/results/test_fld_fixed/test_fld_evaluation.json +30 -0
  1297. wisent/examples/scripts/results/test_fld_fixed/test_fld_pairs.json +8 -0
  1298. wisent/examples/scripts/results/test_fld_pairs.json +8 -0
  1299. wisent/examples/scripts/results/test_flores_evaluation.json +51 -0
  1300. wisent/examples/scripts/results/test_flores_pairs.json +14 -0
  1301. wisent/examples/scripts/results/test_freebase_evaluation.json +30 -0
  1302. wisent/examples/scripts/results/test_freebase_pairs.json +8 -0
  1303. wisent/examples/scripts/results/test_french_bench_evaluation.json +51 -0
  1304. wisent/examples/scripts/results/test_french_bench_pairs.json +14 -0
  1305. wisent/examples/scripts/results/test_galcola_evaluation.json +30 -0
  1306. wisent/examples/scripts/results/test_galcola_pairs.json +8 -0
  1307. wisent/examples/scripts/results/test_galician_bench_evaluation.json +51 -0
  1308. wisent/examples/scripts/results/test_galician_bench_pairs.json +14 -0
  1309. wisent/examples/scripts/results/test_glianorex_evaluation.json +30 -0
  1310. wisent/examples/scripts/results/test_glianorex_pairs.json +8 -0
  1311. wisent/examples/scripts/results/test_global_mmlu_evaluation.json +51 -0
  1312. wisent/examples/scripts/results/test_global_mmlu_pairs.json +14 -0
  1313. wisent/examples/scripts/results/test_glue_evaluation.json +51 -0
  1314. wisent/examples/scripts/results/test_glue_pairs.json +14 -0
  1315. wisent/examples/scripts/results/test_gpqa_evaluation.json +51 -0
  1316. wisent/examples/scripts/results/test_gpqa_pairs.json +14 -0
  1317. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_evaluation.json +51 -0
  1318. wisent/examples/scripts/results/test_gpt3_translation_benchmarks_pairs.json +14 -0
  1319. wisent/examples/scripts/results/test_groundcocoa_evaluation.json +30 -0
  1320. wisent/examples/scripts/results/test_groundcocoa_pairs.json +8 -0
  1321. wisent/examples/scripts/results/test_gsm8k_evaluation.json +30 -0
  1322. wisent/examples/scripts/results/test_gsm8k_pairs.json +8 -0
  1323. wisent/examples/scripts/results/test_haerae_evaluation.json +51 -0
  1324. wisent/examples/scripts/results/test_haerae_pairs.json +14 -0
  1325. wisent/examples/scripts/results/test_headqa_evaluation.json +30 -0
  1326. wisent/examples/scripts/results/test_headqa_pairs.json +8 -0
  1327. wisent/examples/scripts/results/test_hellaswag_evaluation.json +30 -0
  1328. wisent/examples/scripts/results/test_hellaswag_pairs.json +8 -0
  1329. wisent/examples/scripts/results/test_hendrycks_ethics_evaluation.json +51 -0
  1330. wisent/examples/scripts/results/test_hendrycks_ethics_pairs.json +14 -0
  1331. wisent/examples/scripts/results/test_hendrycks_math_evaluation.json +51 -0
  1332. wisent/examples/scripts/results/test_hendrycks_math_pairs.json +14 -0
  1333. wisent/examples/scripts/results/test_histoires_morales_evaluation.json +30 -0
  1334. wisent/examples/scripts/results/test_histoires_morales_pairs.json +8 -0
  1335. wisent/examples/scripts/results/test_hmmt_evaluation.json +30 -0
  1336. wisent/examples/scripts/results/test_hmmt_feb_2025_evaluation.json +30 -0
  1337. wisent/examples/scripts/results/test_hmmt_feb_2025_pairs.json +8 -0
  1338. wisent/examples/scripts/results/test_hmmt_pairs.json +8 -0
  1339. wisent/examples/scripts/results/test_hrm8k_evaluation.json +51 -0
  1340. wisent/examples/scripts/results/test_hrm8k_pairs.json +14 -0
  1341. wisent/examples/scripts/results/test_humaneval_evaluation.json +30 -0
  1342. wisent/examples/scripts/results/test_humaneval_pairs.json +8 -0
  1343. wisent/examples/scripts/results/test_humaneval_plus_evaluation.json +30 -0
  1344. wisent/examples/scripts/results/test_humaneval_plus_pairs.json +8 -0
  1345. wisent/examples/scripts/results/test_ifeval_evaluation.json +30 -0
  1346. wisent/examples/scripts/results/test_ifeval_pairs.json +8 -0
  1347. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_evaluation.json +30 -0
  1348. wisent/examples/scripts/results/test_instruct_humaneval/test_instruct_humaneval_pairs.json +8 -0
  1349. wisent/examples/scripts/results/test_instruct_humaneval_evaluation.json +30 -0
  1350. wisent/examples/scripts/results/test_instruct_humaneval_pairs.json +8 -0
  1351. wisent/examples/scripts/results/test_inverse_scaling_evaluation.json +51 -0
  1352. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_evaluation.json +30 -0
  1353. wisent/examples/scripts/results/test_inverse_scaling_hindsight_neglect_10shot_pairs.json +8 -0
  1354. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_evaluation.json +51 -0
  1355. wisent/examples/scripts/results/test_inverse_scaling_mc/test_inverse_scaling_mc_pairs.json +14 -0
  1356. wisent/examples/scripts/results/test_inverse_scaling_pairs.json +14 -0
  1357. wisent/examples/scripts/results/test_iwslt2017-ar-en_evaluation.json +30 -0
  1358. wisent/examples/scripts/results/test_iwslt2017-ar-en_pairs.json +8 -0
  1359. wisent/examples/scripts/results/test_iwslt2017-en-ar_evaluation.json +30 -0
  1360. wisent/examples/scripts/results/test_iwslt2017-en-ar_pairs.json +8 -0
  1361. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_evaluation.json +30 -0
  1362. wisent/examples/scripts/results/test_iwslt2017_ar_en/test_iwslt2017-ar-en_pairs.json +8 -0
  1363. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_evaluation.json +30 -0
  1364. wisent/examples/scripts/results/test_iwslt2017_en_ar/test_iwslt2017-en-ar_pairs.json +8 -0
  1365. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_evaluation.json +30 -0
  1366. wisent/examples/scripts/results/test_iwslt2017_group/test_iwslt2017_pairs.json +8 -0
  1367. wisent/examples/scripts/results/test_japanese_leaderboard_evaluation.json +51 -0
  1368. wisent/examples/scripts/results/test_japanese_leaderboard_pairs.json +14 -0
  1369. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_evaluation.json +30 -0
  1370. wisent/examples/scripts/results/test_jsonschema_bench/test_jsonschema_bench_pairs.json +8 -0
  1371. wisent/examples/scripts/results/test_jsonschema_bench_evaluation.json +30 -0
  1372. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_evaluation.json +30 -0
  1373. wisent/examples/scripts/results/test_jsonschema_bench_final/test_jsonschema_bench_pairs.json +8 -0
  1374. wisent/examples/scripts/results/test_jsonschema_bench_pairs.json +8 -0
  1375. wisent/examples/scripts/results/test_kbl_evaluation.json +51 -0
  1376. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_evaluation.json +51 -0
  1377. wisent/examples/scripts/results/test_kbl_fixed/test_kbl_pairs.json +14 -0
  1378. wisent/examples/scripts/results/test_kbl_pairs.json +14 -0
  1379. wisent/examples/scripts/results/test_kmmlu_evaluation.json +51 -0
  1380. wisent/examples/scripts/results/test_kmmlu_pairs.json +14 -0
  1381. wisent/examples/scripts/results/test_kobest_evaluation.json +51 -0
  1382. wisent/examples/scripts/results/test_kobest_pairs.json +14 -0
  1383. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_evaluation.json +30 -0
  1384. wisent/examples/scripts/results/test_kormedmcqa/test_kormedmcqa_pairs.json +8 -0
  1385. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_evaluation.json +30 -0
  1386. wisent/examples/scripts/results/test_kormedmcqa_dentist/test_kormedmcqa_dentist_pairs.json +8 -0
  1387. wisent/examples/scripts/results/test_kormedmcqa_evaluation.json +30 -0
  1388. wisent/examples/scripts/results/test_kormedmcqa_pairs.json +8 -0
  1389. wisent/examples/scripts/results/test_lambada_cloze_evaluation.json +30 -0
  1390. wisent/examples/scripts/results/test_lambada_cloze_pairs.json +8 -0
  1391. wisent/examples/scripts/results/test_lambada_evaluation.json +30 -0
  1392. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_evaluation.json +30 -0
  1393. wisent/examples/scripts/results/test_lambada_final/test_lambada_openai_mt_stablelm_en_pairs.json +8 -0
  1394. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_evaluation.json +51 -0
  1395. wisent/examples/scripts/results/test_lambada_multilingual/test_lambada_multilingual_pairs.json +14 -0
  1396. wisent/examples/scripts/results/test_lambada_multilingual_evaluation.json +51 -0
  1397. wisent/examples/scripts/results/test_lambada_multilingual_pairs.json +14 -0
  1398. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_evaluation.json +51 -0
  1399. wisent/examples/scripts/results/test_lambada_multilingual_stablelm_pairs.json +14 -0
  1400. wisent/examples/scripts/results/test_lambada_openai_evaluation.json +30 -0
  1401. wisent/examples/scripts/results/test_lambada_openai_pairs.json +8 -0
  1402. wisent/examples/scripts/results/test_lambada_pairs.json +8 -0
  1403. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +30 -0
  1404. wisent/examples/scripts/results/test_lambada_stablelm_en_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +8 -0
  1405. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_evaluation.json +30 -0
  1406. wisent/examples/scripts/results/test_lambada_stablelm_fixed/test_lambada_openai_mt_stablelm_en_pairs.json +8 -0
  1407. wisent/examples/scripts/results/test_lambada_standard_evaluation.json +30 -0
  1408. wisent/examples/scripts/results/test_lambada_standard_pairs.json +8 -0
  1409. wisent/examples/scripts/results/test_leaderboard_evaluation.json +51 -0
  1410. wisent/examples/scripts/results/test_leaderboard_pairs.json +14 -0
  1411. wisent/examples/scripts/results/test_libra/test_libra_evaluation.json +51 -0
  1412. wisent/examples/scripts/results/test_libra/test_libra_pairs.json +14 -0
  1413. wisent/examples/scripts/results/test_libra_evaluation.json +51 -0
  1414. wisent/examples/scripts/results/test_libra_pairs.json +14 -0
  1415. wisent/examples/scripts/results/test_lingoly_evaluation.json +30 -0
  1416. wisent/examples/scripts/results/test_lingoly_pairs.json +8 -0
  1417. wisent/examples/scripts/results/test_livecodebench_evaluation.json +30 -0
  1418. wisent/examples/scripts/results/test_livecodebench_pairs.json +8 -0
  1419. wisent/examples/scripts/results/test_livemathbench_cnmo_en_evaluation.json +30 -0
  1420. wisent/examples/scripts/results/test_livemathbench_cnmo_en_pairs.json +8 -0
  1421. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_evaluation.json +30 -0
  1422. wisent/examples/scripts/results/test_livemathbench_cnmo_zh_pairs.json +8 -0
  1423. wisent/examples/scripts/results/test_llama_evaluation.json +30 -0
  1424. wisent/examples/scripts/results/test_llama_pairs.json +8 -0
  1425. wisent/examples/scripts/results/test_logiqa2_evaluation.json +30 -0
  1426. wisent/examples/scripts/results/test_logiqa2_pairs.json +8 -0
  1427. wisent/examples/scripts/results/test_logiqa_evaluation.json +30 -0
  1428. wisent/examples/scripts/results/test_logiqa_pairs.json +8 -0
  1429. wisent/examples/scripts/results/test_m_mmlu_evaluation.json +51 -0
  1430. wisent/examples/scripts/results/test_m_mmlu_pairs.json +14 -0
  1431. wisent/examples/scripts/results/test_mastermind/test_mastermind_evaluation.json +51 -0
  1432. wisent/examples/scripts/results/test_mastermind/test_mastermind_pairs.json +14 -0
  1433. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_evaluation.json +30 -0
  1434. wisent/examples/scripts/results/test_mastermind_24_easy/test_mastermind_24_easy_pairs.json +8 -0
  1435. wisent/examples/scripts/results/test_mastermind_evaluation.json +51 -0
  1436. wisent/examples/scripts/results/test_mastermind_pairs.json +14 -0
  1437. wisent/examples/scripts/results/test_math500_evaluation.json +30 -0
  1438. wisent/examples/scripts/results/test_math500_pairs.json +8 -0
  1439. wisent/examples/scripts/results/test_math_evaluation.json +30 -0
  1440. wisent/examples/scripts/results/test_math_pairs.json +8 -0
  1441. wisent/examples/scripts/results/test_mathqa_evaluation.json +30 -0
  1442. wisent/examples/scripts/results/test_mathqa_pairs.json +8 -0
  1443. wisent/examples/scripts/results/test_mbpp_evaluation.json +30 -0
  1444. wisent/examples/scripts/results/test_mbpp_pairs.json +8 -0
  1445. wisent/examples/scripts/results/test_mbpp_plus_evaluation.json +30 -0
  1446. wisent/examples/scripts/results/test_mbpp_plus_pairs.json +8 -0
  1447. wisent/examples/scripts/results/test_mc_taco_evaluation.json +30 -0
  1448. wisent/examples/scripts/results/test_mc_taco_pairs.json +8 -0
  1449. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_evaluation.json +51 -0
  1450. wisent/examples/scripts/results/test_med_concepts_qa/test_med_concepts_qa_pairs.json +14 -0
  1451. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_evaluation.json +30 -0
  1452. wisent/examples/scripts/results/test_med_concepts_qa_atc_easy/test_med_concepts_qa_atc_easy_pairs.json +8 -0
  1453. wisent/examples/scripts/results/test_med_concepts_qa_evaluation.json +51 -0
  1454. wisent/examples/scripts/results/test_med_concepts_qa_pairs.json +14 -0
  1455. wisent/examples/scripts/results/test_meddialog_evaluation.json +30 -0
  1456. wisent/examples/scripts/results/test_meddialog_pairs.json +8 -0
  1457. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_evaluation.json +30 -0
  1458. wisent/examples/scripts/results/test_meddialog_raw_perplexity/test_meddialog_raw_perplexity_pairs.json +8 -0
  1459. wisent/examples/scripts/results/test_mediqa_qa2019_evaluation.json +30 -0
  1460. wisent/examples/scripts/results/test_mediqa_qa2019_pairs.json +8 -0
  1461. wisent/examples/scripts/results/test_medmcqa_evaluation.json +30 -0
  1462. wisent/examples/scripts/results/test_medmcqa_pairs.json +8 -0
  1463. wisent/examples/scripts/results/test_medqa_evaluation.json +30 -0
  1464. wisent/examples/scripts/results/test_medqa_pairs.json +8 -0
  1465. wisent/examples/scripts/results/test_medtext_evaluation.json +30 -0
  1466. wisent/examples/scripts/results/test_medtext_pairs.json +8 -0
  1467. wisent/examples/scripts/results/test_mela_evaluation.json +51 -0
  1468. wisent/examples/scripts/results/test_mela_pairs.json +14 -0
  1469. wisent/examples/scripts/results/test_meqsum_evaluation.json +30 -0
  1470. wisent/examples/scripts/results/test_meqsum_pairs.json +8 -0
  1471. wisent/examples/scripts/results/test_mercury_evaluation.json +30 -0
  1472. wisent/examples/scripts/results/test_mercury_pairs.json +8 -0
  1473. wisent/examples/scripts/results/test_metabench_evaluation.json +51 -0
  1474. wisent/examples/scripts/results/test_metabench_pairs.json +14 -0
  1475. wisent/examples/scripts/results/test_mgsm_evaluation.json +51 -0
  1476. wisent/examples/scripts/results/test_mgsm_pairs.json +14 -0
  1477. wisent/examples/scripts/results/test_mimic_repsum_evaluation.json +30 -0
  1478. wisent/examples/scripts/results/test_mimic_repsum_pairs.json +8 -0
  1479. wisent/examples/scripts/results/test_minerva_math_evaluation.json +51 -0
  1480. wisent/examples/scripts/results/test_minerva_math_pairs.json +14 -0
  1481. wisent/examples/scripts/results/test_mlqa_evaluation.json +51 -0
  1482. wisent/examples/scripts/results/test_mlqa_pairs.json +14 -0
  1483. wisent/examples/scripts/results/test_mmlu-pro-plus_evaluation.json +51 -0
  1484. wisent/examples/scripts/results/test_mmlu-pro-plus_pairs.json +14 -0
  1485. wisent/examples/scripts/results/test_mmlu_evaluation.json +51 -0
  1486. wisent/examples/scripts/results/test_mmlu_pairs.json +14 -0
  1487. wisent/examples/scripts/results/test_mmlu_pro_evaluation.json +51 -0
  1488. wisent/examples/scripts/results/test_mmlu_pro_pairs.json +14 -0
  1489. wisent/examples/scripts/results/test_mmlu_prox_evaluation.json +51 -0
  1490. wisent/examples/scripts/results/test_mmlu_prox_pairs.json +14 -0
  1491. wisent/examples/scripts/results/test_mmlusr_evaluation.json +30 -0
  1492. wisent/examples/scripts/results/test_mmlusr_pairs.json +8 -0
  1493. wisent/examples/scripts/results/test_mmmu_evaluation.json +51 -0
  1494. wisent/examples/scripts/results/test_mmmu_pairs.json +14 -0
  1495. wisent/examples/scripts/results/test_mnli_evaluation.json +30 -0
  1496. wisent/examples/scripts/results/test_mnli_pairs.json +8 -0
  1497. wisent/examples/scripts/results/test_model_written_evals_evaluation.json +51 -0
  1498. wisent/examples/scripts/results/test_model_written_evals_pairs.json +14 -0
  1499. wisent/examples/scripts/results/test_moral_stories_evaluation.json +30 -0
  1500. wisent/examples/scripts/results/test_moral_stories_pairs.json +8 -0
  1501. wisent/examples/scripts/results/test_mts_dialog_evaluation.json +30 -0
  1502. wisent/examples/scripts/results/test_mts_dialog_pairs.json +8 -0
  1503. wisent/examples/scripts/results/test_multiblimp_evaluation.json +51 -0
  1504. wisent/examples/scripts/results/test_multiblimp_pairs.json +14 -0
  1505. wisent/examples/scripts/results/test_multimedqa_evaluation.json +51 -0
  1506. wisent/examples/scripts/results/test_multimedqa_pairs.json +14 -0
  1507. wisent/examples/scripts/results/test_multipl_e_evaluation.json +30 -0
  1508. wisent/examples/scripts/results/test_multipl_e_pairs.json +8 -0
  1509. wisent/examples/scripts/results/test_mutual_evaluation.json +30 -0
  1510. wisent/examples/scripts/results/test_mutual_pairs.json +8 -0
  1511. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_evaluation.json +30 -0
  1512. wisent/examples/scripts/results/test_non_greedy_robustness_agieval_aqua_rat_pairs.json +8 -0
  1513. wisent/examples/scripts/results/test_noreval_evaluation.json +51 -0
  1514. wisent/examples/scripts/results/test_noreval_pairs.json +14 -0
  1515. wisent/examples/scripts/results/test_noticia_evaluation.json +30 -0
  1516. wisent/examples/scripts/results/test_noticia_pairs.json +8 -0
  1517. wisent/examples/scripts/results/test_nq_open_evaluation.json +30 -0
  1518. wisent/examples/scripts/results/test_nq_open_pairs.json +8 -0
  1519. wisent/examples/scripts/results/test_olaph_evaluation.json +30 -0
  1520. wisent/examples/scripts/results/test_olaph_pairs.json +8 -0
  1521. wisent/examples/scripts/results/test_openbookqa_evaluation.json +30 -0
  1522. wisent/examples/scripts/results/test_openbookqa_pairs.json +8 -0
  1523. wisent/examples/scripts/results/test_openllm_evaluation.json +51 -0
  1524. wisent/examples/scripts/results/test_openllm_pairs.json +14 -0
  1525. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_evaluation.json +30 -0
  1526. wisent/examples/scripts/results/test_option_order_robustness_agieval_aqua_rat_pairs.json +8 -0
  1527. wisent/examples/scripts/results/test_paloma_evaluation.json +51 -0
  1528. wisent/examples/scripts/results/test_paloma_pairs.json +14 -0
  1529. wisent/examples/scripts/results/test_passkey/test_passkey_evaluation.json +30 -0
  1530. wisent/examples/scripts/results/test_passkey/test_passkey_pairs.json +8 -0
  1531. wisent/examples/scripts/results/test_paws-x_evaluation.json +51 -0
  1532. wisent/examples/scripts/results/test_paws-x_pairs.json +14 -0
  1533. wisent/examples/scripts/results/test_paws_en/test_paws_en_evaluation.json +30 -0
  1534. wisent/examples/scripts/results/test_paws_en/test_paws_en_pairs.json +8 -0
  1535. wisent/examples/scripts/results/test_penn_treebank_evaluation.json +30 -0
  1536. wisent/examples/scripts/results/test_penn_treebank_pairs.json +8 -0
  1537. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_evaluation.json +30 -0
  1538. wisent/examples/scripts/results/test_pile_10k/test_pile_10k_pairs.json +8 -0
  1539. wisent/examples/scripts/results/test_piqa_evaluation.json +30 -0
  1540. wisent/examples/scripts/results/test_piqa_pairs.json +8 -0
  1541. wisent/examples/scripts/results/test_polemo2_evaluation.json +30 -0
  1542. wisent/examples/scripts/results/test_polemo2_pairs.json +8 -0
  1543. wisent/examples/scripts/results/test_polymath_en_high_evaluation.json +30 -0
  1544. wisent/examples/scripts/results/test_polymath_en_high_pairs.json +8 -0
  1545. wisent/examples/scripts/results/test_polymath_en_medium_evaluation.json +30 -0
  1546. wisent/examples/scripts/results/test_polymath_en_medium_pairs.json +8 -0
  1547. wisent/examples/scripts/results/test_polymath_zh_high_evaluation.json +30 -0
  1548. wisent/examples/scripts/results/test_polymath_zh_high_pairs.json +8 -0
  1549. wisent/examples/scripts/results/test_polymath_zh_medium_evaluation.json +30 -0
  1550. wisent/examples/scripts/results/test_polymath_zh_medium_pairs.json +8 -0
  1551. wisent/examples/scripts/results/test_portuguese_bench_evaluation.json +51 -0
  1552. wisent/examples/scripts/results/test_portuguese_bench_pairs.json +14 -0
  1553. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_evaluation.json +30 -0
  1554. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat/test_prompt_robustness_agieval_aqua_rat_pairs.json +8 -0
  1555. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_evaluation.json +30 -0
  1556. wisent/examples/scripts/results/test_prompt_robustness_agieval_aqua_rat_pairs.json +8 -0
  1557. wisent/examples/scripts/results/test_prost_evaluation.json +30 -0
  1558. wisent/examples/scripts/results/test_prost_pairs.json +8 -0
  1559. wisent/examples/scripts/results/test_ptb_evaluation.json +30 -0
  1560. wisent/examples/scripts/results/test_ptb_pairs.json +8 -0
  1561. wisent/examples/scripts/results/test_pubmedqa_evaluation.json +30 -0
  1562. wisent/examples/scripts/results/test_pubmedqa_pairs.json +8 -0
  1563. wisent/examples/scripts/results/test_pythia_evaluation.json +51 -0
  1564. wisent/examples/scripts/results/test_pythia_pairs.json +14 -0
  1565. wisent/examples/scripts/results/test_qa4mre_evaluation.json +30 -0
  1566. wisent/examples/scripts/results/test_qa4mre_pairs.json +8 -0
  1567. wisent/examples/scripts/results/test_qasper_evaluation.json +30 -0
  1568. wisent/examples/scripts/results/test_qasper_pairs.json +8 -0
  1569. wisent/examples/scripts/results/test_race_evaluation.json +30 -0
  1570. wisent/examples/scripts/results/test_race_pairs.json +8 -0
  1571. wisent/examples/scripts/results/test_realtoxicityprompts_evaluation.json +30 -0
  1572. wisent/examples/scripts/results/test_realtoxicityprompts_pairs.json +8 -0
  1573. wisent/examples/scripts/results/test_recode_evaluation.json +30 -0
  1574. wisent/examples/scripts/results/test_recode_pairs.json +8 -0
  1575. wisent/examples/scripts/results/test_record_evaluation.json +30 -0
  1576. wisent/examples/scripts/results/test_record_pairs.json +8 -0
  1577. wisent/examples/scripts/results/test_ruler_evaluation.json +51 -0
  1578. wisent/examples/scripts/results/test_ruler_pairs.json +14 -0
  1579. wisent/examples/scripts/results/test_sciq_evaluation.json +30 -0
  1580. wisent/examples/scripts/results/test_sciq_pairs.json +8 -0
  1581. wisent/examples/scripts/results/test_score_evaluation.json +51 -0
  1582. wisent/examples/scripts/results/test_score_pairs.json +14 -0
  1583. wisent/examples/scripts/results/test_self_consistency_evaluation.json +30 -0
  1584. wisent/examples/scripts/results/test_self_consistency_pairs.json +8 -0
  1585. wisent/examples/scripts/results/test_siqa/test_siqa_evaluation.json +30 -0
  1586. wisent/examples/scripts/results/test_siqa/test_siqa_pairs.json +8 -0
  1587. wisent/examples/scripts/results/test_siqa_evaluation.json +30 -0
  1588. wisent/examples/scripts/results/test_siqa_pairs.json +8 -0
  1589. wisent/examples/scripts/results/test_spanish_bench_evaluation.json +51 -0
  1590. wisent/examples/scripts/results/test_spanish_bench_pairs.json +14 -0
  1591. wisent/examples/scripts/results/test_squad2_evaluation.json +30 -0
  1592. wisent/examples/scripts/results/test_squad2_pairs.json +8 -0
  1593. wisent/examples/scripts/results/test_squadv2_evaluation.json +30 -0
  1594. wisent/examples/scripts/results/test_squadv2_pairs.json +8 -0
  1595. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_evaluation.json +30 -0
  1596. wisent/examples/scripts/results/test_super-glue-lm-eval-v1-seq2seq_pairs.json +8 -0
  1597. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_evaluation.json +51 -0
  1598. wisent/examples/scripts/results/test_super-glue-lm-eval-v1_pairs.json +14 -0
  1599. wisent/examples/scripts/results/test_swag_evaluation.json +30 -0
  1600. wisent/examples/scripts/results/test_swag_pairs.json +8 -0
  1601. wisent/examples/scripts/results/test_tinyBenchmarks_evaluation.json +51 -0
  1602. wisent/examples/scripts/results/test_tinyBenchmarks_pairs.json +14 -0
  1603. wisent/examples/scripts/results/test_tmmluplus_evaluation.json +51 -0
  1604. wisent/examples/scripts/results/test_tmmluplus_pairs.json +14 -0
  1605. wisent/examples/scripts/results/test_translation_evaluation.json +51 -0
  1606. wisent/examples/scripts/results/test_translation_pairs.json +14 -0
  1607. wisent/examples/scripts/results/test_triviaqa_evaluation.json +30 -0
  1608. wisent/examples/scripts/results/test_triviaqa_pairs.json +8 -0
  1609. wisent/examples/scripts/results/test_truthfulqa-multi_evaluation.json +51 -0
  1610. wisent/examples/scripts/results/test_truthfulqa-multi_pairs.json +14 -0
  1611. wisent/examples/scripts/results/test_truthfulqa_evaluation.json +30 -0
  1612. wisent/examples/scripts/results/test_truthfulqa_mc1_evaluation.json +30 -0
  1613. wisent/examples/scripts/results/test_truthfulqa_mc1_pairs.json +8 -0
  1614. wisent/examples/scripts/results/test_truthfulqa_mc2_evaluation.json +30 -0
  1615. wisent/examples/scripts/results/test_truthfulqa_mc2_pairs.json +8 -0
  1616. wisent/examples/scripts/results/test_truthfulqa_pairs.json +8 -0
  1617. wisent/examples/scripts/results/test_turkishmmlu_evaluation.json +51 -0
  1618. wisent/examples/scripts/results/test_turkishmmlu_pairs.json +14 -0
  1619. wisent/examples/scripts/results/test_unfair_tos_evaluation.json +30 -0
  1620. wisent/examples/scripts/results/test_unfair_tos_pairs.json +8 -0
  1621. wisent/examples/scripts/results/test_unscramble_evaluation.json +51 -0
  1622. wisent/examples/scripts/results/test_unscramble_pairs.json +14 -0
  1623. wisent/examples/scripts/results/test_webqs_evaluation.json +30 -0
  1624. wisent/examples/scripts/results/test_webqs_pairs.json +8 -0
  1625. wisent/examples/scripts/results/test_wikitext103_evaluation.json +30 -0
  1626. wisent/examples/scripts/results/test_wikitext103_pairs.json +8 -0
  1627. wisent/examples/scripts/results/test_wikitext_evaluation.json +30 -0
  1628. wisent/examples/scripts/results/test_wikitext_pairs.json +8 -0
  1629. wisent/examples/scripts/results/test_winogender_evaluation.json +51 -0
  1630. wisent/examples/scripts/results/test_winogender_pairs.json +14 -0
  1631. wisent/examples/scripts/results/test_winogrande_evaluation.json +30 -0
  1632. wisent/examples/scripts/results/test_winogrande_pairs.json +8 -0
  1633. wisent/examples/scripts/results/test_wmdp_evaluation.json +30 -0
  1634. wisent/examples/scripts/results/test_wmdp_pairs.json +8 -0
  1635. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_evaluation.json +30 -0
  1636. wisent/examples/scripts/results/test_wmt-ro-en-t5-prompt_pairs.json +8 -0
  1637. wisent/examples/scripts/results/test_wmt14_en_fr_evaluation.json +30 -0
  1638. wisent/examples/scripts/results/test_wmt14_en_fr_pairs.json +8 -0
  1639. wisent/examples/scripts/results/test_wmt16_en_de_evaluation.json +30 -0
  1640. wisent/examples/scripts/results/test_wmt16_en_de_pairs.json +8 -0
  1641. wisent/examples/scripts/results/test_wmt16_ro_en_evaluation.json +30 -0
  1642. wisent/examples/scripts/results/test_wmt16_ro_en_pairs.json +8 -0
  1643. wisent/examples/scripts/results/test_wsc273_evaluation.json +30 -0
  1644. wisent/examples/scripts/results/test_wsc273_pairs.json +8 -0
  1645. wisent/examples/scripts/results/test_xcopa_evaluation.json +51 -0
  1646. wisent/examples/scripts/results/test_xcopa_pairs.json +14 -0
  1647. wisent/examples/scripts/results/test_xnli_eu_evaluation.json +30 -0
  1648. wisent/examples/scripts/results/test_xnli_eu_pairs.json +8 -0
  1649. wisent/examples/scripts/results/test_xnli_evaluation.json +51 -0
  1650. wisent/examples/scripts/results/test_xnli_pairs.json +14 -0
  1651. wisent/examples/scripts/results/test_xquad_evaluation.json +51 -0
  1652. wisent/examples/scripts/results/test_xquad_pairs.json +14 -0
  1653. wisent/examples/scripts/results/test_xstorycloze_evaluation.json +51 -0
  1654. wisent/examples/scripts/results/test_xstorycloze_pairs.json +14 -0
  1655. wisent/examples/scripts/results/test_xsum_evaluation.json +30 -0
  1656. wisent/examples/scripts/results/test_xsum_pairs.json +8 -0
  1657. wisent/examples/scripts/results/test_xwinograd_evaluation.json +51 -0
  1658. wisent/examples/scripts/results/test_xwinograd_pairs.json +14 -0
  1659. wisent/examples/scripts/results/test_yahoo_answers_topics_evaluation.json +30 -0
  1660. wisent/examples/scripts/results/test_yahoo_answers_topics_pairs.json +8 -0
  1661. wisent/parameters/__init__.py +1 -0
  1662. wisent/parameters/lm_eval/all_lm_eval_task_families.json +169 -0
  1663. wisent/parameters/lm_eval/broken_in_lm_eval.json +10 -0
  1664. wisent/parameters/lm_eval/evaluations_not_lm_eval_tasks.json +0 -0
  1665. wisent/parameters/lm_eval/evaluator_check.json +3476 -0
  1666. wisent/parameters/lm_eval/final_verification.json +24782 -0
  1667. wisent/parameters/lm_eval/group_task_evaluators.json +1833 -0
  1668. wisent/parameters/lm_eval/group_tasks.json +150 -0
  1669. wisent/parameters/lm_eval/individual_tasks.json +402 -0
  1670. wisent/parameters/lm_eval/no_readmes.json +1 -0
  1671. wisent/parameters/lm_eval/not_lm_eval_tasks.json +110 -0
  1672. wisent/parameters/lm_eval/read_tasks.json +208 -0
  1673. wisent/parameters/lm_eval/readme_files.json +208 -0
  1674. wisent/parameters/lm_eval/track_progress_not_lm_eval_tasks.json +128 -0
  1675. wisent/parameters/tasks/missing_task_families.json +2963 -0
  1676. wisent/parameters/tasks/remaining_tasks_to_implement.json +199 -0
  1677. wisent/parameters/tasks/risks.json +10 -0
  1678. wisent/parameters/tasks/skills.json +14 -0
  1679. wisent/parameters/tasks/tasks.json +56031 -0
  1680. wisent/scripts/run_quality_metrics_sweep.sh +315 -0
  1681. wisent/tests/__init__.py +0 -0
  1682. wisent/tests/examples/__init__.py +0 -0
  1683. wisent/tests/examples/cli/__init__.py +0 -0
  1684. wisent/tests/examples/cli/activations/__init__.py +0 -0
  1685. wisent/tests/examples/cli/activations/test_get_activations.py +127 -0
  1686. wisent/tests/examples/cli/classifier/__init__.py +0 -0
  1687. wisent/tests/examples/cli/classifier/test_classifier_examples.py +141 -0
  1688. wisent/tests/examples/cli/contrastive_pairs/__init__.py +0 -0
  1689. wisent/tests/examples/cli/contrastive_pairs/test_generate_pairs.py +89 -0
  1690. wisent/tests/examples/cli/evaluation/__init__.py +0 -0
  1691. wisent/tests/examples/cli/evaluation/test_evaluation_examples.py +117 -0
  1692. wisent/tests/examples/cli/generate/__init__.py +0 -0
  1693. wisent/tests/examples/cli/generate/test_generate_with_classifier.py +146 -0
  1694. wisent/tests/examples/cli/generate/test_generate_with_steering.py +149 -0
  1695. wisent/tests/examples/cli/generate/test_only_generate.py +110 -0
  1696. wisent/tests/examples/cli/multi_steering/__init__.py +0 -0
  1697. wisent/tests/examples/cli/multi_steering/test_multi_steer_from_trained_vectors.py +210 -0
  1698. wisent/tests/examples/cli/multi_steering/test_multi_steer_with_different_parameters.py +205 -0
  1699. wisent/tests/examples/cli/multi_steering/test_train_and_multi_steer.py +174 -0
  1700. wisent/tests/examples/cli/optimizer/__init__.py +0 -0
  1701. wisent/tests/examples/cli/optimizer/test_optimize_sample_size.py +102 -0
  1702. wisent/tests/examples/cli/optimizer/test_optimizer_examples.py +59 -0
  1703. wisent/tests/examples/cli/steering/__init__.py +0 -0
  1704. wisent/tests/examples/cli/steering/test_create_steering_vectors.py +135 -0
  1705. wisent/tests/examples/cli/synthetic/__init__.py +0 -0
  1706. wisent/tests/examples/cli/synthetic/test_synthetic_pairs.py +45 -0
  1707. wisent/tests/nosense/__init__.py +6 -0
  1708. wisent/tests/nosense/base_nosense.py +81 -0
  1709. wisent/tests/nosense/math500_nosense.py +72 -0
  1710. wisent/tests/nosense/test_robustness.py +336 -0
  1711. wisent/tests/test_all_cli_commands.py +674 -0
  1712. wisent/tests/test_geometry_comprehensive.py +327 -0
  1713. wisent/tests/test_titan_geometry.py +257 -0
  1714. wisent/tests/visualize_geometry.py +148 -0
  1715. wisent-0.7.379.dist-info/METADATA +64 -0
  1716. wisent-0.7.379.dist-info/RECORD +1720 -0
  1717. wisent-0.7.379.dist-info/WHEEL +5 -0
  1718. wisent-0.7.379.dist-info/entry_points.txt +2 -0
  1719. wisent-0.7.379.dist-info/licenses/LICENSE +21 -0
  1720. wisent-0.7.379.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1453 @@
1
+ """
2
+ Task Manager for lm-evaluation-harness integration.
3
+
4
+ This module handles discovery, validation, and loading of tasks from the
5
+ lm-evaluation-harness library.
6
+ """
7
+
8
+ import json
9
+ import os
10
+ import re
11
+ import random
12
+ import yaml
13
+ import tempfile
14
+ import glob
15
+ from typing import List, Dict, Any, Optional, Tuple
16
+ from difflib import SequenceMatcher
17
+
18
+ from wisent.core.errors import TaskLoadError, TaskNotFoundError, NoDocsAvailableError
19
+
20
+
21
+ def load_available_tasks() -> List[str]:
22
+ """Load available tasks from local tasks.json file or lm-eval registry."""
23
+
24
+ # First try to load from local tasks.json file
25
+ try:
26
+ tasks_json_path = os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "parameters", "tasks", "tasks.json")
27
+ if not os.path.exists(tasks_json_path):
28
+ # Try alternative path
29
+ tasks_json_path = os.path.join(os.path.dirname(__file__), "..", "..", "tasks.json")
30
+
31
+ if os.path.exists(tasks_json_path):
32
+ with open(tasks_json_path, 'r') as f:
33
+ tasks_data = json.load(f)
34
+ if 'task_list' in tasks_data and tasks_data['task_list']:
35
+ print(f"Loaded {len(tasks_data['task_list'])} tasks from local tasks.json")
36
+ return tasks_data['task_list']
37
+ elif 'tasks' in tasks_data:
38
+ task_names = list(tasks_data['tasks'].keys())
39
+ print(f"Loaded {len(task_names)} tasks from local tasks.json")
40
+ return task_names
41
+ except Exception as e:
42
+ print(f"Warning: Could not load from local tasks.json: {e}")
43
+
44
+ # Fallback to dynamic loading from lm-eval
45
+ try:
46
+ # Try to import lm-eval and get tasks from registry
47
+ from lm_eval.api.registry import ALL_TASKS
48
+ return list(ALL_TASKS)
49
+ except ImportError:
50
+ # If lm-eval not available, try subprocess approach
51
+ try:
52
+ import subprocess
53
+ result = subprocess.run(['lm_eval', '--tasks', 'list'],
54
+ capture_output=True, text=True, timeout=30)
55
+
56
+ # Extract task names from the formatted output
57
+ task_names = []
58
+ for line in result.stdout.split('\n'):
59
+ if '|' in line and not line.startswith('|---') and not 'Group' in line and not 'Config Location' in line:
60
+ parts = line.split('|')
61
+ if len(parts) >= 2:
62
+ task_name = parts[1].strip()
63
+ if task_name and not task_name.startswith('-') and task_name != 'Group':
64
+ task_names.append(task_name)
65
+
66
+ return task_names
67
+ except Exception:
68
+ # Final fallback - try to discover from lm_eval module
69
+ try:
70
+ import lm_eval.tasks
71
+ # Get all available task names through introspection
72
+ from lm_eval.tasks import get_task_dict
73
+ # This will fail for invalid tasks, so we need another approach
74
+
75
+ # Try to get task names from lm_eval internals
76
+ try:
77
+ import lm_eval.tasks.openbookqa # Import a known task module to trigger loading
78
+ from lm_eval.api.registry import TASK_REGISTRY
79
+ return list(TASK_REGISTRY.keys())
80
+ except:
81
+ pass
82
+
83
+ # Last resort - scan lm_eval.tasks for modules
84
+ import pkgutil
85
+ import lm_eval.tasks as tasks_pkg
86
+
87
+ task_names = []
88
+ for importer, modname, ispkg in pkgutil.iter_modules(tasks_pkg.__path__):
89
+ if not ispkg and not modname.startswith('_'):
90
+ task_names.append(modname)
91
+
92
+ return task_names
93
+
94
+ except Exception as e:
95
+ raise TaskLoadError(
96
+ task_name="lm-eval task discovery",
97
+ cause=e
98
+ )
99
+
100
+
101
+ def load_docs(task, limit: Optional[int] = None) -> List[Dict[str, Any]]:
102
+ """
103
+ Load documents from the most appropriate split (validation → test → train → fewshot).
104
+
105
+ Args:
106
+ task: Task object from lm_eval
107
+ limit: Optional limit on number of documents to load
108
+
109
+ Returns:
110
+ List of documents from the most appropriate split
111
+ """
112
+ docs = []
113
+
114
+ # Try different doc sources in order of preference
115
+ if task.has_validation_docs():
116
+ docs = list(task.validation_docs())
117
+ elif task.has_test_docs():
118
+ docs = list(task.test_docs())
119
+ elif task.has_training_docs():
120
+ docs = list(task.training_docs())
121
+ elif hasattr(task, 'has_fewshot_docs') and task.has_fewshot_docs():
122
+ docs = list(task.fewshot_docs())
123
+ else:
124
+ # For tasks that use fewshot_split (like MMMLU), try to load from dataset directly
125
+ if hasattr(task, 'dataset') and hasattr(task, 'fewshot_split'):
126
+ try:
127
+ from datasets import load_dataset
128
+ dataset = load_dataset(
129
+ task.dataset_path if hasattr(task, 'dataset_path') else task.dataset_name,
130
+ task.dataset_config_name if hasattr(task, 'dataset_config_name') else None,
131
+ split=task.fewshot_split
132
+ )
133
+ docs = [dict(item) for item in dataset]
134
+ except Exception as e:
135
+ raise NoDocsAvailableError(task_name=task.NAME)
136
+ else:
137
+ raise NoDocsAvailableError(task_name=task.NAME)
138
+
139
+ if limit is not None and limit > 0:
140
+ docs = docs[:limit]
141
+
142
+ return docs
143
+
144
+
145
+ def find_working_task_from_group(group_dict, max_depth=3, current_depth=0):
146
+ """
147
+ Recursively search through nested ConfigurableGroup structures to find a working individual task.
148
+
149
+ Args:
150
+ group_dict: Dictionary-like ConfigurableGroup object or regular dict
151
+ max_depth: Maximum recursion depth to prevent infinite loops
152
+ current_depth: Current recursion depth
153
+
154
+ Returns:
155
+ Tuple of (task_object, task_name) or (None, None) if no working task found
156
+ """
157
+ if current_depth >= max_depth:
158
+ return None, None
159
+
160
+ try:
161
+ # Try to iterate through the group
162
+ items = group_dict.items() if hasattr(group_dict, 'items') else []
163
+
164
+ for key, value in items:
165
+ # Skip nested ConfigurableGroup objects at first pass
166
+ if hasattr(value, 'items') and 'ConfigurableGroup' in str(type(key)):
167
+ continue
168
+
169
+ # Check if this looks like an individual task
170
+ if hasattr(value, 'has_validation_docs') or hasattr(value, 'has_test_docs') or hasattr(value, 'has_training_docs'):
171
+ # Try to validate it has documents
172
+ try:
173
+ has_docs = False
174
+ if hasattr(value, 'has_validation_docs') and value.has_validation_docs():
175
+ has_docs = True
176
+ elif hasattr(value, 'has_test_docs') and value.has_test_docs():
177
+ has_docs = True
178
+ elif hasattr(value, 'has_training_docs') and value.has_training_docs():
179
+ has_docs = True
180
+
181
+ if has_docs:
182
+ # Test if we can actually get documents
183
+ if hasattr(value, 'validation_docs') and value.has_validation_docs():
184
+ docs = list(value.validation_docs())
185
+ elif hasattr(value, 'test_docs') and value.has_test_docs():
186
+ docs = list(value.test_docs())
187
+ elif hasattr(value, 'training_docs') and value.has_training_docs():
188
+ docs = list(value.training_docs())
189
+ else:
190
+ docs = []
191
+
192
+ if docs:
193
+ return value, str(key)
194
+ except Exception:
195
+ # This task doesn't work, try next one
196
+ continue
197
+
198
+ # If no individual tasks worked, try nested groups
199
+ for key, value in items:
200
+ if hasattr(value, 'items') and 'ConfigurableGroup' in str(type(key)):
201
+ result_task, result_name = find_working_task_from_group(value, max_depth, current_depth + 1)
202
+ if result_task is not None:
203
+ return result_task, result_name
204
+
205
+ return None, None
206
+
207
+ except Exception as e:
208
+ print(f"Error exploring group: {e}")
209
+ return None, None
210
+
211
+
212
+ def handle_configurable_group_task(task_name: str):
213
+ """
214
+ Consolidated function to handle ConfigurableGroup tasks for both CLI and processing scripts.
215
+
216
+ This function detects when a task is actually a ConfigurableGroup and finds a working
217
+ individual task within it, handling nested groups up to 3 levels deep.
218
+ Even handles tasks with lm-eval dependency issues by finding working alternatives.
219
+ Also supports loading custom YAML task configurations.
220
+
221
+ Args:
222
+ task_name: Name of the potentially problematic group task
223
+
224
+ Returns:
225
+ Tuple of (working_task_object, actual_task_name) or raises ValueError if no working task found
226
+ """
227
+ try:
228
+ from lm_eval.tasks import get_task_dict
229
+ except ImportError as e:
230
+ raise ImportError("lm-evaluation-harness is required. Install with: pip install lm-eval") from e
231
+
232
+ print(f"🔍 Loading task: {task_name}")
233
+
234
+ # First, try to load the task normally from the registry
235
+ try:
236
+ # Initialize TaskManager to ensure registry is populated
237
+ from lm_eval.tasks import TaskManager as LMTaskManager
238
+ task_manager = LMTaskManager()
239
+ task_manager.initialize_tasks()
240
+
241
+ task_dict = get_task_dict([task_name], task_manager=task_manager)
242
+ if task_name in task_dict:
243
+ task = task_dict[task_name]
244
+ print(f" ✅ Found {task_name} in registry")
245
+ return task, task_name
246
+ except Exception as e:
247
+ print(f" ⚠️ Registry loading failed: {e}")
248
+
249
+ # Check if the task exists in the registry but has loading issues
250
+ try:
251
+ from lm_eval.tasks import TaskManager as LMTaskManager
252
+ task_manager = LMTaskManager()
253
+ task_manager.initialize_tasks()
254
+
255
+ # Check in both individual tasks and groups
256
+ all_tasks = getattr(task_manager, 'all_tasks', set())
257
+ all_groups = getattr(task_manager, 'all_groups', set())
258
+
259
+ print(f" 📊 Registry check: {len(all_tasks)} tasks, {len(all_groups)} groups available")
260
+ print(f" 🔍 Is '{task_name}' in groups? {task_name in all_groups}")
261
+ print(f" 🔍 Is '{task_name}' in tasks? {task_name in all_tasks}")
262
+
263
+ if task_name in all_tasks or task_name in all_groups:
264
+ print(f" 🔍 Task {task_name} exists in registry but has loading issues")
265
+
266
+ # For group tasks, try to extract individual working tasks
267
+ if task_name in all_groups:
268
+ print(f" 💡 Found {task_name} as a ConfigurableGroup - extracting individual tasks...")
269
+ result = try_extract_working_tasks_from_group(task_name, task_manager)
270
+ if result:
271
+ return result
272
+ else:
273
+ print(f" 💥 FAILED: Group {task_name} exists but no working tasks found!")
274
+ return None
275
+
276
+ # For individual tasks that fail loading, try aggressive search
277
+ print(f" 💡 Found {task_name} as individual task - trying alternatives...")
278
+ return try_find_related_working_task(task_name)
279
+
280
+ # If not found in registry at all, try aggressive search
281
+ print(f" 🔄 Task {task_name} not found in registry, trying alternatives...")
282
+ return try_find_related_working_task(task_name)
283
+
284
+ except Exception as registry_error:
285
+ print(f" ⚠️ Registry check failed: {registry_error}")
286
+ # Still try aggressive search as fallback
287
+ return try_find_related_working_task(task_name)
288
+
289
+ # If not found in registry, look for custom YAML configurations
290
+ print(f" 🔍 Searching for custom YAML configuration for {task_name}")
291
+
292
+ import os
293
+ import glob
294
+
295
+ # For specific custom tasks like flan_held_in, create the YAML files if needed
296
+ if task_name == "flan_held_in":
297
+ yaml_file_path = create_flan_held_in_files()
298
+ if yaml_file_path:
299
+ config_dir = os.path.dirname(yaml_file_path)
300
+ print(f" 🔍 Loading flan_held_in from: {config_dir}")
301
+
302
+ try:
303
+ # Load using the proper config directory approach
304
+ task_dict = load_task_with_config_dir(task_name, config_dir)
305
+
306
+ if task_name in task_dict:
307
+ task = task_dict[task_name]
308
+ print(f" ✅ Successfully loaded {task_name}")
309
+ return task, task_name
310
+
311
+ # If the group task doesn't load directly, try to extract individual tasks
312
+ print(f" 🔍 Extracting individual tasks from group...")
313
+ individual_tasks = extract_individual_tasks_from_yaml(yaml_file_path, task_name)
314
+ if individual_tasks:
315
+ print(f" 📋 Found individual tasks: {individual_tasks[:3]}...")
316
+
317
+ for extracted_task_name in individual_tasks:
318
+ try:
319
+ individual_dict = load_task_with_config_dir(extracted_task_name, config_dir)
320
+ if extracted_task_name in individual_dict:
321
+ task = individual_dict[extracted_task_name]
322
+ print(f" ✅ Successfully loaded individual task: {extracted_task_name}")
323
+ return task, extracted_task_name
324
+ except Exception as e:
325
+ print(f" ❌ Failed to load {extracted_task_name}: {str(e)[:50]}")
326
+ continue
327
+
328
+ except Exception as e:
329
+ print(f" ❌ Failed to load flan_held_in: {e}")
330
+
331
+ # Generic approach for other custom tasks
332
+ # Look for existing YAML files in common directories
333
+ yaml_candidates = []
334
+ search_dirs = [
335
+ "wisent/parameters/tasks",
336
+ ".",
337
+ "tasks",
338
+ "configs"
339
+ ]
340
+
341
+ for search_dir in search_dirs:
342
+ if os.path.exists(search_dir):
343
+ yaml_candidates.extend(glob.glob(os.path.join(search_dir, f"{task_name}.yaml")))
344
+ yaml_candidates.extend(glob.glob(os.path.join(search_dir, f"{task_name}.yml")))
345
+
346
+ # Try loading existing YAML files for the task
347
+ for yaml_file in yaml_candidates:
348
+ if os.path.exists(yaml_file):
349
+ print(f" 🔍 Found YAML file: {yaml_file}")
350
+ config_dir = os.path.dirname(yaml_file)
351
+
352
+ try:
353
+ task_dict = load_task_with_config_dir(task_name, config_dir)
354
+ if task_name in task_dict:
355
+ task = task_dict[task_name]
356
+ print(f" ✅ Successfully loaded {task_name}")
357
+ return task, task_name
358
+
359
+ except Exception as e:
360
+ print(f" ❌ Failed to load from {yaml_file}: {str(e)[:100]}")
361
+
362
+ # If still not found, fall back to the original ConfigurableGroup handling logic
363
+ print(f" 🔄 Falling back to ConfigurableGroup handling for {task_name}")
364
+
365
+ # FIRST: Check if task exists in registry (for both individual tasks and groups)
366
+ try:
367
+ from lm_eval.tasks import TaskManager as LMTaskManager
368
+ task_manager = LMTaskManager()
369
+ task_manager.initialize_tasks()
370
+
371
+ # Check in both individual tasks and groups
372
+ all_tasks = getattr(task_manager, 'all_tasks', set())
373
+ all_groups = getattr(task_manager, 'all_groups', set())
374
+
375
+ # Convert to sets if they're lists, then merge
376
+ if isinstance(all_tasks, list):
377
+ all_tasks = set(all_tasks)
378
+ if isinstance(all_groups, list):
379
+ all_groups = set(all_groups)
380
+
381
+ print(f" 📊 Registry check: {len(all_tasks)} tasks, {len(all_groups)} groups available")
382
+ print(f" 🔍 Is '{task_name}' in groups? {task_name in all_groups}")
383
+ print(f" 🔍 Is '{task_name}' in tasks? {task_name in all_tasks}")
384
+
385
+ if task_name in all_tasks or task_name in all_groups:
386
+ print(f" 🔍 Task {task_name} exists in registry but has loading issues")
387
+
388
+ # For group tasks, try to extract individual working tasks
389
+ if task_name in all_groups:
390
+ print(f" 💡 Found {task_name} as a ConfigurableGroup - extracting individual tasks...")
391
+ result = try_extract_working_tasks_from_group(task_name, task_manager)
392
+ if result:
393
+ return result
394
+ else:
395
+ print(f" 💥 FAILED: Group {task_name} exists but no working tasks found!")
396
+ return None
397
+
398
+ # For individual tasks that fail loading, try aggressive search
399
+ print(f" 💡 Found {task_name} as individual task - trying alternatives...")
400
+ return try_find_related_working_task(task_name)
401
+
402
+ # If not found in registry at all, try aggressive search
403
+ print(f" 🔄 Task {task_name} not found in registry, trying alternatives...")
404
+ return try_find_related_working_task(task_name)
405
+
406
+ except Exception as registry_error:
407
+ print(f" ⚠️ Registry check failed: {registry_error}")
408
+ # Still try aggressive search as fallback
409
+ return try_find_related_working_task(task_name)
410
+
411
+ try:
412
+ # Original logic for ConfigurableGroup tasks (should not reach here for known groups)
413
+ task_dict = get_task_dict([task_name])
414
+ if task_name not in task_dict:
415
+ # Task doesn't exist, try aggressive search
416
+ return try_find_related_working_task(task_name)
417
+
418
+ task = task_dict[task_name]
419
+
420
+ # Check if it's a ConfigurableGroup by examining the task object
421
+ if hasattr(task, '__dict__') and isinstance(getattr(task, '__dict__', {}), dict):
422
+ task_dict_items = getattr(task, '__dict__', {})
423
+
424
+ # Look for ConfigurableGroup indicators
425
+ if any(isinstance(v, dict) for v in task_dict_items.values()):
426
+ print(f" 🎯 Detected ConfigurableGroup structure in {task_name}")
427
+
428
+ # Try to find a working individual task within the group
429
+ working_task = find_working_task_from_group(task_dict_items)
430
+ if working_task:
431
+ return working_task
432
+
433
+ # If it's not a ConfigurableGroup or we couldn't find working tasks,
434
+ # try to use the task directly but handle potential dependency issues
435
+ try:
436
+ # Test if the task can load documents (quick validation)
437
+ if hasattr(task, 'validation_docs'):
438
+ docs = list(task.validation_docs())
439
+ if docs:
440
+ print(f" ✅ Task {task_name} works directly")
441
+ return task, task_name
442
+ elif hasattr(task, 'test_docs'):
443
+ docs = list(task.test_docs())
444
+ if docs:
445
+ print(f" ✅ Task {task_name} works directly")
446
+ return task, task_name
447
+ elif hasattr(task, 'training_docs'):
448
+ docs = list(task.training_docs())
449
+ if docs:
450
+ print(f" ✅ Task {task_name} works directly")
451
+ return task, task_name
452
+
453
+ except Exception as doc_error:
454
+ print(f" ⚠️ Task {task_name} has document loading issues: {doc_error}")
455
+
456
+ # If there are dependency issues, try to find working alternatives
457
+ return try_find_related_working_task(task_name)
458
+
459
+ # If we get here, the task exists but has no usable documents
460
+ print(f" ⚠️ Task {task_name} has no usable documents")
461
+ return try_find_related_working_task(task_name)
462
+
463
+ except Exception as e:
464
+ print(f" ❌ Error handling {task_name}: {e}")
465
+ # Try aggressive search for alternatives
466
+ return try_find_related_working_task(task_name)
467
+
468
+
469
+ def extract_individual_tasks_from_yaml(yaml_file: str, group_name: str, _visited_files=None) -> List[str]:
470
+ """
471
+ Extract individual task names from a YAML configuration file.
472
+ This function handles nested groups by recursively resolving group names.
473
+
474
+ Args:
475
+ yaml_file: Path to the YAML file
476
+ group_name: Name of the group we're looking for
477
+ _visited_files: Set of already visited files to prevent infinite recursion
478
+
479
+ Returns:
480
+ List of individual task names found in the YAML
481
+ """
482
+ try:
483
+ import yaml
484
+ import os
485
+
486
+ # Initialize visited files set to prevent infinite recursion
487
+ if _visited_files is None:
488
+ _visited_files = set()
489
+
490
+ # Check if we've already processed this file
491
+ yaml_path_normalized = os.path.abspath(yaml_file)
492
+ if yaml_path_normalized in _visited_files:
493
+ print(f" 🔄 Cycle detected: {yaml_file} - skipping to prevent infinite recursion")
494
+ return []
495
+
496
+ _visited_files.add(yaml_path_normalized)
497
+
498
+ with open(yaml_file, 'r') as f:
499
+ yaml_content = yaml.safe_load(f)
500
+
501
+ individual_tasks = []
502
+
503
+ def extract_tasks_recursive(obj, depth=0):
504
+ if depth > 5: # Prevent infinite recursion
505
+ return
506
+
507
+ if isinstance(obj, dict):
508
+ # Look for 'task' key which usually contains individual tasks
509
+ if 'task' in obj:
510
+ task_value = obj['task']
511
+ if isinstance(task_value, str):
512
+ # Single task name - could be individual or group
513
+ individual_tasks.append(task_value)
514
+ elif isinstance(task_value, list):
515
+ # List of tasks or nested groups
516
+ for item in task_value:
517
+ extract_tasks_recursive(item, depth + 1)
518
+ elif isinstance(task_value, dict):
519
+ # Nested task definition
520
+ extract_tasks_recursive(task_value, depth + 1)
521
+
522
+ # Also check other keys recursively
523
+ for key, value in obj.items():
524
+ if key != 'task': # Already processed above
525
+ extract_tasks_recursive(value, depth + 1)
526
+
527
+ elif isinstance(obj, list):
528
+ for item in obj:
529
+ extract_tasks_recursive(item, depth + 1)
530
+ elif isinstance(obj, str):
531
+ # This is a task name (could be individual or group)
532
+ individual_tasks.append(obj)
533
+
534
+ extract_tasks_recursive(yaml_content)
535
+
536
+ # Remove duplicates and filter out empty strings
537
+ potential_tasks = list(set([task for task in individual_tasks if task and isinstance(task, str)]))
538
+
539
+ print(f" 📋 Found potential tasks/groups: {potential_tasks[:5]}...") # Limit output
540
+
541
+ # Now we need to resolve any groups to their individual tasks
542
+ resolved_tasks = []
543
+
544
+ # Get the base directory for this YAML file to find related group files
545
+ yaml_dir = os.path.dirname(yaml_file)
546
+
547
+ # Limit to prevent excessive processing
548
+ max_tasks_to_process = 5
549
+
550
+ for i, task_name in enumerate(potential_tasks[:max_tasks_to_process]):
551
+ # First check if this looks like an individual task (has specific suffixes)
552
+ if any(suffix in task_name for suffix in ['_zeroshot_', '_fewshot_', '_cot_', '_prompt-', '_task_']):
553
+ # This is likely an individual task
554
+ resolved_tasks.append(task_name)
555
+ continue
556
+
557
+ # Check if this is a known group that we should resolve (limit recursion depth)
558
+ if len(_visited_files) < 3: # Limit recursion depth
559
+ potential_group_file = os.path.join(yaml_dir, f"{task_name}.yaml")
560
+ if os.path.exists(potential_group_file):
561
+ print(f" 🔍 Found nested group file: {os.path.basename(potential_group_file)}")
562
+ # Recursively extract from this group
563
+ nested_tasks = extract_individual_tasks_from_yaml(potential_group_file, task_name, _visited_files.copy())
564
+ resolved_tasks.extend(nested_tasks[:3]) # Limit results
565
+ continue
566
+
567
+ # Check in subdirectories (common pattern)
568
+ for subdir in ['zeroshot', 'fewshot', 'cot']:
569
+ subdir_path = os.path.join(yaml_dir, task_name, subdir)
570
+ if os.path.isdir(subdir_path):
571
+ subdir_yaml = os.path.join(subdir_path, f"_{task_name}_{subdir}.yaml")
572
+ if os.path.exists(subdir_yaml):
573
+ print(f" 🔍 Found nested group in subdir: {subdir}")
574
+ nested_tasks = extract_individual_tasks_from_yaml(subdir_yaml, f"{task_name}_{subdir}", _visited_files.copy())
575
+ resolved_tasks.extend(nested_tasks[:3]) # Limit results
576
+ break
577
+ else:
578
+ # Treat as individual task if we can't find a group file
579
+ resolved_tasks.append(task_name)
580
+ else:
581
+ # Max recursion depth reached, treat as individual task
582
+ resolved_tasks.append(task_name)
583
+
584
+ # Final cleanup - remove duplicates and limit results
585
+ final_tasks = list(set(resolved_tasks))[:10] # Limit to 10 tasks max
586
+
587
+ print(f" 📋 Extracted individual tasks from YAML: {final_tasks}")
588
+ return final_tasks
589
+
590
+ except Exception as e:
591
+ print(f" ❌ Error extracting tasks from YAML {yaml_file}: {e}")
592
+ return []
593
+
594
+
595
+ def try_find_related_working_task(task_name: str):
596
+ """
597
+ AGGRESSIVELY find related tasks that work when the main task has issues.
598
+ This function will try EVERY possible variation to find a working task.
599
+ NO TASK SHOULD BE SKIPPED!
600
+
601
+ Args:
602
+ task_name: The problematic task name
603
+
604
+ Returns:
605
+ Tuple of (task_object, task_name) or None if absolutely no alternatives found
606
+ """
607
+ try:
608
+ from lm_eval.tasks import get_task_dict
609
+ from lm_eval.tasks import TaskManager as LMTaskManager
610
+
611
+ # Ensure TaskManager is properly initialized
612
+ task_manager = LMTaskManager()
613
+ task_manager.initialize_tasks()
614
+
615
+ # Get all available tasks from the initialized manager
616
+ all_tasks = getattr(task_manager, 'all_tasks', set())
617
+ all_groups = getattr(task_manager, 'all_groups', set())
618
+
619
+ # Convert to sets if they're lists, then merge
620
+ if isinstance(all_tasks, list):
621
+ all_tasks = set(all_tasks)
622
+ if isinstance(all_groups, list):
623
+ all_groups = set(all_groups)
624
+
625
+ all_available_tasks = all_tasks | all_groups
626
+
627
+ print(f" 📊 TaskManager has {len(all_tasks)} tasks, {len(all_groups)} groups")
628
+
629
+ print(f" 🔄 AGGRESSIVE SEARCH for working alternatives to '{task_name}' ({len(all_available_tasks)} tasks available)...")
630
+
631
+ # Strategy 1: Remove '_group' suffix
632
+ if '_group' in task_name:
633
+ base_name = task_name.replace('_group', '')
634
+ print(f" 🎯 Trying base name: {base_name}")
635
+ try:
636
+ return handle_configurable_group_task(base_name)
637
+ except:
638
+ pass
639
+
640
+ # Strategy 2: Try progressively shorter prefixes
641
+ parts = task_name.split('_')
642
+ if len(parts) > 1:
643
+ for i in range(len(parts) - 1, 0, -1):
644
+ parent_name = '_'.join(parts[:i])
645
+ print(f" 🎯 Trying parent: {parent_name}")
646
+ try:
647
+ return handle_configurable_group_task(parent_name)
648
+ except:
649
+ continue
650
+
651
+ # Strategy 3: Find ANY task with the same prefix (e.g., flan_held_in -> any flan_* task)
652
+ prefix = parts[0] if parts else task_name
653
+ print(f" 🎯 Searching for ANY task starting with '{prefix}_'...")
654
+
655
+ matching_tasks = [t for t in all_available_tasks if t.startswith(prefix + '_') and t != task_name]
656
+
657
+ # Try up to 10 matching tasks until we find one that works
658
+ for candidate in matching_tasks[:10]:
659
+ print(f" 🎯 Trying candidate: {candidate}")
660
+ try:
661
+ result = handle_configurable_group_task(candidate)
662
+ print(f" ✅ SUCCESS! Found working alternative: {candidate}")
663
+ return result
664
+ except:
665
+ continue
666
+
667
+ # Strategy 4: Try exact prefix match (e.g., flan_held_in -> flan)
668
+ if prefix in all_available_tasks:
669
+ print(f" 🎯 Trying exact prefix: {prefix}")
670
+ try:
671
+ return handle_configurable_group_task(prefix)
672
+ except:
673
+ pass
674
+
675
+ # Strategy 5: Find tasks with similar keywords
676
+ keywords = [part for part in parts if len(part) > 2] # Skip short parts
677
+ for keyword in keywords:
678
+ print(f" 🎯 Searching for tasks containing '{keyword}'...")
679
+ keyword_tasks = [t for t in all_available_tasks if keyword in t and t != task_name]
680
+
681
+ for candidate in keyword_tasks[:5]: # Try up to 5 per keyword
682
+ print(f" 🎯 Trying keyword match: {candidate}")
683
+ try:
684
+ result = handle_configurable_group_task(candidate)
685
+ print(f" ✅ SUCCESS! Found working keyword match: {candidate}")
686
+ return result
687
+ except:
688
+ continue
689
+
690
+ # NO MORE STUPID FALLBACKS - FIX THE REAL ISSUE
691
+ print(f" 💥 FAILED TO FIND CORRECT TASK: {task_name} - NO RANDOM FALLBACKS ALLOWED!")
692
+ return None
693
+
694
+ except Exception as e:
695
+ print(f" ❌ Search failed: {e}")
696
+ return None
697
+
698
+
699
+ def try_extract_working_tasks_from_group(group_name: str, task_manager):
700
+ """
701
+ Try to extract and load individual working tasks from a problematic group.
702
+
703
+ This handles cases like flan_held_in where the group exists in the registry
704
+ but has loading issues (like yaml_path becoming None during include processing).
705
+
706
+ Args:
707
+ group_name: Name of the group (e.g., 'flan_held_in')
708
+ task_manager: Initialized LM TaskManager instance
709
+
710
+ Returns:
711
+ Tuple of (task_object, task_name) or None if no working tasks found
712
+ """
713
+ try:
714
+ from lm_eval.tasks import get_task_dict
715
+
716
+ print(f" 🔍 Extracting working tasks from group: {group_name}")
717
+
718
+ # Get the group configuration from the task manager
719
+ if hasattr(task_manager, 'task_index') and group_name in task_manager.task_index:
720
+ group_info = task_manager.task_index[group_name]
721
+ yaml_path = group_info.get('yaml_path')
722
+
723
+ if yaml_path and os.path.exists(yaml_path):
724
+ print(f" 📁 Found group YAML: {yaml_path}")
725
+
726
+ # Generic approach: parse the main YAML to extract task names
727
+
728
+ # STEP 1: Try to parse the main group YAML for task names
729
+ import yaml
730
+ try:
731
+ with open(yaml_path, 'r') as f:
732
+ yaml_content = yaml.safe_load(f)
733
+
734
+ # Extract task names from the main group YAML - more comprehensive search
735
+ initial_tasks = []
736
+ if isinstance(yaml_content, dict):
737
+ # Method 1: Direct 'task' field
738
+ if 'task' in yaml_content:
739
+ if isinstance(yaml_content['task'], list):
740
+ initial_tasks.extend(yaml_content['task'])
741
+ elif isinstance(yaml_content['task'], str):
742
+ initial_tasks.append(yaml_content['task'])
743
+
744
+ # Method 2: Look for any list that might contain task names
745
+ for key, value in yaml_content.items():
746
+ if isinstance(value, list) and key not in ['metric_list', 'generation_kwargs', 'metadata']:
747
+ # Filter for task-like names (avoid metrics and config values)
748
+ for item in value:
749
+ if isinstance(item, str) and ('_' in item or item.isalpha()):
750
+ if item not in initial_tasks:
751
+ initial_tasks.append(item)
752
+
753
+ if initial_tasks:
754
+ print(f" 📋 Found {len(initial_tasks)} initial tasks from main YAML: {initial_tasks[:5]}...")
755
+
756
+ # Try the initially found tasks directly
757
+ for task_name in initial_tasks[:15]: # Try more tasks
758
+ try:
759
+ print(f" 🎯 Trying initial task: {task_name}")
760
+ result = get_task_dict([task_name], task_manager=task_manager)
761
+ if task_name in result:
762
+ task = result[task_name]
763
+ print(f" ✅ SUCCESS: Found working initial task {task_name}")
764
+ return task, task_name
765
+ except Exception as e:
766
+ print(f" ❌ Initial task {task_name} failed: {str(e)[:50]}")
767
+ continue
768
+ else:
769
+ print(f" ⚠️ No task names found in main YAML structure")
770
+
771
+ except Exception as yaml_parse_error:
772
+ print(f" ⚠️ Main YAML parsing failed: {str(yaml_parse_error)[:100]}")
773
+
774
+ # Fallback: try the recursive extraction method
775
+ try:
776
+ individual_tasks = extract_individual_tasks_from_yaml(yaml_path, group_name)
777
+
778
+ if individual_tasks:
779
+ print(f" 📋 Found {len(individual_tasks)} individual tasks in group")
780
+
781
+ # Try to load known working base tasks that these might be based on
782
+ base_tasks_to_try = []
783
+
784
+ # Extract base task names (remove prompt suffixes)
785
+ for task in individual_tasks:
786
+ if '_prompt-' in task:
787
+ base_task = task.split('_prompt-')[0]
788
+ if base_task not in base_tasks_to_try:
789
+ base_tasks_to_try.append(base_task)
790
+
791
+ # Try the base tasks first
792
+ for base_task in base_tasks_to_try:
793
+ try:
794
+ print(f" 🎯 Trying base task: {base_task}")
795
+ result = get_task_dict([base_task], task_manager=task_manager)
796
+ if base_task in result:
797
+ task = result[base_task]
798
+ print(f" ✅ SUCCESS: Found working base task {base_task}")
799
+ return task, base_task
800
+ except Exception as e:
801
+ print(f" ❌ Base task {base_task} failed: {str(e)[:50]}")
802
+ continue
803
+
804
+ # If base tasks don't work, try some individual tasks (but skip templates/variables)
805
+ valid_tasks = [t for t in individual_tasks if not any(x in t for x in ['{{', '}}', '_common_yaml', 'sentence:'])]
806
+ for individual_task in valid_tasks[:5]: # Try first 5 valid ones
807
+ try:
808
+ print(f" 🎯 Trying individual task: {individual_task}")
809
+ result = get_task_dict([individual_task], task_manager=task_manager)
810
+ if individual_task in result:
811
+ task = result[individual_task]
812
+ print(f" ✅ SUCCESS: Found working individual task {individual_task}")
813
+ return task, individual_task
814
+ except Exception as e:
815
+ print(f" ❌ Individual task {individual_task} failed: {str(e)[:50]}")
816
+ continue
817
+
818
+ except Exception as yaml_error:
819
+ print(f" ⚠️ YAML extraction failed (likely !function constructor): {str(yaml_error)[:100]}")
820
+ # Fall through to generic catch-all approach below
821
+
822
+ # FINAL GENERIC CATCH-ALL: If all YAML approaches fail, search registry intelligently
823
+ print(f" 🔍 FINAL CATCH-ALL: Searching registry for tasks matching group pattern...")
824
+
825
+ # Search for tasks that contain the group name or parts of it
826
+ all_tasks = getattr(task_manager, 'all_tasks', set())
827
+ if isinstance(all_tasks, list):
828
+ all_tasks = set(all_tasks)
829
+
830
+ # Generate candidate task names based on the group name with smart filtering
831
+ candidates = []
832
+
833
+ # Strategy 1: Try exact group name
834
+ if group_name in all_tasks:
835
+ candidates.append(group_name)
836
+
837
+ # Strategy 2: Try tasks that start with the group name
838
+ group_prefix_tasks = [t for t in all_tasks if t.startswith(group_name + '_')]
839
+ candidates.extend(group_prefix_tasks[:10]) # Limit to first 10
840
+
841
+ # Strategy 3: Try tasks that contain all major parts of the group name
842
+ group_parts = [part for part in group_name.split('_') if len(part) > 2]
843
+ for part in group_parts:
844
+ matching_tasks = [t for t in all_tasks if part in t and t not in candidates]
845
+ # Prioritize exact matches and longer names
846
+ matching_tasks.sort(key=lambda x: (part in x.split('_'), len(x)), reverse=True)
847
+ candidates.extend(matching_tasks[:3]) # Top 3 per part
848
+
849
+ # Remove duplicates while preserving order
850
+ seen = set()
851
+ unique_candidates = []
852
+ for candidate in candidates:
853
+ if candidate not in seen:
854
+ unique_candidates.append(candidate)
855
+ seen.add(candidate)
856
+
857
+ print(f" 📋 Found {len(unique_candidates)} candidate tasks to try...")
858
+
859
+ # Try each candidate with intelligent prioritization
860
+ for candidate in unique_candidates[:20]: # Limit total attempts
861
+ try:
862
+ print(f" 🎯 Trying candidate: {candidate}")
863
+ result = get_task_dict([candidate], task_manager=task_manager)
864
+ if candidate in result:
865
+ task = result[candidate]
866
+ print(f" ✅ SUCCESS: Found working candidate {candidate}")
867
+ return task, candidate
868
+ except Exception as e:
869
+ print(f" ❌ Candidate {candidate} failed: {str(e)[:50]}")
870
+ continue
871
+
872
+ # If still no success, this group truly has no working tasks
873
+ print(f" 💥 FAILED: Group {group_name} has no working tasks - exhausted all generic approaches")
874
+ print(f" ❌ No working tasks found in group {group_name}")
875
+ return None
876
+
877
+ except Exception as e:
878
+ print(f" ❌ Group extraction failed: {e}")
879
+ return None
880
+
881
+
882
+ def save_custom_task_yaml(task_name: str, yaml_content: str) -> Optional[str]:
883
+ """
884
+ Save custom YAML task configuration to the tasks directory for future loading.
885
+
886
+ Args:
887
+ task_name: Name of the task
888
+ yaml_content: YAML content to save
889
+
890
+ Returns:
891
+ Path to the saved file, or None if failed
892
+ """
893
+ try:
894
+ # Create the tasks directory if it doesn't exist
895
+ tasks_dir = os.path.join("wisent", "parameters", "tasks")
896
+ os.makedirs(tasks_dir, exist_ok=True)
897
+
898
+ # Save the YAML content to a file
899
+ yaml_file_path = os.path.join(tasks_dir, f"{task_name}.yaml")
900
+ with open(yaml_file_path, 'w') as f:
901
+ f.write(yaml_content)
902
+
903
+ print(f" 💾 Saved custom task configuration to: {yaml_file_path}")
904
+ return yaml_file_path
905
+
906
+ except Exception as e:
907
+ print(f" ❌ Failed to save custom task configuration: {e}")
908
+ return None
909
+
910
+
911
+
912
+
913
+
914
+ def create_task_yaml_from_user_content(task_name: str, user_yaml_content: str) -> Optional[str]:
915
+ """
916
+ Create a task YAML file from user-provided YAML content.
917
+ This function can be called when users provide their own YAML configurations.
918
+
919
+ Args:
920
+ task_name: Name of the task
921
+ user_yaml_content: YAML content provided by the user
922
+
923
+ Returns:
924
+ Path to the saved file, or None if failed
925
+ """
926
+ try:
927
+ # Validate that the YAML is parseable
928
+ yaml_data = yaml.safe_load(user_yaml_content)
929
+
930
+ # Save the user's YAML content
931
+ yaml_file_path = save_custom_task_yaml(f"{task_name}_user", user_yaml_content)
932
+
933
+ if yaml_file_path:
934
+ print(f" 💾 Saved user-provided YAML for {task_name}")
935
+ return yaml_file_path
936
+
937
+ return None
938
+
939
+ except Exception as e:
940
+ print(f" ❌ Failed to process user YAML content: {e}")
941
+ return None
942
+
943
+
944
+ def load_with_env_config(task_name: str, yaml_file: str):
945
+ """
946
+ Try to load a task by setting environment variables for lm_eval configuration.
947
+
948
+ Args:
949
+ task_name: Name of the task to load
950
+ yaml_file: Path to the YAML configuration file
951
+
952
+ Returns:
953
+ Task dictionary from get_task_dict
954
+ """
955
+ try:
956
+ from lm_eval.tasks import get_task_dict
957
+
958
+ # Try setting various environment variables that lm_eval might use
959
+ original_env = {}
960
+ env_vars_to_set = [
961
+ 'LM_EVAL_CONFIG_PATH',
962
+ 'LM_EVAL_TASKS_PATH',
963
+ 'LMEVAL_CONFIG_PATH',
964
+ 'TASK_CONFIG_PATH'
965
+ ]
966
+
967
+ # Save original environment
968
+ for env_var in env_vars_to_set:
969
+ original_env[env_var] = os.environ.get(env_var)
970
+ os.environ[env_var] = yaml_file
971
+
972
+ try:
973
+ # Try to load the task with environment variables set
974
+ return get_task_dict([task_name])
975
+ finally:
976
+ # Restore original environment
977
+ for env_var in env_vars_to_set:
978
+ if original_env[env_var] is None:
979
+ os.environ.pop(env_var, None)
980
+ else:
981
+ os.environ[env_var] = original_env[env_var]
982
+
983
+ except Exception as e:
984
+ raise Exception(f"Environment config loading failed: {e}")
985
+
986
+
987
+ def create_flan_held_in_files() -> Optional[str]:
988
+ """
989
+ Create the actual flan_held_in YAML files as provided by the user.
990
+ This creates both the main file and the template file with proper include directives.
991
+
992
+ Returns:
993
+ Path to the main flan_held_in.yaml file, or None if failed
994
+ """
995
+ try:
996
+ # Create the tasks directory
997
+ tasks_dir = os.path.join("wisent", "parameters", "tasks")
998
+ os.makedirs(tasks_dir, exist_ok=True)
999
+
1000
+ # Create the template file first
1001
+ template_content = """output_type: generate_until
1002
+ test_split: null
1003
+ doc_to_choice: null
1004
+ metric_list:
1005
+ - metric: exact_match
1006
+ aggregation: mean
1007
+ higher_is_better: true
1008
+ generation_kwargs:
1009
+ until:
1010
+ - "</s>"
1011
+ do_sample: false
1012
+ temperature: 0.0
1013
+ metadata:
1014
+ version: 1.0
1015
+ """
1016
+
1017
+ template_path = os.path.join(tasks_dir, "_held_in_template_yaml.yaml")
1018
+ with open(template_path, 'w') as f:
1019
+ f.write(template_content)
1020
+
1021
+ # Create the main flan_held_in.yaml file with the exact content from the user
1022
+ main_content = """group: flan_held_in
1023
+ group_alias: Flan (Held-In)
1024
+ task:
1025
+ # ANLI R1
1026
+ - group: anli_r1_flan
1027
+ group_alias: ANLI R1
1028
+ aggregate_metric_list:
1029
+ - metric: acc
1030
+ weight_by_size: True
1031
+ task:
1032
+ - task: anli_r1_prompt-0
1033
+ task_alias: prompt-0
1034
+ include: _held_in_template_yaml
1035
+ doc_to_text: "{{premise}}\\n\\nChoose your answer: based on the paragraph above can we conclude that \\"{{hypothesis}}\\"?\\n\\nOPTIONS:\\n- Yes\\n- It's impossible to say\\n- No\\nI think the answer is"
1036
+ doc_to_target: "{{[\\"Yes\\", \\"It's impossible to say\\", \\"No\\"][label]}}"
1037
+ - task: anli_r1_prompt-1
1038
+ task_alias: prompt-1
1039
+ include: _held_in_template_yaml
1040
+ doc_to_text: "{{premise}}\\n\\nBased on that paragraph can we conclude that this sentence is true?\\n{{hypothesis}}\\n\\nOPTIONS:\\n- Yes\\n- It's impossible to say\\n- No"
1041
+ doc_to_target: "{{[\\"Yes\\", \\"It's impossible to say\\", \\"No\\"][label]}}"
1042
+ - task: anli_r1_prompt-2
1043
+ task_alias: prompt-2
1044
+ include: _held_in_template_yaml
1045
+ doc_to_text: "{{premise}}\\n\\nCan we draw the following conclusion?\\n{{hypothesis}}\\n\\nOPTIONS:\\n- Yes\\n- It's impossible to say\\n- No"
1046
+ doc_to_target: "{{[\\"Yes\\", \\"It's impossible to say\\", \\"No\\"][label]}}"
1047
+ # Arc Easy
1048
+ - group: arc_easy_flan
1049
+ group_alias: Arc Easy
1050
+ aggregate_metric_list:
1051
+ - metric: acc
1052
+ weight_by_size: True
1053
+ task:
1054
+ - task: arc_easy_prompt-0
1055
+ task_alias: prompt-0
1056
+ include: _held_in_template_yaml
1057
+ doc_to_text: "{{question}}\\n\\nOPTIONS:\\n- {{choices.text|join('\\n- ')}}"
1058
+ doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
1059
+ - task: arc_easy_prompt-1
1060
+ task_alias: prompt-1
1061
+ include: _held_in_template_yaml
1062
+ doc_to_text: "Question: {{question}}\\nOPTIONS:\\n- {{choices.text|join('\\n- ')}}\\nAnswer:"
1063
+ doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
1064
+ # BoolQ
1065
+ - group: boolq_flan
1066
+ group_alias: BoolQ
1067
+ aggregate_metric_list:
1068
+ - metric: acc
1069
+ weight_by_size: True
1070
+ task:
1071
+ - task: boolq_prompt-0
1072
+ task_alias: prompt-0
1073
+ include: _held_in_template_yaml
1074
+ doc_to_text: "{{passage}}\\n\\nCan we conclude that {{question}}?\\n\\nOPTIONS:\\n- no\\n- yes"
1075
+ doc_to_target: "{{['no', 'yes'][label]}}"
1076
+ - task: boolq_prompt-1
1077
+ task_alias: prompt-1
1078
+ include: _held_in_template_yaml
1079
+ doc_to_text: "{{passage}}\\n\\nIs it true that {{question}}?\\n\\nOPTIONS:\\n- no\\n- yes"
1080
+ doc_to_target: "{{['no', 'yes'][label]}}"
1081
+ """
1082
+
1083
+ main_path = os.path.join(tasks_dir, "flan_held_in.yaml")
1084
+ with open(main_path, 'w') as f:
1085
+ f.write(main_content)
1086
+
1087
+ print(f" 💾 Created flan_held_in YAML files:")
1088
+ print(f" 📄 Template: {template_path}")
1089
+ print(f" 📄 Main: {main_path}")
1090
+
1091
+ return main_path
1092
+
1093
+ except Exception as e:
1094
+ print(f" ❌ Failed to create flan_held_in files: {e}")
1095
+ return None
1096
+
1097
+
1098
+ def load_task_with_config_dir(task_name: str, config_dir: str):
1099
+ """
1100
+ Load a task by setting the lm_eval configuration directory.
1101
+ This attempts to load YAML configurations by manipulating the path and environment.
1102
+
1103
+ Args:
1104
+ task_name: Name of the task to load
1105
+ config_dir: Directory containing YAML configuration files
1106
+
1107
+ Returns:
1108
+ Task dictionary from get_task_dict
1109
+ """
1110
+ try:
1111
+ from lm_eval.tasks import get_task_dict
1112
+ from lm_eval.tasks import TaskManager as LMTaskManager
1113
+ import sys
1114
+
1115
+ print(f" 🔧 Attempting to load {task_name} from config dir: {config_dir}")
1116
+
1117
+ # Method 1: Try to use TaskManager if available
1118
+ try:
1119
+ # Check if LMTaskManager has config path functionality
1120
+ task_manager = LMTaskManager()
1121
+ if hasattr(task_manager, 'initialize_tasks') or hasattr(task_manager, 'load_config'):
1122
+ print(f" 🔧 Using TaskManager approach")
1123
+ return get_task_dict([task_name], task_manager=task_manager)
1124
+ except Exception as e:
1125
+ print(f" ⚠️ TaskManager approach failed: {e}")
1126
+
1127
+ # Method 2: Try adding config directory to Python path
1128
+ original_path = sys.path[:]
1129
+ try:
1130
+ if config_dir not in sys.path:
1131
+ sys.path.insert(0, config_dir)
1132
+ print(f" 🔧 Added config dir to Python path")
1133
+ return get_task_dict([task_name])
1134
+ except Exception as e:
1135
+ print(f" ⚠️ Python path approach failed: {e}")
1136
+ finally:
1137
+ sys.path[:] = original_path
1138
+
1139
+ # Method 3: Try setting environment variables
1140
+ original_env = {}
1141
+ env_vars = ['LM_EVAL_CONFIG_DIR', 'LMEVAL_CONFIG_PATH', 'TASK_CONFIG_PATH']
1142
+ try:
1143
+ for env_var in env_vars:
1144
+ original_env[env_var] = os.environ.get(env_var)
1145
+ os.environ[env_var] = config_dir
1146
+ print(f" 🔧 Set environment variables")
1147
+ return get_task_dict([task_name])
1148
+ except Exception as e:
1149
+ print(f" ⚠️ Environment variable approach failed: {e}")
1150
+ finally:
1151
+ for env_var in env_vars:
1152
+ if original_env[env_var] is None:
1153
+ os.environ.pop(env_var, None)
1154
+ else:
1155
+ os.environ[env_var] = original_env[env_var]
1156
+
1157
+ # Method 4: Fall back to basic loading
1158
+ print(f" 🔧 Falling back to basic task loading")
1159
+ return get_task_dict([task_name])
1160
+
1161
+ except Exception as e:
1162
+ raise Exception(f"Config directory loading failed: {e}")
1163
+
1164
+
1165
+ class TaskManager:
1166
+ """Manages lm-eval task discovery, validation, and loading."""
1167
+
1168
+ def __init__(self):
1169
+ self._available_tasks = None
1170
+ self._task_name_mappings = {}
1171
+
1172
+ @property
1173
+ def available_tasks(self) -> List[str]:
1174
+ """Get list of available tasks, loading if necessary."""
1175
+ if self._available_tasks is None:
1176
+ self._available_tasks = load_available_tasks()
1177
+ return self._available_tasks
1178
+
1179
+ def get_available_tasks(self) -> List[str]:
1180
+ """Get list of all available tasks."""
1181
+ return self.available_tasks
1182
+
1183
+ def is_valid_task(self, task_name: str) -> bool:
1184
+ """Check if a task name is valid."""
1185
+ try:
1186
+ resolved_name = self.resolve_task_name(task_name)
1187
+ return resolved_name in self.available_tasks
1188
+ except ValueError:
1189
+ return False
1190
+
1191
+ def resolve_task_name(self, task_name: str) -> str:
1192
+ """
1193
+ Resolve a task name to its canonical form, handling variations and common mistakes.
1194
+
1195
+ Args:
1196
+ task_name: The task name to resolve
1197
+
1198
+ Returns:
1199
+ The canonical task name
1200
+
1201
+ Raises:
1202
+ ValueError: If the task name cannot be resolved
1203
+ """
1204
+ # Direct match
1205
+ if task_name in self.available_tasks:
1206
+ return task_name
1207
+
1208
+ # Check cached mappings
1209
+ if task_name in self._task_name_mappings:
1210
+ return self._task_name_mappings[task_name]
1211
+
1212
+ # Try fuzzy matching
1213
+ best_match = None
1214
+ best_similarity = 0.0
1215
+ similarity_threshold = 0.6
1216
+
1217
+ for available_task in self.available_tasks:
1218
+ similarity = self._calculate_task_name_similarity(task_name, available_task)
1219
+ if similarity > best_similarity and similarity >= similarity_threshold:
1220
+ best_similarity = similarity
1221
+ best_match = available_task
1222
+
1223
+ if best_match:
1224
+ # Cache the mapping
1225
+ self._task_name_mappings[task_name] = best_match
1226
+ return best_match
1227
+
1228
+ # List some suggestions if no match found
1229
+ suggestions = [task for task in self.available_tasks
1230
+ if any(word.lower() in task.lower() for word in task_name.split('_'))][:5]
1231
+
1232
+ raise TaskNotFoundError(
1233
+ task_name=task_name,
1234
+ available_tasks=similar_tasks if similar_tasks else None
1235
+ )
1236
+
1237
+ def _calculate_task_name_similarity(self, name1: str, name2: str) -> float:
1238
+ """Calculate similarity between two task names."""
1239
+ # Direct similarity
1240
+ base_similarity = SequenceMatcher(None, name1.lower(), name2.lower()).ratio()
1241
+
1242
+ # Bonus for word-level matches
1243
+ words1 = set(re.split(r'[_\-\s]+', name1.lower()))
1244
+ words2 = set(re.split(r'[_\-\s]+', name2.lower()))
1245
+
1246
+ if words1 and words2:
1247
+ word_overlap = len(words1.intersection(words2)) / max(len(words1), len(words2))
1248
+ return (base_similarity + word_overlap) / 2
1249
+
1250
+ return base_similarity
1251
+
1252
+ def load_task(self, task_name: str, limit: Optional[int] = None):
1253
+ """
1254
+ Load a task from lm-evaluation-harness with dynamic task name resolution.
1255
+ Supports both regular tasks and ConfigurableGroup tasks.
1256
+
1257
+ Args:
1258
+ task_name: Name of the task
1259
+ limit: Optional limit on number of documents
1260
+
1261
+ Returns:
1262
+ Task object from lm_eval
1263
+ """
1264
+
1265
+ # Find the actual task name dynamically
1266
+ actual_task_name = self.resolve_task_name(task_name)
1267
+
1268
+ try:
1269
+ # First try to handle as potentially problematic ConfigurableGroup task
1270
+ task, _ = handle_configurable_group_task(actual_task_name)
1271
+ task._limit = limit
1272
+ return task
1273
+
1274
+ except Exception as e:
1275
+ # If that fails, check if it's a task resolution issue
1276
+ if not self.is_valid_task(actual_task_name):
1277
+ raise TaskNotFoundError(task_name=task_name)
1278
+
1279
+ # Re-raise the original error if it wasn't a resolution issue
1280
+ raise TaskLoadError(task_name=task_name, cause=e)
1281
+
1282
+ def split_task_data(self, task_data, split_ratio: float = 0.8, random_seed: int = 42) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
1283
+ """
1284
+ Split task data into training and testing sets.
1285
+
1286
+ Args:
1287
+ task_data: Task object from lm_eval
1288
+ split_ratio: Ratio for training split (0.0 to 1.0)
1289
+ random_seed: Random seed for reproducible splits
1290
+
1291
+ Returns:
1292
+ Tuple of (training_docs, testing_docs)
1293
+ """
1294
+ import random
1295
+
1296
+ # Load documents with limit if specified
1297
+ limit = getattr(task_data, '_limit', None)
1298
+ docs = load_docs(task_data, limit)
1299
+
1300
+ # Shuffle with seed for reproducibility
1301
+ random.seed(random_seed)
1302
+ shuffled_docs = docs.copy()
1303
+ random.shuffle(shuffled_docs)
1304
+
1305
+ # Split based on ratio
1306
+ split_point = int(len(shuffled_docs) * split_ratio)
1307
+ training_docs = shuffled_docs[:split_point]
1308
+ testing_docs = shuffled_docs[split_point:]
1309
+
1310
+ return training_docs, testing_docs
1311
+
1312
+ def prepare_prompts_from_docs(self, task, docs: List[Dict[str, Any]]) -> List[str]:
1313
+ """
1314
+ Prepare prompts from task documents.
1315
+
1316
+ Args:
1317
+ task: Task object from lm_eval
1318
+ docs: List of documents to convert to prompts
1319
+
1320
+ Returns:
1321
+ List of formatted prompts
1322
+ """
1323
+ prompts = []
1324
+
1325
+ for doc in docs:
1326
+ try:
1327
+ # Different tasks have different prompt creation methods
1328
+ if hasattr(task, 'doc_to_text'):
1329
+ prompt = task.doc_to_text(doc)
1330
+ elif hasattr(task, 'doc_format'):
1331
+ prompt = task.doc_format(doc)
1332
+ elif 'input' in doc:
1333
+ prompt = doc['input']
1334
+ elif 'question' in doc:
1335
+ prompt = doc['question']
1336
+ elif 'prompt' in doc:
1337
+ prompt = doc['prompt']
1338
+ else:
1339
+ # Fallback: use the first text-like field
1340
+ text_fields = ['text', 'passage', 'context', 'story']
1341
+ prompt = None
1342
+ for field in text_fields:
1343
+ if field in doc and isinstance(doc[field], str):
1344
+ prompt = doc[field]
1345
+ break
1346
+
1347
+ if prompt is None:
1348
+ prompt = str(doc)
1349
+
1350
+ prompts.append(prompt)
1351
+
1352
+ except Exception as e:
1353
+ # Skip problematic documents
1354
+ print(f"Warning: Could not create prompt from document: {e}")
1355
+ continue
1356
+
1357
+ return prompts
1358
+
1359
+ def get_reference_answers(self, task, docs: List[Dict[str, Any]]) -> List[str]:
1360
+ """
1361
+ Extract reference answers from task documents.
1362
+
1363
+ Args:
1364
+ task: Task object from lm_eval
1365
+ docs: List of documents to extract answers from
1366
+
1367
+ Returns:
1368
+ List of reference answers
1369
+ """
1370
+ answers = []
1371
+
1372
+ for doc in docs:
1373
+ try:
1374
+ # Different tasks store answers differently
1375
+ if hasattr(task, 'doc_to_target'):
1376
+ answer = task.doc_to_target(doc)
1377
+ elif hasattr(task, 'get_answer'):
1378
+ answer = task.get_answer(doc)
1379
+ elif 'answer' in doc:
1380
+ answer = doc['answer']
1381
+ elif 'target' in doc:
1382
+ answer = doc['target']
1383
+ elif 'label' in doc:
1384
+ answer = doc['label']
1385
+ elif 'output' in doc:
1386
+ answer = doc['output']
1387
+ else:
1388
+ # Look for likely answer fields
1389
+ answer_fields = ['correct_answer', 'gold', 'truth', 'solution']
1390
+ answer = None
1391
+ for field in answer_fields:
1392
+ if field in doc:
1393
+ answer = doc[field]
1394
+ break
1395
+
1396
+ if answer is None:
1397
+ answer = "UNKNOWN"
1398
+
1399
+ answers.append(str(answer))
1400
+
1401
+ except Exception as e:
1402
+ print(f"Warning: Could not extract answer from document: {e}")
1403
+ answers.append("UNKNOWN")
1404
+
1405
+ return answers
1406
+
1407
+ def register_custom_task_yaml(self, task_name: str, yaml_content: str) -> bool:
1408
+ """
1409
+ Register a custom YAML task configuration that can be loaded later.
1410
+
1411
+ Args:
1412
+ task_name: Name of the task to register
1413
+ yaml_content: YAML content defining the task
1414
+
1415
+ Returns:
1416
+ True if successfully registered, False otherwise
1417
+
1418
+ Example:
1419
+ yaml_content = '''
1420
+ my_custom_task:
1421
+ class: custom_task
1422
+ doc_to_text: "Question: {{question}}"
1423
+ doc_to_target: "{{answer}}"
1424
+ '''
1425
+ manager.register_custom_task_yaml("my_custom_task", yaml_content)
1426
+ """
1427
+ try:
1428
+ yaml_file_path = create_task_yaml_from_user_content(task_name, yaml_content)
1429
+ if yaml_file_path:
1430
+ print(f"✅ Registered custom task configuration for '{task_name}'")
1431
+ print(f" 📁 Saved to: {yaml_file_path}")
1432
+ return True
1433
+ return False
1434
+ except Exception as e:
1435
+ print(f"❌ Failed to register custom task '{task_name}': {e}")
1436
+ return False
1437
+
1438
+
1439
+ # Global instance for convenience
1440
+ _task_manager = TaskManager()
1441
+
1442
+ # Convenience functions that use the global instance
1443
+ def get_available_tasks() -> List[str]:
1444
+ """Get list of all available tasks."""
1445
+ return _task_manager.get_available_tasks()
1446
+
1447
+ def is_valid_task(task_name: str) -> bool:
1448
+ """Check if a task name is valid."""
1449
+ return _task_manager.is_valid_task(task_name)
1450
+
1451
+ def resolve_task_name(task_name: str) -> str:
1452
+ """Resolve a task name to its canonical form."""
1453
+ return _task_manager.resolve_task_name(task_name)