wisent 0.1.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show
  1. wisent/__init__.py +1 -8
  2. wisent/benchmarks/__init__.py +0 -0
  3. wisent/benchmarks/coding/__init__.py +0 -0
  4. wisent/benchmarks/coding/metrics/__init__.py +0 -0
  5. wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
  6. wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
  7. wisent/benchmarks/coding/metrics/evaluator.py +275 -0
  8. wisent/benchmarks/coding/metrics/passk.py +66 -0
  9. wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
  10. wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
  11. wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
  12. wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  13. wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
  14. wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
  15. wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
  16. wisent/benchmarks/coding/providers/__init__.py +18 -0
  17. wisent/benchmarks/coding/providers/core/__init__.py +0 -0
  18. wisent/benchmarks/coding/providers/core/atoms.py +31 -0
  19. wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
  20. wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
  21. wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
  22. wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
  23. wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
  24. wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
  25. wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
  26. wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
  27. wisent/classifiers/__init__.py +0 -0
  28. wisent/classifiers/core/__init__.py +0 -0
  29. wisent/classifiers/core/atoms.py +747 -0
  30. wisent/classifiers/models/__init__.py +0 -0
  31. wisent/classifiers/models/logistic.py +29 -0
  32. wisent/classifiers/models/mlp.py +47 -0
  33. wisent/cli/__init__.py +0 -0
  34. wisent/cli/classifiers/__init__.py +0 -0
  35. wisent/cli/classifiers/classifier_rotator.py +137 -0
  36. wisent/cli/cli_logger.py +142 -0
  37. wisent/cli/data_loaders/__init__.py +0 -0
  38. wisent/cli/data_loaders/data_loader_rotator.py +96 -0
  39. wisent/cli/evaluators/__init__.py +0 -0
  40. wisent/cli/evaluators/evaluator_rotator.py +148 -0
  41. wisent/cli/steering_methods/__init__.py +0 -0
  42. wisent/cli/steering_methods/steering_rotator.py +110 -0
  43. wisent/cli/wisent_cli/__init__.py +0 -0
  44. wisent/cli/wisent_cli/commands/__init__.py +0 -0
  45. wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
  46. wisent/cli/wisent_cli/commands/listing.py +154 -0
  47. wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
  48. wisent/cli/wisent_cli/main.py +93 -0
  49. wisent/cli/wisent_cli/shell.py +80 -0
  50. wisent/cli/wisent_cli/ui.py +69 -0
  51. wisent/cli/wisent_cli/util/__init__.py +0 -0
  52. wisent/cli/wisent_cli/util/aggregations.py +43 -0
  53. wisent/cli/wisent_cli/util/parsing.py +126 -0
  54. wisent/cli/wisent_cli/version.py +4 -0
  55. wisent/core/__init__.py +27 -0
  56. wisent/core/activations/__init__.py +0 -0
  57. wisent/core/activations/activations_collector.py +338 -0
  58. wisent/core/activations/core/__init__.py +0 -0
  59. wisent/core/activations/core/atoms.py +216 -0
  60. wisent/core/agent/__init__.py +18 -0
  61. wisent/core/agent/budget.py +638 -0
  62. wisent/core/agent/device_benchmarks.py +685 -0
  63. wisent/core/agent/diagnose/__init__.py +55 -0
  64. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  65. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  66. wisent/core/agent/diagnose/create_classifier.py +1154 -0
  67. wisent/core/agent/diagnose/response_diagnostics.py +268 -0
  68. wisent/core/agent/diagnose/select_classifiers.py +506 -0
  69. wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
  70. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  71. wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
  72. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  73. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  74. wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
  75. wisent/core/agent/diagnose.py +242 -0
  76. wisent/core/agent/steer.py +212 -0
  77. wisent/core/agent/timeout.py +134 -0
  78. wisent/core/autonomous_agent.py +1234 -0
  79. wisent/core/bigcode_integration.py +583 -0
  80. wisent/core/contrastive_pairs/__init__.py +15 -0
  81. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  82. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  83. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  84. wisent/core/contrastive_pairs/core/pair.py +178 -0
  85. wisent/core/contrastive_pairs/core/response.py +152 -0
  86. wisent/core/contrastive_pairs/core/serialization.py +300 -0
  87. wisent/core/contrastive_pairs/core/set.py +133 -0
  88. wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
  89. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  90. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  91. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
  92. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  93. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  94. wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
  95. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  96. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
  102. wisent/core/data_loaders/__init__.py +0 -0
  103. wisent/core/data_loaders/core/__init__.py +0 -0
  104. wisent/core/data_loaders/core/atoms.py +98 -0
  105. wisent/core/data_loaders/loaders/__init__.py +0 -0
  106. wisent/core/data_loaders/loaders/custom.py +120 -0
  107. wisent/core/data_loaders/loaders/lm_loader.py +218 -0
  108. wisent/core/detection_handling.py +257 -0
  109. wisent/core/download_full_benchmarks.py +1386 -0
  110. wisent/core/evaluators/__init__.py +0 -0
  111. wisent/core/evaluators/oracles/__init__.py +0 -0
  112. wisent/core/evaluators/oracles/interactive.py +73 -0
  113. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  114. wisent/core/evaluators/oracles/user_specified.py +67 -0
  115. wisent/core/hyperparameter_optimizer.py +429 -0
  116. wisent/core/lm_eval_harness_ground_truth.py +1396 -0
  117. wisent/core/log_likelihoods_evaluator.py +321 -0
  118. wisent/core/managed_cached_benchmarks.py +595 -0
  119. wisent/core/mixed_benchmark_sampler.py +364 -0
  120. wisent/core/model_config_manager.py +330 -0
  121. wisent/core/model_persistence.py +317 -0
  122. wisent/core/models/__init__.py +0 -0
  123. wisent/core/models/core/__init__.py +0 -0
  124. wisent/core/models/core/atoms.py +460 -0
  125. wisent/core/models/wisent_model.py +727 -0
  126. wisent/core/multi_steering.py +316 -0
  127. wisent/core/optuna/__init__.py +57 -0
  128. wisent/core/optuna/classifier/__init__.py +25 -0
  129. wisent/core/optuna/classifier/activation_generator.py +349 -0
  130. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  131. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
  132. wisent/core/optuna/steering/__init__.py +0 -0
  133. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
  134. wisent/core/optuna/steering/data_utils.py +342 -0
  135. wisent/core/optuna/steering/metrics.py +474 -0
  136. wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
  137. wisent/core/optuna/steering/steering_optimization.py +1111 -0
  138. wisent/core/parser.py +1668 -0
  139. wisent/core/prompts/__init__.py +0 -0
  140. wisent/core/prompts/core/__init__.py +0 -0
  141. wisent/core/prompts/core/atom.py +57 -0
  142. wisent/core/prompts/core/prompt_formater.py +157 -0
  143. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  144. wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
  145. wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
  146. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
  147. wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
  148. wisent/core/representation.py +5 -0
  149. wisent/core/sample_size_optimizer.py +648 -0
  150. wisent/core/sample_size_optimizer_v2.py +355 -0
  151. wisent/core/save_results.py +277 -0
  152. wisent/core/steering.py +652 -0
  153. wisent/core/steering_method.py +26 -0
  154. wisent/core/steering_methods/__init__.py +0 -0
  155. wisent/core/steering_methods/core/__init__.py +0 -0
  156. wisent/core/steering_methods/core/atoms.py +153 -0
  157. wisent/core/steering_methods/methods/__init__.py +0 -0
  158. wisent/core/steering_methods/methods/caa.py +44 -0
  159. wisent/core/steering_optimizer.py +1297 -0
  160. wisent/core/task_interface.py +132 -0
  161. wisent/core/task_selector.py +189 -0
  162. wisent/core/tasks/__init__.py +175 -0
  163. wisent/core/tasks/aime_task.py +141 -0
  164. wisent/core/tasks/file_task.py +211 -0
  165. wisent/core/tasks/hle_task.py +180 -0
  166. wisent/core/tasks/hmmt_task.py +119 -0
  167. wisent/core/tasks/livecodebench_task.py +201 -0
  168. wisent/core/tasks/livemathbench_task.py +158 -0
  169. wisent/core/tasks/lm_eval_task.py +455 -0
  170. wisent/core/tasks/math500_task.py +84 -0
  171. wisent/core/tasks/polymath_task.py +146 -0
  172. wisent/core/tasks/supergpqa_task.py +220 -0
  173. wisent/core/time_estimator.py +149 -0
  174. wisent/core/timing_calibration.py +174 -0
  175. wisent/core/tracking/__init__.py +54 -0
  176. wisent/core/tracking/latency.py +618 -0
  177. wisent/core/tracking/memory.py +359 -0
  178. wisent/core/trainers/__init__.py +0 -0
  179. wisent/core/trainers/core/__init__.py +11 -0
  180. wisent/core/trainers/core/atoms.py +45 -0
  181. wisent/core/trainers/steering_trainer.py +271 -0
  182. wisent/core/user_model_config.py +158 -0
  183. wisent/opti/__init__.py +0 -0
  184. wisent/opti/core/__init__.py +0 -0
  185. wisent/opti/core/atoms.py +175 -0
  186. wisent/opti/methods/__init__.py +0 -0
  187. wisent/opti/methods/opti_classificator.py +172 -0
  188. wisent/opti/methods/opti_steering.py +138 -0
  189. wisent/synthetic/__init__.py +0 -0
  190. wisent/synthetic/cleaners/__init__.py +0 -0
  191. wisent/synthetic/cleaners/core/__init__.py +0 -0
  192. wisent/synthetic/cleaners/core/atoms.py +58 -0
  193. wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
  194. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  195. wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
  196. wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
  197. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  198. wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
  199. wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
  200. wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
  201. wisent/synthetic/db_instructions/__init__.py +0 -0
  202. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  203. wisent/synthetic/db_instructions/core/atoms.py +25 -0
  204. wisent/synthetic/db_instructions/mini_dp.py +37 -0
  205. wisent/synthetic/generators/__init__.py +0 -0
  206. wisent/synthetic/generators/core/__init__.py +0 -0
  207. wisent/synthetic/generators/core/atoms.py +73 -0
  208. wisent/synthetic/generators/diversities/__init__.py +0 -0
  209. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  210. wisent/synthetic/generators/diversities/core/core.py +68 -0
  211. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  212. wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  213. wisent/synthetic/generators/pairs_generator.py +179 -0
  214. wisent-0.5.2.dist-info/METADATA +67 -0
  215. wisent-0.5.2.dist-info/RECORD +218 -0
  216. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/WHEEL +1 -1
  217. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info/licenses}/LICENSE +2 -2
  218. wisent/activations/__init__.py +0 -9
  219. wisent/activations/client.py +0 -97
  220. wisent/activations/extractor.py +0 -251
  221. wisent/activations/models.py +0 -95
  222. wisent/client.py +0 -45
  223. wisent/control_vector/__init__.py +0 -9
  224. wisent/control_vector/client.py +0 -85
  225. wisent/control_vector/manager.py +0 -168
  226. wisent/control_vector/models.py +0 -70
  227. wisent/inference/__init__.py +0 -9
  228. wisent/inference/client.py +0 -103
  229. wisent/inference/inferencer.py +0 -250
  230. wisent/inference/models.py +0 -66
  231. wisent/utils/__init__.py +0 -3
  232. wisent/utils/auth.py +0 -30
  233. wisent/utils/http.py +0 -228
  234. wisent/version.py +0 -3
  235. wisent-0.1.1.dist-info/METADATA +0 -142
  236. wisent-0.1.1.dist-info/RECORD +0 -23
  237. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1456 @@
1
+ """
2
+ Task Manager for lm-evaluation-harness integration.
3
+
4
+ This module handles discovery, validation, and loading of tasks from the
5
+ lm-evaluation-harness library.
6
+ """
7
+
8
+ import json
9
+ import os
10
+ import re
11
+ import random
12
+ import yaml
13
+ import tempfile
14
+ import glob
15
+ from typing import List, Dict, Any, Optional, Tuple
16
+ from difflib import SequenceMatcher
17
+
18
+
19
+ def load_available_tasks() -> List[str]:
20
+ """Load available tasks from local tasks.json file or lm-eval registry."""
21
+
22
+ # First try to load from local tasks.json file
23
+ try:
24
+ tasks_json_path = os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "parameters", "tasks", "tasks.json")
25
+ if not os.path.exists(tasks_json_path):
26
+ # Try alternative path
27
+ tasks_json_path = os.path.join(os.path.dirname(__file__), "..", "..", "tasks.json")
28
+
29
+ if os.path.exists(tasks_json_path):
30
+ with open(tasks_json_path, 'r') as f:
31
+ tasks_data = json.load(f)
32
+ if 'task_list' in tasks_data and tasks_data['task_list']:
33
+ print(f"Loaded {len(tasks_data['task_list'])} tasks from local tasks.json")
34
+ return tasks_data['task_list']
35
+ elif 'tasks' in tasks_data:
36
+ task_names = list(tasks_data['tasks'].keys())
37
+ print(f"Loaded {len(task_names)} tasks from local tasks.json")
38
+ return task_names
39
+ except Exception as e:
40
+ print(f"Warning: Could not load from local tasks.json: {e}")
41
+
42
+ # Fallback to dynamic loading from lm-eval
43
+ try:
44
+ # Try to import lm-eval and get tasks from registry
45
+ from lm_eval.api.registry import ALL_TASKS
46
+ return list(ALL_TASKS)
47
+ except ImportError:
48
+ # If lm-eval not available, try subprocess approach
49
+ try:
50
+ import subprocess
51
+ result = subprocess.run(['lm_eval', '--tasks', 'list'],
52
+ capture_output=True, text=True, timeout=30)
53
+
54
+ # Extract task names from the formatted output
55
+ task_names = []
56
+ for line in result.stdout.split('\n'):
57
+ if '|' in line and not line.startswith('|---') and not 'Group' in line and not 'Config Location' in line:
58
+ parts = line.split('|')
59
+ if len(parts) >= 2:
60
+ task_name = parts[1].strip()
61
+ if task_name and not task_name.startswith('-') and task_name != 'Group':
62
+ task_names.append(task_name)
63
+
64
+ return task_names
65
+ except Exception:
66
+ # Final fallback - try to discover from lm_eval module
67
+ try:
68
+ import lm_eval.tasks
69
+ # Get all available task names through introspection
70
+ from lm_eval.tasks import get_task_dict
71
+ # This will fail for invalid tasks, so we need another approach
72
+
73
+ # Try to get task names from lm_eval internals
74
+ try:
75
+ import lm_eval.tasks.openbookqa # Import a known task module to trigger loading
76
+ from lm_eval.api.registry import TASK_REGISTRY
77
+ return list(TASK_REGISTRY.keys())
78
+ except:
79
+ pass
80
+
81
+ # Last resort - scan lm_eval.tasks for modules
82
+ import pkgutil
83
+ import lm_eval.tasks as tasks_pkg
84
+
85
+ task_names = []
86
+ for importer, modname, ispkg in pkgutil.iter_modules(tasks_pkg.__path__):
87
+ if not ispkg and not modname.startswith('_'):
88
+ task_names.append(modname)
89
+
90
+ return task_names
91
+
92
+ except Exception as e:
93
+ raise RuntimeError(
94
+ f"Could not discover tasks from lm-eval or local tasks.json. "
95
+ f"Please ensure lm-evaluation-harness is installed and accessible. "
96
+ f"Error: {e}. Try: pip install lm-eval"
97
+ )
98
+
99
+
100
+ def load_docs(task, limit: Optional[int] = None) -> List[Dict[str, Any]]:
101
+ """
102
+ Load documents from the most appropriate split (validation → test → train → fewshot).
103
+
104
+ Args:
105
+ task: Task object from lm_eval
106
+ limit: Optional limit on number of documents to load
107
+
108
+ Returns:
109
+ List of documents from the most appropriate split
110
+ """
111
+ docs = []
112
+
113
+ # Try different doc sources in order of preference
114
+ if task.has_validation_docs():
115
+ docs = list(task.validation_docs())
116
+ elif task.has_test_docs():
117
+ docs = list(task.test_docs())
118
+ elif task.has_training_docs():
119
+ docs = list(task.training_docs())
120
+ elif hasattr(task, 'has_fewshot_docs') and task.has_fewshot_docs():
121
+ docs = list(task.fewshot_docs())
122
+ else:
123
+ # For tasks that use fewshot_split (like MMMLU), try to load from dataset directly
124
+ if hasattr(task, 'dataset') and hasattr(task, 'fewshot_split'):
125
+ try:
126
+ from datasets import load_dataset
127
+ dataset = load_dataset(
128
+ task.dataset_path if hasattr(task, 'dataset_path') else task.dataset_name,
129
+ task.dataset_config_name if hasattr(task, 'dataset_config_name') else None,
130
+ split=task.fewshot_split
131
+ )
132
+ docs = [dict(item) for item in dataset]
133
+ except Exception as e:
134
+ raise RuntimeError(f"No labelled docs available for task {task.NAME}. Error loading fewshot split: {e}")
135
+ else:
136
+ raise RuntimeError(f"No labelled docs available for task {task.NAME}")
137
+
138
+ if limit is not None and limit > 0:
139
+ docs = docs[:limit]
140
+
141
+ return docs
142
+
143
+
144
+ def find_working_task_from_group(group_dict, max_depth=3, current_depth=0):
145
+ """
146
+ Recursively search through nested ConfigurableGroup structures to find a working individual task.
147
+
148
+ Args:
149
+ group_dict: Dictionary-like ConfigurableGroup object or regular dict
150
+ max_depth: Maximum recursion depth to prevent infinite loops
151
+ current_depth: Current recursion depth
152
+
153
+ Returns:
154
+ Tuple of (task_object, task_name) or (None, None) if no working task found
155
+ """
156
+ if current_depth >= max_depth:
157
+ return None, None
158
+
159
+ try:
160
+ # Try to iterate through the group
161
+ items = group_dict.items() if hasattr(group_dict, 'items') else []
162
+
163
+ for key, value in items:
164
+ # Skip nested ConfigurableGroup objects at first pass
165
+ if hasattr(value, 'items') and 'ConfigurableGroup' in str(type(key)):
166
+ continue
167
+
168
+ # Check if this looks like an individual task
169
+ if hasattr(value, 'has_validation_docs') or hasattr(value, 'has_test_docs') or hasattr(value, 'has_training_docs'):
170
+ # Try to validate it has documents
171
+ try:
172
+ has_docs = False
173
+ if hasattr(value, 'has_validation_docs') and value.has_validation_docs():
174
+ has_docs = True
175
+ elif hasattr(value, 'has_test_docs') and value.has_test_docs():
176
+ has_docs = True
177
+ elif hasattr(value, 'has_training_docs') and value.has_training_docs():
178
+ has_docs = True
179
+
180
+ if has_docs:
181
+ # Test if we can actually get documents
182
+ if hasattr(value, 'validation_docs') and value.has_validation_docs():
183
+ docs = list(value.validation_docs())
184
+ elif hasattr(value, 'test_docs') and value.has_test_docs():
185
+ docs = list(value.test_docs())
186
+ elif hasattr(value, 'training_docs') and value.has_training_docs():
187
+ docs = list(value.training_docs())
188
+ else:
189
+ docs = []
190
+
191
+ if docs:
192
+ return value, str(key)
193
+ except Exception:
194
+ # This task doesn't work, try next one
195
+ continue
196
+
197
+ # If no individual tasks worked, try nested groups
198
+ for key, value in items:
199
+ if hasattr(value, 'items') and 'ConfigurableGroup' in str(type(key)):
200
+ result_task, result_name = find_working_task_from_group(value, max_depth, current_depth + 1)
201
+ if result_task is not None:
202
+ return result_task, result_name
203
+
204
+ return None, None
205
+
206
+ except Exception as e:
207
+ print(f"Error exploring group: {e}")
208
+ return None, None
209
+
210
+
211
+ def handle_configurable_group_task(task_name: str):
212
+ """
213
+ Consolidated function to handle ConfigurableGroup tasks for both CLI and processing scripts.
214
+
215
+ This function detects when a task is actually a ConfigurableGroup and finds a working
216
+ individual task within it, handling nested groups up to 3 levels deep.
217
+ Even handles tasks with lm-eval dependency issues by finding working alternatives.
218
+ Also supports loading custom YAML task configurations.
219
+
220
+ Args:
221
+ task_name: Name of the potentially problematic group task
222
+
223
+ Returns:
224
+ Tuple of (working_task_object, actual_task_name) or raises ValueError if no working task found
225
+ """
226
+ try:
227
+ from lm_eval.tasks import get_task_dict
228
+ except ImportError as e:
229
+ raise ImportError("lm-evaluation-harness is required. Install with: pip install lm-eval") from e
230
+
231
+ print(f"🔍 Loading task: {task_name}")
232
+
233
+ # First, try to load the task normally from the registry
234
+ try:
235
+ # Initialize TaskManager to ensure registry is populated
236
+ from lm_eval.tasks import TaskManager as LMTaskManager
237
+ task_manager = LMTaskManager()
238
+ task_manager.initialize_tasks()
239
+
240
+ task_dict = get_task_dict([task_name], task_manager=task_manager)
241
+ if task_name in task_dict:
242
+ task = task_dict[task_name]
243
+ print(f" ✅ Found {task_name} in registry")
244
+ return task, task_name
245
+ except Exception as e:
246
+ print(f" ⚠️ Registry loading failed: {e}")
247
+
248
+ # Check if the task exists in the registry but has loading issues
249
+ try:
250
+ from lm_eval.tasks import TaskManager as LMTaskManager
251
+ task_manager = LMTaskManager()
252
+ task_manager.initialize_tasks()
253
+
254
+ # Check in both individual tasks and groups
255
+ all_tasks = getattr(task_manager, 'all_tasks', set())
256
+ all_groups = getattr(task_manager, 'all_groups', set())
257
+
258
+ print(f" 📊 Registry check: {len(all_tasks)} tasks, {len(all_groups)} groups available")
259
+ print(f" 🔍 Is '{task_name}' in groups? {task_name in all_groups}")
260
+ print(f" 🔍 Is '{task_name}' in tasks? {task_name in all_tasks}")
261
+
262
+ if task_name in all_tasks or task_name in all_groups:
263
+ print(f" 🔍 Task {task_name} exists in registry but has loading issues")
264
+
265
+ # For group tasks, try to extract individual working tasks
266
+ if task_name in all_groups:
267
+ print(f" 💡 Found {task_name} as a ConfigurableGroup - extracting individual tasks...")
268
+ result = try_extract_working_tasks_from_group(task_name, task_manager)
269
+ if result:
270
+ return result
271
+ else:
272
+ print(f" 💥 FAILED: Group {task_name} exists but no working tasks found!")
273
+ return None
274
+
275
+ # For individual tasks that fail loading, try aggressive search
276
+ print(f" 💡 Found {task_name} as individual task - trying alternatives...")
277
+ return try_find_related_working_task(task_name)
278
+
279
+ # If not found in registry at all, try aggressive search
280
+ print(f" 🔄 Task {task_name} not found in registry, trying alternatives...")
281
+ return try_find_related_working_task(task_name)
282
+
283
+ except Exception as registry_error:
284
+ print(f" ⚠️ Registry check failed: {registry_error}")
285
+ # Still try aggressive search as fallback
286
+ return try_find_related_working_task(task_name)
287
+
288
+ # If not found in registry, look for custom YAML configurations
289
+ print(f" 🔍 Searching for custom YAML configuration for {task_name}")
290
+
291
+ import os
292
+ import glob
293
+
294
+ # For specific custom tasks like flan_held_in, create the YAML files if needed
295
+ if task_name == "flan_held_in":
296
+ yaml_file_path = create_flan_held_in_files()
297
+ if yaml_file_path:
298
+ config_dir = os.path.dirname(yaml_file_path)
299
+ print(f" 🔍 Loading flan_held_in from: {config_dir}")
300
+
301
+ try:
302
+ # Load using the proper config directory approach
303
+ task_dict = load_task_with_config_dir(task_name, config_dir)
304
+
305
+ if task_name in task_dict:
306
+ task = task_dict[task_name]
307
+ print(f" ✅ Successfully loaded {task_name}")
308
+ return task, task_name
309
+
310
+ # If the group task doesn't load directly, try to extract individual tasks
311
+ print(f" 🔍 Extracting individual tasks from group...")
312
+ individual_tasks = extract_individual_tasks_from_yaml(yaml_file_path, task_name)
313
+ if individual_tasks:
314
+ print(f" 📋 Found individual tasks: {individual_tasks[:3]}...")
315
+
316
+ for extracted_task_name in individual_tasks:
317
+ try:
318
+ individual_dict = load_task_with_config_dir(extracted_task_name, config_dir)
319
+ if extracted_task_name in individual_dict:
320
+ task = individual_dict[extracted_task_name]
321
+ print(f" ✅ Successfully loaded individual task: {extracted_task_name}")
322
+ return task, extracted_task_name
323
+ except Exception as e:
324
+ print(f" ❌ Failed to load {extracted_task_name}: {str(e)[:50]}")
325
+ continue
326
+
327
+ except Exception as e:
328
+ print(f" ❌ Failed to load flan_held_in: {e}")
329
+
330
+ # Generic approach for other custom tasks
331
+ # Look for existing YAML files in common directories
332
+ yaml_candidates = []
333
+ search_dirs = [
334
+ "wisent_guard/parameters/tasks",
335
+ ".",
336
+ "tasks",
337
+ "configs"
338
+ ]
339
+
340
+ for search_dir in search_dirs:
341
+ if os.path.exists(search_dir):
342
+ yaml_candidates.extend(glob.glob(os.path.join(search_dir, f"{task_name}.yaml")))
343
+ yaml_candidates.extend(glob.glob(os.path.join(search_dir, f"{task_name}.yml")))
344
+
345
+ # Try loading existing YAML files for the task
346
+ for yaml_file in yaml_candidates:
347
+ if os.path.exists(yaml_file):
348
+ print(f" 🔍 Found YAML file: {yaml_file}")
349
+ config_dir = os.path.dirname(yaml_file)
350
+
351
+ try:
352
+ task_dict = load_task_with_config_dir(task_name, config_dir)
353
+ if task_name in task_dict:
354
+ task = task_dict[task_name]
355
+ print(f" ✅ Successfully loaded {task_name}")
356
+ return task, task_name
357
+
358
+ except Exception as e:
359
+ print(f" ❌ Failed to load from {yaml_file}: {str(e)[:100]}")
360
+
361
+ # If still not found, fall back to the original ConfigurableGroup handling logic
362
+ print(f" 🔄 Falling back to ConfigurableGroup handling for {task_name}")
363
+
364
+ # FIRST: Check if task exists in registry (for both individual tasks and groups)
365
+ try:
366
+ from lm_eval.tasks import TaskManager as LMTaskManager
367
+ task_manager = LMTaskManager()
368
+ task_manager.initialize_tasks()
369
+
370
+ # Check in both individual tasks and groups
371
+ all_tasks = getattr(task_manager, 'all_tasks', set())
372
+ all_groups = getattr(task_manager, 'all_groups', set())
373
+
374
+ # Convert to sets if they're lists, then merge
375
+ if isinstance(all_tasks, list):
376
+ all_tasks = set(all_tasks)
377
+ if isinstance(all_groups, list):
378
+ all_groups = set(all_groups)
379
+
380
+ print(f" 📊 Registry check: {len(all_tasks)} tasks, {len(all_groups)} groups available")
381
+ print(f" 🔍 Is '{task_name}' in groups? {task_name in all_groups}")
382
+ print(f" 🔍 Is '{task_name}' in tasks? {task_name in all_tasks}")
383
+
384
+ if task_name in all_tasks or task_name in all_groups:
385
+ print(f" 🔍 Task {task_name} exists in registry but has loading issues")
386
+
387
+ # For group tasks, try to extract individual working tasks
388
+ if task_name in all_groups:
389
+ print(f" 💡 Found {task_name} as a ConfigurableGroup - extracting individual tasks...")
390
+ result = try_extract_working_tasks_from_group(task_name, task_manager)
391
+ if result:
392
+ return result
393
+ else:
394
+ print(f" 💥 FAILED: Group {task_name} exists but no working tasks found!")
395
+ return None
396
+
397
+ # For individual tasks that fail loading, try aggressive search
398
+ print(f" 💡 Found {task_name} as individual task - trying alternatives...")
399
+ return try_find_related_working_task(task_name)
400
+
401
+ # If not found in registry at all, try aggressive search
402
+ print(f" 🔄 Task {task_name} not found in registry, trying alternatives...")
403
+ return try_find_related_working_task(task_name)
404
+
405
+ except Exception as registry_error:
406
+ print(f" ⚠️ Registry check failed: {registry_error}")
407
+ # Still try aggressive search as fallback
408
+ return try_find_related_working_task(task_name)
409
+
410
+ try:
411
+ # Original logic for ConfigurableGroup tasks (should not reach here for known groups)
412
+ task_dict = get_task_dict([task_name])
413
+ if task_name not in task_dict:
414
+ # Task doesn't exist, try aggressive search
415
+ return try_find_related_working_task(task_name)
416
+
417
+ task = task_dict[task_name]
418
+
419
+ # Check if it's a ConfigurableGroup by examining the task object
420
+ if hasattr(task, '__dict__') and isinstance(getattr(task, '__dict__', {}), dict):
421
+ task_dict_items = getattr(task, '__dict__', {})
422
+
423
+ # Look for ConfigurableGroup indicators
424
+ if any(isinstance(v, dict) for v in task_dict_items.values()):
425
+ print(f" 🎯 Detected ConfigurableGroup structure in {task_name}")
426
+
427
+ # Try to find a working individual task within the group
428
+ working_task = find_working_task_from_group(task_dict_items)
429
+ if working_task:
430
+ return working_task
431
+
432
+ # If it's not a ConfigurableGroup or we couldn't find working tasks,
433
+ # try to use the task directly but handle potential dependency issues
434
+ try:
435
+ # Test if the task can load documents (quick validation)
436
+ if hasattr(task, 'validation_docs'):
437
+ docs = list(task.validation_docs())
438
+ if docs:
439
+ print(f" ✅ Task {task_name} works directly")
440
+ return task, task_name
441
+ elif hasattr(task, 'test_docs'):
442
+ docs = list(task.test_docs())
443
+ if docs:
444
+ print(f" ✅ Task {task_name} works directly")
445
+ return task, task_name
446
+ elif hasattr(task, 'training_docs'):
447
+ docs = list(task.training_docs())
448
+ if docs:
449
+ print(f" ✅ Task {task_name} works directly")
450
+ return task, task_name
451
+
452
+ except Exception as doc_error:
453
+ print(f" ⚠️ Task {task_name} has document loading issues: {doc_error}")
454
+
455
+ # If there are dependency issues, try to find working alternatives
456
+ return try_find_related_working_task(task_name)
457
+
458
+ # If we get here, the task exists but has no usable documents
459
+ print(f" ⚠️ Task {task_name} has no usable documents")
460
+ return try_find_related_working_task(task_name)
461
+
462
+ except Exception as e:
463
+ print(f" ❌ Error handling {task_name}: {e}")
464
+ # Try aggressive search for alternatives
465
+ return try_find_related_working_task(task_name)
466
+
467
+
468
+ def extract_individual_tasks_from_yaml(yaml_file: str, group_name: str, _visited_files=None) -> List[str]:
469
+ """
470
+ Extract individual task names from a YAML configuration file.
471
+ This function handles nested groups by recursively resolving group names.
472
+
473
+ Args:
474
+ yaml_file: Path to the YAML file
475
+ group_name: Name of the group we're looking for
476
+ _visited_files: Set of already visited files to prevent infinite recursion
477
+
478
+ Returns:
479
+ List of individual task names found in the YAML
480
+ """
481
+ try:
482
+ import yaml
483
+ import os
484
+
485
+ # Initialize visited files set to prevent infinite recursion
486
+ if _visited_files is None:
487
+ _visited_files = set()
488
+
489
+ # Check if we've already processed this file
490
+ yaml_path_normalized = os.path.abspath(yaml_file)
491
+ if yaml_path_normalized in _visited_files:
492
+ print(f" 🔄 Cycle detected: {yaml_file} - skipping to prevent infinite recursion")
493
+ return []
494
+
495
+ _visited_files.add(yaml_path_normalized)
496
+
497
+ with open(yaml_file, 'r') as f:
498
+ yaml_content = yaml.safe_load(f)
499
+
500
+ individual_tasks = []
501
+
502
+ def extract_tasks_recursive(obj, depth=0):
503
+ if depth > 5: # Prevent infinite recursion
504
+ return
505
+
506
+ if isinstance(obj, dict):
507
+ # Look for 'task' key which usually contains individual tasks
508
+ if 'task' in obj:
509
+ task_value = obj['task']
510
+ if isinstance(task_value, str):
511
+ # Single task name - could be individual or group
512
+ individual_tasks.append(task_value)
513
+ elif isinstance(task_value, list):
514
+ # List of tasks or nested groups
515
+ for item in task_value:
516
+ extract_tasks_recursive(item, depth + 1)
517
+ elif isinstance(task_value, dict):
518
+ # Nested task definition
519
+ extract_tasks_recursive(task_value, depth + 1)
520
+
521
+ # Also check other keys recursively
522
+ for key, value in obj.items():
523
+ if key != 'task': # Already processed above
524
+ extract_tasks_recursive(value, depth + 1)
525
+
526
+ elif isinstance(obj, list):
527
+ for item in obj:
528
+ extract_tasks_recursive(item, depth + 1)
529
+ elif isinstance(obj, str):
530
+ # This is a task name (could be individual or group)
531
+ individual_tasks.append(obj)
532
+
533
+ extract_tasks_recursive(yaml_content)
534
+
535
+ # Remove duplicates and filter out empty strings
536
+ potential_tasks = list(set([task for task in individual_tasks if task and isinstance(task, str)]))
537
+
538
+ print(f" 📋 Found potential tasks/groups: {potential_tasks[:5]}...") # Limit output
539
+
540
+ # Now we need to resolve any groups to their individual tasks
541
+ resolved_tasks = []
542
+
543
+ # Get the base directory for this YAML file to find related group files
544
+ yaml_dir = os.path.dirname(yaml_file)
545
+
546
+ # Limit to prevent excessive processing
547
+ max_tasks_to_process = 5
548
+
549
+ for i, task_name in enumerate(potential_tasks[:max_tasks_to_process]):
550
+ # First check if this looks like an individual task (has specific suffixes)
551
+ if any(suffix in task_name for suffix in ['_zeroshot_', '_fewshot_', '_cot_', '_prompt-', '_task_']):
552
+ # This is likely an individual task
553
+ resolved_tasks.append(task_name)
554
+ continue
555
+
556
+ # Check if this is a known group that we should resolve (limit recursion depth)
557
+ if len(_visited_files) < 3: # Limit recursion depth
558
+ potential_group_file = os.path.join(yaml_dir, f"{task_name}.yaml")
559
+ if os.path.exists(potential_group_file):
560
+ print(f" 🔍 Found nested group file: {os.path.basename(potential_group_file)}")
561
+ # Recursively extract from this group
562
+ nested_tasks = extract_individual_tasks_from_yaml(potential_group_file, task_name, _visited_files.copy())
563
+ resolved_tasks.extend(nested_tasks[:3]) # Limit results
564
+ continue
565
+
566
+ # Check in subdirectories (common pattern)
567
+ for subdir in ['zeroshot', 'fewshot', 'cot']:
568
+ subdir_path = os.path.join(yaml_dir, task_name, subdir)
569
+ if os.path.isdir(subdir_path):
570
+ subdir_yaml = os.path.join(subdir_path, f"_{task_name}_{subdir}.yaml")
571
+ if os.path.exists(subdir_yaml):
572
+ print(f" 🔍 Found nested group in subdir: {subdir}")
573
+ nested_tasks = extract_individual_tasks_from_yaml(subdir_yaml, f"{task_name}_{subdir}", _visited_files.copy())
574
+ resolved_tasks.extend(nested_tasks[:3]) # Limit results
575
+ break
576
+ else:
577
+ # Treat as individual task if we can't find a group file
578
+ resolved_tasks.append(task_name)
579
+ else:
580
+ # Max recursion depth reached, treat as individual task
581
+ resolved_tasks.append(task_name)
582
+
583
+ # Final cleanup - remove duplicates and limit results
584
+ final_tasks = list(set(resolved_tasks))[:10] # Limit to 10 tasks max
585
+
586
+ print(f" 📋 Extracted individual tasks from YAML: {final_tasks}")
587
+ return final_tasks
588
+
589
+ except Exception as e:
590
+ print(f" ❌ Error extracting tasks from YAML {yaml_file}: {e}")
591
+ return []
592
+
593
+
594
+ def try_find_related_working_task(task_name: str):
595
+ """
596
+ AGGRESSIVELY find related tasks that work when the main task has issues.
597
+ This function will try EVERY possible variation to find a working task.
598
+ NO TASK SHOULD BE SKIPPED!
599
+
600
+ Args:
601
+ task_name: The problematic task name
602
+
603
+ Returns:
604
+ Tuple of (task_object, task_name) or None if absolutely no alternatives found
605
+ """
606
+ try:
607
+ from lm_eval.tasks import get_task_dict
608
+ from lm_eval.tasks import TaskManager as LMTaskManager
609
+
610
+ # Ensure TaskManager is properly initialized
611
+ task_manager = LMTaskManager()
612
+ task_manager.initialize_tasks()
613
+
614
+ # Get all available tasks from the initialized manager
615
+ all_tasks = getattr(task_manager, 'all_tasks', set())
616
+ all_groups = getattr(task_manager, 'all_groups', set())
617
+
618
+ # Convert to sets if they're lists, then merge
619
+ if isinstance(all_tasks, list):
620
+ all_tasks = set(all_tasks)
621
+ if isinstance(all_groups, list):
622
+ all_groups = set(all_groups)
623
+
624
+ all_available_tasks = all_tasks | all_groups
625
+
626
+ print(f" 📊 TaskManager has {len(all_tasks)} tasks, {len(all_groups)} groups")
627
+
628
+ print(f" 🔄 AGGRESSIVE SEARCH for working alternatives to '{task_name}' ({len(all_available_tasks)} tasks available)...")
629
+
630
+ # Strategy 1: Remove '_group' suffix
631
+ if '_group' in task_name:
632
+ base_name = task_name.replace('_group', '')
633
+ print(f" 🎯 Trying base name: {base_name}")
634
+ try:
635
+ return handle_configurable_group_task(base_name)
636
+ except:
637
+ pass
638
+
639
+ # Strategy 2: Try progressively shorter prefixes
640
+ parts = task_name.split('_')
641
+ if len(parts) > 1:
642
+ for i in range(len(parts) - 1, 0, -1):
643
+ parent_name = '_'.join(parts[:i])
644
+ print(f" 🎯 Trying parent: {parent_name}")
645
+ try:
646
+ return handle_configurable_group_task(parent_name)
647
+ except:
648
+ continue
649
+
650
+ # Strategy 3: Find ANY task with the same prefix (e.g., flan_held_in -> any flan_* task)
651
+ prefix = parts[0] if parts else task_name
652
+ print(f" 🎯 Searching for ANY task starting with '{prefix}_'...")
653
+
654
+ matching_tasks = [t for t in all_available_tasks if t.startswith(prefix + '_') and t != task_name]
655
+
656
+ # Try up to 10 matching tasks until we find one that works
657
+ for candidate in matching_tasks[:10]:
658
+ print(f" 🎯 Trying candidate: {candidate}")
659
+ try:
660
+ result = handle_configurable_group_task(candidate)
661
+ print(f" ✅ SUCCESS! Found working alternative: {candidate}")
662
+ return result
663
+ except:
664
+ continue
665
+
666
+ # Strategy 4: Try exact prefix match (e.g., flan_held_in -> flan)
667
+ if prefix in all_available_tasks:
668
+ print(f" 🎯 Trying exact prefix: {prefix}")
669
+ try:
670
+ return handle_configurable_group_task(prefix)
671
+ except:
672
+ pass
673
+
674
+ # Strategy 5: Find tasks with similar keywords
675
+ keywords = [part for part in parts if len(part) > 2] # Skip short parts
676
+ for keyword in keywords:
677
+ print(f" 🎯 Searching for tasks containing '{keyword}'...")
678
+ keyword_tasks = [t for t in all_available_tasks if keyword in t and t != task_name]
679
+
680
+ for candidate in keyword_tasks[:5]: # Try up to 5 per keyword
681
+ print(f" 🎯 Trying keyword match: {candidate}")
682
+ try:
683
+ result = handle_configurable_group_task(candidate)
684
+ print(f" ✅ SUCCESS! Found working keyword match: {candidate}")
685
+ return result
686
+ except:
687
+ continue
688
+
689
+ # NO MORE STUPID FALLBACKS - FIX THE REAL ISSUE
690
+ print(f" 💥 FAILED TO FIND CORRECT TASK: {task_name} - NO RANDOM FALLBACKS ALLOWED!")
691
+ return None
692
+
693
+ except Exception as e:
694
+ print(f" ❌ Search failed: {e}")
695
+ return None
696
+
697
+
698
+ def try_extract_working_tasks_from_group(group_name: str, task_manager):
699
+ """
700
+ Try to extract and load individual working tasks from a problematic group.
701
+
702
+ This handles cases like flan_held_in where the group exists in the registry
703
+ but has loading issues (like yaml_path becoming None during include processing).
704
+
705
+ Args:
706
+ group_name: Name of the group (e.g., 'flan_held_in')
707
+ task_manager: Initialized LM TaskManager instance
708
+
709
+ Returns:
710
+ Tuple of (task_object, task_name) or None if no working tasks found
711
+ """
712
+ try:
713
+ from lm_eval.tasks import get_task_dict
714
+
715
+ print(f" 🔍 Extracting working tasks from group: {group_name}")
716
+
717
+ # Get the group configuration from the task manager
718
+ if hasattr(task_manager, 'task_index') and group_name in task_manager.task_index:
719
+ group_info = task_manager.task_index[group_name]
720
+ yaml_path = group_info.get('yaml_path')
721
+
722
+ if yaml_path and os.path.exists(yaml_path):
723
+ print(f" 📁 Found group YAML: {yaml_path}")
724
+
725
+ # Generic approach: parse the main YAML to extract task names
726
+
727
+ # STEP 1: Try to parse the main group YAML for task names
728
+ import yaml
729
+ try:
730
+ with open(yaml_path, 'r') as f:
731
+ yaml_content = yaml.safe_load(f)
732
+
733
+ # Extract task names from the main group YAML - more comprehensive search
734
+ initial_tasks = []
735
+ if isinstance(yaml_content, dict):
736
+ # Method 1: Direct 'task' field
737
+ if 'task' in yaml_content:
738
+ if isinstance(yaml_content['task'], list):
739
+ initial_tasks.extend(yaml_content['task'])
740
+ elif isinstance(yaml_content['task'], str):
741
+ initial_tasks.append(yaml_content['task'])
742
+
743
+ # Method 2: Look for any list that might contain task names
744
+ for key, value in yaml_content.items():
745
+ if isinstance(value, list) and key not in ['metric_list', 'generation_kwargs', 'metadata']:
746
+ # Filter for task-like names (avoid metrics and config values)
747
+ for item in value:
748
+ if isinstance(item, str) and ('_' in item or item.isalpha()):
749
+ if item not in initial_tasks:
750
+ initial_tasks.append(item)
751
+
752
+ if initial_tasks:
753
+ print(f" 📋 Found {len(initial_tasks)} initial tasks from main YAML: {initial_tasks[:5]}...")
754
+
755
+ # Try the initially found tasks directly
756
+ for task_name in initial_tasks[:15]: # Try more tasks
757
+ try:
758
+ print(f" 🎯 Trying initial task: {task_name}")
759
+ result = get_task_dict([task_name], task_manager=task_manager)
760
+ if task_name in result:
761
+ task = result[task_name]
762
+ print(f" ✅ SUCCESS: Found working initial task {task_name}")
763
+ return task, task_name
764
+ except Exception as e:
765
+ print(f" ❌ Initial task {task_name} failed: {str(e)[:50]}")
766
+ continue
767
+ else:
768
+ print(f" ⚠️ No task names found in main YAML structure")
769
+
770
+ except Exception as yaml_parse_error:
771
+ print(f" ⚠️ Main YAML parsing failed: {str(yaml_parse_error)[:100]}")
772
+
773
+ # Fallback: try the recursive extraction method
774
+ try:
775
+ individual_tasks = extract_individual_tasks_from_yaml(yaml_path, group_name)
776
+
777
+ if individual_tasks:
778
+ print(f" 📋 Found {len(individual_tasks)} individual tasks in group")
779
+
780
+ # Try to load known working base tasks that these might be based on
781
+ base_tasks_to_try = []
782
+
783
+ # Extract base task names (remove prompt suffixes)
784
+ for task in individual_tasks:
785
+ if '_prompt-' in task:
786
+ base_task = task.split('_prompt-')[0]
787
+ if base_task not in base_tasks_to_try:
788
+ base_tasks_to_try.append(base_task)
789
+
790
+ # Try the base tasks first
791
+ for base_task in base_tasks_to_try:
792
+ try:
793
+ print(f" 🎯 Trying base task: {base_task}")
794
+ result = get_task_dict([base_task], task_manager=task_manager)
795
+ if base_task in result:
796
+ task = result[base_task]
797
+ print(f" ✅ SUCCESS: Found working base task {base_task}")
798
+ return task, base_task
799
+ except Exception as e:
800
+ print(f" ❌ Base task {base_task} failed: {str(e)[:50]}")
801
+ continue
802
+
803
+ # If base tasks don't work, try some individual tasks (but skip templates/variables)
804
+ valid_tasks = [t for t in individual_tasks if not any(x in t for x in ['{{', '}}', '_common_yaml', 'sentence:'])]
805
+ for individual_task in valid_tasks[:5]: # Try first 5 valid ones
806
+ try:
807
+ print(f" 🎯 Trying individual task: {individual_task}")
808
+ result = get_task_dict([individual_task], task_manager=task_manager)
809
+ if individual_task in result:
810
+ task = result[individual_task]
811
+ print(f" ✅ SUCCESS: Found working individual task {individual_task}")
812
+ return task, individual_task
813
+ except Exception as e:
814
+ print(f" ❌ Individual task {individual_task} failed: {str(e)[:50]}")
815
+ continue
816
+
817
+ except Exception as yaml_error:
818
+ print(f" ⚠️ YAML extraction failed (likely !function constructor): {str(yaml_error)[:100]}")
819
+ # Fall through to generic catch-all approach below
820
+
821
+ # FINAL GENERIC CATCH-ALL: If all YAML approaches fail, search registry intelligently
822
+ print(f" 🔍 FINAL CATCH-ALL: Searching registry for tasks matching group pattern...")
823
+
824
+ # Search for tasks that contain the group name or parts of it
825
+ all_tasks = getattr(task_manager, 'all_tasks', set())
826
+ if isinstance(all_tasks, list):
827
+ all_tasks = set(all_tasks)
828
+
829
+ # Generate candidate task names based on the group name with smart filtering
830
+ candidates = []
831
+
832
+ # Strategy 1: Try exact group name
833
+ if group_name in all_tasks:
834
+ candidates.append(group_name)
835
+
836
+ # Strategy 2: Try tasks that start with the group name
837
+ group_prefix_tasks = [t for t in all_tasks if t.startswith(group_name + '_')]
838
+ candidates.extend(group_prefix_tasks[:10]) # Limit to first 10
839
+
840
+ # Strategy 3: Try tasks that contain all major parts of the group name
841
+ group_parts = [part for part in group_name.split('_') if len(part) > 2]
842
+ for part in group_parts:
843
+ matching_tasks = [t for t in all_tasks if part in t and t not in candidates]
844
+ # Prioritize exact matches and longer names
845
+ matching_tasks.sort(key=lambda x: (part in x.split('_'), len(x)), reverse=True)
846
+ candidates.extend(matching_tasks[:3]) # Top 3 per part
847
+
848
+ # Remove duplicates while preserving order
849
+ seen = set()
850
+ unique_candidates = []
851
+ for candidate in candidates:
852
+ if candidate not in seen:
853
+ unique_candidates.append(candidate)
854
+ seen.add(candidate)
855
+
856
+ print(f" 📋 Found {len(unique_candidates)} candidate tasks to try...")
857
+
858
+ # Try each candidate with intelligent prioritization
859
+ for candidate in unique_candidates[:20]: # Limit total attempts
860
+ try:
861
+ print(f" 🎯 Trying candidate: {candidate}")
862
+ result = get_task_dict([candidate], task_manager=task_manager)
863
+ if candidate in result:
864
+ task = result[candidate]
865
+ print(f" ✅ SUCCESS: Found working candidate {candidate}")
866
+ return task, candidate
867
+ except Exception as e:
868
+ print(f" ❌ Candidate {candidate} failed: {str(e)[:50]}")
869
+ continue
870
+
871
+ # If still no success, this group truly has no working tasks
872
+ print(f" 💥 FAILED: Group {group_name} has no working tasks - exhausted all generic approaches")
873
+ print(f" ❌ No working tasks found in group {group_name}")
874
+ return None
875
+
876
+ except Exception as e:
877
+ print(f" ❌ Group extraction failed: {e}")
878
+ return None
879
+
880
+
881
+ def save_custom_task_yaml(task_name: str, yaml_content: str) -> Optional[str]:
882
+ """
883
+ Save custom YAML task configuration to the tasks directory for future loading.
884
+
885
+ Args:
886
+ task_name: Name of the task
887
+ yaml_content: YAML content to save
888
+
889
+ Returns:
890
+ Path to the saved file, or None if failed
891
+ """
892
+ try:
893
+ # Create the tasks directory if it doesn't exist
894
+ tasks_dir = os.path.join("wisent_guard", "parameters", "tasks")
895
+ os.makedirs(tasks_dir, exist_ok=True)
896
+
897
+ # Save the YAML content to a file
898
+ yaml_file_path = os.path.join(tasks_dir, f"{task_name}.yaml")
899
+ with open(yaml_file_path, 'w') as f:
900
+ f.write(yaml_content)
901
+
902
+ print(f" 💾 Saved custom task configuration to: {yaml_file_path}")
903
+ return yaml_file_path
904
+
905
+ except Exception as e:
906
+ print(f" ❌ Failed to save custom task configuration: {e}")
907
+ return None
908
+
909
+
910
+
911
+
912
+
913
+ def create_task_yaml_from_user_content(task_name: str, user_yaml_content: str) -> Optional[str]:
914
+ """
915
+ Create a task YAML file from user-provided YAML content.
916
+ This function can be called when users provide their own YAML configurations.
917
+
918
+ Args:
919
+ task_name: Name of the task
920
+ user_yaml_content: YAML content provided by the user
921
+
922
+ Returns:
923
+ Path to the saved file, or None if failed
924
+ """
925
+ try:
926
+ # Validate that the YAML is parseable
927
+ yaml_data = yaml.safe_load(user_yaml_content)
928
+
929
+ # Save the user's YAML content
930
+ yaml_file_path = save_custom_task_yaml(f"{task_name}_user", user_yaml_content)
931
+
932
+ if yaml_file_path:
933
+ print(f" 💾 Saved user-provided YAML for {task_name}")
934
+ return yaml_file_path
935
+
936
+ return None
937
+
938
+ except Exception as e:
939
+ print(f" ❌ Failed to process user YAML content: {e}")
940
+ return None
941
+
942
+
943
+ def load_with_env_config(task_name: str, yaml_file: str):
944
+ """
945
+ Try to load a task by setting environment variables for lm_eval configuration.
946
+
947
+ Args:
948
+ task_name: Name of the task to load
949
+ yaml_file: Path to the YAML configuration file
950
+
951
+ Returns:
952
+ Task dictionary from get_task_dict
953
+ """
954
+ try:
955
+ from lm_eval.tasks import get_task_dict
956
+
957
+ # Try setting various environment variables that lm_eval might use
958
+ original_env = {}
959
+ env_vars_to_set = [
960
+ 'LM_EVAL_CONFIG_PATH',
961
+ 'LM_EVAL_TASKS_PATH',
962
+ 'LMEVAL_CONFIG_PATH',
963
+ 'TASK_CONFIG_PATH'
964
+ ]
965
+
966
+ # Save original environment
967
+ for env_var in env_vars_to_set:
968
+ original_env[env_var] = os.environ.get(env_var)
969
+ os.environ[env_var] = yaml_file
970
+
971
+ try:
972
+ # Try to load the task with environment variables set
973
+ return get_task_dict([task_name])
974
+ finally:
975
+ # Restore original environment
976
+ for env_var in env_vars_to_set:
977
+ if original_env[env_var] is None:
978
+ os.environ.pop(env_var, None)
979
+ else:
980
+ os.environ[env_var] = original_env[env_var]
981
+
982
+ except Exception as e:
983
+ raise Exception(f"Environment config loading failed: {e}")
984
+
985
+
986
+ def create_flan_held_in_files() -> Optional[str]:
987
+ """
988
+ Create the actual flan_held_in YAML files as provided by the user.
989
+ This creates both the main file and the template file with proper include directives.
990
+
991
+ Returns:
992
+ Path to the main flan_held_in.yaml file, or None if failed
993
+ """
994
+ try:
995
+ # Create the tasks directory
996
+ tasks_dir = os.path.join("wisent_guard", "parameters", "tasks")
997
+ os.makedirs(tasks_dir, exist_ok=True)
998
+
999
+ # Create the template file first
1000
+ template_content = """output_type: generate_until
1001
+ test_split: null
1002
+ doc_to_choice: null
1003
+ metric_list:
1004
+ - metric: exact_match
1005
+ aggregation: mean
1006
+ higher_is_better: true
1007
+ generation_kwargs:
1008
+ until:
1009
+ - "</s>"
1010
+ do_sample: false
1011
+ temperature: 0.0
1012
+ metadata:
1013
+ version: 1.0
1014
+ """
1015
+
1016
+ template_path = os.path.join(tasks_dir, "_held_in_template_yaml.yaml")
1017
+ with open(template_path, 'w') as f:
1018
+ f.write(template_content)
1019
+
1020
+ # Create the main flan_held_in.yaml file with the exact content from the user
1021
+ main_content = """group: flan_held_in
1022
+ group_alias: Flan (Held-In)
1023
+ task:
1024
+ # ANLI R1
1025
+ - group: anli_r1_flan
1026
+ group_alias: ANLI R1
1027
+ aggregate_metric_list:
1028
+ - metric: acc
1029
+ weight_by_size: True
1030
+ task:
1031
+ - task: anli_r1_prompt-0
1032
+ task_alias: prompt-0
1033
+ include: _held_in_template_yaml
1034
+ doc_to_text: "{{premise}}\\n\\nChoose your answer: based on the paragraph above can we conclude that \\"{{hypothesis}}\\"?\\n\\nOPTIONS:\\n- Yes\\n- It's impossible to say\\n- No\\nI think the answer is"
1035
+ doc_to_target: "{{[\\"Yes\\", \\"It's impossible to say\\", \\"No\\"][label]}}"
1036
+ - task: anli_r1_prompt-1
1037
+ task_alias: prompt-1
1038
+ include: _held_in_template_yaml
1039
+ doc_to_text: "{{premise}}\\n\\nBased on that paragraph can we conclude that this sentence is true?\\n{{hypothesis}}\\n\\nOPTIONS:\\n- Yes\\n- It's impossible to say\\n- No"
1040
+ doc_to_target: "{{[\\"Yes\\", \\"It's impossible to say\\", \\"No\\"][label]}}"
1041
+ - task: anli_r1_prompt-2
1042
+ task_alias: prompt-2
1043
+ include: _held_in_template_yaml
1044
+ doc_to_text: "{{premise}}\\n\\nCan we draw the following conclusion?\\n{{hypothesis}}\\n\\nOPTIONS:\\n- Yes\\n- It's impossible to say\\n- No"
1045
+ doc_to_target: "{{[\\"Yes\\", \\"It's impossible to say\\", \\"No\\"][label]}}"
1046
+ # Arc Easy
1047
+ - group: arc_easy_flan
1048
+ group_alias: Arc Easy
1049
+ aggregate_metric_list:
1050
+ - metric: acc
1051
+ weight_by_size: True
1052
+ task:
1053
+ - task: arc_easy_prompt-0
1054
+ task_alias: prompt-0
1055
+ include: _held_in_template_yaml
1056
+ doc_to_text: "{{question}}\\n\\nOPTIONS:\\n- {{choices.text|join('\\n- ')}}"
1057
+ doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
1058
+ - task: arc_easy_prompt-1
1059
+ task_alias: prompt-1
1060
+ include: _held_in_template_yaml
1061
+ doc_to_text: "Question: {{question}}\\nOPTIONS:\\n- {{choices.text|join('\\n- ')}}\\nAnswer:"
1062
+ doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
1063
+ # BoolQ
1064
+ - group: boolq_flan
1065
+ group_alias: BoolQ
1066
+ aggregate_metric_list:
1067
+ - metric: acc
1068
+ weight_by_size: True
1069
+ task:
1070
+ - task: boolq_prompt-0
1071
+ task_alias: prompt-0
1072
+ include: _held_in_template_yaml
1073
+ doc_to_text: "{{passage}}\\n\\nCan we conclude that {{question}}?\\n\\nOPTIONS:\\n- no\\n- yes"
1074
+ doc_to_target: "{{['no', 'yes'][label]}}"
1075
+ - task: boolq_prompt-1
1076
+ task_alias: prompt-1
1077
+ include: _held_in_template_yaml
1078
+ doc_to_text: "{{passage}}\\n\\nIs it true that {{question}}?\\n\\nOPTIONS:\\n- no\\n- yes"
1079
+ doc_to_target: "{{['no', 'yes'][label]}}"
1080
+ """
1081
+
1082
+ main_path = os.path.join(tasks_dir, "flan_held_in.yaml")
1083
+ with open(main_path, 'w') as f:
1084
+ f.write(main_content)
1085
+
1086
+ print(f" 💾 Created flan_held_in YAML files:")
1087
+ print(f" 📄 Template: {template_path}")
1088
+ print(f" 📄 Main: {main_path}")
1089
+
1090
+ return main_path
1091
+
1092
+ except Exception as e:
1093
+ print(f" ❌ Failed to create flan_held_in files: {e}")
1094
+ return None
1095
+
1096
+
1097
+ def load_task_with_config_dir(task_name: str, config_dir: str):
1098
+ """
1099
+ Load a task by setting the lm_eval configuration directory.
1100
+ This attempts to load YAML configurations by manipulating the path and environment.
1101
+
1102
+ Args:
1103
+ task_name: Name of the task to load
1104
+ config_dir: Directory containing YAML configuration files
1105
+
1106
+ Returns:
1107
+ Task dictionary from get_task_dict
1108
+ """
1109
+ try:
1110
+ from lm_eval.tasks import get_task_dict
1111
+ from lm_eval.tasks import TaskManager as LMTaskManager
1112
+ import sys
1113
+
1114
+ print(f" 🔧 Attempting to load {task_name} from config dir: {config_dir}")
1115
+
1116
+ # Method 1: Try to use TaskManager if available
1117
+ try:
1118
+ # Check if LMTaskManager has config path functionality
1119
+ task_manager = LMTaskManager()
1120
+ if hasattr(task_manager, 'initialize_tasks') or hasattr(task_manager, 'load_config'):
1121
+ print(f" 🔧 Using TaskManager approach")
1122
+ return get_task_dict([task_name], task_manager=task_manager)
1123
+ except Exception as e:
1124
+ print(f" ⚠️ TaskManager approach failed: {e}")
1125
+
1126
+ # Method 2: Try adding config directory to Python path
1127
+ original_path = sys.path[:]
1128
+ try:
1129
+ if config_dir not in sys.path:
1130
+ sys.path.insert(0, config_dir)
1131
+ print(f" 🔧 Added config dir to Python path")
1132
+ return get_task_dict([task_name])
1133
+ except Exception as e:
1134
+ print(f" ⚠️ Python path approach failed: {e}")
1135
+ finally:
1136
+ sys.path[:] = original_path
1137
+
1138
+ # Method 3: Try setting environment variables
1139
+ original_env = {}
1140
+ env_vars = ['LM_EVAL_CONFIG_DIR', 'LMEVAL_CONFIG_PATH', 'TASK_CONFIG_PATH']
1141
+ try:
1142
+ for env_var in env_vars:
1143
+ original_env[env_var] = os.environ.get(env_var)
1144
+ os.environ[env_var] = config_dir
1145
+ print(f" 🔧 Set environment variables")
1146
+ return get_task_dict([task_name])
1147
+ except Exception as e:
1148
+ print(f" ⚠️ Environment variable approach failed: {e}")
1149
+ finally:
1150
+ for env_var in env_vars:
1151
+ if original_env[env_var] is None:
1152
+ os.environ.pop(env_var, None)
1153
+ else:
1154
+ os.environ[env_var] = original_env[env_var]
1155
+
1156
+ # Method 4: Fall back to basic loading
1157
+ print(f" 🔧 Falling back to basic task loading")
1158
+ return get_task_dict([task_name])
1159
+
1160
+ except Exception as e:
1161
+ raise Exception(f"Config directory loading failed: {e}")
1162
+
1163
+
1164
+ class TaskManager:
1165
+ """Manages lm-eval task discovery, validation, and loading."""
1166
+
1167
+ def __init__(self):
1168
+ self._available_tasks = None
1169
+ self._task_name_mappings = {}
1170
+
1171
+ @property
1172
+ def available_tasks(self) -> List[str]:
1173
+ """Get list of available tasks, loading if necessary."""
1174
+ if self._available_tasks is None:
1175
+ self._available_tasks = load_available_tasks()
1176
+ return self._available_tasks
1177
+
1178
+ def get_available_tasks(self) -> List[str]:
1179
+ """Get list of all available tasks."""
1180
+ return self.available_tasks
1181
+
1182
+ def is_valid_task(self, task_name: str) -> bool:
1183
+ """Check if a task name is valid."""
1184
+ try:
1185
+ resolved_name = self.resolve_task_name(task_name)
1186
+ return resolved_name in self.available_tasks
1187
+ except ValueError:
1188
+ return False
1189
+
1190
+ def resolve_task_name(self, task_name: str) -> str:
1191
+ """
1192
+ Resolve a task name to its canonical form, handling variations and common mistakes.
1193
+
1194
+ Args:
1195
+ task_name: The task name to resolve
1196
+
1197
+ Returns:
1198
+ The canonical task name
1199
+
1200
+ Raises:
1201
+ ValueError: If the task name cannot be resolved
1202
+ """
1203
+ # Direct match
1204
+ if task_name in self.available_tasks:
1205
+ return task_name
1206
+
1207
+ # Check cached mappings
1208
+ if task_name in self._task_name_mappings:
1209
+ return self._task_name_mappings[task_name]
1210
+
1211
+ # Try fuzzy matching
1212
+ best_match = None
1213
+ best_similarity = 0.0
1214
+ similarity_threshold = 0.6
1215
+
1216
+ for available_task in self.available_tasks:
1217
+ similarity = self._calculate_task_name_similarity(task_name, available_task)
1218
+ if similarity > best_similarity and similarity >= similarity_threshold:
1219
+ best_similarity = similarity
1220
+ best_match = available_task
1221
+
1222
+ if best_match:
1223
+ # Cache the mapping
1224
+ self._task_name_mappings[task_name] = best_match
1225
+ return best_match
1226
+
1227
+ # List some suggestions if no match found
1228
+ suggestions = [task for task in self.available_tasks
1229
+ if any(word.lower() in task.lower() for word in task_name.split('_'))][:5]
1230
+
1231
+ raise ValueError(
1232
+ f"Task '{task_name}' not found. "
1233
+ f"Available tasks: {len(self.available_tasks)} total. "
1234
+ f"Suggestions: {suggestions if suggestions else 'Use get_available_tasks() to see all options'}"
1235
+ )
1236
+
1237
+ def _calculate_task_name_similarity(self, name1: str, name2: str) -> float:
1238
+ """Calculate similarity between two task names."""
1239
+ # Direct similarity
1240
+ base_similarity = SequenceMatcher(None, name1.lower(), name2.lower()).ratio()
1241
+
1242
+ # Bonus for word-level matches
1243
+ words1 = set(re.split(r'[_\-\s]+', name1.lower()))
1244
+ words2 = set(re.split(r'[_\-\s]+', name2.lower()))
1245
+
1246
+ if words1 and words2:
1247
+ word_overlap = len(words1.intersection(words2)) / max(len(words1), len(words2))
1248
+ return (base_similarity + word_overlap) / 2
1249
+
1250
+ return base_similarity
1251
+
1252
+ def load_task(self, task_name: str, limit: Optional[int] = None):
1253
+ """
1254
+ Load a task from lm-evaluation-harness with dynamic task name resolution.
1255
+ Supports both regular tasks and ConfigurableGroup tasks.
1256
+
1257
+ Args:
1258
+ task_name: Name of the task
1259
+ limit: Optional limit on number of documents
1260
+
1261
+ Returns:
1262
+ Task object from lm_eval
1263
+ """
1264
+
1265
+ # Find the actual task name dynamically
1266
+ actual_task_name = self.resolve_task_name(task_name)
1267
+
1268
+ try:
1269
+ # First try to handle as potentially problematic ConfigurableGroup task
1270
+ task, _ = handle_configurable_group_task(actual_task_name)
1271
+ task._limit = limit
1272
+ return task
1273
+
1274
+ except Exception as e:
1275
+ # If that fails, check if it's a task resolution issue
1276
+ if not self.is_valid_task(actual_task_name):
1277
+ raise ValueError(
1278
+ f"Task '{task_name}' could not be resolved to a valid task. "
1279
+ f"Use get_available_tasks() to see all available tasks."
1280
+ )
1281
+
1282
+ # Re-raise the original error if it wasn't a resolution issue
1283
+ raise ValueError(f"Failed to load task '{task_name}': {e}") from e
1284
+
1285
+ def split_task_data(self, task_data, split_ratio: float = 0.8, random_seed: int = 42) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
1286
+ """
1287
+ Split task data into training and testing sets.
1288
+
1289
+ Args:
1290
+ task_data: Task object from lm_eval
1291
+ split_ratio: Ratio for training split (0.0 to 1.0)
1292
+ random_seed: Random seed for reproducible splits
1293
+
1294
+ Returns:
1295
+ Tuple of (training_docs, testing_docs)
1296
+ """
1297
+ import random
1298
+
1299
+ # Load documents with limit if specified
1300
+ limit = getattr(task_data, '_limit', None)
1301
+ docs = load_docs(task_data, limit)
1302
+
1303
+ # Shuffle with seed for reproducibility
1304
+ random.seed(random_seed)
1305
+ shuffled_docs = docs.copy()
1306
+ random.shuffle(shuffled_docs)
1307
+
1308
+ # Split based on ratio
1309
+ split_point = int(len(shuffled_docs) * split_ratio)
1310
+ training_docs = shuffled_docs[:split_point]
1311
+ testing_docs = shuffled_docs[split_point:]
1312
+
1313
+ return training_docs, testing_docs
1314
+
1315
+ def prepare_prompts_from_docs(self, task, docs: List[Dict[str, Any]]) -> List[str]:
1316
+ """
1317
+ Prepare prompts from task documents.
1318
+
1319
+ Args:
1320
+ task: Task object from lm_eval
1321
+ docs: List of documents to convert to prompts
1322
+
1323
+ Returns:
1324
+ List of formatted prompts
1325
+ """
1326
+ prompts = []
1327
+
1328
+ for doc in docs:
1329
+ try:
1330
+ # Different tasks have different prompt creation methods
1331
+ if hasattr(task, 'doc_to_text'):
1332
+ prompt = task.doc_to_text(doc)
1333
+ elif hasattr(task, 'doc_format'):
1334
+ prompt = task.doc_format(doc)
1335
+ elif 'input' in doc:
1336
+ prompt = doc['input']
1337
+ elif 'question' in doc:
1338
+ prompt = doc['question']
1339
+ elif 'prompt' in doc:
1340
+ prompt = doc['prompt']
1341
+ else:
1342
+ # Fallback: use the first text-like field
1343
+ text_fields = ['text', 'passage', 'context', 'story']
1344
+ prompt = None
1345
+ for field in text_fields:
1346
+ if field in doc and isinstance(doc[field], str):
1347
+ prompt = doc[field]
1348
+ break
1349
+
1350
+ if prompt is None:
1351
+ prompt = str(doc)
1352
+
1353
+ prompts.append(prompt)
1354
+
1355
+ except Exception as e:
1356
+ # Skip problematic documents
1357
+ print(f"Warning: Could not create prompt from document: {e}")
1358
+ continue
1359
+
1360
+ return prompts
1361
+
1362
+ def get_reference_answers(self, task, docs: List[Dict[str, Any]]) -> List[str]:
1363
+ """
1364
+ Extract reference answers from task documents.
1365
+
1366
+ Args:
1367
+ task: Task object from lm_eval
1368
+ docs: List of documents to extract answers from
1369
+
1370
+ Returns:
1371
+ List of reference answers
1372
+ """
1373
+ answers = []
1374
+
1375
+ for doc in docs:
1376
+ try:
1377
+ # Different tasks store answers differently
1378
+ if hasattr(task, 'doc_to_target'):
1379
+ answer = task.doc_to_target(doc)
1380
+ elif hasattr(task, 'get_answer'):
1381
+ answer = task.get_answer(doc)
1382
+ elif 'answer' in doc:
1383
+ answer = doc['answer']
1384
+ elif 'target' in doc:
1385
+ answer = doc['target']
1386
+ elif 'label' in doc:
1387
+ answer = doc['label']
1388
+ elif 'output' in doc:
1389
+ answer = doc['output']
1390
+ else:
1391
+ # Look for likely answer fields
1392
+ answer_fields = ['correct_answer', 'gold', 'truth', 'solution']
1393
+ answer = None
1394
+ for field in answer_fields:
1395
+ if field in doc:
1396
+ answer = doc[field]
1397
+ break
1398
+
1399
+ if answer is None:
1400
+ answer = "UNKNOWN"
1401
+
1402
+ answers.append(str(answer))
1403
+
1404
+ except Exception as e:
1405
+ print(f"Warning: Could not extract answer from document: {e}")
1406
+ answers.append("UNKNOWN")
1407
+
1408
+ return answers
1409
+
1410
+ def register_custom_task_yaml(self, task_name: str, yaml_content: str) -> bool:
1411
+ """
1412
+ Register a custom YAML task configuration that can be loaded later.
1413
+
1414
+ Args:
1415
+ task_name: Name of the task to register
1416
+ yaml_content: YAML content defining the task
1417
+
1418
+ Returns:
1419
+ True if successfully registered, False otherwise
1420
+
1421
+ Example:
1422
+ yaml_content = '''
1423
+ my_custom_task:
1424
+ class: custom_task
1425
+ doc_to_text: "Question: {{question}}"
1426
+ doc_to_target: "{{answer}}"
1427
+ '''
1428
+ manager.register_custom_task_yaml("my_custom_task", yaml_content)
1429
+ """
1430
+ try:
1431
+ yaml_file_path = create_task_yaml_from_user_content(task_name, yaml_content)
1432
+ if yaml_file_path:
1433
+ print(f"✅ Registered custom task configuration for '{task_name}'")
1434
+ print(f" 📁 Saved to: {yaml_file_path}")
1435
+ return True
1436
+ return False
1437
+ except Exception as e:
1438
+ print(f"❌ Failed to register custom task '{task_name}': {e}")
1439
+ return False
1440
+
1441
+
1442
+ # Global instance for convenience
1443
+ _task_manager = TaskManager()
1444
+
1445
+ # Convenience functions that use the global instance
1446
+ def get_available_tasks() -> List[str]:
1447
+ """Get list of all available tasks."""
1448
+ return _task_manager.get_available_tasks()
1449
+
1450
+ def is_valid_task(task_name: str) -> bool:
1451
+ """Check if a task name is valid."""
1452
+ return _task_manager.is_valid_task(task_name)
1453
+
1454
+ def resolve_task_name(task_name: str) -> str:
1455
+ """Resolve a task name to its canonical form."""
1456
+ return _task_manager.resolve_task_name(task_name)