wisent 0.5.11__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (225) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/__init__.py +26 -0
  3. wisent/core/activations/activations.py +96 -0
  4. wisent/core/activations/activations_collector.py +71 -20
  5. wisent/core/activations/prompt_construction_strategy.py +47 -0
  6. wisent/core/agent/budget.py +2 -2
  7. wisent/core/agent/device_benchmarks.py +1 -1
  8. wisent/core/agent/diagnose/classifier_marketplace.py +8 -8
  9. wisent/core/agent/diagnose/response_diagnostics.py +4 -4
  10. wisent/core/agent/diagnose/synthetic_classifier_option.py +1 -1
  11. wisent/core/agent/diagnose/tasks/task_manager.py +3 -3
  12. wisent/core/agent/diagnose.py +2 -1
  13. wisent/core/autonomous_agent.py +10 -2
  14. wisent/core/benchmark_extractors.py +293 -0
  15. wisent/core/bigcode_integration.py +20 -7
  16. wisent/core/branding.py +108 -0
  17. wisent/core/cli/__init__.py +15 -0
  18. wisent/core/cli/create_steering_vector.py +138 -0
  19. wisent/core/cli/evaluate_responses.py +715 -0
  20. wisent/core/cli/generate_pairs.py +128 -0
  21. wisent/core/cli/generate_pairs_from_task.py +119 -0
  22. wisent/core/cli/generate_responses.py +129 -0
  23. wisent/core/cli/generate_vector_from_synthetic.py +149 -0
  24. wisent/core/cli/generate_vector_from_task.py +147 -0
  25. wisent/core/cli/get_activations.py +191 -0
  26. wisent/core/cli/optimize_classification.py +339 -0
  27. wisent/core/cli/optimize_steering.py +364 -0
  28. wisent/core/cli/tasks.py +182 -0
  29. wisent/core/cli_logger.py +22 -0
  30. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +27 -1
  31. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +49 -1
  32. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +115 -0
  33. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +115 -0
  34. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +111 -0
  35. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +119 -0
  36. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +115 -0
  37. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +114 -0
  38. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +118 -0
  39. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +146 -0
  40. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +129 -0
  41. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +119 -0
  42. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +112 -0
  43. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +113 -0
  44. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livecodebench.py +367 -0
  45. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +115 -0
  46. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +114 -0
  47. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +113 -0
  48. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +112 -0
  49. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +111 -0
  50. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +114 -0
  51. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +113 -0
  52. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +115 -0
  53. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +111 -0
  54. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +113 -0
  55. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +113 -0
  56. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +112 -0
  57. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +116 -0
  58. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +115 -0
  59. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +111 -0
  60. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +111 -0
  61. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +121 -0
  62. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +121 -0
  63. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +111 -0
  64. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +110 -0
  65. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +114 -0
  66. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad2.py +124 -0
  67. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sst2.py +111 -0
  68. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +112 -0
  69. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +127 -0
  70. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_gen.py +112 -0
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +117 -0
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +117 -0
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +127 -0
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +119 -0
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +1 -1
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +111 -0
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +114 -0
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +112 -0
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +114 -0
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +114 -0
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +1 -1
  82. wisent/core/data_loaders/__init__.py +235 -0
  83. wisent/core/data_loaders/loaders/lm_loader.py +2 -2
  84. wisent/core/data_loaders/loaders/task_interface_loader.py +300 -0
  85. wisent/{cli/data_loaders/data_loader_rotator.py → core/data_loaders/rotator.py} +1 -1
  86. wisent/core/download_full_benchmarks.py +79 -2
  87. wisent/core/evaluators/benchmark_specific/__init__.py +26 -0
  88. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/evaluator.py +17 -17
  89. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/cpp_sanitizer.py +2 -2
  90. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/java_sanitizer.py +2 -2
  91. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/python_sanitizer.py +2 -2
  92. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/__init__.py +3 -0
  93. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/provider.py +305 -0
  94. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/runtime.py +36 -4
  95. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/entrypoint.py +2 -4
  96. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/recipes.py +1 -1
  97. wisent/core/evaluators/benchmark_specific/coding/solution_generator.py +258 -0
  98. wisent/core/evaluators/benchmark_specific/exact_match_evaluator.py +79 -0
  99. wisent/core/evaluators/benchmark_specific/f1_evaluator.py +101 -0
  100. wisent/core/evaluators/benchmark_specific/generation_evaluator.py +197 -0
  101. wisent/core/{log_likelihoods_evaluator.py → evaluators/benchmark_specific/log_likelihoods_evaluator.py} +10 -2
  102. wisent/core/evaluators/benchmark_specific/perplexity_evaluator.py +140 -0
  103. wisent/core/evaluators/benchmark_specific/personalization_evaluator.py +250 -0
  104. wisent/{cli/evaluators/evaluator_rotator.py → core/evaluators/rotator.py} +4 -4
  105. wisent/core/lm_eval_harness_ground_truth.py +3 -2
  106. wisent/core/main.py +57 -0
  107. wisent/core/model_persistence.py +2 -2
  108. wisent/core/models/wisent_model.py +8 -6
  109. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  110. wisent/core/optuna/steering/steering_optimization.py +1 -1
  111. wisent/core/parser_arguments/__init__.py +10 -0
  112. wisent/core/parser_arguments/agent_parser.py +110 -0
  113. wisent/core/parser_arguments/configure_model_parser.py +7 -0
  114. wisent/core/parser_arguments/create_steering_vector_parser.py +59 -0
  115. wisent/core/parser_arguments/evaluate_parser.py +40 -0
  116. wisent/core/parser_arguments/evaluate_responses_parser.py +10 -0
  117. wisent/core/parser_arguments/full_optimize_parser.py +115 -0
  118. wisent/core/parser_arguments/generate_pairs_from_task_parser.py +33 -0
  119. wisent/core/parser_arguments/generate_pairs_parser.py +29 -0
  120. wisent/core/parser_arguments/generate_responses_parser.py +15 -0
  121. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +127 -0
  122. wisent/core/parser_arguments/generate_vector_from_task_parser.py +127 -0
  123. wisent/core/parser_arguments/generate_vector_parser.py +90 -0
  124. wisent/core/parser_arguments/get_activations_parser.py +90 -0
  125. wisent/core/parser_arguments/main_parser.py +152 -0
  126. wisent/core/parser_arguments/model_config_parser.py +59 -0
  127. wisent/core/parser_arguments/monitor_parser.py +17 -0
  128. wisent/core/parser_arguments/multi_steer_parser.py +47 -0
  129. wisent/core/parser_arguments/optimize_classification_parser.py +67 -0
  130. wisent/core/parser_arguments/optimize_sample_size_parser.py +58 -0
  131. wisent/core/parser_arguments/optimize_steering_parser.py +147 -0
  132. wisent/core/parser_arguments/synthetic_parser.py +93 -0
  133. wisent/core/parser_arguments/tasks_parser.py +584 -0
  134. wisent/core/parser_arguments/test_nonsense_parser.py +26 -0
  135. wisent/core/parser_arguments/utils.py +111 -0
  136. wisent/core/prompts/core/prompt_formater.py +3 -3
  137. wisent/core/prompts/prompt_stratiegies/direct_completion.py +2 -0
  138. wisent/core/prompts/prompt_stratiegies/instruction_following.py +2 -0
  139. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +2 -0
  140. wisent/core/prompts/prompt_stratiegies/role_playing.py +2 -0
  141. wisent/{cli/steering_methods/steering_rotator.py → core/steering_methods/rotator.py} +4 -4
  142. wisent/core/steering_optimizer.py +45 -21
  143. wisent/{synthetic → core/synthetic}/cleaners/deduper_cleaner.py +3 -3
  144. wisent/{synthetic → core/synthetic}/cleaners/methods/base_dedupers.py +2 -2
  145. wisent/{synthetic → core/synthetic}/cleaners/methods/base_refusalers.py +1 -1
  146. wisent/{synthetic → core/synthetic}/cleaners/pairs_cleaner.py +5 -5
  147. wisent/{synthetic → core/synthetic}/cleaners/refusaler_cleaner.py +4 -4
  148. wisent/{synthetic → core/synthetic}/db_instructions/mini_dp.py +1 -1
  149. wisent/{synthetic → core/synthetic}/generators/diversities/methods/fast_diversity.py +1 -1
  150. wisent/{synthetic → core/synthetic}/generators/pairs_generator.py +38 -12
  151. wisent/core/tasks/livecodebench_task.py +4 -103
  152. wisent/core/timing_calibration.py +1 -1
  153. {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/METADATA +3 -3
  154. wisent-0.5.13.dist-info/RECORD +294 -0
  155. wisent-0.5.13.dist-info/entry_points.txt +2 -0
  156. wisent/benchmarks/coding/providers/livecodebench/provider.py +0 -53
  157. wisent/classifiers/core/atoms.py +0 -747
  158. wisent/classifiers/models/logistic.py +0 -29
  159. wisent/classifiers/models/mlp.py +0 -47
  160. wisent/cli/classifiers/classifier_rotator.py +0 -137
  161. wisent/cli/cli_logger.py +0 -142
  162. wisent/cli/wisent_cli/commands/help_cmd.py +0 -52
  163. wisent/cli/wisent_cli/commands/listing.py +0 -154
  164. wisent/cli/wisent_cli/commands/train_cmd.py +0 -322
  165. wisent/cli/wisent_cli/main.py +0 -93
  166. wisent/cli/wisent_cli/shell.py +0 -80
  167. wisent/cli/wisent_cli/ui.py +0 -69
  168. wisent/cli/wisent_cli/util/aggregations.py +0 -43
  169. wisent/cli/wisent_cli/util/parsing.py +0 -126
  170. wisent/cli/wisent_cli/version.py +0 -4
  171. wisent/opti/methods/__init__.py +0 -0
  172. wisent/synthetic/__init__.py +0 -0
  173. wisent/synthetic/cleaners/__init__.py +0 -0
  174. wisent/synthetic/cleaners/core/__init__.py +0 -0
  175. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  176. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  177. wisent/synthetic/db_instructions/__init__.py +0 -0
  178. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  179. wisent/synthetic/generators/__init__.py +0 -0
  180. wisent/synthetic/generators/core/__init__.py +0 -0
  181. wisent/synthetic/generators/diversities/__init__.py +0 -0
  182. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  183. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  184. wisent-0.5.11.dist-info/RECORD +0 -220
  185. /wisent/{benchmarks → core/evaluators/benchmark_specific/coding}/__init__.py +0 -0
  186. /wisent/{benchmarks/coding → core/evaluators/benchmark_specific/coding/metrics}/__init__.py +0 -0
  187. /wisent/{benchmarks/coding/metrics → core/evaluators/benchmark_specific/coding/metrics/core}/__init__.py +0 -0
  188. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/core/atoms.py +0 -0
  189. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/passk.py +0 -0
  190. /wisent/{benchmarks/coding/metrics/core → core/evaluators/benchmark_specific/coding/output_sanitizer}/__init__.py +0 -0
  191. /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/output_sanitizer/core}/__init__.py +0 -0
  192. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/core/atoms.py +0 -0
  193. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/utils.py +0 -0
  194. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/__init__.py +0 -0
  195. /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/providers}/core/__init__.py +0 -0
  196. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/core/atoms.py +0 -0
  197. /wisent/{benchmarks/coding/providers/core → core/evaluators/benchmark_specific/coding/safe_docker}/__init__.py +0 -0
  198. /wisent/{benchmarks/coding/providers/livecodebench → core/evaluators/benchmark_specific/coding/safe_docker/core}/__init__.py +0 -0
  199. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/atoms.py +0 -0
  200. /wisent/{benchmarks/coding/safe_docker → core/opti}/__init__.py +0 -0
  201. /wisent/{benchmarks/coding/safe_docker → core/opti}/core/__init__.py +0 -0
  202. /wisent/{opti → core/opti}/core/atoms.py +0 -0
  203. /wisent/{classifiers → core/opti/methods}/__init__.py +0 -0
  204. /wisent/{opti → core/opti}/methods/opti_classificator.py +0 -0
  205. /wisent/{opti → core/opti}/methods/opti_steering.py +0 -0
  206. /wisent/{classifiers/core → core/synthetic}/__init__.py +0 -0
  207. /wisent/{classifiers/models → core/synthetic/cleaners}/__init__.py +0 -0
  208. /wisent/{cli → core/synthetic/cleaners/core}/__init__.py +0 -0
  209. /wisent/{synthetic → core/synthetic}/cleaners/core/atoms.py +0 -0
  210. /wisent/{cli/classifiers → core/synthetic/cleaners/methods}/__init__.py +0 -0
  211. /wisent/{cli/data_loaders → core/synthetic/cleaners/methods/core}/__init__.py +0 -0
  212. /wisent/{synthetic → core/synthetic}/cleaners/methods/core/atoms.py +0 -0
  213. /wisent/{cli/evaluators → core/synthetic/db_instructions}/__init__.py +0 -0
  214. /wisent/{cli/steering_methods → core/synthetic/db_instructions/core}/__init__.py +0 -0
  215. /wisent/{synthetic → core/synthetic}/db_instructions/core/atoms.py +0 -0
  216. /wisent/{cli/wisent_cli → core/synthetic/generators}/__init__.py +0 -0
  217. /wisent/{cli/wisent_cli/commands → core/synthetic/generators/core}/__init__.py +0 -0
  218. /wisent/{synthetic → core/synthetic}/generators/core/atoms.py +0 -0
  219. /wisent/{cli/wisent_cli/util → core/synthetic/generators/diversities}/__init__.py +0 -0
  220. /wisent/{opti → core/synthetic/generators/diversities/core}/__init__.py +0 -0
  221. /wisent/{synthetic → core/synthetic}/generators/diversities/core/core.py +0 -0
  222. /wisent/{opti/core → core/synthetic/generators/diversities/methods}/__init__.py +0 -0
  223. {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/WHEEL +0 -0
  224. {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/licenses/LICENSE +0 -0
  225. {wisent-0.5.11.dist-info → wisent-0.5.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,258 @@
1
+ """
2
+ LiveCodeBench solution generator using AI models and code execution.
3
+
4
+ This module generates and evaluates solutions for LiveCodeBench problems,
5
+ creating good/bad code pairs for contrastive learning.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import os
11
+ from pathlib import Path
12
+ from typing import Any, Callable, Optional
13
+ from dataclasses import dataclass, asdict
14
+
15
+ from wisent.core.evaluators.benchmark_specific.coding.providers.livecodebench.provider import LiveCodeBenchProvider
16
+ from wisent.core.evaluators.benchmark_specific.coding.metrics.evaluator import CodingEvaluator, EvaluatorConfig
17
+ from wisent.core.evaluators.benchmark_specific.coding.providers.core.atoms import CodingTask
18
+
19
+
20
+ @dataclass
21
+ class SolutionExample:
22
+ """A single solution example with evaluation result."""
23
+ model: str
24
+ code: str
25
+ result: str # "good" or "bad"
26
+ status: str # "ok", "compile_error", "runtime_error", "timeout"
27
+ elapsed: float
28
+
29
+
30
+ @dataclass
31
+ class ProblemSolutions:
32
+ """Solutions for a single problem."""
33
+ question_id: str
34
+ good_example: Optional[dict[str, Any]] = None
35
+ bad_example: Optional[dict[str, Any]] = None
36
+ difficulty: str = "unknown"
37
+ all_solutions: list[dict[str, Any]] = None
38
+
39
+ def __post_init__(self):
40
+ if self.all_solutions is None:
41
+ self.all_solutions = []
42
+
43
+
44
+ class LiveCodeBenchSolutionGenerator:
45
+ """
46
+ Generates and evaluates solutions for LiveCodeBench problems.
47
+
48
+ This replicates the wisent-core approach but as an independent system.
49
+ """
50
+
51
+ def __init__(
52
+ self,
53
+ model_fns: dict[str, Callable[[CodingTask], dict[str, str]]],
54
+ cache_dir: str = "./livecodebench_solutions",
55
+ evaluator_config: Optional[EvaluatorConfig] = None,
56
+ ):
57
+ """
58
+ Initialize the solution generator.
59
+
60
+ Args:
61
+ model_fns: Dictionary mapping model names to solution generation functions.
62
+ Each function takes a CodingTask and returns a dict of files.
63
+ cache_dir: Directory to cache generated solutions.
64
+ evaluator_config: Optional configuration for code evaluation.
65
+ """
66
+ self.model_fns = model_fns
67
+ self.cache_dir = Path(cache_dir)
68
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
69
+
70
+ self.evaluator_config = evaluator_config or EvaluatorConfig(
71
+ image="coding/sandbox:polyglot-1.0",
72
+ self_repair=False,
73
+ time_limit_s=8,
74
+ cpu_limit_s=3,
75
+ mem_limit_mb=768,
76
+ )
77
+
78
+ self.cache_file = self.cache_dir / "solutions.json"
79
+ self._cached_solutions: Optional[dict[str, ProblemSolutions]] = None
80
+
81
+ def _load_cache(self) -> dict[str, ProblemSolutions]:
82
+ """Load cached solutions from disk."""
83
+ if self._cached_solutions is not None:
84
+ return self._cached_solutions
85
+
86
+ if not self.cache_file.exists():
87
+ self._cached_solutions = {}
88
+ return {}
89
+
90
+ with open(self.cache_file, 'r') as f:
91
+ data = json.load(f)
92
+
93
+ solutions_map = {}
94
+ for item in data.get("problems", []):
95
+ problem_id = item["question_id"]
96
+ solutions_map[problem_id] = ProblemSolutions(
97
+ question_id=problem_id,
98
+ good_example=item.get("good_example"),
99
+ bad_example=item.get("bad_example"),
100
+ difficulty=item.get("difficulty", "unknown"),
101
+ all_solutions=item.get("all_solutions", []),
102
+ )
103
+
104
+ self._cached_solutions = solutions_map
105
+ return solutions_map
106
+
107
+ def _save_cache(self, solutions: dict[str, ProblemSolutions]):
108
+ """Save solutions to disk."""
109
+ data = {
110
+ "total_problems": len(solutions),
111
+ "problems": [
112
+ {
113
+ "question_id": ps.question_id,
114
+ "good_example": ps.good_example,
115
+ "bad_example": ps.bad_example,
116
+ "difficulty": ps.difficulty,
117
+ "all_solutions": ps.all_solutions,
118
+ }
119
+ for ps in solutions.values()
120
+ ]
121
+ }
122
+
123
+ with open(self.cache_file, 'w') as f:
124
+ json.dump(data, f, indent=2)
125
+
126
+ print(f"Saved {len(solutions)} problem solutions to {self.cache_file}")
127
+
128
+ def generate_solutions(
129
+ self,
130
+ limit: Optional[int] = None,
131
+ platform: Optional[str] = None,
132
+ release_version: str = "all",
133
+ skip_existing: bool = True,
134
+ ):
135
+ """
136
+ Generate solutions for LiveCodeBench problems using multiple AI models.
137
+
138
+ Args:
139
+ limit: Maximum number of problems to process.
140
+ platform: Filter by platform (leetcode, codeforces, atcoder).
141
+ release_version: Dataset version (release_v1, release_v2, all).
142
+ skip_existing: Skip problems that already have good/bad pairs.
143
+ """
144
+ # Load cache
145
+ cached_solutions = self._load_cache()
146
+
147
+ # Load problems
148
+ provider = LiveCodeBenchProvider(
149
+ language="python",
150
+ limit=limit,
151
+ platform=platform,
152
+ release_version=release_version,
153
+ )
154
+
155
+ problems_processed = 0
156
+ problems_skipped = 0
157
+
158
+ print(f"Processing LiveCodeBench problems...")
159
+ print(f"Models: {list(self.model_fns.keys())}")
160
+
161
+ for idx, task in enumerate(provider.iter_tasks()):
162
+ question_id = task.options.get("problem_id", f"unknown_{idx}")
163
+
164
+ # Skip if already has good/bad pair
165
+ if skip_existing and question_id in cached_solutions:
166
+ existing = cached_solutions[question_id]
167
+ if existing.good_example and existing.bad_example:
168
+ problems_skipped += 1
169
+ continue
170
+
171
+ print(f"\n[{idx + 1}] Processing {question_id}...")
172
+
173
+ # Generate solutions with each model
174
+ solutions = []
175
+ for model_name, model_fn in self.model_fns.items():
176
+ print(f" - Generating with {model_name}...")
177
+
178
+ try:
179
+ # Generate solution
180
+ files = model_fn(task)
181
+
182
+ # Evaluate solution
183
+ evaluator = CodingEvaluator(
184
+ provider=None, # Not used for single evaluation
185
+ model_fn=lambda _: files,
186
+ repair_fn=None,
187
+ cfg=self.evaluator_config,
188
+ )
189
+
190
+ result = evaluator._run_once(task, {**task.files, **files})
191
+
192
+ # Determine if good or bad
193
+ is_good = result.status == "ok"
194
+
195
+ solution = SolutionExample(
196
+ model=model_name,
197
+ code=files.get("solution.py", ""),
198
+ result="good" if is_good else "bad",
199
+ status=result.status,
200
+ elapsed=result.elapsed,
201
+ )
202
+ solutions.append(solution)
203
+
204
+ print(f" Result: {solution.result} ({solution.status}, {solution.elapsed:.2f}s)")
205
+
206
+ except Exception as e:
207
+ print(f" Error: {e}")
208
+ continue
209
+
210
+ # Select best good and bad examples
211
+ good_solutions = [s for s in solutions if s.result == "good"]
212
+ bad_solutions = [s for s in solutions if s.result == "bad"]
213
+
214
+ problem_solution = ProblemSolutions(
215
+ question_id=question_id,
216
+ difficulty=task.options.get("difficulty", "unknown"),
217
+ all_solutions=[asdict(s) for s in solutions],
218
+ )
219
+
220
+ if good_solutions:
221
+ # Prefer fastest good solution
222
+ best_good = min(good_solutions, key=lambda s: s.elapsed)
223
+ problem_solution.good_example = asdict(best_good)
224
+
225
+ if bad_solutions:
226
+ # Prefer bad solution with fastest failure
227
+ best_bad = min(bad_solutions, key=lambda s: s.elapsed)
228
+ problem_solution.bad_example = asdict(best_bad)
229
+
230
+ # Update cache
231
+ cached_solutions[question_id] = problem_solution
232
+ problems_processed += 1
233
+
234
+ # Save periodically
235
+ if problems_processed % 10 == 0:
236
+ self._save_cache(cached_solutions)
237
+ print(f"\nProgress: {problems_processed} processed, {problems_skipped} skipped")
238
+
239
+ # Final save
240
+ self._save_cache(cached_solutions)
241
+
242
+ print(f"\n=== Generation Complete ===")
243
+ print(f"Problems processed: {problems_processed}")
244
+ print(f"Problems skipped: {problems_skipped}")
245
+ print(f"Total in cache: {len(cached_solutions)}")
246
+
247
+ # Summary statistics
248
+ with_good_bad = sum(1 for ps in cached_solutions.values() if ps.good_example and ps.bad_example)
249
+ print(f"Problems with good+bad pairs: {with_good_bad}")
250
+
251
+ def get_solutions(self, question_id: str) -> Optional[ProblemSolutions]:
252
+ """Get solutions for a specific problem."""
253
+ cached = self._load_cache()
254
+ return cached.get(question_id)
255
+
256
+ def get_all_solutions(self) -> dict[str, ProblemSolutions]:
257
+ """Get all cached solutions."""
258
+ return self._load_cache()
@@ -0,0 +1,79 @@
1
+ """Exact match evaluator for benchmarks requiring exact string matching.
2
+
3
+ Used for tasks like GSM8K, TriviaQA where the answer must match exactly
4
+ (possibly after normalization).
5
+ """
6
+
7
+ from typing import Any
8
+ import logging
9
+
10
+ from wisent.core.evaluators.core.atoms import BaseEvaluator, EvalResult
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class ExactMatchEvaluator(BaseEvaluator):
16
+ """Evaluator using exact match for answer comparison.
17
+
18
+ Compatible with:
19
+ - GSM8K: Math problems (numerical exact match)
20
+ - TriviaQA: Factual questions (text exact match)
21
+ - Any task requiring exact answer matching
22
+ """
23
+
24
+ name = "exact_match"
25
+ description = "Exact match evaluator for precise answer comparison"
26
+ task_names = ("gsm8k", "triviaqa")
27
+
28
+ def evaluate(self, response: str, expected: Any, **kwargs) -> EvalResult:
29
+ """Evaluate using exact match.
30
+
31
+ Args:
32
+ response: Generated or extracted answer
33
+ expected: Expected answer (can be list of acceptable answers)
34
+ **kwargs:
35
+ normalize: Whether to normalize before comparison (default: True)
36
+ case_sensitive: Whether match is case-sensitive (default: False)
37
+
38
+ Returns:
39
+ EvalResult with TRUTHFUL/UNTRUTHFUL
40
+ """
41
+ normalize = kwargs.get('normalize', True)
42
+ case_sensitive = kwargs.get('case_sensitive', False)
43
+
44
+ # Handle list of acceptable answers
45
+ if isinstance(expected, list):
46
+ expected_answers = expected
47
+ else:
48
+ expected_answers = [expected]
49
+
50
+ # Prepare response
51
+ response_clean = str(response).strip()
52
+ if normalize:
53
+ response_clean = self.normalize_text(response_clean)
54
+ if not case_sensitive:
55
+ response_clean = response_clean.lower()
56
+
57
+ # Check each expected answer
58
+ for exp in expected_answers:
59
+ exp_clean = str(exp).strip()
60
+ if normalize:
61
+ exp_clean = self.normalize_text(exp_clean)
62
+ if not case_sensitive:
63
+ exp_clean = exp_clean.lower()
64
+
65
+ if response_clean == exp_clean:
66
+ return EvalResult(
67
+ ground_truth="TRUTHFUL",
68
+ method_used=self.name,
69
+ confidence=1.0,
70
+ details=f"Exact match: '{response}' == '{exp}'",
71
+ meta={"matched_answer": exp}
72
+ )
73
+
74
+ return EvalResult(
75
+ ground_truth="UNTRUTHFUL",
76
+ method_used=self.name,
77
+ confidence=0.0,
78
+ details=f"No match: '{response}' not in {expected_answers}",
79
+ )
@@ -0,0 +1,101 @@
1
+ """F1 score evaluator for benchmarks requiring token-level comparison.
2
+
3
+ Used for reading comprehension tasks where partial credit is appropriate.
4
+ """
5
+
6
+ from typing import Any, Set
7
+ import logging
8
+
9
+ from wisent.core.evaluators.core.atoms import BaseEvaluator, EvalResult
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class F1Evaluator(BaseEvaluator):
15
+ """Evaluator using F1 score for token-level comparison.
16
+
17
+ Compatible with:
18
+ - DROP: Reading comprehension with discrete reasoning
19
+ - SQuAD: Question answering
20
+ - Any task where partial token overlap should be rewarded
21
+ """
22
+
23
+ name = "f1"
24
+ description = "F1 score evaluator for token-level comparison"
25
+ task_names = ("drop", "squad")
26
+
27
+ def evaluate(self, response: str, expected: Any, **kwargs) -> EvalResult:
28
+ """Evaluate using F1 score.
29
+
30
+ Args:
31
+ response: Generated answer
32
+ expected: Expected answer (can be list of acceptable answers)
33
+ **kwargs:
34
+ normalize: Whether to normalize tokens (default: True)
35
+
36
+ Returns:
37
+ EvalResult with F1 score as confidence
38
+ """
39
+ normalize = kwargs.get('normalize', True)
40
+
41
+ # Handle list of acceptable answers - use best F1
42
+ if isinstance(expected, list):
43
+ expected_answers = expected
44
+ else:
45
+ expected_answers = [expected]
46
+
47
+ best_f1 = 0.0
48
+ best_match = None
49
+
50
+ for exp in expected_answers:
51
+ f1 = self._compute_f1(response, str(exp), normalize)
52
+ if f1 > best_f1:
53
+ best_f1 = f1
54
+ best_match = exp
55
+
56
+ # Determine ground truth based on F1 threshold
57
+ if best_f1 >= 0.8:
58
+ ground_truth = "TRUTHFUL"
59
+ elif best_f1 >= 0.5:
60
+ ground_truth = "UNKNOWN" # Partial match
61
+ else:
62
+ ground_truth = "UNTRUTHFUL"
63
+
64
+ return EvalResult(
65
+ ground_truth=ground_truth,
66
+ method_used=self.name,
67
+ confidence=best_f1,
68
+ details=f"F1 score: {best_f1:.3f} (response vs '{best_match}')",
69
+ meta={"f1_score": best_f1, "matched_answer": best_match}
70
+ )
71
+
72
+ def _compute_f1(self, response: str, expected: str, normalize: bool = True) -> float:
73
+ """Compute F1 score between response and expected.
74
+
75
+ Returns:
76
+ F1 score in [0, 1]
77
+ """
78
+ # Tokenize
79
+ response_tokens = self._tokenize(response, normalize)
80
+ expected_tokens = self._tokenize(expected, normalize)
81
+
82
+ if not response_tokens or not expected_tokens:
83
+ return 0.0
84
+
85
+ # Compute precision and recall
86
+ common = response_tokens & expected_tokens
87
+ if not common:
88
+ return 0.0
89
+
90
+ precision = len(common) / len(response_tokens)
91
+ recall = len(common) / len(expected_tokens)
92
+
93
+ # Compute F1
94
+ f1 = 2 * (precision * recall) / (precision + recall)
95
+ return f1
96
+
97
+ def _tokenize(self, text: str, normalize: bool = True) -> Set[str]:
98
+ """Tokenize text into set of tokens."""
99
+ if normalize:
100
+ text = self.normalize_text(text)
101
+ return set(text.split())
@@ -0,0 +1,197 @@
1
+ """Generation-based evaluator for benchmarks that require text generation.
2
+
3
+ This evaluator handles tasks like GSM8K, DROP, TriviaQA where the model generates
4
+ free-form text that must be parsed and compared to reference answers.
5
+ """
6
+
7
+ import re
8
+ from typing import Any, Dict
9
+ import logging
10
+
11
+ from wisent.core.evaluators.core.atoms import BaseEvaluator, EvalResult
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class GenerationEvaluator(BaseEvaluator):
17
+ """Evaluator for generation-based benchmarks.
18
+
19
+ Handles tasks that require text generation followed by answer extraction:
20
+ - GSM8K: Math word problems requiring numerical answers
21
+ - DROP: Reading comprehension with discrete reasoning
22
+ - TriviaQA: Trivia questions requiring factual answers
23
+
24
+ Methods:
25
+ - Exact match: Direct string comparison
26
+ - Normalized match: Lenient comparison after normalization
27
+ - Numerical match: For math problems
28
+ """
29
+
30
+ name = "generation"
31
+ description = "Generation-based evaluator for text generation tasks"
32
+ task_names = ("gsm8k", "drop", "triviaqa")
33
+
34
+ def evaluate(self, response: str, expected: Any, **kwargs) -> EvalResult:
35
+ """Evaluate generated response against expected answer.
36
+
37
+ Args:
38
+ response: Generated model response
39
+ expected: Expected answer (str, int, float, or list of acceptable answers)
40
+ **kwargs:
41
+ task_name: Task name for task-specific logic
42
+ answer_type: Type of answer ("numerical", "text", "exact")
43
+ normalize: Whether to normalize strings before comparison
44
+
45
+ Returns:
46
+ EvalResult with TRUTHFUL/UNTRUTHFUL/UNKNOWN
47
+ """
48
+ task_name = kwargs.get('task_name', '')
49
+ answer_type = kwargs.get('answer_type', 'text')
50
+ normalize = kwargs.get('normalize', True)
51
+
52
+ # Extract answer from generated response
53
+ extracted_answer = self._extract_answer(response, task_name, answer_type)
54
+
55
+ if extracted_answer is None:
56
+ return EvalResult(
57
+ ground_truth="UNKNOWN",
58
+ method_used=self.name,
59
+ confidence=0.0,
60
+ details="Could not extract answer from generated response",
61
+ )
62
+
63
+ # Normalize expected answer
64
+ if isinstance(expected, list):
65
+ expected_answers = expected
66
+ else:
67
+ expected_answers = [expected]
68
+
69
+ # Check if extracted answer matches any expected answer
70
+ is_correct, matched_answer, confidence = self._check_match(
71
+ extracted_answer, expected_answers, answer_type, normalize
72
+ )
73
+
74
+ return EvalResult(
75
+ ground_truth="TRUTHFUL" if is_correct else "UNTRUTHFUL",
76
+ method_used=self.name,
77
+ confidence=confidence,
78
+ details=f"Extracted: '{extracted_answer}', Expected: '{matched_answer or expected_answers[0]}'",
79
+ meta={
80
+ "extracted_answer": extracted_answer,
81
+ "expected_answers": expected_answers,
82
+ "matched_answer": matched_answer,
83
+ "answer_type": answer_type,
84
+ }
85
+ )
86
+
87
+ def _extract_answer(self, response: str, task_name: str, answer_type: str) -> Any:
88
+ """Extract answer from generated response."""
89
+ if answer_type == "numerical" or task_name == "gsm8k":
90
+ return self._extract_numerical_answer(response)
91
+ else:
92
+ return self._extract_text_answer(response)
93
+
94
+ def _extract_numerical_answer(self, response: str) -> float:
95
+ """Extract numerical answer from response (for math problems)."""
96
+ # Look for common patterns
97
+ patterns = [
98
+ r'####\s*([-+]?\d*\.?\d+)', # GSM8K format
99
+ r'answer\s*is\s*([-+]?\d*\.?\d+)',
100
+ r'=\s*([-+]?\d*\.?\d+)\s*$',
101
+ r'\$?\s*([-+]?\d*\.?\d+)',
102
+ ]
103
+
104
+ for pattern in patterns:
105
+ match = re.search(pattern, response, re.IGNORECASE)
106
+ if match:
107
+ try:
108
+ return float(match.group(1))
109
+ except ValueError:
110
+ continue
111
+
112
+ # Fallback: find last number in response
113
+ numbers = re.findall(r'[-+]?\d*\.?\d+', response)
114
+ if numbers:
115
+ try:
116
+ return float(numbers[-1])
117
+ except ValueError:
118
+ pass
119
+
120
+ return None
121
+
122
+ def _extract_text_answer(self, response: str) -> str:
123
+ """Extract text answer from response."""
124
+ # Look for explicit answer markers
125
+ patterns = [
126
+ r'answer\s*is:?\s*(.+?)(?:\n|$)',
127
+ r'final\s+answer:?\s*(.+?)(?:\n|$)',
128
+ r'(?:^|\n)answer:?\s*(.+?)(?:\n|$)',
129
+ ]
130
+
131
+ for pattern in patterns:
132
+ match = re.search(pattern, response, re.IGNORECASE)
133
+ if match:
134
+ return match.group(1).strip()
135
+
136
+ # Fallback: use first sentence
137
+ sentences = re.split(r'[.!?]\s+', response)
138
+ if sentences:
139
+ return sentences[0].strip()
140
+
141
+ return response.strip()
142
+
143
+ def _check_match(
144
+ self, extracted: Any, expected_list: list, answer_type: str, normalize: bool
145
+ ) -> tuple:
146
+ """Check if extracted answer matches any expected answer.
147
+
148
+ Returns:
149
+ (is_correct, matched_answer, confidence)
150
+ """
151
+ if answer_type == "numerical":
152
+ return self._check_numerical_match(extracted, expected_list)
153
+ else:
154
+ return self._check_text_match(extracted, expected_list, normalize)
155
+
156
+ def _check_numerical_match(self, extracted: float, expected_list: list) -> tuple:
157
+ """Check numerical match with tolerance."""
158
+ if extracted is None:
159
+ return False, None, 0.0
160
+
161
+ for expected in expected_list:
162
+ try:
163
+ expected_num = float(expected)
164
+ # Check if close enough (tolerance for floating point)
165
+ if abs(extracted - expected_num) < 1e-6:
166
+ return True, expected, 1.0
167
+ except (ValueError, TypeError):
168
+ continue
169
+
170
+ return False, None, 0.0
171
+
172
+ def _check_text_match(self, extracted: str, expected_list: list, normalize: bool) -> tuple:
173
+ """Check text match with optional normalization."""
174
+ if extracted is None:
175
+ return False, None, 0.0
176
+
177
+ if normalize:
178
+ extracted_norm = self.normalize_text(extracted)
179
+ else:
180
+ extracted_norm = extracted
181
+
182
+ for expected in expected_list:
183
+ expected_str = str(expected)
184
+ if normalize:
185
+ expected_norm = self.normalize_text(expected_str)
186
+ else:
187
+ expected_norm = expected_str
188
+
189
+ # Exact match
190
+ if extracted_norm == expected_norm:
191
+ return True, expected, 1.0
192
+
193
+ # Substring match
194
+ if extracted_norm in expected_norm or expected_norm in extracted_norm:
195
+ return True, expected, 0.8
196
+
197
+ return False, None, 0.0
@@ -9,13 +9,21 @@ directly on each choice to evaluate performance against known ground truth.
9
9
 
10
10
  import logging
11
11
  from typing import Any, Dict, Optional
12
+ from dataclasses import dataclass
12
13
 
13
- from wisent.core.activations import ActivationAggregationStrategy, Activations
14
- from wisent.core.layer import Layer
14
+ from wisent.core.activations.core.atoms import ActivationAggregationStrategy
15
+ from wisent.core.activations.activations import Activations
15
16
 
16
17
  logger = logging.getLogger(__name__)
17
18
 
18
19
 
20
+ @dataclass
21
+ class Layer:
22
+ """Simple layer metadata class."""
23
+ index: int
24
+ type: str = "transformer"
25
+
26
+
19
27
  class LogLikelihoodsEvaluator:
20
28
  """
21
29
  Evaluator for log-likelihoods based ground truth assessment.