wisent 0.5.12__py3-none-any.whl → 0.5.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (227) hide show
  1. wisent/__init__.py +1 -1
  2. wisent/core/activations/__init__.py +26 -0
  3. wisent/core/activations/activations.py +96 -0
  4. wisent/core/activations/activations_collector.py +71 -20
  5. wisent/core/activations/prompt_construction_strategy.py +47 -0
  6. wisent/core/agent/__init__.py +1 -18
  7. wisent/core/agent/budget.py +2 -2
  8. wisent/core/agent/device_benchmarks.py +1 -1
  9. wisent/core/agent/diagnose/__init__.py +1 -55
  10. wisent/core/agent/diagnose/classifier_marketplace.py +8 -8
  11. wisent/core/agent/diagnose/response_diagnostics.py +4 -4
  12. wisent/core/agent/diagnose/synthetic_classifier_option.py +1 -1
  13. wisent/core/agent/diagnose/tasks/task_manager.py +3 -3
  14. wisent/core/agent/diagnose.py +2 -1
  15. wisent/core/autonomous_agent.py +10 -2
  16. wisent/core/benchmark_extractors.py +293 -0
  17. wisent/core/bigcode_integration.py +20 -7
  18. wisent/core/branding.py +108 -0
  19. wisent/core/cli/__init__.py +15 -0
  20. wisent/core/cli/create_steering_vector.py +138 -0
  21. wisent/core/cli/evaluate_responses.py +715 -0
  22. wisent/core/cli/generate_pairs.py +128 -0
  23. wisent/core/cli/generate_pairs_from_task.py +119 -0
  24. wisent/core/cli/generate_responses.py +129 -0
  25. wisent/core/cli/generate_vector_from_synthetic.py +149 -0
  26. wisent/core/cli/generate_vector_from_task.py +147 -0
  27. wisent/core/cli/get_activations.py +191 -0
  28. wisent/core/cli/optimize_classification.py +339 -0
  29. wisent/core/cli/optimize_steering.py +364 -0
  30. wisent/core/cli/tasks.py +182 -0
  31. wisent/core/cli_logger.py +22 -0
  32. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +27 -1
  33. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +49 -1
  34. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_challenge.py +115 -0
  35. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arc_easy.py +115 -0
  36. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/arithmetic.py +111 -0
  37. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/asdiv.py +119 -0
  38. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/boolq.py +115 -0
  39. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/cb.py +114 -0
  40. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/copa.py +118 -0
  41. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/coqa.py +146 -0
  42. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/drop.py +129 -0
  43. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/gsm8k.py +119 -0
  44. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/headqa.py +112 -0
  45. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/hellaswag.py +113 -0
  46. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/livecodebench.py +367 -0
  47. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa.py +115 -0
  48. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/logiqa2.py +114 -0
  49. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mc-taco.py +113 -0
  50. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/medqa.py +112 -0
  51. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mrpc.py +111 -0
  52. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/multirc.py +114 -0
  53. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/mutual.py +113 -0
  54. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/openbookqa.py +115 -0
  55. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pawsx.py +111 -0
  56. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/piqa.py +113 -0
  57. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/prost.py +113 -0
  58. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/pubmedqa.py +112 -0
  59. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qa4mre.py +116 -0
  60. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qasper.py +115 -0
  61. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qnli.py +111 -0
  62. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/qqp.py +111 -0
  63. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/race.py +121 -0
  64. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/record.py +121 -0
  65. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/rte.py +111 -0
  66. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sciq.py +110 -0
  67. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/social_iqa.py +114 -0
  68. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/squad2.py +124 -0
  69. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/sst2.py +111 -0
  70. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/swag.py +112 -0
  71. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/triviaqa.py +127 -0
  72. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_gen.py +112 -0
  73. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc1.py +117 -0
  74. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/truthfulqa_mc2.py +117 -0
  75. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/webqs.py +127 -0
  76. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wic.py +119 -0
  77. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +1 -1
  78. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wnli.py +111 -0
  79. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/wsc.py +114 -0
  80. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xnli.py +112 -0
  81. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xstorycloze.py +114 -0
  82. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/xwinograd.py +114 -0
  83. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +1 -1
  84. wisent/core/data_loaders/__init__.py +235 -0
  85. wisent/core/data_loaders/loaders/lm_loader.py +2 -2
  86. wisent/core/data_loaders/loaders/task_interface_loader.py +300 -0
  87. wisent/{cli/data_loaders/data_loader_rotator.py → core/data_loaders/rotator.py} +1 -1
  88. wisent/core/download_full_benchmarks.py +79 -2
  89. wisent/core/evaluators/benchmark_specific/__init__.py +26 -0
  90. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/evaluator.py +17 -17
  91. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/cpp_sanitizer.py +2 -2
  92. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/java_sanitizer.py +2 -2
  93. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/python_sanitizer.py +2 -2
  94. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/__init__.py +3 -0
  95. wisent/core/evaluators/benchmark_specific/coding/providers/livecodebench/provider.py +305 -0
  96. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/runtime.py +36 -4
  97. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/entrypoint.py +2 -4
  98. wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/recipes.py +1 -1
  99. wisent/core/evaluators/benchmark_specific/coding/solution_generator.py +258 -0
  100. wisent/core/evaluators/benchmark_specific/exact_match_evaluator.py +79 -0
  101. wisent/core/evaluators/benchmark_specific/f1_evaluator.py +101 -0
  102. wisent/core/evaluators/benchmark_specific/generation_evaluator.py +197 -0
  103. wisent/core/{log_likelihoods_evaluator.py → evaluators/benchmark_specific/log_likelihoods_evaluator.py} +10 -2
  104. wisent/core/evaluators/benchmark_specific/perplexity_evaluator.py +140 -0
  105. wisent/core/evaluators/benchmark_specific/personalization_evaluator.py +250 -0
  106. wisent/{cli/evaluators/evaluator_rotator.py → core/evaluators/rotator.py} +4 -4
  107. wisent/core/lm_eval_harness_ground_truth.py +3 -2
  108. wisent/core/main.py +57 -0
  109. wisent/core/model_persistence.py +2 -2
  110. wisent/core/models/wisent_model.py +6 -6
  111. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +2 -2
  112. wisent/core/optuna/steering/steering_optimization.py +1 -1
  113. wisent/core/parser_arguments/__init__.py +10 -0
  114. wisent/core/parser_arguments/agent_parser.py +110 -0
  115. wisent/core/parser_arguments/configure_model_parser.py +7 -0
  116. wisent/core/parser_arguments/create_steering_vector_parser.py +59 -0
  117. wisent/core/parser_arguments/evaluate_parser.py +40 -0
  118. wisent/core/parser_arguments/evaluate_responses_parser.py +10 -0
  119. wisent/core/parser_arguments/full_optimize_parser.py +115 -0
  120. wisent/core/parser_arguments/generate_pairs_from_task_parser.py +33 -0
  121. wisent/core/parser_arguments/generate_pairs_parser.py +29 -0
  122. wisent/core/parser_arguments/generate_responses_parser.py +15 -0
  123. wisent/core/parser_arguments/generate_vector_from_synthetic_parser.py +127 -0
  124. wisent/core/parser_arguments/generate_vector_from_task_parser.py +127 -0
  125. wisent/core/parser_arguments/generate_vector_parser.py +90 -0
  126. wisent/core/parser_arguments/get_activations_parser.py +90 -0
  127. wisent/core/parser_arguments/main_parser.py +152 -0
  128. wisent/core/parser_arguments/model_config_parser.py +59 -0
  129. wisent/core/parser_arguments/monitor_parser.py +17 -0
  130. wisent/core/parser_arguments/multi_steer_parser.py +47 -0
  131. wisent/core/parser_arguments/optimize_classification_parser.py +67 -0
  132. wisent/core/parser_arguments/optimize_sample_size_parser.py +58 -0
  133. wisent/core/parser_arguments/optimize_steering_parser.py +147 -0
  134. wisent/core/parser_arguments/synthetic_parser.py +93 -0
  135. wisent/core/parser_arguments/tasks_parser.py +584 -0
  136. wisent/core/parser_arguments/test_nonsense_parser.py +26 -0
  137. wisent/core/parser_arguments/utils.py +111 -0
  138. wisent/core/prompts/core/prompt_formater.py +3 -3
  139. wisent/core/prompts/prompt_stratiegies/direct_completion.py +2 -0
  140. wisent/core/prompts/prompt_stratiegies/instruction_following.py +2 -0
  141. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +2 -0
  142. wisent/core/prompts/prompt_stratiegies/role_playing.py +2 -0
  143. wisent/{cli/steering_methods/steering_rotator.py → core/steering_methods/rotator.py} +4 -4
  144. wisent/core/steering_optimizer.py +45 -21
  145. wisent/{synthetic → core/synthetic}/cleaners/deduper_cleaner.py +3 -3
  146. wisent/{synthetic → core/synthetic}/cleaners/methods/base_dedupers.py +2 -2
  147. wisent/{synthetic → core/synthetic}/cleaners/methods/base_refusalers.py +1 -1
  148. wisent/{synthetic → core/synthetic}/cleaners/pairs_cleaner.py +5 -5
  149. wisent/{synthetic → core/synthetic}/cleaners/refusaler_cleaner.py +4 -4
  150. wisent/{synthetic → core/synthetic}/db_instructions/mini_dp.py +1 -1
  151. wisent/{synthetic → core/synthetic}/generators/diversities/methods/fast_diversity.py +1 -1
  152. wisent/{synthetic → core/synthetic}/generators/pairs_generator.py +38 -12
  153. wisent/core/tasks/livecodebench_task.py +4 -103
  154. wisent/core/timing_calibration.py +1 -1
  155. {wisent-0.5.12.dist-info → wisent-0.5.14.dist-info}/METADATA +3 -3
  156. wisent-0.5.14.dist-info/RECORD +294 -0
  157. wisent-0.5.14.dist-info/entry_points.txt +2 -0
  158. wisent/benchmarks/coding/providers/livecodebench/provider.py +0 -53
  159. wisent/classifiers/core/atoms.py +0 -747
  160. wisent/classifiers/models/logistic.py +0 -29
  161. wisent/classifiers/models/mlp.py +0 -47
  162. wisent/cli/classifiers/classifier_rotator.py +0 -137
  163. wisent/cli/cli_logger.py +0 -142
  164. wisent/cli/wisent_cli/commands/help_cmd.py +0 -52
  165. wisent/cli/wisent_cli/commands/listing.py +0 -154
  166. wisent/cli/wisent_cli/commands/train_cmd.py +0 -322
  167. wisent/cli/wisent_cli/main.py +0 -93
  168. wisent/cli/wisent_cli/shell.py +0 -80
  169. wisent/cli/wisent_cli/ui.py +0 -69
  170. wisent/cli/wisent_cli/util/aggregations.py +0 -43
  171. wisent/cli/wisent_cli/util/parsing.py +0 -126
  172. wisent/cli/wisent_cli/version.py +0 -4
  173. wisent/opti/methods/__init__.py +0 -0
  174. wisent/synthetic/__init__.py +0 -0
  175. wisent/synthetic/cleaners/__init__.py +0 -0
  176. wisent/synthetic/cleaners/core/__init__.py +0 -0
  177. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  178. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  179. wisent/synthetic/db_instructions/__init__.py +0 -0
  180. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  181. wisent/synthetic/generators/__init__.py +0 -0
  182. wisent/synthetic/generators/core/__init__.py +0 -0
  183. wisent/synthetic/generators/diversities/__init__.py +0 -0
  184. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  185. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  186. wisent-0.5.12.dist-info/RECORD +0 -220
  187. /wisent/{benchmarks → core/evaluators/benchmark_specific/coding}/__init__.py +0 -0
  188. /wisent/{benchmarks/coding → core/evaluators/benchmark_specific/coding/metrics}/__init__.py +0 -0
  189. /wisent/{benchmarks/coding/metrics → core/evaluators/benchmark_specific/coding/metrics/core}/__init__.py +0 -0
  190. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/core/atoms.py +0 -0
  191. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/metrics/passk.py +0 -0
  192. /wisent/{benchmarks/coding/metrics/core → core/evaluators/benchmark_specific/coding/output_sanitizer}/__init__.py +0 -0
  193. /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/output_sanitizer/core}/__init__.py +0 -0
  194. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/core/atoms.py +0 -0
  195. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/output_sanitizer/utils.py +0 -0
  196. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/__init__.py +0 -0
  197. /wisent/{benchmarks/coding/output_sanitizer → core/evaluators/benchmark_specific/coding/providers}/core/__init__.py +0 -0
  198. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/providers/core/atoms.py +0 -0
  199. /wisent/{benchmarks/coding/providers/core → core/evaluators/benchmark_specific/coding/safe_docker}/__init__.py +0 -0
  200. /wisent/{benchmarks/coding/providers/livecodebench → core/evaluators/benchmark_specific/coding/safe_docker/core}/__init__.py +0 -0
  201. /wisent/{benchmarks → core/evaluators/benchmark_specific}/coding/safe_docker/core/atoms.py +0 -0
  202. /wisent/{benchmarks/coding/safe_docker → core/opti}/__init__.py +0 -0
  203. /wisent/{benchmarks/coding/safe_docker → core/opti}/core/__init__.py +0 -0
  204. /wisent/{opti → core/opti}/core/atoms.py +0 -0
  205. /wisent/{classifiers → core/opti/methods}/__init__.py +0 -0
  206. /wisent/{opti → core/opti}/methods/opti_classificator.py +0 -0
  207. /wisent/{opti → core/opti}/methods/opti_steering.py +0 -0
  208. /wisent/{classifiers/core → core/synthetic}/__init__.py +0 -0
  209. /wisent/{classifiers/models → core/synthetic/cleaners}/__init__.py +0 -0
  210. /wisent/{cli → core/synthetic/cleaners/core}/__init__.py +0 -0
  211. /wisent/{synthetic → core/synthetic}/cleaners/core/atoms.py +0 -0
  212. /wisent/{cli/classifiers → core/synthetic/cleaners/methods}/__init__.py +0 -0
  213. /wisent/{cli/data_loaders → core/synthetic/cleaners/methods/core}/__init__.py +0 -0
  214. /wisent/{synthetic → core/synthetic}/cleaners/methods/core/atoms.py +0 -0
  215. /wisent/{cli/evaluators → core/synthetic/db_instructions}/__init__.py +0 -0
  216. /wisent/{cli/steering_methods → core/synthetic/db_instructions/core}/__init__.py +0 -0
  217. /wisent/{synthetic → core/synthetic}/db_instructions/core/atoms.py +0 -0
  218. /wisent/{cli/wisent_cli → core/synthetic/generators}/__init__.py +0 -0
  219. /wisent/{cli/wisent_cli/commands → core/synthetic/generators/core}/__init__.py +0 -0
  220. /wisent/{synthetic → core/synthetic}/generators/core/atoms.py +0 -0
  221. /wisent/{cli/wisent_cli/util → core/synthetic/generators/diversities}/__init__.py +0 -0
  222. /wisent/{opti → core/synthetic/generators/diversities/core}/__init__.py +0 -0
  223. /wisent/{synthetic → core/synthetic}/generators/diversities/core/core.py +0 -0
  224. /wisent/{opti/core → core/synthetic/generators/diversities/methods}/__init__.py +0 -0
  225. {wisent-0.5.12.dist-info → wisent-0.5.14.dist-info}/WHEEL +0 -0
  226. {wisent-0.5.12.dist-info → wisent-0.5.14.dist-info}/licenses/LICENSE +0 -0
  227. {wisent-0.5.12.dist-info → wisent-0.5.14.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,26 @@
1
+ """Benchmark-specific evaluators for lm-eval tasks.
2
+
3
+ This module provides evaluation methods that match lm-eval's native approaches:
4
+ - Log likelihood evaluation for multiple-choice tasks
5
+ - Generation evaluation for text generation tasks
6
+ - Exact match evaluation for precise answer matching
7
+ - F1 evaluation for token-level comparison
8
+ - Perplexity evaluation for language modeling
9
+ - Personalization evaluation for personality trait manifestation
10
+ """
11
+
12
+ from .log_likelihoods_evaluator import LogLikelihoodsEvaluator
13
+ from .generation_evaluator import GenerationEvaluator
14
+ from .exact_match_evaluator import ExactMatchEvaluator
15
+ from .f1_evaluator import F1Evaluator
16
+ from .perplexity_evaluator import PerplexityEvaluator
17
+ from .personalization_evaluator import PersonalizationEvaluator
18
+
19
+ __all__ = [
20
+ 'LogLikelihoodsEvaluator',
21
+ 'GenerationEvaluator',
22
+ 'ExactMatchEvaluator',
23
+ 'F1Evaluator',
24
+ 'PerplexityEvaluator',
25
+ 'PersonalizationEvaluator',
26
+ ]
@@ -2,19 +2,19 @@ from __future__ import annotations
2
2
  from dataclasses import dataclass
3
3
  from typing import Callable, Iterable, Optional, TYPE_CHECKING
4
4
 
5
- from wisent.benchmarks.coding.safe_docker.core.runtime import DockerSandboxExecutor
6
- from wisent.benchmarks.coding.safe_docker.recipes import RECIPE_REGISTRY
7
- from wisent.benchmarks.coding.metrics.core.atoms import SampleOutcome, Evaluator
5
+ from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.runtime import DockerSandboxExecutor
6
+ from wisent.core.evaluators.benchmark_specific.coding.safe_docker.recipes import RECIPE_REGISTRY
7
+ from wisent.core.evaluators.benchmark_specific.coding.metrics.core.atoms import SampleOutcome, Evaluator
8
8
 
9
- from wisent.benchmarks.coding.output_sanitizer.core.atoms import TaskSchema
10
- from wisent.benchmarks.coding.output_sanitizer.python_sanitizer import PythonStandardizer
11
- from wisent.benchmarks.coding.output_sanitizer.cpp_sanitizer import CppStandardizer
12
- from wisent.benchmarks.coding.output_sanitizer.java_sanitizer import JavaStandardizer
9
+ from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.core.atoms import TaskSchema
10
+ from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.python_sanitizer import PythonStandardizer
11
+ from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.cpp_sanitizer import CppStandardizer
12
+ from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.java_sanitizer import JavaStandardizer
13
13
 
14
14
  if TYPE_CHECKING:
15
- from wisent.benchmarks.coding.safe_docker.core.atoms import Result
16
- from wisent.benchmarks.coding.providers.core.atoms import Provider, CodingTask
17
- from wisent.benchmarks.coding.output_sanitizer.core.atoms import CodeStandardizer
15
+ from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Result
16
+ from wisent.core.evaluators.benchmark_specific.coding.providers.core.atoms import Provider, CodingTask
17
+ from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.core.atoms import CodeStandardizer
18
18
 
19
19
  RepairFn = Callable[[str, dict[str,str], str], dict[str,str]]
20
20
 
@@ -82,7 +82,7 @@ def _make_schema(task: CodingTask) -> TaskSchema:
82
82
  and allow_wrapper set appropriately.
83
83
 
84
84
  example:
85
- >>> from wisent.benchmarks.coding.providers.core.atoms import CodingTask
85
+ >>> from wisent.core.evaluators.benchmark_specific.coding.providers.core.atoms import CodingTask
86
86
  >>> task = CodingTask(language="python", files={}, options={"entry_point":"add","file_name":"my_solution.py"})
87
87
  >>> schema = _make_schema(task)
88
88
  >>> schema.language
@@ -128,7 +128,7 @@ class CodingEvaluator(Evaluator):
128
128
  Feedback string summarizing the result, truncated to cfg.feedback_max_chars.
129
129
 
130
130
  examples:
131
- >>> from wisent.benchmarks.coding.safe_docker.core.atoms import Result
131
+ >>> from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Result
132
132
  >>> res = Result(status="timeout", stdout="", stderr="", elapsed=10.0)
133
133
  >>> evaluator = CodingEvaluator(provider=None, model_fn=lambda x: {}, cfg=EvaluatorConfig())
134
134
  >>> evaluator._feedback(res)
@@ -163,8 +163,8 @@ class CodingEvaluator(Evaluator):
163
163
  Result object containing the status, stdout, stderr, and elapsed time.
164
164
 
165
165
  examples:
166
- >>> from wisent.benchmarks.coding.providers.core.atoms import CodingTask
167
- >>> from wisent.benchmarks.coding.safe_docker.core.atoms import Result
166
+ >>> from wisent.core.evaluators.benchmark_specific.coding.providers.core.atoms import CodingTask
167
+ >>> from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Result
168
168
  >>> task = CodingTask(language="python", files={}, options={})
169
169
  >>> files = {"solution.py": "def add(a,b): return a + b", "tests.py": "from solution import add\ndef test_ok(): assert add(1,2)==3"}
170
170
  >>> evaluator = CodingEvaluator(provider=None, model_fn=lambda x: {})
@@ -181,7 +181,7 @@ class CodingEvaluator(Evaluator):
181
181
  0.23
182
182
  """
183
183
  recipe = RECIPE_REGISTRY[task.language]
184
- job = recipe.make_job(files, **task.options,
184
+ job = recipe.make_job(**task.options,
185
185
  time_limit_s=self.cfg.time_limit_s,
186
186
  cpu_limit_s=self.cfg.cpu_limit_s,
187
187
  mem_limit_mb=self.cfg.mem_limit_mb)
@@ -201,7 +201,7 @@ class CodingEvaluator(Evaluator):
201
201
  The sanitized files if pre_sanitize is True and a sanitizer exists for the language; otherwise, the original files.
202
202
 
203
203
  examples:
204
- >>> from wisent.benchmarks.coding.providers.core.atoms import CodingTask
204
+ >>> from wisent.core.evaluators.benchmark_specific.coding.providers.core.atoms import CodingTask
205
205
  >>> task = CodingTask(language="python", files={}, options={"entry_point":"add","file_name":"my_solution.py"})
206
206
  >>> files = {"my_solution.py": "def add(a,b): return a - b # BUG"}
207
207
  >>> evaluator = CodingEvaluator(provider=None, model_fn=lambda x: {}, cfg=EvaluatorConfig(pre_sanitize=True))
@@ -234,7 +234,7 @@ class CodingEvaluator(Evaluator):
234
234
  SampleOutcome for each task, indicating pass/fail status and elapsed time.
235
235
 
236
236
  examples:
237
- >>> from wisent.benchmarks.coding.providers.core.atoms import CodingTask, Provider
237
+ >>> from wisent.core.evaluators.benchmark_specific.coding.providers.core.atoms import CodingTask, Provider
238
238
  >>> class DummyProvider:
239
239
  ... name = "dummy"
240
240
  ... def iter_tasks(self):
@@ -1,8 +1,8 @@
1
1
  from __future__ import annotations
2
2
  import re
3
3
  from typing import List
4
- from wisent.benchmarks.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
5
- from wisent.benchmarks.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace
4
+ from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
5
+ from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace
6
6
 
7
7
  FUNC_RE = re.compile(r"^\s*(?:template<[^>]+>\s*)?(?:[\w:\s*&<>,]+)\s+(\w+)\s*\(", re.MULTILINE)
8
8
  CLASS_RE = re.compile(r"^\s*class\s+(\w+)\s*[{:]", re.MULTILINE)
@@ -2,8 +2,8 @@
2
2
  from __future__ import annotations
3
3
  import re
4
4
  from typing import List
5
- from wisent.benchmarks.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
6
- from wisent.benchmarks.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace
5
+ from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
6
+ from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace
7
7
 
8
8
  CLASS_RE = re.compile(r"\bclass\s+([A-Za-z_]\w*)")
9
9
  METHOD_RE = re.compile(r"(public\s+static\s+[\w\<\>\[\]]+\s+)(\w+)\s*\(")
@@ -2,8 +2,8 @@
2
2
  from __future__ import annotations
3
3
  import ast, re
4
4
  from typing import List
5
- from wisent.benchmarks.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
6
- from wisent.benchmarks.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace, maybe_black
5
+ from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.core.atoms import TaskSchema, NormalizeResult, CodeStandardizer
6
+ from wisent.core.evaluators.benchmark_specific.coding.output_sanitizer.utils import extract_code_block, normalize_whitespace, maybe_black
7
7
 
8
8
  class PythonStandardizer(CodeStandardizer):
9
9
  language = "python"
@@ -0,0 +1,3 @@
1
+ from .provider import LiveCodeBenchProvider
2
+
3
+ __all__ = ["LiveCodeBenchProvider"]
@@ -0,0 +1,305 @@
1
+ # coding/providers/livecodebench/provider.py
2
+ from __future__ import annotations
3
+ import json
4
+ from typing import Iterable, Optional
5
+ from ..core.atoms import CodingTask, Language
6
+
7
+
8
+ class LiveCodeBenchProvider:
9
+ """
10
+ LiveCodeBench provider: loads real coding problems from HuggingFace.
11
+
12
+ Dataset: livecodebench/code_generation_lite
13
+ Supports Python problems from LeetCode, AtCoder, and CodeForces.
14
+ """
15
+ name = "livecodebench"
16
+
17
+ def __init__(
18
+ self,
19
+ language: Language = "python",
20
+ release_version: str = "all",
21
+ limit: Optional[int] = None,
22
+ platform: Optional[str] = None,
23
+ ):
24
+ """
25
+ Initialize LiveCodeBench provider.
26
+
27
+ Arguments:
28
+ language: Programming language (currently only "python" supported)
29
+ release_version: Version to load ("release_v1", "release_v2", "all")
30
+ limit: Maximum number of problems to load
31
+ platform: Filter by platform ("leetcode", "codeforces", "atcoder")
32
+ """
33
+ self.language = language
34
+ self.release_version = release_version
35
+ self.limit = limit
36
+ self.platform = platform
37
+
38
+ if language != "python":
39
+ raise NotImplementedError(
40
+ f"LiveCodeBench currently only supports Python. Got: {language}"
41
+ )
42
+
43
+ def iter_tasks(self, split: str = "test") -> Iterable[CodingTask]:
44
+ """
45
+ Iterate over LiveCodeBench coding tasks.
46
+
47
+ Arguments:
48
+ split: Dataset split (only "test" is available for LiveCodeBench)
49
+
50
+ Yields:
51
+ CodingTask objects with solution file, test file, and options
52
+ """
53
+ from datasets import load_dataset
54
+
55
+ # Load dataset from HuggingFace
56
+ dataset = load_dataset("livecodebench/code_generation_lite", split=split)
57
+
58
+ # Filter by version (date range)
59
+ if self.release_version == "release_v1":
60
+ dataset = dataset.filter(
61
+ lambda x: x["contest_date"] >= "2023-05-01" and x["contest_date"] <= "2023-10-31"
62
+ )
63
+ elif self.release_version == "release_v2":
64
+ dataset = dataset.filter(
65
+ lambda x: x["contest_date"] >= "2023-11-01" and x["contest_date"] <= "2024-04-30"
66
+ )
67
+
68
+ # Filter by platform if specified
69
+ if self.platform:
70
+ platform_lower = self.platform.lower()
71
+ dataset = dataset.filter(
72
+ lambda x: x["platform"].lower() == platform_lower
73
+ )
74
+
75
+ # Apply limit
76
+ if self.limit:
77
+ dataset = dataset.select(range(min(self.limit, len(dataset))))
78
+
79
+ # Convert each problem to a CodingTask
80
+ for idx, problem in enumerate(dataset):
81
+ task = self._problem_to_task(problem, idx)
82
+ if task:
83
+ yield task
84
+
85
+ def _problem_to_task(self, problem: dict, idx: int) -> Optional[CodingTask]:
86
+ """
87
+ Convert a LiveCodeBench problem to a CodingTask.
88
+
89
+ Arguments:
90
+ problem: Problem dictionary from HuggingFace dataset
91
+ idx: Problem index
92
+
93
+ Returns:
94
+ CodingTask or None if conversion fails
95
+ """
96
+ try:
97
+ platform = problem["platform"].lower()
98
+ question_id = problem["question_id"]
99
+
100
+ # Parse test cases
101
+ public_tests = json.loads(problem["public_test_cases"])
102
+
103
+ if not public_tests:
104
+ return None
105
+
106
+ # Determine test type and generate appropriate test file
107
+ test_type = public_tests[0].get("testtype", "stdin")
108
+
109
+ if test_type == "functional":
110
+ # LeetCode-style: function calls with arguments
111
+ test_file = self._generate_functional_test(problem, public_tests)
112
+ else:
113
+ # stdin: CodeForces/AtCoder style
114
+ test_file = self._generate_stdin_test(problem, public_tests)
115
+
116
+ if not test_file:
117
+ return None
118
+
119
+ # Generate solution file template
120
+ solution_file = self._generate_solution_template(problem)
121
+
122
+ files = {
123
+ "solution.py": solution_file,
124
+ "tests.py": test_file,
125
+ }
126
+
127
+ options = {
128
+ "problem_id": question_id,
129
+ "platform": platform,
130
+ "difficulty": problem.get("difficulty", "unknown"),
131
+ }
132
+
133
+ return CodingTask(
134
+ language=self.language,
135
+ files=files,
136
+ options=options,
137
+ )
138
+
139
+ except Exception as e:
140
+ # Skip problematic problems
141
+ import logging
142
+ logging.warning(f"Failed to convert problem {idx}: {e}")
143
+ return None
144
+
145
+ def _generate_solution_template(self, problem: dict) -> str:
146
+ """
147
+ Generate a solution template from starter code or problem description.
148
+
149
+ Arguments:
150
+ problem: Problem dictionary
151
+
152
+ Returns:
153
+ Python solution template as string
154
+ """
155
+ starter_code = problem.get("starter_code", "").strip()
156
+
157
+ if starter_code:
158
+ # Use provided starter code
159
+ return starter_code
160
+ else:
161
+ # Generate minimal template for stdin problems
162
+ return """# Read input and solve the problem
163
+ import sys
164
+
165
+ def solve():
166
+ # Read input from stdin
167
+ lines = sys.stdin.read().strip().split('\\n')
168
+
169
+ # TODO: Implement solution
170
+ pass
171
+
172
+ if __name__ == "__main__":
173
+ solve()
174
+ """
175
+
176
+ def _generate_functional_test(self, problem: dict, test_cases: list) -> str:
177
+ """
178
+ Generate test file for LeetCode-style functional tests.
179
+
180
+ Arguments:
181
+ problem: Problem dictionary
182
+ test_cases: List of test case dictionaries
183
+
184
+ Returns:
185
+ Python test file content
186
+ """
187
+ starter_code = problem.get("starter_code", "").strip()
188
+
189
+ if not starter_code:
190
+ return ""
191
+
192
+ # Extract class and method name from starter code
193
+ import re
194
+ class_match = re.search(r"class\s+(\w+)", starter_code)
195
+ method_match = re.search(r"def\s+(\w+)\s*\(", starter_code)
196
+
197
+ if not class_match or not method_match:
198
+ return ""
199
+
200
+ class_name = class_match.group(1)
201
+ method_name = method_match.group(1)
202
+
203
+ # Generate test file
204
+ test_code = f"""from solution import {class_name}
205
+
206
+ def test_functional():
207
+ solution = {class_name}()
208
+
209
+ """
210
+
211
+ for i, test in enumerate(test_cases):
212
+ input_str = test.get("input", "")
213
+ expected_output = test.get("output", "")
214
+
215
+ # Parse input (typically JSON array where first element is the actual argument)
216
+ try:
217
+ # Try to evaluate as Python literal
218
+ import ast
219
+ parsed = ast.literal_eval(input_str)
220
+
221
+ # If it's a list with one element that's also a list, use that inner list
222
+ if isinstance(parsed, list) and len(parsed) == 1 and isinstance(parsed[0], list):
223
+ args = [parsed[0]]
224
+ elif isinstance(parsed, list):
225
+ args = [parsed]
226
+ else:
227
+ args = [parsed]
228
+ except:
229
+ # Fallback: use raw string
230
+ args = [input_str]
231
+
232
+ # Parse expected output
233
+ try:
234
+ import ast
235
+ expected = ast.literal_eval(expected_output)
236
+ except:
237
+ expected = expected_output
238
+
239
+ # Generate assertion
240
+ args_str = ", ".join(repr(arg) for arg in args)
241
+ test_code += f" # Test case {i + 1}\n"
242
+ test_code += f" result = solution.{method_name}({args_str})\n"
243
+ test_code += f" assert result == {repr(expected)}, f\"Test {i + 1} failed: {{result}} != {repr(expected)}\"\n\n"
244
+
245
+ test_code += "if __name__ == '__main__':\n"
246
+ test_code += " test_functional()\n"
247
+ test_code += " print('All tests passed!')\n"
248
+
249
+ return test_code
250
+
251
+ def _generate_stdin_test(self, problem: dict, test_cases: list) -> str:
252
+ """
253
+ Generate test file for stdin-based tests (CodeForces/AtCoder style).
254
+
255
+ Arguments:
256
+ problem: Problem dictionary
257
+ test_cases: List of test case dictionaries
258
+
259
+ Returns:
260
+ Python test file content
261
+ """
262
+ # For stdin tests, we run the solution and compare output
263
+ test_code = """import subprocess
264
+ import sys
265
+
266
+ def test_stdin():
267
+ test_cases = [
268
+ """
269
+
270
+ for i, test in enumerate(test_cases):
271
+ input_data = test.get("input", "")
272
+ expected_output = test.get("output", "")
273
+
274
+ test_code += f" # Test case {i + 1}\n"
275
+ test_code += f" ({repr(input_data)}, {repr(expected_output)}),\n"
276
+
277
+ test_code += """ ]
278
+
279
+ for i, (input_data, expected_output) in enumerate(test_cases):
280
+ # Run solution with input
281
+ proc = subprocess.run(
282
+ [sys.executable, "solution.py"],
283
+ input=input_data,
284
+ capture_output=True,
285
+ text=True,
286
+ timeout=5
287
+ )
288
+
289
+ actual_output = proc.stdout.strip()
290
+ expected_output = expected_output.strip()
291
+
292
+ assert actual_output == expected_output, (
293
+ f"Test case {i + 1} failed:\\n"
294
+ f" Input: {input_data[:100]}\\n"
295
+ f" Expected: {expected_output[:200]}\\n"
296
+ f" Got: {actual_output[:200]}"
297
+ )
298
+
299
+ print(f'All {len(test_cases)} test(s) passed!')
300
+
301
+ if __name__ == '__main__':
302
+ test_stdin()
303
+ """
304
+
305
+ return test_code
@@ -1,10 +1,10 @@
1
1
  from __future__ import annotations
2
2
  import json, os, subprocess, tempfile
3
3
  from typing import TYPE_CHECKING
4
- from wisent.benchmarks.coding.safe_docker.core.atoms import Result, SandboxExecutor
4
+ from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Result, SandboxExecutor
5
5
 
6
6
  if TYPE_CHECKING:
7
- from wisent.benchmarks.coding.safe_docker.core.atoms import Job
7
+ from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Job
8
8
 
9
9
  __all__ = ["DockerSandboxExecutor"]
10
10
 
@@ -31,6 +31,38 @@ class DockerSandboxExecutor(SandboxExecutor):
31
31
  def __init__(self, image: str = DEFAULT_IMAGE, runtime: str | None = None):
32
32
  self.image = image
33
33
  self.runtime = runtime
34
+ self._check_docker_available()
35
+
36
+ def _check_docker_available(self) -> None:
37
+ """
38
+ Check if Docker daemon is running and accessible.
39
+
40
+ Raises:
41
+ RuntimeError: If Docker is not available or not running.
42
+ """
43
+ try:
44
+ result = subprocess.run(
45
+ ["docker", "info"],
46
+ capture_output=True,
47
+ text=True,
48
+ timeout=5
49
+ )
50
+ if result.returncode != 0:
51
+ raise RuntimeError(
52
+ "Docker daemon is not running. Please start Docker and try again.\n"
53
+ f"Error: {result.stderr}"
54
+ )
55
+ except FileNotFoundError:
56
+ raise RuntimeError(
57
+ "Docker command not found. Please install Docker:\n"
58
+ " - macOS: https://docs.docker.com/desktop/install/mac-install/\n"
59
+ " - Linux: https://docs.docker.com/engine/install/\n"
60
+ " - Windows: https://docs.docker.com/desktop/install/windows-install/"
61
+ )
62
+ except subprocess.TimeoutExpired:
63
+ raise RuntimeError(
64
+ "Docker command timed out. Docker daemon may be unresponsive."
65
+ )
34
66
 
35
67
  def run(self, files: dict[str, str], job: Job) -> Result:
36
68
  """
@@ -49,8 +81,8 @@ class DockerSandboxExecutor(SandboxExecutor):
49
81
  A Result object with the outcome of the execution.
50
82
 
51
83
  example (pythonm add function)
52
- >>> from wisent.benchmarks.coding.safe_docker.core.atoms import Job, Result
53
- >>> from wisent.benchmarks.coding.safe_docker.core.runtime import DockerSandboxExecutor
84
+ >>> from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Job, Result
85
+ >>> from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.runtime import DockerSandboxExecutor
54
86
  >>> job = Job(
55
87
  ... language="python",
56
88
  ... compile_argv=None,
@@ -1,12 +1,10 @@
1
1
  from __future__ import annotations
2
2
  import json, os, shutil, subprocess, sys, time, signal, resource
3
3
 
4
- from wisent.benchmarks.coding.safe_docker.core.atoms import Job
5
-
6
4
  JOB_FILE = "/job/job.json"
7
5
  WORKDIR = "/work"
8
6
 
9
- def set_limits(job: Job):
7
+ def set_limits(job):
10
8
  """
11
9
  Set resource limits for the sandboxed process.
12
10
 
@@ -25,7 +23,7 @@ def set_limits(job: Job):
25
23
  resource.setrlimit(resource.RLIMIT_CORE,(0,0))
26
24
  os.setsid()
27
25
 
28
- def run(argv: list[str], job: Job) -> tuple[int,str,str,float,str]:
26
+ def run(argv: list[str], job) -> tuple[int,str,str,float,str]:
29
27
  """
30
28
  Run a command in a subprocess with resource limits.
31
29
 
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
  from typing import Dict
3
- from wisent.benchmarks.coding.safe_docker.core.atoms import Job, LanguageRecipe
3
+ from wisent.core.evaluators.benchmark_specific.coding.safe_docker.core.atoms import Job, LanguageRecipe
4
4
 
5
5
  class PythonRecipe(LanguageRecipe):
6
6
  """