wisent 0.1.1__tar.gz → 0.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (254) hide show
  1. {wisent-0.1.1 → wisent-0.5.2}/LICENSE +2 -2
  2. wisent-0.5.2/PKG-INFO +67 -0
  3. wisent-0.5.2/README.md +26 -0
  4. wisent-0.5.2/pyproject.toml +85 -0
  5. wisent-0.5.2/setup.py +51 -0
  6. wisent-0.5.2/wisent/__init__.py +1 -0
  7. wisent-0.5.2/wisent/benchmarks/__init__.py +0 -0
  8. wisent-0.5.2/wisent/benchmarks/coding/__init__.py +0 -0
  9. wisent-0.5.2/wisent/benchmarks/coding/metrics/__init__.py +0 -0
  10. wisent-0.5.2/wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
  11. wisent-0.5.2/wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
  12. wisent-0.5.2/wisent/benchmarks/coding/metrics/evaluator.py +275 -0
  13. wisent-0.5.2/wisent/benchmarks/coding/metrics/passk.py +66 -0
  14. wisent-0.5.2/wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
  15. wisent-0.5.2/wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
  16. wisent-0.5.2/wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
  17. wisent-0.5.2/wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  18. wisent-0.5.2/wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
  19. wisent-0.5.2/wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
  20. wisent-0.5.2/wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
  21. wisent-0.5.2/wisent/benchmarks/coding/providers/__init__.py +18 -0
  22. wisent-0.5.2/wisent/benchmarks/coding/providers/core/__init__.py +0 -0
  23. wisent-0.5.2/wisent/benchmarks/coding/providers/core/atoms.py +31 -0
  24. wisent-0.5.2/wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
  25. wisent-0.5.2/wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
  26. wisent-0.5.2/wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
  27. wisent-0.5.2/wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
  28. wisent-0.5.2/wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
  29. wisent-0.5.2/wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
  30. wisent-0.5.2/wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
  31. wisent-0.5.2/wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
  32. wisent-0.5.2/wisent/classifiers/__init__.py +0 -0
  33. wisent-0.5.2/wisent/classifiers/core/__init__.py +0 -0
  34. wisent-0.5.2/wisent/classifiers/core/atoms.py +747 -0
  35. wisent-0.5.2/wisent/classifiers/models/__init__.py +0 -0
  36. wisent-0.5.2/wisent/classifiers/models/logistic.py +29 -0
  37. wisent-0.5.2/wisent/classifiers/models/mlp.py +47 -0
  38. wisent-0.5.2/wisent/cli/__init__.py +0 -0
  39. wisent-0.5.2/wisent/cli/classifiers/__init__.py +0 -0
  40. wisent-0.5.2/wisent/cli/classifiers/classifier_rotator.py +137 -0
  41. wisent-0.5.2/wisent/cli/cli_logger.py +142 -0
  42. wisent-0.5.2/wisent/cli/data_loaders/__init__.py +0 -0
  43. wisent-0.5.2/wisent/cli/data_loaders/data_loader_rotator.py +96 -0
  44. wisent-0.5.2/wisent/cli/evaluators/__init__.py +0 -0
  45. wisent-0.5.2/wisent/cli/evaluators/evaluator_rotator.py +148 -0
  46. wisent-0.5.2/wisent/cli/steering_methods/__init__.py +0 -0
  47. wisent-0.5.2/wisent/cli/steering_methods/steering_rotator.py +110 -0
  48. wisent-0.5.2/wisent/cli/wisent_cli/__init__.py +0 -0
  49. wisent-0.5.2/wisent/cli/wisent_cli/commands/__init__.py +0 -0
  50. wisent-0.5.2/wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
  51. wisent-0.5.2/wisent/cli/wisent_cli/commands/listing.py +154 -0
  52. wisent-0.5.2/wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
  53. wisent-0.5.2/wisent/cli/wisent_cli/main.py +93 -0
  54. wisent-0.5.2/wisent/cli/wisent_cli/shell.py +80 -0
  55. wisent-0.5.2/wisent/cli/wisent_cli/ui.py +69 -0
  56. wisent-0.5.2/wisent/cli/wisent_cli/util/__init__.py +0 -0
  57. wisent-0.5.2/wisent/cli/wisent_cli/util/aggregations.py +43 -0
  58. wisent-0.5.2/wisent/cli/wisent_cli/util/parsing.py +126 -0
  59. wisent-0.5.2/wisent/cli/wisent_cli/version.py +4 -0
  60. wisent-0.5.2/wisent/core/__init__.py +27 -0
  61. wisent-0.5.2/wisent/core/activations/__init__.py +0 -0
  62. wisent-0.5.2/wisent/core/activations/activations_collector.py +338 -0
  63. wisent-0.5.2/wisent/core/activations/core/__init__.py +0 -0
  64. wisent-0.5.2/wisent/core/activations/core/atoms.py +216 -0
  65. wisent-0.5.2/wisent/core/agent/__init__.py +18 -0
  66. wisent-0.5.2/wisent/core/agent/budget.py +638 -0
  67. wisent-0.5.2/wisent/core/agent/device_benchmarks.py +685 -0
  68. wisent-0.5.2/wisent/core/agent/diagnose/__init__.py +55 -0
  69. wisent-0.5.2/wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  70. wisent-0.5.2/wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  71. wisent-0.5.2/wisent/core/agent/diagnose/create_classifier.py +1154 -0
  72. wisent-0.5.2/wisent/core/agent/diagnose/response_diagnostics.py +268 -0
  73. wisent-0.5.2/wisent/core/agent/diagnose/select_classifiers.py +506 -0
  74. wisent-0.5.2/wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
  75. wisent-0.5.2/wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  76. wisent-0.5.2/wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
  77. wisent-0.5.2/wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  78. wisent-0.5.2/wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  79. wisent-0.5.2/wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
  80. wisent-0.5.2/wisent/core/agent/diagnose.py +242 -0
  81. wisent-0.5.2/wisent/core/agent/steer.py +212 -0
  82. wisent-0.5.2/wisent/core/agent/timeout.py +134 -0
  83. wisent-0.5.2/wisent/core/autonomous_agent.py +1234 -0
  84. wisent-0.5.2/wisent/core/bigcode_integration.py +583 -0
  85. wisent-0.5.2/wisent/core/contrastive_pairs/__init__.py +15 -0
  86. wisent-0.5.2/wisent/core/contrastive_pairs/core/__init__.py +0 -0
  87. wisent-0.5.2/wisent/core/contrastive_pairs/core/atoms.py +45 -0
  88. wisent-0.5.2/wisent/core/contrastive_pairs/core/buliders.py +59 -0
  89. wisent-0.5.2/wisent/core/contrastive_pairs/core/pair.py +178 -0
  90. wisent-0.5.2/wisent/core/contrastive_pairs/core/response.py +152 -0
  91. wisent-0.5.2/wisent/core/contrastive_pairs/core/serialization.py +300 -0
  92. wisent-0.5.2/wisent/core/contrastive_pairs/core/set.py +133 -0
  93. wisent-0.5.2/wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
  94. wisent-0.5.2/wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  95. wisent-0.5.2/wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  96. wisent-0.5.2/wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
  97. wisent-0.5.2/wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  98. wisent-0.5.2/wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  99. wisent-0.5.2/wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
  100. wisent-0.5.2/wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  101. wisent-0.5.2/wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
  102. wisent-0.5.2/wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
  103. wisent-0.5.2/wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
  104. wisent-0.5.2/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
  105. wisent-0.5.2/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
  106. wisent-0.5.2/wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
  107. wisent-0.5.2/wisent/core/data_loaders/__init__.py +0 -0
  108. wisent-0.5.2/wisent/core/data_loaders/core/__init__.py +0 -0
  109. wisent-0.5.2/wisent/core/data_loaders/core/atoms.py +98 -0
  110. wisent-0.5.2/wisent/core/data_loaders/loaders/__init__.py +0 -0
  111. wisent-0.5.2/wisent/core/data_loaders/loaders/custom.py +120 -0
  112. wisent-0.5.2/wisent/core/data_loaders/loaders/lm_loader.py +218 -0
  113. wisent-0.5.2/wisent/core/detection_handling.py +257 -0
  114. wisent-0.5.2/wisent/core/download_full_benchmarks.py +1386 -0
  115. wisent-0.5.2/wisent/core/evaluators/__init__.py +0 -0
  116. wisent-0.5.2/wisent/core/evaluators/oracles/__init__.py +0 -0
  117. wisent-0.5.2/wisent/core/evaluators/oracles/interactive.py +73 -0
  118. wisent-0.5.2/wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  119. wisent-0.5.2/wisent/core/evaluators/oracles/user_specified.py +67 -0
  120. wisent-0.5.2/wisent/core/hyperparameter_optimizer.py +429 -0
  121. wisent-0.5.2/wisent/core/lm_eval_harness_ground_truth.py +1396 -0
  122. wisent-0.5.2/wisent/core/log_likelihoods_evaluator.py +321 -0
  123. wisent-0.5.2/wisent/core/managed_cached_benchmarks.py +595 -0
  124. wisent-0.5.2/wisent/core/mixed_benchmark_sampler.py +364 -0
  125. wisent-0.5.2/wisent/core/model_config_manager.py +330 -0
  126. wisent-0.5.2/wisent/core/model_persistence.py +317 -0
  127. wisent-0.5.2/wisent/core/models/__init__.py +0 -0
  128. wisent-0.5.2/wisent/core/models/core/__init__.py +0 -0
  129. wisent-0.5.2/wisent/core/models/core/atoms.py +460 -0
  130. wisent-0.5.2/wisent/core/models/wisent_model.py +727 -0
  131. wisent-0.5.2/wisent/core/multi_steering.py +316 -0
  132. wisent-0.5.2/wisent/core/optuna/__init__.py +57 -0
  133. wisent-0.5.2/wisent/core/optuna/classifier/__init__.py +25 -0
  134. wisent-0.5.2/wisent/core/optuna/classifier/activation_generator.py +349 -0
  135. wisent-0.5.2/wisent/core/optuna/classifier/classifier_cache.py +509 -0
  136. wisent-0.5.2/wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
  137. wisent-0.5.2/wisent/core/optuna/steering/__init__.py +0 -0
  138. wisent-0.5.2/wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
  139. wisent-0.5.2/wisent/core/optuna/steering/data_utils.py +342 -0
  140. wisent-0.5.2/wisent/core/optuna/steering/metrics.py +474 -0
  141. wisent-0.5.2/wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
  142. wisent-0.5.2/wisent/core/optuna/steering/steering_optimization.py +1111 -0
  143. wisent-0.5.2/wisent/core/parser.py +1668 -0
  144. wisent-0.5.2/wisent/core/prompts/__init__.py +0 -0
  145. wisent-0.5.2/wisent/core/prompts/core/__init__.py +0 -0
  146. wisent-0.5.2/wisent/core/prompts/core/atom.py +57 -0
  147. wisent-0.5.2/wisent/core/prompts/core/prompt_formater.py +157 -0
  148. wisent-0.5.2/wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  149. wisent-0.5.2/wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
  150. wisent-0.5.2/wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
  151. wisent-0.5.2/wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
  152. wisent-0.5.2/wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
  153. wisent-0.5.2/wisent/core/representation.py +5 -0
  154. wisent-0.5.2/wisent/core/sample_size_optimizer.py +648 -0
  155. wisent-0.5.2/wisent/core/sample_size_optimizer_v2.py +355 -0
  156. wisent-0.5.2/wisent/core/save_results.py +277 -0
  157. wisent-0.5.2/wisent/core/steering.py +652 -0
  158. wisent-0.5.2/wisent/core/steering_method.py +26 -0
  159. wisent-0.5.2/wisent/core/steering_methods/__init__.py +0 -0
  160. wisent-0.5.2/wisent/core/steering_methods/core/__init__.py +0 -0
  161. wisent-0.5.2/wisent/core/steering_methods/core/atoms.py +153 -0
  162. wisent-0.5.2/wisent/core/steering_methods/methods/__init__.py +0 -0
  163. wisent-0.5.2/wisent/core/steering_methods/methods/caa.py +44 -0
  164. wisent-0.5.2/wisent/core/steering_optimizer.py +1297 -0
  165. wisent-0.5.2/wisent/core/task_interface.py +132 -0
  166. wisent-0.5.2/wisent/core/task_selector.py +189 -0
  167. wisent-0.5.2/wisent/core/tasks/__init__.py +175 -0
  168. wisent-0.5.2/wisent/core/tasks/aime_task.py +141 -0
  169. wisent-0.5.2/wisent/core/tasks/file_task.py +211 -0
  170. wisent-0.5.2/wisent/core/tasks/hle_task.py +180 -0
  171. wisent-0.5.2/wisent/core/tasks/hmmt_task.py +119 -0
  172. wisent-0.5.2/wisent/core/tasks/livecodebench_task.py +201 -0
  173. wisent-0.5.2/wisent/core/tasks/livemathbench_task.py +158 -0
  174. wisent-0.5.2/wisent/core/tasks/lm_eval_task.py +455 -0
  175. wisent-0.5.2/wisent/core/tasks/math500_task.py +84 -0
  176. wisent-0.5.2/wisent/core/tasks/polymath_task.py +146 -0
  177. wisent-0.5.2/wisent/core/tasks/supergpqa_task.py +220 -0
  178. wisent-0.5.2/wisent/core/time_estimator.py +149 -0
  179. wisent-0.5.2/wisent/core/timing_calibration.py +174 -0
  180. wisent-0.5.2/wisent/core/tracking/__init__.py +54 -0
  181. wisent-0.5.2/wisent/core/tracking/latency.py +618 -0
  182. wisent-0.5.2/wisent/core/tracking/memory.py +359 -0
  183. wisent-0.5.2/wisent/core/trainers/__init__.py +0 -0
  184. wisent-0.5.2/wisent/core/trainers/core/__init__.py +11 -0
  185. wisent-0.5.2/wisent/core/trainers/core/atoms.py +45 -0
  186. wisent-0.5.2/wisent/core/trainers/steering_trainer.py +271 -0
  187. wisent-0.5.2/wisent/core/user_model_config.py +158 -0
  188. wisent-0.5.2/wisent/opti/__init__.py +0 -0
  189. wisent-0.5.2/wisent/opti/core/__init__.py +0 -0
  190. wisent-0.5.2/wisent/opti/core/atoms.py +175 -0
  191. wisent-0.5.2/wisent/opti/methods/__init__.py +0 -0
  192. wisent-0.5.2/wisent/opti/methods/opti_classificator.py +172 -0
  193. wisent-0.5.2/wisent/opti/methods/opti_steering.py +138 -0
  194. wisent-0.5.2/wisent/synthetic/__init__.py +0 -0
  195. wisent-0.5.2/wisent/synthetic/cleaners/__init__.py +0 -0
  196. wisent-0.5.2/wisent/synthetic/cleaners/core/__init__.py +0 -0
  197. wisent-0.5.2/wisent/synthetic/cleaners/core/atoms.py +58 -0
  198. wisent-0.5.2/wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
  199. wisent-0.5.2/wisent/synthetic/cleaners/methods/__init__.py +0 -0
  200. wisent-0.5.2/wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
  201. wisent-0.5.2/wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
  202. wisent-0.5.2/wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  203. wisent-0.5.2/wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
  204. wisent-0.5.2/wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
  205. wisent-0.5.2/wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
  206. wisent-0.5.2/wisent/synthetic/db_instructions/__init__.py +0 -0
  207. wisent-0.5.2/wisent/synthetic/db_instructions/core/__init__.py +0 -0
  208. wisent-0.5.2/wisent/synthetic/db_instructions/core/atoms.py +25 -0
  209. wisent-0.5.2/wisent/synthetic/db_instructions/mini_dp.py +37 -0
  210. wisent-0.5.2/wisent/synthetic/generators/__init__.py +0 -0
  211. wisent-0.5.2/wisent/synthetic/generators/core/__init__.py +0 -0
  212. wisent-0.5.2/wisent/synthetic/generators/core/atoms.py +73 -0
  213. wisent-0.5.2/wisent/synthetic/generators/diversities/__init__.py +0 -0
  214. wisent-0.5.2/wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  215. wisent-0.5.2/wisent/synthetic/generators/diversities/core/core.py +68 -0
  216. wisent-0.5.2/wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  217. wisent-0.5.2/wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  218. wisent-0.5.2/wisent/synthetic/generators/pairs_generator.py +179 -0
  219. wisent-0.5.2/wisent.egg-info/PKG-INFO +67 -0
  220. wisent-0.5.2/wisent.egg-info/SOURCES.txt +222 -0
  221. wisent-0.5.2/wisent.egg-info/requires.txt +12 -0
  222. wisent-0.1.1/PKG-INFO +0 -142
  223. wisent-0.1.1/README.md +0 -112
  224. wisent-0.1.1/pyproject.toml +0 -22
  225. wisent-0.1.1/setup.py +0 -31
  226. wisent-0.1.1/tests/test_activations.py +0 -350
  227. wisent-0.1.1/tests/test_client.py +0 -49
  228. wisent-0.1.1/tests/test_control_vector.py +0 -427
  229. wisent-0.1.1/tests/test_examples.py +0 -343
  230. wisent-0.1.1/tests/test_inference.py +0 -511
  231. wisent-0.1.1/wisent/__init__.py +0 -8
  232. wisent-0.1.1/wisent/activations/__init__.py +0 -9
  233. wisent-0.1.1/wisent/activations/client.py +0 -97
  234. wisent-0.1.1/wisent/activations/extractor.py +0 -251
  235. wisent-0.1.1/wisent/activations/models.py +0 -95
  236. wisent-0.1.1/wisent/client.py +0 -45
  237. wisent-0.1.1/wisent/control_vector/__init__.py +0 -9
  238. wisent-0.1.1/wisent/control_vector/client.py +0 -85
  239. wisent-0.1.1/wisent/control_vector/manager.py +0 -168
  240. wisent-0.1.1/wisent/control_vector/models.py +0 -70
  241. wisent-0.1.1/wisent/inference/__init__.py +0 -9
  242. wisent-0.1.1/wisent/inference/client.py +0 -103
  243. wisent-0.1.1/wisent/inference/inferencer.py +0 -250
  244. wisent-0.1.1/wisent/inference/models.py +0 -66
  245. wisent-0.1.1/wisent/utils/__init__.py +0 -3
  246. wisent-0.1.1/wisent/utils/auth.py +0 -30
  247. wisent-0.1.1/wisent/utils/http.py +0 -228
  248. wisent-0.1.1/wisent/version.py +0 -3
  249. wisent-0.1.1/wisent.egg-info/PKG-INFO +0 -142
  250. wisent-0.1.1/wisent.egg-info/SOURCES.txt +0 -32
  251. wisent-0.1.1/wisent.egg-info/requires.txt +0 -7
  252. {wisent-0.1.1 → wisent-0.5.2}/setup.cfg +0 -0
  253. {wisent-0.1.1 → wisent-0.5.2}/wisent.egg-info/dependency_links.txt +0 -0
  254. {wisent-0.1.1 → wisent-0.5.2}/wisent.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2024 Wisent Team
3
+ Copyright (c) 2023-2024 Wisent Team
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
18
  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
19
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
20
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
21
+ SOFTWARE.
wisent-0.5.2/PKG-INFO ADDED
@@ -0,0 +1,67 @@
1
+ Metadata-Version: 2.4
2
+ Name: wisent
3
+ Version: 0.5.2
4
+ Summary: Monitor and guard against harmful content in language models
5
+ Home-page: https://github.com/yourusername/wisent-activation-guardrails
6
+ Author: Wisent Team
7
+ Author-email: your.email@example.com
8
+ Keywords: nlp,machine learning,language models,safety,guardrails,lm-evaluation-harness
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Requires-Python: >=3.8
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: torch>=1.9.0
19
+ Requires-Dist: transformers>=4.20.0
20
+ Requires-Dist: tqdm>=4.50.0
21
+ Requires-Dist: scikit-learn>=0.24.0
22
+ Requires-Dist: pandas>=1.2.0
23
+ Requires-Dist: numpy>=1.21.0
24
+ Requires-Dist: datasets>=2.0.0
25
+ Requires-Dist: sentence-transformers>=2.0.0
26
+ Requires-Dist: faiss-cpu>=1.7.0
27
+ Provides-Extra: harness
28
+ Requires-Dist: lm-eval==0.4.8; extra == "harness"
29
+ Dynamic: author
30
+ Dynamic: author-email
31
+ Dynamic: classifier
32
+ Dynamic: description
33
+ Dynamic: description-content-type
34
+ Dynamic: home-page
35
+ Dynamic: keywords
36
+ Dynamic: license-file
37
+ Dynamic: provides-extra
38
+ Dynamic: requires-dist
39
+ Dynamic: requires-python
40
+ Dynamic: summary
41
+
42
+ # Wisent-Guard
43
+
44
+ <p align="center">
45
+ <a href="https://github.com/wisent-ai/wisent-guard/stargazers">
46
+ <img src="https://img.shields.io/github/stars/wisent-ai/wisent-guard" alt="stars" />
47
+ </a>
48
+ <a href="https://pypi.org/project/wisent-guard">
49
+ <img src="https://static.pepy.tech/badge/wisent-guard" alt="PyPI - Downloads" />
50
+ </a>
51
+ <br />
52
+ </p>
53
+
54
+ <p align="center">
55
+ <img src="wisent-guard-logo.png" alt="Wisent Guard" width="200">
56
+ </p>
57
+
58
+ A Python package for latent space monitoring and guardrails. Delivered to you by the [Wisent](https://wisent.ai) team led by [Lukasz Bartoszcze](https://lukaszbartoszcze.com).
59
+
60
+ ## Overview
61
+
62
+ Wisent-Guard allows you to control your AI by identifying brain patterns corresponding to responses you don't like, like hallucinations or harmful outputs. We use contrastive pairs of representations to detect when a model might be generating harmful content or hallucinating. Learn more at https://www.wisent.ai/wisent-guard.
63
+
64
+
65
+ ## License
66
+
67
+ This project is licensed under the MIT License - see the LICENSE file for details.
wisent-0.5.2/README.md ADDED
@@ -0,0 +1,26 @@
1
+ # Wisent-Guard
2
+
3
+ <p align="center">
4
+ <a href="https://github.com/wisent-ai/wisent-guard/stargazers">
5
+ <img src="https://img.shields.io/github/stars/wisent-ai/wisent-guard" alt="stars" />
6
+ </a>
7
+ <a href="https://pypi.org/project/wisent-guard">
8
+ <img src="https://static.pepy.tech/badge/wisent-guard" alt="PyPI - Downloads" />
9
+ </a>
10
+ <br />
11
+ </p>
12
+
13
+ <p align="center">
14
+ <img src="wisent-guard-logo.png" alt="Wisent Guard" width="200">
15
+ </p>
16
+
17
+ A Python package for latent space monitoring and guardrails. Delivered to you by the [Wisent](https://wisent.ai) team led by [Lukasz Bartoszcze](https://lukaszbartoszcze.com).
18
+
19
+ ## Overview
20
+
21
+ Wisent-Guard allows you to control your AI by identifying brain patterns corresponding to responses you don't like, like hallucinations or harmful outputs. We use contrastive pairs of representations to detect when a model might be generating harmful content or hallucinating. Learn more at https://www.wisent.ai/wisent-guard.
22
+
23
+
24
+ ## License
25
+
26
+ This project is licensed under the MIT License - see the LICENSE file for details.
@@ -0,0 +1,85 @@
1
+ [tool.pytest.ini_options]
2
+ testpaths = ["tests"]
3
+ python_files = ["test_*.py"]
4
+ python_classes = ["Test*"]
5
+ python_functions = ["test_*"]
6
+ addopts = ["-v", "--tb=short", "--ignore=evaluation", "-m", "not slow and not heavy and not docker and not bigcode_required"]
7
+ markers = [
8
+ "slow: marks tests as slow (deselect with '-m \"not slow\"')",
9
+ "heavy: marks tests as very resource-intensive with large model downloads (deselect with '-m \"not heavy\"')",
10
+ "model_download: marks tests that download models from HuggingFace (deselect with '-m \"not model_download\"')",
11
+ "integration: marks tests as integration tests",
12
+ "unit: marks tests as unit tests",
13
+ "docker: marks tests for Docker-based execution",
14
+ "performance: marks tests for performance evaluation",
15
+ "cli: marks tests for CLI integration",
16
+ "sandbox_required: marks tests that require sandbox environment (includes coding tasks and trust_remote_code)",
17
+ "bigcode_required: marks tests that require bigcode-evaluation-harness installation",
18
+ ]
19
+ filterwarnings = [
20
+ "ignore::DeprecationWarning",
21
+ "ignore::PendingDeprecationWarning",
22
+ ]
23
+
24
+ [tool.ruff]
25
+ line-length = 120
26
+ target-version = "py39"
27
+ exclude = [
28
+ ".bzr",
29
+ ".direnv",
30
+ ".eggs",
31
+ ".git",
32
+ ".git-rewrite",
33
+ ".hg",
34
+ ".mypy_cache",
35
+ ".nox",
36
+ ".pants.d",
37
+ ".pytype",
38
+ ".ruff_cache",
39
+ ".svn",
40
+ ".tox",
41
+ ".venv",
42
+ "__pypackages__",
43
+ "_build",
44
+ "buck-out",
45
+ "build",
46
+ "dist",
47
+ "node_modules",
48
+ "venv",
49
+ "evaluation",
50
+ "bigcode-evaluation-harness",
51
+ ]
52
+
53
+ [tool.ruff.lint]
54
+ # Enable common rule sets
55
+ select = ["E4", "E7", "E9", "F", "I", "N", "UP", "B", "A", "C4", "ISC", "ICN", "PIE", "T20", "PT", "Q", "RSE", "RET", "SLF", "SIM", "TCH", "ARG", "ERA", "PGH", "RUF"]
56
+ ignore = [
57
+ "PLR0913", # Too many arguments in function definition
58
+ "PLR0912", # Too many branches
59
+ "PLR0915", # Too many statements
60
+ "C901", # Function is too complex
61
+ "PLR2004", # Magic value used in comparison
62
+ "TRY003", # Avoid specifying long messages outside exception class
63
+ "PLW2901", # Redefined loop variable
64
+ "B008", # Do not perform function calls in argument defaults
65
+ "ARG002", # Unused method argument
66
+ "PIE800", # Unnecessary spread operator,
67
+ "T201" # print
68
+ ]
69
+ fixable = ["ALL"]
70
+ unfixable = []
71
+
72
+ [tool.ruff.format]
73
+ # Like Black, use double quotes for strings.
74
+ quote-style = "double"
75
+ # Like Black, indent with spaces, rather than tabs.
76
+ indent-style = "space"
77
+ # Like Black, respect magic trailing commas.
78
+ skip-magic-trailing-comma = false
79
+ # Like Black, automatically detect the appropriate line ending.
80
+ line-ending = "auto"
81
+
82
+ [tool.ruff.lint.isort]
83
+ known-first-party = ["wisent"]
84
+ force-single-line = false
85
+ combine-as-imports = true
wisent-0.5.2/setup.py ADDED
@@ -0,0 +1,51 @@
1
+ import os
2
+ from setuptools import setup, find_packages
3
+
4
+ # Read the contents of the README file
5
+ with open("README.md", encoding="utf-8") as f:
6
+ long_description = f.read()
7
+
8
+ # Read version from __init__.py
9
+ with open(os.path.join("wisent", "__init__.py"), encoding="utf-8") as f:
10
+ for line in f:
11
+ if line.startswith("__version__"):
12
+ version = line.split("=")[1].strip().strip('"').strip("'")
13
+ break
14
+
15
+ setup(
16
+ name="wisent",
17
+ version=version,
18
+ author="Wisent Team",
19
+ author_email="your.email@example.com", # Replace with your email
20
+ description="Monitor and guard against harmful content in language models",
21
+ long_description=long_description,
22
+ long_description_content_type="text/markdown",
23
+ url="https://github.com/yourusername/wisent-activation-guardrails", # Replace with your GitHub repo
24
+ packages=find_packages(exclude=["patches", "patches.*"]), # Exclude patches directory
25
+ classifiers=[
26
+ "Programming Language :: Python :: 3",
27
+ "License :: OSI Approved :: MIT License",
28
+ "Operating System :: OS Independent",
29
+ "Development Status :: 4 - Beta",
30
+ "Intended Audience :: Science/Research",
31
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
32
+ ],
33
+ python_requires=">=3.8",
34
+ install_requires=[
35
+ "torch>=1.9.0",
36
+ "transformers>=4.20.0",
37
+ "tqdm>=4.50.0",
38
+ "scikit-learn>=0.24.0",
39
+ "pandas>=1.2.0",
40
+ "numpy>=1.21.0",
41
+ "datasets>=2.0.0",
42
+ "sentence-transformers>=2.0.0",
43
+ "faiss-cpu>=1.7.0",
44
+ ],
45
+ extras_require={
46
+ "harness": [
47
+ "lm-eval==0.4.8",
48
+ ],
49
+ },
50
+ keywords="nlp, machine learning, language models, safety, guardrails, lm-evaluation-harness",
51
+ )
@@ -0,0 +1 @@
1
+ __version__ = "0.5.2"
File without changes
File without changes
@@ -0,0 +1,36 @@
1
+ from __future__ import annotations
2
+ from dataclasses import dataclass
3
+ from typing import Iterable, Protocol
4
+
5
+ @dataclass(frozen=True)
6
+ class SampleOutcome:
7
+ """
8
+ Result of executing a single sample (possibly after self-repair).
9
+
10
+ attributes:
11
+ task_id:
12
+ The unique identifier for the task.
13
+ status:
14
+ One of "ok", "compile_error", "runtime_error", or "timeout".
15
+ passed:
16
+ True if the code passed all tests, False otherwise.
17
+ elapsed:
18
+ Time taken to execute the code in seconds.
19
+ """
20
+ task_id: str
21
+ status: str
22
+ passed: bool
23
+ elapsed: float
24
+
25
+ class Metric(Protocol):
26
+ """
27
+ Metric computes a score from an iterable of SampleOutcome.
28
+ """
29
+ def compute(self, outcomes: Iterable[SampleOutcome]) -> float: ...
30
+
31
+
32
+ class Evaluator(Protocol):
33
+ """
34
+ Runs tasks end-to-end (codegen + optional self-repair) and yields SampleOutcome.
35
+ """
36
+ def evaluate(self) -> Iterable[SampleOutcome]: ...
@@ -0,0 +1,275 @@
1
+ from __future__ import annotations
2
+ from dataclasses import dataclass
3
+ from typing import Callable, Iterable, Optional, TYPE_CHECKING
4
+
5
+ from wisent.benchmarks.coding.safe_docker.core.runtime import DockerSandboxExecutor
6
+ from wisent.benchmarks.coding.safe_docker.recipes import RECIPE_REGISTRY
7
+ from wisent.benchmarks.coding.metrics.core.atoms import SampleOutcome, Evaluator
8
+
9
+ from wisent.benchmarks.coding.output_sanitizer.core.atoms import TaskSchema
10
+ from wisent.benchmarks.coding.output_sanitizer.python_sanitizer import PythonStandardizer
11
+ from wisent.benchmarks.coding.output_sanitizer.cpp_sanitizer import CppStandardizer
12
+ from wisent.benchmarks.coding.output_sanitizer.java_sanitizer import JavaStandardizer
13
+
14
+ if TYPE_CHECKING:
15
+ from wisent.benchmarks.coding.safe_docker.core.atoms import Result
16
+ from wisent.benchmarks.coding.providers.core.atoms import Provider, CodingTask
17
+ from wisent.benchmarks.coding.output_sanitizer.core.atoms import CodeStandardizer
18
+
19
+ RepairFn = Callable[[str, dict[str,str], str], dict[str,str]]
20
+
21
+ @dataclass
22
+ class EvaluatorConfig:
23
+ """
24
+ Configuration for CodingEvaluator.
25
+
26
+ attributes:
27
+ image:
28
+ Docker image to use for code execution (default: "coding/sandbox:polyglot-1.0").
29
+ runtime:
30
+ Optional Docker runtime (e.g., "runsc" for gVisor).
31
+ feedback_max_chars:
32
+ Maximum characters of feedback to pass to the repair function (default: 2000).
33
+ self_repair:
34
+ Whether to perform a single self-repair turn (default: True). It means the we provide feedback to the model for one iteration.
35
+ time_limit_s:
36
+ Time limit in seconds for each code execution (default: 8s).
37
+ cpu_limit_s:
38
+ CPU time limit in seconds for each code execution (default: 3s).
39
+ mem_limit_mb:
40
+ Memory limit in megabytes for each code execution (default: 768MB).
41
+ pre_sanitize:
42
+ Whether to run LLM output through a sanitizer before execution (default: True).
43
+ """
44
+ image: str = "coding/sandbox:polyglot-1.0"
45
+ runtime: Optional[str] = None
46
+ feedback_max_chars: int = 2000
47
+ self_repair: bool = True
48
+ time_limit_s: int = 8
49
+ cpu_limit_s: int = 3
50
+ mem_limit_mb: int = 768
51
+ pre_sanitize: bool = True
52
+
53
+ _SANITIZERS = {
54
+ "python": PythonStandardizer(),
55
+ "cpp": CppStandardizer(),
56
+ "java": JavaStandardizer(),
57
+ }
58
+
59
+ def _default_filename(lang: str) -> str:
60
+ """
61
+ Returns the default source file name for a given programming language.
62
+
63
+ arguments:
64
+ lang:
65
+ Programming language ("python", "cpp", or "java").
66
+
67
+ returns:
68
+ Default filename as a string.
69
+ """
70
+ return {"python":"solution.py","cpp":"solution.cpp","java":"Solution.java"}[lang]
71
+
72
+ def _make_schema(task: CodingTask) -> TaskSchema:
73
+ """
74
+ Constructs a TaskSchema from a CodingTask, using task options or defaults.
75
+
76
+ arguments:
77
+ task:
78
+ CodingTask containing language and options.
79
+
80
+ returns:
81
+ TaskSchema with language, file_name, entry_point, java_class, prefer_rename,
82
+ and allow_wrapper set appropriately.
83
+
84
+ example:
85
+ >>> from wisent.benchmarks.coding.providers.core.atoms import CodingTask
86
+ >>> task = CodingTask(language="python", files={}, options={"entry_point":"add","file_name":"my_solution.py"})
87
+ >>> schema = _make_schema(task)
88
+ >>> schema.language
89
+ 'python'
90
+ >>> schema.file_name
91
+ 'my_solution.py'
92
+ >>> schema.entry_point
93
+ 'add'
94
+ >>> schema.java_class
95
+ 'Solution'
96
+ >>> schema.prefer_rename
97
+ True
98
+ >>> schema.allow_wrapper
99
+ True
100
+ """
101
+ entry = str(task.options.get("entry_point", "solve"))
102
+ file_name = str(task.options.get("file_name", _default_filename(task.language)))
103
+ java_class = str(task.options.get("java_class", "Solution"))
104
+ return TaskSchema(language=task.language, file_name=file_name, entry_point=entry,
105
+ java_class=java_class, prefer_rename=True, allow_wrapper=True)
106
+
107
+ class CodingEvaluator(Evaluator):
108
+ """
109
+ Evaluator for coding tasks with optional self-repair.
110
+ """
111
+ def __init__(self, provider: Provider, model_fn: Callable[[CodingTask], dict[str,str]],
112
+ repair_fn: Optional[RepairFn] = None, cfg: EvaluatorConfig = EvaluatorConfig()):
113
+ self.provider = provider
114
+ self.model_fn = model_fn
115
+ self.repair_fn = repair_fn
116
+ self.cfg = cfg
117
+ self.exec = DockerSandboxExecutor(image=cfg.image, runtime=cfg.runtime)
118
+
119
+ def _feedback(self, res: "Result") -> str:
120
+ """
121
+ Generates feedback text from a Result object for use in self-repair.
122
+
123
+ arguments:
124
+ res:
125
+ Result object containing status, stdout, stderr, and elapsed time.
126
+
127
+ returns:
128
+ Feedback string summarizing the result, truncated to cfg.feedback_max_chars.
129
+
130
+ examples:
131
+ >>> from wisent.benchmarks.coding.safe_docker.core.atoms import Result
132
+ >>> res = Result(status="timeout", stdout="", stderr="", elapsed=10.0)
133
+ >>> evaluator = CodingEvaluator(provider=None, model_fn=lambda x: {}, cfg=EvaluatorConfig())
134
+ >>> evaluator._feedback(res)
135
+ 'Timeout after 10.00s.'
136
+ >>> res = Result(status="compile_error", stdout="", stderr="error: something went wrong", elapsed=1.5)
137
+ >>> evaluator._feedback(res)
138
+ 'Compilation failed:\nerror: something went wrong'
139
+ >>> res = Result(status="runtime_error", stdout="test failed", stderr="", elapsed=0.5)
140
+ >>> evaluator._feedback(res)
141
+ 'Runtime error:\ntest failed'
142
+ """
143
+ if res.status == "timeout":
144
+ return f"Timeout after {res.elapsed:.2f}s."
145
+ body = (res.stdout or "") + ("\n" + res.stderr if res.stderr else "")
146
+ if res.status == "compile_error":
147
+ prefix = "Compilation failed:\n"
148
+ else:
149
+ prefix = "Tests failed:\n"
150
+ return (prefix + body)[: self.cfg.feedback_max_chars]
151
+
152
+ def _run_once(self, task: CodingTask, files: dict[str,str]) -> Result:
153
+ """
154
+ Runs a single evaluation job for the given task and files.
155
+
156
+ arguments:
157
+ task:
158
+ The coding task to evaluate.
159
+ files:
160
+ The files to include in the evaluation.
161
+
162
+ returns:
163
+ Result object containing the status, stdout, stderr, and elapsed time.
164
+
165
+ examples:
166
+ >>> from wisent.benchmarks.coding.providers.core.atoms import CodingTask
167
+ >>> from wisent.benchmarks.coding.safe_docker.core.atoms import Result
168
+ >>> task = CodingTask(language="python", files={}, options={})
169
+ >>> files = {"solution.py": "def add(a,b): return a + b", "tests.py": "from solution import add\ndef test_ok(): assert add(1,2)==3"}
170
+ >>> evaluator = CodingEvaluator(provider=None, model_fn=lambda x: {})
171
+ >>> res: Result = evaluator._run_once(task, files)
172
+ >>> res.status
173
+ 'ok'
174
+ >>> res.exit_code
175
+ 0
176
+ >>> res.stdout
177
+ 'test_ok passed'
178
+ >>> res.stderr
179
+ ''
180
+ >>> round(res.elapsed, 2)
181
+ 0.23
182
+ """
183
+ recipe = RECIPE_REGISTRY[task.language]
184
+ job = recipe.make_job(files, **task.options,
185
+ time_limit_s=self.cfg.time_limit_s,
186
+ cpu_limit_s=self.cfg.cpu_limit_s,
187
+ mem_limit_mb=self.cfg.mem_limit_mb)
188
+ return self.exec.run(files, job)
189
+
190
+ def _maybe_sanitize(self, task: CodingTask, files: dict[str,str]) -> dict[str,str]:
191
+ """
192
+ Optionally sanitizes the generated files based on the task schema.
193
+
194
+ arguments:
195
+ task:
196
+ The coding task containing language and options.
197
+ files:
198
+ The generated files to potentially sanitize.
199
+
200
+ returns:
201
+ The sanitized files if pre_sanitize is True and a sanitizer exists for the language; otherwise, the original files.
202
+
203
+ examples:
204
+ >>> from wisent.benchmarks.coding.providers.core.atoms import CodingTask
205
+ >>> task = CodingTask(language="python", files={}, options={"entry_point":"add","file_name":"my_solution.py"})
206
+ >>> files = {"my_solution.py": "def add(a,b): return a - b # BUG"}
207
+ >>> evaluator = CodingEvaluator(provider=None, model_fn=lambda x: {}, cfg=EvaluatorConfig(pre_sanitize=True))
208
+ >>> sanitized_files = evaluator._maybe_sanitize(task, files)
209
+ >>> "my_solution.py" in sanitized_files
210
+ True
211
+ >>> sanitized_files["my_solution.py"]
212
+ 'def add(a, b):\n return a + b\n'
213
+ """
214
+ if not self.cfg.pre_sanitize:
215
+ return files
216
+ schema = _make_schema(task)
217
+ sanitizer: "CodeStandardizer" = _SANITIZERS.get(task.language)
218
+ if sanitizer is None:
219
+ return files
220
+
221
+ raw = files.get(schema.file_name) or files.get("__raw__")
222
+ if not raw:
223
+ return files
224
+
225
+ out = sanitizer.normalize(raw, schema)
226
+ files = {**files, schema.file_name: out.files.get(schema.file_name, raw)}
227
+ return files
228
+
229
+ def evaluate(self) -> Iterable[SampleOutcome]:
230
+ """
231
+ Evaluates all tasks from the provider, performing optional self-repair.
232
+
233
+ yields:
234
+ SampleOutcome for each task, indicating pass/fail status and elapsed time.
235
+
236
+ examples:
237
+ >>> from wisent.benchmarks.coding.providers.core.atoms import CodingTask, Provider
238
+ >>> class DummyProvider:
239
+ ... name = "dummy"
240
+ ... def iter_tasks(self):
241
+ ... yield CodingTask(language="python", files={"tests.py":"from solution import add\ndef test_ok(): assert add(1,2)==3"},
242
+ ... options={"entry_point":"add","file_name":"solution.py"})
243
+ >>> def model_fn(task: CodingTask) -> Dict[str,str]:
244
+ ... return {"solution.py": "def add(a,b): return a - b # BUG"}
245
+ >>> def repair_fn(lang: str, prev_files: Dict[str,str], feedback: str) -> Dict[str,str]:
246
+ ... fixed = prev_files["solution.py"].replace("a - b", "a + b")
247
+ ... return {"solution.py": fixed}
248
+ >>> evaluator = CodingEvaluator(provider=DummyProvider(), model_fn=model_fn, repair_fn=repair_fn, cfg=EvaluatorConfig(self_repair=True))
249
+ >>> outcomes = list(evaluator.evaluate())
250
+ >>> len(outcomes)
251
+ 1
252
+ >>> outcomes[0].passed
253
+ True
254
+ """
255
+ for idx, task in enumerate(self.provider.iter_tasks()):
256
+ files0 = self.model_fn(task)
257
+ files0 = {**task.files, **files0}
258
+ files0 = self._maybe_sanitize(task, files0)
259
+
260
+ r0 = self._run_once(task, files0)
261
+ if r0.status == "ok":
262
+ yield SampleOutcome(task_id=f"{self.provider.name}:{idx}", status=r0.status, passed=True, elapsed=r0.elapsed)
263
+ continue
264
+
265
+ if not self.cfg.self_repair or self.repair_fn is None:
266
+ yield SampleOutcome(task_id=f"{self.provider.name}:{idx}", status=r0.status, passed=False, elapsed=r0.elapsed); continue
267
+
268
+ fb = self._feedback(r0)
269
+ files1 = self.repair_fn(task.language, files0, fb)
270
+ files1 = {**task.files, **files1}
271
+ files1 = self._maybe_sanitize(task, files1)
272
+
273
+ r1 = self._run_once(task, files1)
274
+ passed = (r0.status == "ok") or (r1.status == "ok")
275
+ yield SampleOutcome(task_id=f"{self.provider.name}:{idx}", status=r1.status, passed=passed, elapsed=(r0.elapsed + r1.elapsed))
@@ -0,0 +1,66 @@
1
+ # coding/metrics/passk.py
2
+ from __future__ import annotations
3
+ from typing import Iterable
4
+ import math
5
+ from collections import defaultdict
6
+ from .core.atoms import SampleOutcome, Metric
7
+
8
+ class PassAtK(Metric):
9
+ """
10
+ Exact Pass@k for code generation.
11
+ """
12
+
13
+ def __init__(self, k: int = 1):
14
+ if k < 1:
15
+ raise ValueError("k must be >= 1")
16
+ self.k = k
17
+
18
+ def compute(self, outcomes: Iterable[SampleOutcome]) -> float:
19
+ """
20
+ Aggregate counts per task_id
21
+
22
+ arguments:
23
+ outcomes: Iterable of SampleOutcome objects
24
+
25
+ returns:
26
+ Average Pass@k score across tasks
27
+
28
+ intuition:
29
+ For each task, we have n samples, c of which pass.
30
+ We want the probability that at least one of k random picks from these n samples is a passing one.
31
+ This is 1 - (combinations of picking k from the n-c failing ones) / (combinations of picking k from all n).
32
+ We then average this score across all tasks.
33
+ """
34
+ per_task_counts = defaultdict(lambda: {"n": 0, "c": 0})
35
+ for o in outcomes:
36
+ d = per_task_counts[o.task_id]
37
+ d["n"] += 1
38
+ d["c"] += 1 if o.passed else 0
39
+
40
+ if not per_task_counts:
41
+ return 0.0
42
+
43
+ scores_sum = 0.0
44
+ task_cnt = 0
45
+ for counts in per_task_counts.values():
46
+ n = counts["n"]
47
+ c = counts["c"]
48
+ if n <= 0:
49
+ continue
50
+
51
+ k = min(self.k, n)
52
+ if c <= 0:
53
+ score = 0.0
54
+ elif k == 0:
55
+ score = 0.0
56
+ elif k == 1:
57
+ score = c / n
58
+ else:
59
+ denom = math.comb(n, k)
60
+ num = math.comb(n - c, k) if k <= (n - c) else 0
61
+ score = 1.0 - (num / denom if denom > 0 else 0.0)
62
+
63
+ scores_sum += score
64
+ task_cnt += 1
65
+
66
+ return 0.0 if task_cnt == 0 else scores_sum / task_cnt
@@ -0,0 +1,27 @@
1
+ # coding/llm_sanitizer/core/atoms.py
2
+ from __future__ import annotations
3
+ from dataclasses import dataclass
4
+ from typing import Dict, Protocol, Literal, Optional
5
+
6
+ Language = Literal["python", "cpp", "java"]
7
+
8
+ @dataclass(frozen=True)
9
+ class TaskSchema:
10
+ """What the sandbox expects for this task."""
11
+ language: Language
12
+ file_name: str # e.g., "solution.py" | "solution.cpp" | "Solution.java"
13
+ entry_point: str # function/method name tests will call (e.g., "add", "solve")
14
+ java_class: str = "Solution" # only for Java; expected public class name
15
+ # Optional hints:
16
+ allow_wrapper: bool = True # may synthesize thin wrapper instead of renaming
17
+ prefer_rename: bool = False # if True and safe, rename single top-level function to entry_point
18
+
19
+ @dataclass(frozen=True)
20
+ class NormalizeResult:
21
+ files: Dict[str, str] # filename -> normalized source
22
+ notes: str # human-readable log of what was done
23
+ ok: bool # True if we think it’s valid / parseable
24
+
25
+ class CodeStandardizer(Protocol):
26
+ language: Language
27
+ def normalize(self, raw: str, schema: TaskSchema) -> NormalizeResult: ...