wisent 0.1.1__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wisent might be problematic. Click here for more details.
- wisent/__init__.py +1 -8
- wisent/benchmarks/__init__.py +0 -0
- wisent/benchmarks/coding/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
- wisent/benchmarks/coding/metrics/evaluator.py +275 -0
- wisent/benchmarks/coding/metrics/passk.py +66 -0
- wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
- wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
- wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
- wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
- wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
- wisent/benchmarks/coding/providers/__init__.py +18 -0
- wisent/benchmarks/coding/providers/core/__init__.py +0 -0
- wisent/benchmarks/coding/providers/core/atoms.py +31 -0
- wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
- wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
- wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
- wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
- wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
- wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
- wisent/classifiers/__init__.py +0 -0
- wisent/classifiers/core/__init__.py +0 -0
- wisent/classifiers/core/atoms.py +747 -0
- wisent/classifiers/models/__init__.py +0 -0
- wisent/classifiers/models/logistic.py +29 -0
- wisent/classifiers/models/mlp.py +47 -0
- wisent/cli/__init__.py +0 -0
- wisent/cli/classifiers/__init__.py +0 -0
- wisent/cli/classifiers/classifier_rotator.py +137 -0
- wisent/cli/cli_logger.py +142 -0
- wisent/cli/data_loaders/__init__.py +0 -0
- wisent/cli/data_loaders/data_loader_rotator.py +96 -0
- wisent/cli/evaluators/__init__.py +0 -0
- wisent/cli/evaluators/evaluator_rotator.py +148 -0
- wisent/cli/steering_methods/__init__.py +0 -0
- wisent/cli/steering_methods/steering_rotator.py +110 -0
- wisent/cli/wisent_cli/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
- wisent/cli/wisent_cli/commands/listing.py +154 -0
- wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
- wisent/cli/wisent_cli/main.py +93 -0
- wisent/cli/wisent_cli/shell.py +80 -0
- wisent/cli/wisent_cli/ui.py +69 -0
- wisent/cli/wisent_cli/util/__init__.py +0 -0
- wisent/cli/wisent_cli/util/aggregations.py +43 -0
- wisent/cli/wisent_cli/util/parsing.py +126 -0
- wisent/cli/wisent_cli/version.py +4 -0
- wisent/core/__init__.py +27 -0
- wisent/core/activations/__init__.py +0 -0
- wisent/core/activations/activations_collector.py +338 -0
- wisent/core/activations/core/__init__.py +0 -0
- wisent/core/activations/core/atoms.py +216 -0
- wisent/core/agent/__init__.py +18 -0
- wisent/core/agent/budget.py +638 -0
- wisent/core/agent/device_benchmarks.py +685 -0
- wisent/core/agent/diagnose/__init__.py +55 -0
- wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
- wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
- wisent/core/agent/diagnose/create_classifier.py +1154 -0
- wisent/core/agent/diagnose/response_diagnostics.py +268 -0
- wisent/core/agent/diagnose/select_classifiers.py +506 -0
- wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
- wisent/core/agent/diagnose/tasks/__init__.py +33 -0
- wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
- wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
- wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
- wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
- wisent/core/agent/diagnose.py +242 -0
- wisent/core/agent/steer.py +212 -0
- wisent/core/agent/timeout.py +134 -0
- wisent/core/autonomous_agent.py +1234 -0
- wisent/core/bigcode_integration.py +583 -0
- wisent/core/contrastive_pairs/__init__.py +15 -0
- wisent/core/contrastive_pairs/core/__init__.py +0 -0
- wisent/core/contrastive_pairs/core/atoms.py +45 -0
- wisent/core/contrastive_pairs/core/buliders.py +59 -0
- wisent/core/contrastive_pairs/core/pair.py +178 -0
- wisent/core/contrastive_pairs/core/response.py +152 -0
- wisent/core/contrastive_pairs/core/serialization.py +300 -0
- wisent/core/contrastive_pairs/core/set.py +133 -0
- wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
- wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
- wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
- wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
- wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
- wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
- wisent/core/data_loaders/__init__.py +0 -0
- wisent/core/data_loaders/core/__init__.py +0 -0
- wisent/core/data_loaders/core/atoms.py +98 -0
- wisent/core/data_loaders/loaders/__init__.py +0 -0
- wisent/core/data_loaders/loaders/custom.py +120 -0
- wisent/core/data_loaders/loaders/lm_loader.py +218 -0
- wisent/core/detection_handling.py +257 -0
- wisent/core/download_full_benchmarks.py +1386 -0
- wisent/core/evaluators/__init__.py +0 -0
- wisent/core/evaluators/oracles/__init__.py +0 -0
- wisent/core/evaluators/oracles/interactive.py +73 -0
- wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
- wisent/core/evaluators/oracles/user_specified.py +67 -0
- wisent/core/hyperparameter_optimizer.py +429 -0
- wisent/core/lm_eval_harness_ground_truth.py +1396 -0
- wisent/core/log_likelihoods_evaluator.py +321 -0
- wisent/core/managed_cached_benchmarks.py +595 -0
- wisent/core/mixed_benchmark_sampler.py +364 -0
- wisent/core/model_config_manager.py +330 -0
- wisent/core/model_persistence.py +317 -0
- wisent/core/models/__init__.py +0 -0
- wisent/core/models/core/__init__.py +0 -0
- wisent/core/models/core/atoms.py +460 -0
- wisent/core/models/wisent_model.py +727 -0
- wisent/core/multi_steering.py +316 -0
- wisent/core/optuna/__init__.py +57 -0
- wisent/core/optuna/classifier/__init__.py +25 -0
- wisent/core/optuna/classifier/activation_generator.py +349 -0
- wisent/core/optuna/classifier/classifier_cache.py +509 -0
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
- wisent/core/optuna/steering/__init__.py +0 -0
- wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
- wisent/core/optuna/steering/data_utils.py +342 -0
- wisent/core/optuna/steering/metrics.py +474 -0
- wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
- wisent/core/optuna/steering/steering_optimization.py +1111 -0
- wisent/core/parser.py +1668 -0
- wisent/core/prompts/__init__.py +0 -0
- wisent/core/prompts/core/__init__.py +0 -0
- wisent/core/prompts/core/atom.py +57 -0
- wisent/core/prompts/core/prompt_formater.py +157 -0
- wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
- wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
- wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
- wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
- wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
- wisent/core/representation.py +5 -0
- wisent/core/sample_size_optimizer.py +648 -0
- wisent/core/sample_size_optimizer_v2.py +355 -0
- wisent/core/save_results.py +277 -0
- wisent/core/steering.py +652 -0
- wisent/core/steering_method.py +26 -0
- wisent/core/steering_methods/__init__.py +0 -0
- wisent/core/steering_methods/core/__init__.py +0 -0
- wisent/core/steering_methods/core/atoms.py +153 -0
- wisent/core/steering_methods/methods/__init__.py +0 -0
- wisent/core/steering_methods/methods/caa.py +44 -0
- wisent/core/steering_optimizer.py +1297 -0
- wisent/core/task_interface.py +132 -0
- wisent/core/task_selector.py +189 -0
- wisent/core/tasks/__init__.py +175 -0
- wisent/core/tasks/aime_task.py +141 -0
- wisent/core/tasks/file_task.py +211 -0
- wisent/core/tasks/hle_task.py +180 -0
- wisent/core/tasks/hmmt_task.py +119 -0
- wisent/core/tasks/livecodebench_task.py +201 -0
- wisent/core/tasks/livemathbench_task.py +158 -0
- wisent/core/tasks/lm_eval_task.py +455 -0
- wisent/core/tasks/math500_task.py +84 -0
- wisent/core/tasks/polymath_task.py +146 -0
- wisent/core/tasks/supergpqa_task.py +220 -0
- wisent/core/time_estimator.py +149 -0
- wisent/core/timing_calibration.py +174 -0
- wisent/core/tracking/__init__.py +54 -0
- wisent/core/tracking/latency.py +618 -0
- wisent/core/tracking/memory.py +359 -0
- wisent/core/trainers/__init__.py +0 -0
- wisent/core/trainers/core/__init__.py +11 -0
- wisent/core/trainers/core/atoms.py +45 -0
- wisent/core/trainers/steering_trainer.py +271 -0
- wisent/core/user_model_config.py +158 -0
- wisent/opti/__init__.py +0 -0
- wisent/opti/core/__init__.py +0 -0
- wisent/opti/core/atoms.py +175 -0
- wisent/opti/methods/__init__.py +0 -0
- wisent/opti/methods/opti_classificator.py +172 -0
- wisent/opti/methods/opti_steering.py +138 -0
- wisent/synthetic/__init__.py +0 -0
- wisent/synthetic/cleaners/__init__.py +0 -0
- wisent/synthetic/cleaners/core/__init__.py +0 -0
- wisent/synthetic/cleaners/core/atoms.py +58 -0
- wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
- wisent/synthetic/cleaners/methods/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
- wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
- wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
- wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
- wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
- wisent/synthetic/db_instructions/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/atoms.py +25 -0
- wisent/synthetic/db_instructions/mini_dp.py +37 -0
- wisent/synthetic/generators/__init__.py +0 -0
- wisent/synthetic/generators/core/__init__.py +0 -0
- wisent/synthetic/generators/core/atoms.py +73 -0
- wisent/synthetic/generators/diversities/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/core.py +68 -0
- wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
- wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
- wisent/synthetic/generators/pairs_generator.py +179 -0
- wisent-0.5.2.dist-info/METADATA +67 -0
- wisent-0.5.2.dist-info/RECORD +218 -0
- {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/WHEEL +1 -1
- {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info/licenses}/LICENSE +2 -2
- wisent/activations/__init__.py +0 -9
- wisent/activations/client.py +0 -97
- wisent/activations/extractor.py +0 -251
- wisent/activations/models.py +0 -95
- wisent/client.py +0 -45
- wisent/control_vector/__init__.py +0 -9
- wisent/control_vector/client.py +0 -85
- wisent/control_vector/manager.py +0 -168
- wisent/control_vector/models.py +0 -70
- wisent/inference/__init__.py +0 -9
- wisent/inference/client.py +0 -103
- wisent/inference/inferencer.py +0 -250
- wisent/inference/models.py +0 -66
- wisent/utils/__init__.py +0 -3
- wisent/utils/auth.py +0 -30
- wisent/utils/http.py +0 -228
- wisent/version.py +0 -3
- wisent-0.1.1.dist-info/METADATA +0 -142
- wisent-0.1.1.dist-info/RECORD +0 -23
- {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,583 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BigCode Evaluation Harness integration for Wisent Guard.
|
|
3
|
+
|
|
4
|
+
This module provides integration with bigcode-evaluation-harness for code generation benchmarks.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import subprocess
|
|
11
|
+
import sys
|
|
12
|
+
import tempfile
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any, Dict, List, Optional
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class BigCodeTaskLoader:
|
|
20
|
+
"""Loads and manages BigCode evaluation tasks."""
|
|
21
|
+
|
|
22
|
+
# Mapping of our task names to BigCode task names
|
|
23
|
+
TASK_MAPPING = {
|
|
24
|
+
# === DIRECT MATCHES ===
|
|
25
|
+
"humaneval": "humaneval",
|
|
26
|
+
"mbpp": "mbpp",
|
|
27
|
+
"conala": "conala",
|
|
28
|
+
"concode": "concode",
|
|
29
|
+
"mercury": "mercury",
|
|
30
|
+
# === CORRECTED MAPPINGS ===
|
|
31
|
+
"humaneval_plus": "humanevalplus",
|
|
32
|
+
"instructhumaneval": "instruct-humaneval",
|
|
33
|
+
"mbpp_plus": "mbppplus",
|
|
34
|
+
"apps": "apps-introductory",
|
|
35
|
+
"ds1000": "ds1000-all-completion",
|
|
36
|
+
# === MULTI-LANGUAGE TASKS ===
|
|
37
|
+
"multiple_py": "multiple-py",
|
|
38
|
+
"multiple_js": "multiple-js",
|
|
39
|
+
"multiple_java": "multiple-java",
|
|
40
|
+
"multiple_cpp": "multiple-cljcpp",
|
|
41
|
+
"multiple_rs": "multiple-rs",
|
|
42
|
+
"multiple_go": "multiple-go",
|
|
43
|
+
# === CODE-TO-TEXT TASKS ===
|
|
44
|
+
"codexglue_code_to_text_python": "codexglue_code_to_text-python",
|
|
45
|
+
"codexglue_code_to_text_go": "codexglue_code_to_text-go",
|
|
46
|
+
"codexglue_code_to_text_java": "codexglue_code_to_text-java",
|
|
47
|
+
"codexglue_code_to_text_javascript": "codexglue_code_to_text-javascript",
|
|
48
|
+
"codexglue_code_to_text_php": "codexglue_code_to_text-php",
|
|
49
|
+
"codexglue_code_to_text_ruby": "codexglue_code_to_text-ruby",
|
|
50
|
+
# === FIXED PROBLEMATIC MAPPINGS ===
|
|
51
|
+
"recode": "perturbed-humaneval-natgen-num_seeds_1",
|
|
52
|
+
"humanevalpack": None, # ❌ REMOVED - no simple mapping exists, only complex variants
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
def __init__(self):
|
|
56
|
+
"""Initialize BigCode task loader."""
|
|
57
|
+
self._bigcode_available = self._check_bigcode_available()
|
|
58
|
+
self._task_cache = {}
|
|
59
|
+
|
|
60
|
+
def _check_bigcode_available(self) -> bool:
|
|
61
|
+
"""Check if bigcode-evaluation-harness is available."""
|
|
62
|
+
try:
|
|
63
|
+
import bigcode_eval
|
|
64
|
+
|
|
65
|
+
return True
|
|
66
|
+
except ImportError:
|
|
67
|
+
logger.warning("bigcode-evaluation-harness not installed")
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
def is_bigcode_task(self, task_name: str) -> bool:
|
|
71
|
+
"""Check if a task is a BigCode task."""
|
|
72
|
+
return task_name in self.TASK_MAPPING
|
|
73
|
+
|
|
74
|
+
def load_task(self, task_name: str, limit: Optional[int] = None) -> "BigCodeTask":
|
|
75
|
+
"""
|
|
76
|
+
Load a BigCode task.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
task_name: Name of the task (our naming convention)
|
|
80
|
+
limit: Optional limit on number of samples
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
BigCodeTask object
|
|
84
|
+
"""
|
|
85
|
+
if not self._bigcode_available:
|
|
86
|
+
raise ImportError("bigcode-evaluation-harness not installed. Run: pip install bigcode-evaluation-harness")
|
|
87
|
+
|
|
88
|
+
if task_name not in self.TASK_MAPPING:
|
|
89
|
+
raise ValueError(f"Unknown BigCode task: {task_name}")
|
|
90
|
+
|
|
91
|
+
bigcode_task_name = self.TASK_MAPPING[task_name]
|
|
92
|
+
|
|
93
|
+
# Handle removed tasks with None mapping
|
|
94
|
+
if bigcode_task_name is None:
|
|
95
|
+
raise ValueError(f"Task '{task_name}' has been removed - no BigCode mapping available")
|
|
96
|
+
|
|
97
|
+
# Check cache
|
|
98
|
+
cache_key = f"{task_name}:{limit}"
|
|
99
|
+
if cache_key in self._task_cache:
|
|
100
|
+
return self._task_cache[cache_key]
|
|
101
|
+
|
|
102
|
+
# Create task object
|
|
103
|
+
task = BigCodeTask(task_name, bigcode_task_name, limit)
|
|
104
|
+
self._task_cache[cache_key] = task
|
|
105
|
+
|
|
106
|
+
return task
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class BigCodeTask:
|
|
110
|
+
"""Represents a BigCode evaluation task."""
|
|
111
|
+
|
|
112
|
+
def __init__(self, task_name: str, bigcode_task_name: str, limit: Optional[int] = None):
|
|
113
|
+
"""
|
|
114
|
+
Initialize BigCode task.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
task_name: Our task name
|
|
118
|
+
bigcode_task_name: BigCode's task name
|
|
119
|
+
limit: Optional limit on samples
|
|
120
|
+
"""
|
|
121
|
+
self.task_name = task_name
|
|
122
|
+
self.bigcode_task_name = bigcode_task_name
|
|
123
|
+
self.limit = limit
|
|
124
|
+
self._limit = limit # Store as private attribute too
|
|
125
|
+
self._data = None
|
|
126
|
+
self._task_obj = None
|
|
127
|
+
self._load_data()
|
|
128
|
+
|
|
129
|
+
def _load_data(self):
|
|
130
|
+
"""Load task data from BigCode."""
|
|
131
|
+
try:
|
|
132
|
+
# Import BigCode modules
|
|
133
|
+
from bigcode_eval.tasks import get_task
|
|
134
|
+
|
|
135
|
+
# Get the task
|
|
136
|
+
task = get_task(self.bigcode_task_name)
|
|
137
|
+
self._task_obj = task
|
|
138
|
+
|
|
139
|
+
# Get dataset - BigCode uses get_dataset() method
|
|
140
|
+
dataset = task.get_dataset()
|
|
141
|
+
|
|
142
|
+
# Convert to list if needed
|
|
143
|
+
if hasattr(dataset, "__iter__"):
|
|
144
|
+
dataset = list(dataset)
|
|
145
|
+
|
|
146
|
+
# Apply limit if specified
|
|
147
|
+
if self.limit:
|
|
148
|
+
dataset = dataset[: self.limit]
|
|
149
|
+
|
|
150
|
+
self._data = dataset
|
|
151
|
+
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.error(f"Failed to load BigCode task {self.bigcode_task_name}: {e}")
|
|
154
|
+
# Fallback to loading from files if available
|
|
155
|
+
self._load_from_files()
|
|
156
|
+
|
|
157
|
+
# Methods to match lm-eval interface
|
|
158
|
+
def has_validation_docs(self) -> bool:
|
|
159
|
+
"""Check if task has validation documents."""
|
|
160
|
+
return False # BigCode tasks don't have separate validation sets
|
|
161
|
+
|
|
162
|
+
def has_test_docs(self) -> bool:
|
|
163
|
+
"""Check if task has test documents."""
|
|
164
|
+
return True # All samples are considered test docs
|
|
165
|
+
|
|
166
|
+
def test_docs(self) -> List[Dict[str, Any]]:
|
|
167
|
+
"""Get test documents."""
|
|
168
|
+
return self.get_samples()
|
|
169
|
+
|
|
170
|
+
def validation_docs(self) -> List[Dict[str, Any]]:
|
|
171
|
+
"""Get validation documents."""
|
|
172
|
+
return [] # No separate validation set
|
|
173
|
+
|
|
174
|
+
def doc_to_text(self, doc: Dict[str, Any]) -> str:
|
|
175
|
+
"""Convert document to text prompt."""
|
|
176
|
+
# Handle different BigCode formats
|
|
177
|
+
if "prompt" in doc:
|
|
178
|
+
return doc["prompt"]
|
|
179
|
+
if "text" in doc:
|
|
180
|
+
return doc["text"]
|
|
181
|
+
if "question" in doc:
|
|
182
|
+
return doc["question"]
|
|
183
|
+
if "problem" in doc:
|
|
184
|
+
return doc["problem"]
|
|
185
|
+
# Fallback - try to use task object if available
|
|
186
|
+
if self._task_obj and hasattr(self._task_obj, "get_prompt"):
|
|
187
|
+
return self._task_obj.get_prompt(doc)
|
|
188
|
+
return str(doc)
|
|
189
|
+
|
|
190
|
+
def _load_from_files(self):
|
|
191
|
+
"""Load task data from local files as fallback."""
|
|
192
|
+
# Try to load from standard locations
|
|
193
|
+
data_paths = [
|
|
194
|
+
f"~/.cache/bigcode_eval/{self.bigcode_task_name}",
|
|
195
|
+
f"data/{self.bigcode_task_name}",
|
|
196
|
+
f"bigcode_eval/tasks/{self.bigcode_task_name}",
|
|
197
|
+
]
|
|
198
|
+
|
|
199
|
+
for path in data_paths:
|
|
200
|
+
expanded_path = os.path.expanduser(path)
|
|
201
|
+
if os.path.exists(expanded_path):
|
|
202
|
+
self._load_from_path(expanded_path)
|
|
203
|
+
return
|
|
204
|
+
|
|
205
|
+
# If no data found, raise error
|
|
206
|
+
raise ValueError(f"No data found for task {self.task_name}. Please provide valid benchmark data.")
|
|
207
|
+
|
|
208
|
+
def _load_from_path(self, path: str):
|
|
209
|
+
"""Load data from a specific path."""
|
|
210
|
+
data = []
|
|
211
|
+
|
|
212
|
+
# Look for JSON/JSONL files
|
|
213
|
+
for file in Path(path).glob("*.json*"):
|
|
214
|
+
with open(file) as f:
|
|
215
|
+
if file.suffix == ".jsonl":
|
|
216
|
+
for line in f:
|
|
217
|
+
data.append(json.loads(line))
|
|
218
|
+
else:
|
|
219
|
+
file_data = json.load(f)
|
|
220
|
+
if isinstance(file_data, list):
|
|
221
|
+
data.extend(file_data)
|
|
222
|
+
else:
|
|
223
|
+
data.append(file_data)
|
|
224
|
+
|
|
225
|
+
if self.limit:
|
|
226
|
+
data = data[: self.limit]
|
|
227
|
+
|
|
228
|
+
self._data = data
|
|
229
|
+
|
|
230
|
+
def get_samples(self) -> List[Dict[str, Any]]:
|
|
231
|
+
"""Get all samples from the task."""
|
|
232
|
+
return self._data if self._data else []
|
|
233
|
+
|
|
234
|
+
def __len__(self):
|
|
235
|
+
"""Get number of samples."""
|
|
236
|
+
return len(self._data) if self._data else 0
|
|
237
|
+
|
|
238
|
+
def __iter__(self):
|
|
239
|
+
"""Iterate over samples."""
|
|
240
|
+
return iter(self.get_samples())
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
class BigCodeEvaluator:
|
|
244
|
+
"""Evaluates model outputs on BigCode benchmarks."""
|
|
245
|
+
|
|
246
|
+
def __init__(self, docker_executor=None):
|
|
247
|
+
"""
|
|
248
|
+
Initialize evaluator.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
docker_executor: Optional Docker executor for secure code execution
|
|
252
|
+
"""
|
|
253
|
+
self.docker_executor = docker_executor
|
|
254
|
+
|
|
255
|
+
def evaluate(self, task: BigCodeTask, generations: List[str], k_values: List[int] = [1, 10, 100]) -> Dict[str, Any]:
|
|
256
|
+
"""
|
|
257
|
+
Evaluate generations on a BigCode task.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
task: BigCodeTask object
|
|
261
|
+
generations: List of generated code solutions
|
|
262
|
+
k_values: k values for pass@k metric
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
Evaluation results dict
|
|
266
|
+
"""
|
|
267
|
+
results = {
|
|
268
|
+
"task": task.task_name,
|
|
269
|
+
"num_samples": len(task),
|
|
270
|
+
"num_generations": len(generations),
|
|
271
|
+
"pass_at_k": {},
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
# For code generation tasks, we need to execute and test
|
|
275
|
+
if self._is_code_execution_task(task.task_name):
|
|
276
|
+
results["execution_results"] = self._evaluate_code_execution(task, generations)
|
|
277
|
+
|
|
278
|
+
# Calculate pass@k
|
|
279
|
+
for k in k_values:
|
|
280
|
+
if k <= len(generations):
|
|
281
|
+
pass_rate = self._calculate_pass_at_k(results["execution_results"], k)
|
|
282
|
+
results["pass_at_k"][f"pass@{k}"] = pass_rate
|
|
283
|
+
|
|
284
|
+
else:
|
|
285
|
+
# For non-execution tasks (e.g., code-to-text), use BLEU or other metrics
|
|
286
|
+
results["bleu_scores"] = self._evaluate_text_generation(task, generations)
|
|
287
|
+
|
|
288
|
+
return results
|
|
289
|
+
|
|
290
|
+
def _is_code_execution_task(self, task_name: str) -> bool:
|
|
291
|
+
"""Check if task requires code execution."""
|
|
292
|
+
non_execution_tasks = {
|
|
293
|
+
"codexglue_code_to_text",
|
|
294
|
+
"codexglue_code_to_text_python",
|
|
295
|
+
"codexglue_code_to_text_go",
|
|
296
|
+
"codexglue_code_to_text_ruby",
|
|
297
|
+
"codexglue_code_to_text_java",
|
|
298
|
+
"codexglue_code_to_text_javascript",
|
|
299
|
+
"codexglue_code_to_text_php",
|
|
300
|
+
}
|
|
301
|
+
return task_name not in non_execution_tasks
|
|
302
|
+
|
|
303
|
+
def _evaluate_code_execution(self, task: BigCodeTask, generations: List[str]) -> List[Dict]:
|
|
304
|
+
"""Evaluate code by executing it."""
|
|
305
|
+
results = []
|
|
306
|
+
|
|
307
|
+
for i, sample in enumerate(task.get_samples()):
|
|
308
|
+
sample_results = []
|
|
309
|
+
|
|
310
|
+
for j, generation in enumerate(generations[i] if i < len(generations) else []):
|
|
311
|
+
result = self._execute_and_test(sample, generation, task.task_name)
|
|
312
|
+
sample_results.append(result)
|
|
313
|
+
|
|
314
|
+
results.append({"sample_id": i, "results": sample_results})
|
|
315
|
+
|
|
316
|
+
return results
|
|
317
|
+
|
|
318
|
+
def _execute_and_test(self, sample: Dict, generation: str, task_name: str) -> Dict:
|
|
319
|
+
"""Execute generated code and run tests."""
|
|
320
|
+
if self.docker_executor:
|
|
321
|
+
# Use Docker for secure execution
|
|
322
|
+
return self._execute_in_docker(sample, generation, task_name)
|
|
323
|
+
# Fallback to subprocess (less secure)
|
|
324
|
+
return self._execute_in_subprocess(sample, generation, task_name)
|
|
325
|
+
|
|
326
|
+
def _execute_in_docker(self, sample: Dict, generation: str, task_name: str) -> Dict:
|
|
327
|
+
"""Execute code in Docker container."""
|
|
328
|
+
# TODO: Implement Docker execution
|
|
329
|
+
logger.warning("Docker execution not yet implemented, using subprocess")
|
|
330
|
+
return self._execute_in_subprocess(sample, generation, task_name)
|
|
331
|
+
|
|
332
|
+
def _execute_in_subprocess(self, sample: Dict, generation: str, task_name: str) -> Dict:
|
|
333
|
+
"""Execute code in subprocess (less secure)."""
|
|
334
|
+
result = {"passed": False, "error": None, "output": None}
|
|
335
|
+
|
|
336
|
+
try:
|
|
337
|
+
# Create test script
|
|
338
|
+
test_script = self._create_test_script(sample, generation, task_name)
|
|
339
|
+
|
|
340
|
+
# Write to temp file
|
|
341
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
|
|
342
|
+
f.write(test_script)
|
|
343
|
+
temp_path = f.name
|
|
344
|
+
|
|
345
|
+
try:
|
|
346
|
+
# Execute
|
|
347
|
+
proc = subprocess.run([sys.executable, temp_path], capture_output=True, text=True, timeout=10)
|
|
348
|
+
|
|
349
|
+
if proc.returncode == 0:
|
|
350
|
+
result["passed"] = True
|
|
351
|
+
result["output"] = proc.stdout
|
|
352
|
+
logger.debug(f"✅ Code execution PASSED. Output: {proc.stdout[:200]}")
|
|
353
|
+
else:
|
|
354
|
+
result["error"] = proc.stderr or proc.stdout
|
|
355
|
+
logger.debug(f"❌ Code execution FAILED. Error: {result['error'][:500]}")
|
|
356
|
+
|
|
357
|
+
finally:
|
|
358
|
+
# Clean up
|
|
359
|
+
os.unlink(temp_path)
|
|
360
|
+
|
|
361
|
+
except subprocess.TimeoutExpired:
|
|
362
|
+
result["error"] = "Timeout"
|
|
363
|
+
except Exception as e:
|
|
364
|
+
result["error"] = str(e)
|
|
365
|
+
|
|
366
|
+
return result
|
|
367
|
+
|
|
368
|
+
def _create_test_script(self, sample: Dict, generation: str, task_name: str) -> str:
|
|
369
|
+
"""Create a test script for the sample."""
|
|
370
|
+
if "humaneval" in task_name:
|
|
371
|
+
script = self._create_humaneval_test_script(sample, generation)
|
|
372
|
+
elif "mbpp" in task_name:
|
|
373
|
+
script = self._create_mbpp_test_script(sample, generation)
|
|
374
|
+
elif "apps" in task_name:
|
|
375
|
+
script = self._create_apps_test_script(sample, generation)
|
|
376
|
+
else:
|
|
377
|
+
# Default format
|
|
378
|
+
script = self._create_humaneval_test_script(sample, generation)
|
|
379
|
+
|
|
380
|
+
logger.debug(f"📝 Test script for {task_name}:\n{script}\n")
|
|
381
|
+
return script
|
|
382
|
+
|
|
383
|
+
def _create_humaneval_test_script(self, sample: Dict, generation: str) -> str:
|
|
384
|
+
"""Create test script for HumanEval format."""
|
|
385
|
+
entry_point = sample.get("entry_point", "solution")
|
|
386
|
+
test_code = sample.get("test", "")
|
|
387
|
+
prompt = sample.get("prompt", "")
|
|
388
|
+
|
|
389
|
+
# The prompt contains the function signature, and generation should be the function body
|
|
390
|
+
# We need to combine them properly
|
|
391
|
+
script = f"""
|
|
392
|
+
{prompt}{generation}
|
|
393
|
+
|
|
394
|
+
{test_code}
|
|
395
|
+
|
|
396
|
+
if __name__ == "__main__":
|
|
397
|
+
check({entry_point})
|
|
398
|
+
print("All tests passed!")
|
|
399
|
+
"""
|
|
400
|
+
return script
|
|
401
|
+
|
|
402
|
+
def _create_mbpp_test_script(self, sample: Dict, generation: str) -> str:
|
|
403
|
+
"""Create test script for MBPP format."""
|
|
404
|
+
test_imports = sample.get("test_imports", [])
|
|
405
|
+
test_list = sample.get("test_list", [])
|
|
406
|
+
|
|
407
|
+
# Fix function name mismatch before creating test script
|
|
408
|
+
fixed_generation = self._fix_function_name_mismatch(generation, test_list)
|
|
409
|
+
|
|
410
|
+
imports = "\n".join(test_imports)
|
|
411
|
+
tests = "\n ".join(test_list)
|
|
412
|
+
|
|
413
|
+
script = f"""
|
|
414
|
+
{imports}
|
|
415
|
+
|
|
416
|
+
{fixed_generation}
|
|
417
|
+
|
|
418
|
+
if __name__ == "__main__":
|
|
419
|
+
{tests}
|
|
420
|
+
print("All tests passed!")
|
|
421
|
+
"""
|
|
422
|
+
return script
|
|
423
|
+
|
|
424
|
+
def _create_apps_test_script(self, sample: Dict, generation: str) -> str:
|
|
425
|
+
"""Create test script for APPS format."""
|
|
426
|
+
# APPS has input/output pairs
|
|
427
|
+
io_data = json.loads(sample.get("input_output", "{}"))
|
|
428
|
+
inputs = io_data.get("inputs", [])
|
|
429
|
+
outputs = io_data.get("outputs", [])
|
|
430
|
+
|
|
431
|
+
tests = []
|
|
432
|
+
for inp, out in zip(inputs, outputs):
|
|
433
|
+
tests.append(f"assert str(solution({inp})) == '{out}'")
|
|
434
|
+
|
|
435
|
+
test_code = "\n ".join(tests)
|
|
436
|
+
|
|
437
|
+
script = f"""
|
|
438
|
+
{generation}
|
|
439
|
+
|
|
440
|
+
if __name__ == "__main__":
|
|
441
|
+
{test_code}
|
|
442
|
+
print("All tests passed!")
|
|
443
|
+
"""
|
|
444
|
+
return script
|
|
445
|
+
|
|
446
|
+
def _fix_function_name_mismatch(self, code: str, test_list: List[str]) -> str:
|
|
447
|
+
"""
|
|
448
|
+
Fix function name mismatches between generated code and test cases.
|
|
449
|
+
|
|
450
|
+
Uses wrapper function approach for robustness across different code structures.
|
|
451
|
+
|
|
452
|
+
Args:
|
|
453
|
+
code: Generated code that may have wrong function name
|
|
454
|
+
test_list: List of test assertions that specify expected function name
|
|
455
|
+
|
|
456
|
+
Returns:
|
|
457
|
+
Fixed code with wrapper function if needed
|
|
458
|
+
"""
|
|
459
|
+
import re
|
|
460
|
+
|
|
461
|
+
if not test_list or not code.strip():
|
|
462
|
+
return code
|
|
463
|
+
|
|
464
|
+
# Extract expected function name from test assertions
|
|
465
|
+
expected_name = None
|
|
466
|
+
# Built-in functions to skip when looking for the target function
|
|
467
|
+
builtin_functions = {
|
|
468
|
+
"set",
|
|
469
|
+
"len",
|
|
470
|
+
"str",
|
|
471
|
+
"int",
|
|
472
|
+
"float",
|
|
473
|
+
"list",
|
|
474
|
+
"tuple",
|
|
475
|
+
"dict",
|
|
476
|
+
"sum",
|
|
477
|
+
"max",
|
|
478
|
+
"min",
|
|
479
|
+
"abs",
|
|
480
|
+
"round",
|
|
481
|
+
"sorted",
|
|
482
|
+
"reversed",
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
for test in test_list:
|
|
486
|
+
# Find all function calls in assert statements
|
|
487
|
+
function_calls = re.findall(r"(\w+)\s*\(", test)
|
|
488
|
+
|
|
489
|
+
for func_name in function_calls:
|
|
490
|
+
# Skip built-in functions and common test functions
|
|
491
|
+
if func_name not in builtin_functions and func_name not in {
|
|
492
|
+
"assert",
|
|
493
|
+
"assertEqual",
|
|
494
|
+
"assertTrue",
|
|
495
|
+
"assertFalse",
|
|
496
|
+
}:
|
|
497
|
+
expected_name = func_name
|
|
498
|
+
break
|
|
499
|
+
|
|
500
|
+
if expected_name:
|
|
501
|
+
break
|
|
502
|
+
|
|
503
|
+
if not expected_name:
|
|
504
|
+
return code # No function name found in tests
|
|
505
|
+
|
|
506
|
+
# Extract actual function name from generated code
|
|
507
|
+
actual_name = None
|
|
508
|
+
func_match = re.search(r"def\s+(\w+)\s*\(", code)
|
|
509
|
+
if func_match:
|
|
510
|
+
actual_name = func_match.group(1)
|
|
511
|
+
|
|
512
|
+
if not actual_name:
|
|
513
|
+
return code # No function definition found
|
|
514
|
+
|
|
515
|
+
if actual_name == expected_name:
|
|
516
|
+
return code # Names already match
|
|
517
|
+
|
|
518
|
+
logger.debug(f"🔧 Function name mismatch detected: {actual_name} → {expected_name}")
|
|
519
|
+
logger.debug(" Adding wrapper function for compatibility")
|
|
520
|
+
|
|
521
|
+
# Add wrapper function to bridge the name gap
|
|
522
|
+
wrapper = f"""
|
|
523
|
+
# Wrapper function for test compatibility
|
|
524
|
+
def {expected_name}(*args, **kwargs):
|
|
525
|
+
return {actual_name}(*args, **kwargs)
|
|
526
|
+
"""
|
|
527
|
+
|
|
528
|
+
return code + wrapper
|
|
529
|
+
|
|
530
|
+
def _calculate_pass_at_k(self, execution_results: List[Dict], k: int) -> float:
|
|
531
|
+
"""Calculate pass@k metric."""
|
|
532
|
+
total_passed = 0
|
|
533
|
+
total_samples = len(execution_results)
|
|
534
|
+
|
|
535
|
+
for result in execution_results:
|
|
536
|
+
sample_results = result["results"][:k]
|
|
537
|
+
if any(r["passed"] for r in sample_results):
|
|
538
|
+
total_passed += 1
|
|
539
|
+
|
|
540
|
+
return total_passed / total_samples if total_samples > 0 else 0.0
|
|
541
|
+
|
|
542
|
+
def _evaluate_text_generation(self, task: BigCodeTask, generations: List[str]) -> List[float]:
|
|
543
|
+
"""Evaluate text generation tasks (e.g., code-to-text)."""
|
|
544
|
+
# TODO: Implement BLEU scoring
|
|
545
|
+
logger.warning("Text generation evaluation not yet implemented")
|
|
546
|
+
return [0.0] * len(generations)
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
# Main interface for BigCode integration
|
|
550
|
+
_loader = None
|
|
551
|
+
_evaluator = None
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
def get_bigcode_loader() -> BigCodeTaskLoader:
|
|
555
|
+
"""Get the global BigCode task loader."""
|
|
556
|
+
global _loader
|
|
557
|
+
if _loader is None:
|
|
558
|
+
_loader = BigCodeTaskLoader()
|
|
559
|
+
return _loader
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def get_bigcode_evaluator(docker_executor=None) -> BigCodeEvaluator:
|
|
563
|
+
"""Get the global BigCode evaluator."""
|
|
564
|
+
global _evaluator
|
|
565
|
+
if _evaluator is None:
|
|
566
|
+
_evaluator = BigCodeEvaluator(docker_executor)
|
|
567
|
+
return _evaluator
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
def is_bigcode_task(task_name: str) -> bool:
|
|
571
|
+
"""Check if a task is from BigCode."""
|
|
572
|
+
return get_bigcode_loader().is_bigcode_task(task_name)
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def load_bigcode_task(task_name: str, limit: Optional[int] = None) -> BigCodeTask:
|
|
576
|
+
"""Load a BigCode task."""
|
|
577
|
+
return get_bigcode_loader().load_task(task_name, limit)
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def evaluate_bigcode_task(task: BigCodeTask, generations: List[str], docker_executor=None) -> Dict[str, Any]:
|
|
581
|
+
"""Evaluate generations on a BigCode task."""
|
|
582
|
+
evaluator = get_bigcode_evaluator(docker_executor)
|
|
583
|
+
return evaluator.evaluate(task, generations)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Public interface for contrastive pair utilities."""
|
|
2
|
+
|
|
3
|
+
from .core.pair import ContrastivePair
|
|
4
|
+
from .core.set import ContrastivePairSet
|
|
5
|
+
from .core.buliders import from_phrase_pairs
|
|
6
|
+
from .diagnostics import DiagnosticsConfig, DiagnosticsReport, run_all_diagnostics
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"ContrastivePair",
|
|
10
|
+
"ContrastivePairSet",
|
|
11
|
+
"from_phrase_pairs",
|
|
12
|
+
"DiagnosticsConfig",
|
|
13
|
+
"DiagnosticsReport",
|
|
14
|
+
"run_all_diagnostics",
|
|
15
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
|
|
5
|
+
from typing import Iterable, TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from wisent.core.activations.core.atoms import LayerActivations
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"AtomResponse",
|
|
12
|
+
"AtomContrastivePair",
|
|
13
|
+
"AtomContrastivePairSet",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
class AtomResponse(ABC):
|
|
17
|
+
"""Abstract base for a single model response."""
|
|
18
|
+
model_response: str
|
|
19
|
+
layers_activations: LayerActivations | None = None
|
|
20
|
+
label: str | None = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class AtomContrastivePair(ABC):
|
|
24
|
+
"""Abstract base for a (prompt, positive, negative) trio."""
|
|
25
|
+
prompt: str
|
|
26
|
+
positive_response: AtomResponse
|
|
27
|
+
negative_response: AtomResponse
|
|
28
|
+
label: str | None
|
|
29
|
+
trait_description: str | None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class AtomContrastivePairSet(ABC):
|
|
33
|
+
"""Abstract base for a named collection of pairs."""
|
|
34
|
+
name: str
|
|
35
|
+
pairs: list[AtomContrastivePair]
|
|
36
|
+
task_type: str | None
|
|
37
|
+
|
|
38
|
+
@abstractmethod
|
|
39
|
+
def add(self, pair: AtomContrastivePair) -> None: ...
|
|
40
|
+
|
|
41
|
+
@abstractmethod
|
|
42
|
+
def extend(self, pairs: Iterable[AtomContrastivePair]) -> None: ...
|
|
43
|
+
|
|
44
|
+
def __len__(self) -> int:
|
|
45
|
+
return len(self.pairs)
|