wisent 0.1.1__py3-none-any.whl ā 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wisent might be problematic. Click here for more details.
- wisent/__init__.py +1 -8
- wisent/benchmarks/__init__.py +0 -0
- wisent/benchmarks/coding/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
- wisent/benchmarks/coding/metrics/evaluator.py +275 -0
- wisent/benchmarks/coding/metrics/passk.py +66 -0
- wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
- wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
- wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
- wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
- wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
- wisent/benchmarks/coding/providers/__init__.py +18 -0
- wisent/benchmarks/coding/providers/core/__init__.py +0 -0
- wisent/benchmarks/coding/providers/core/atoms.py +31 -0
- wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
- wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
- wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
- wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
- wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
- wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
- wisent/classifiers/__init__.py +0 -0
- wisent/classifiers/core/__init__.py +0 -0
- wisent/classifiers/core/atoms.py +747 -0
- wisent/classifiers/models/__init__.py +0 -0
- wisent/classifiers/models/logistic.py +29 -0
- wisent/classifiers/models/mlp.py +47 -0
- wisent/cli/__init__.py +0 -0
- wisent/cli/classifiers/__init__.py +0 -0
- wisent/cli/classifiers/classifier_rotator.py +137 -0
- wisent/cli/cli_logger.py +142 -0
- wisent/cli/data_loaders/__init__.py +0 -0
- wisent/cli/data_loaders/data_loader_rotator.py +96 -0
- wisent/cli/evaluators/__init__.py +0 -0
- wisent/cli/evaluators/evaluator_rotator.py +148 -0
- wisent/cli/steering_methods/__init__.py +0 -0
- wisent/cli/steering_methods/steering_rotator.py +110 -0
- wisent/cli/wisent_cli/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
- wisent/cli/wisent_cli/commands/listing.py +154 -0
- wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
- wisent/cli/wisent_cli/main.py +93 -0
- wisent/cli/wisent_cli/shell.py +80 -0
- wisent/cli/wisent_cli/ui.py +69 -0
- wisent/cli/wisent_cli/util/__init__.py +0 -0
- wisent/cli/wisent_cli/util/aggregations.py +43 -0
- wisent/cli/wisent_cli/util/parsing.py +126 -0
- wisent/cli/wisent_cli/version.py +4 -0
- wisent/core/__init__.py +27 -0
- wisent/core/activations/__init__.py +0 -0
- wisent/core/activations/activations_collector.py +338 -0
- wisent/core/activations/core/__init__.py +0 -0
- wisent/core/activations/core/atoms.py +216 -0
- wisent/core/agent/__init__.py +18 -0
- wisent/core/agent/budget.py +638 -0
- wisent/core/agent/device_benchmarks.py +685 -0
- wisent/core/agent/diagnose/__init__.py +55 -0
- wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
- wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
- wisent/core/agent/diagnose/create_classifier.py +1154 -0
- wisent/core/agent/diagnose/response_diagnostics.py +268 -0
- wisent/core/agent/diagnose/select_classifiers.py +506 -0
- wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
- wisent/core/agent/diagnose/tasks/__init__.py +33 -0
- wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
- wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
- wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
- wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
- wisent/core/agent/diagnose.py +242 -0
- wisent/core/agent/steer.py +212 -0
- wisent/core/agent/timeout.py +134 -0
- wisent/core/autonomous_agent.py +1234 -0
- wisent/core/bigcode_integration.py +583 -0
- wisent/core/contrastive_pairs/__init__.py +15 -0
- wisent/core/contrastive_pairs/core/__init__.py +0 -0
- wisent/core/contrastive_pairs/core/atoms.py +45 -0
- wisent/core/contrastive_pairs/core/buliders.py +59 -0
- wisent/core/contrastive_pairs/core/pair.py +178 -0
- wisent/core/contrastive_pairs/core/response.py +152 -0
- wisent/core/contrastive_pairs/core/serialization.py +300 -0
- wisent/core/contrastive_pairs/core/set.py +133 -0
- wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
- wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
- wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
- wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
- wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
- wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
- wisent/core/data_loaders/__init__.py +0 -0
- wisent/core/data_loaders/core/__init__.py +0 -0
- wisent/core/data_loaders/core/atoms.py +98 -0
- wisent/core/data_loaders/loaders/__init__.py +0 -0
- wisent/core/data_loaders/loaders/custom.py +120 -0
- wisent/core/data_loaders/loaders/lm_loader.py +218 -0
- wisent/core/detection_handling.py +257 -0
- wisent/core/download_full_benchmarks.py +1386 -0
- wisent/core/evaluators/__init__.py +0 -0
- wisent/core/evaluators/oracles/__init__.py +0 -0
- wisent/core/evaluators/oracles/interactive.py +73 -0
- wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
- wisent/core/evaluators/oracles/user_specified.py +67 -0
- wisent/core/hyperparameter_optimizer.py +429 -0
- wisent/core/lm_eval_harness_ground_truth.py +1396 -0
- wisent/core/log_likelihoods_evaluator.py +321 -0
- wisent/core/managed_cached_benchmarks.py +595 -0
- wisent/core/mixed_benchmark_sampler.py +364 -0
- wisent/core/model_config_manager.py +330 -0
- wisent/core/model_persistence.py +317 -0
- wisent/core/models/__init__.py +0 -0
- wisent/core/models/core/__init__.py +0 -0
- wisent/core/models/core/atoms.py +460 -0
- wisent/core/models/wisent_model.py +727 -0
- wisent/core/multi_steering.py +316 -0
- wisent/core/optuna/__init__.py +57 -0
- wisent/core/optuna/classifier/__init__.py +25 -0
- wisent/core/optuna/classifier/activation_generator.py +349 -0
- wisent/core/optuna/classifier/classifier_cache.py +509 -0
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
- wisent/core/optuna/steering/__init__.py +0 -0
- wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
- wisent/core/optuna/steering/data_utils.py +342 -0
- wisent/core/optuna/steering/metrics.py +474 -0
- wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
- wisent/core/optuna/steering/steering_optimization.py +1111 -0
- wisent/core/parser.py +1668 -0
- wisent/core/prompts/__init__.py +0 -0
- wisent/core/prompts/core/__init__.py +0 -0
- wisent/core/prompts/core/atom.py +57 -0
- wisent/core/prompts/core/prompt_formater.py +157 -0
- wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
- wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
- wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
- wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
- wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
- wisent/core/representation.py +5 -0
- wisent/core/sample_size_optimizer.py +648 -0
- wisent/core/sample_size_optimizer_v2.py +355 -0
- wisent/core/save_results.py +277 -0
- wisent/core/steering.py +652 -0
- wisent/core/steering_method.py +26 -0
- wisent/core/steering_methods/__init__.py +0 -0
- wisent/core/steering_methods/core/__init__.py +0 -0
- wisent/core/steering_methods/core/atoms.py +153 -0
- wisent/core/steering_methods/methods/__init__.py +0 -0
- wisent/core/steering_methods/methods/caa.py +44 -0
- wisent/core/steering_optimizer.py +1297 -0
- wisent/core/task_interface.py +132 -0
- wisent/core/task_selector.py +189 -0
- wisent/core/tasks/__init__.py +175 -0
- wisent/core/tasks/aime_task.py +141 -0
- wisent/core/tasks/file_task.py +211 -0
- wisent/core/tasks/hle_task.py +180 -0
- wisent/core/tasks/hmmt_task.py +119 -0
- wisent/core/tasks/livecodebench_task.py +201 -0
- wisent/core/tasks/livemathbench_task.py +158 -0
- wisent/core/tasks/lm_eval_task.py +455 -0
- wisent/core/tasks/math500_task.py +84 -0
- wisent/core/tasks/polymath_task.py +146 -0
- wisent/core/tasks/supergpqa_task.py +220 -0
- wisent/core/time_estimator.py +149 -0
- wisent/core/timing_calibration.py +174 -0
- wisent/core/tracking/__init__.py +54 -0
- wisent/core/tracking/latency.py +618 -0
- wisent/core/tracking/memory.py +359 -0
- wisent/core/trainers/__init__.py +0 -0
- wisent/core/trainers/core/__init__.py +11 -0
- wisent/core/trainers/core/atoms.py +45 -0
- wisent/core/trainers/steering_trainer.py +271 -0
- wisent/core/user_model_config.py +158 -0
- wisent/opti/__init__.py +0 -0
- wisent/opti/core/__init__.py +0 -0
- wisent/opti/core/atoms.py +175 -0
- wisent/opti/methods/__init__.py +0 -0
- wisent/opti/methods/opti_classificator.py +172 -0
- wisent/opti/methods/opti_steering.py +138 -0
- wisent/synthetic/__init__.py +0 -0
- wisent/synthetic/cleaners/__init__.py +0 -0
- wisent/synthetic/cleaners/core/__init__.py +0 -0
- wisent/synthetic/cleaners/core/atoms.py +58 -0
- wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
- wisent/synthetic/cleaners/methods/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
- wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
- wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
- wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
- wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
- wisent/synthetic/db_instructions/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/atoms.py +25 -0
- wisent/synthetic/db_instructions/mini_dp.py +37 -0
- wisent/synthetic/generators/__init__.py +0 -0
- wisent/synthetic/generators/core/__init__.py +0 -0
- wisent/synthetic/generators/core/atoms.py +73 -0
- wisent/synthetic/generators/diversities/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/core.py +68 -0
- wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
- wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
- wisent/synthetic/generators/pairs_generator.py +179 -0
- wisent-0.5.1.dist-info/METADATA +67 -0
- wisent-0.5.1.dist-info/RECORD +218 -0
- {wisent-0.1.1.dist-info ā wisent-0.5.1.dist-info}/WHEEL +1 -1
- {wisent-0.1.1.dist-info ā wisent-0.5.1.dist-info/licenses}/LICENSE +2 -2
- wisent/activations/__init__.py +0 -9
- wisent/activations/client.py +0 -97
- wisent/activations/extractor.py +0 -251
- wisent/activations/models.py +0 -95
- wisent/client.py +0 -45
- wisent/control_vector/__init__.py +0 -9
- wisent/control_vector/client.py +0 -85
- wisent/control_vector/manager.py +0 -168
- wisent/control_vector/models.py +0 -70
- wisent/inference/__init__.py +0 -9
- wisent/inference/client.py +0 -103
- wisent/inference/inferencer.py +0 -250
- wisent/inference/models.py +0 -66
- wisent/utils/__init__.py +0 -3
- wisent/utils/auth.py +0 -30
- wisent/utils/http.py +0 -228
- wisent/version.py +0 -3
- wisent-0.1.1.dist-info/METADATA +0 -142
- wisent-0.1.1.dist-info/RECORD +0 -23
- {wisent-0.1.1.dist-info ā wisent-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,641 @@
|
|
|
1
|
+
from typing import List, Dict, Any, Optional, Set
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
import re
|
|
4
|
+
import asyncio
|
|
5
|
+
import time
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
# Add the lm-harness-integration path for benchmark selection
|
|
10
|
+
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'lm-harness-integration'))
|
|
11
|
+
|
|
12
|
+
from .classifier_marketplace import ClassifierMarketplace, ClassifierListing, ClassifierCreationEstimate
|
|
13
|
+
from ..budget import get_budget_manager, track_task_performance, ResourceType
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class TaskAnalysis:
|
|
17
|
+
"""Analysis of what classifiers might be needed for a task."""
|
|
18
|
+
prompt_content: str
|
|
19
|
+
relevant_benchmarks: List[Dict[str, Any]] = None # Selected benchmarks for training and steering
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ClassifierDecision:
|
|
23
|
+
"""A decision about whether to use an existing classifier or create a new one."""
|
|
24
|
+
benchmark_name: str
|
|
25
|
+
action: str # "use_existing", "create_new", "skip"
|
|
26
|
+
selected_classifier: Optional[ClassifierListing] = None
|
|
27
|
+
creation_estimate: Optional[ClassifierCreationEstimate] = None
|
|
28
|
+
reasoning: str = ""
|
|
29
|
+
confidence: float = 0.0
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class SingleClassifierDecision:
|
|
33
|
+
"""Decision about creating one combined classifier from multiple benchmarks."""
|
|
34
|
+
benchmark_names: List[str]
|
|
35
|
+
action: str # "use_existing", "create_new", "skip"
|
|
36
|
+
selected_classifier: Optional[ClassifierListing] = None
|
|
37
|
+
creation_estimate: Optional[ClassifierCreationEstimate] = None
|
|
38
|
+
reasoning: str = ""
|
|
39
|
+
confidence: float = 0.0
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class ClassifierParams:
|
|
43
|
+
"""Model-determined classifier parameters."""
|
|
44
|
+
optimal_layer: int # 8-20: Based on semantic complexity needed
|
|
45
|
+
classification_threshold: float # 0.1-0.9: Based on quality strictness required
|
|
46
|
+
training_samples: int # 10-50: Based on complexity and time constraints
|
|
47
|
+
classifier_type: str # logistic/svm/neural: Based on data characteristics
|
|
48
|
+
reasoning: str = ""
|
|
49
|
+
model_name: str = "unknown" # Model name for matching existing classifiers
|
|
50
|
+
|
|
51
|
+
# Additional classifier configuration parameters
|
|
52
|
+
aggregation_method: str = "last_token" # last_token/mean/max for activation aggregation
|
|
53
|
+
token_aggregation: str = "average" # average/final/first/max/min for token score aggregation
|
|
54
|
+
num_epochs: int = 50
|
|
55
|
+
batch_size: int = 32
|
|
56
|
+
learning_rate: float = 0.001
|
|
57
|
+
early_stopping_patience: int = 10
|
|
58
|
+
hidden_dim: int = 128
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class SteeringParams:
|
|
62
|
+
"""Model-determined steering parameters."""
|
|
63
|
+
steering_method: str # CAA/HPR/DAC/BiPO/KSteering: Best fit for prompt type
|
|
64
|
+
initial_strength: float # 0.1-2.0: How aggressive to start
|
|
65
|
+
increment: float # 0.1-0.5: How much to increase per failed attempt
|
|
66
|
+
maximum_strength: float # 0.5-3.0: Upper limit to prevent over-steering
|
|
67
|
+
method_specific_params: Dict[str, Any] = None # Beta values, thresholds, etc.
|
|
68
|
+
reasoning: str = ""
|
|
69
|
+
|
|
70
|
+
@dataclass
|
|
71
|
+
class QualityResult:
|
|
72
|
+
"""Result of quality evaluation."""
|
|
73
|
+
score: float # Classifier prediction score
|
|
74
|
+
acceptable: bool # Model judgment if quality is acceptable
|
|
75
|
+
reasoning: str = ""
|
|
76
|
+
|
|
77
|
+
@dataclass
|
|
78
|
+
class QualityControlledResponse:
|
|
79
|
+
"""Final response with complete metadata."""
|
|
80
|
+
response_text: str
|
|
81
|
+
final_quality_score: float
|
|
82
|
+
attempts_needed: int
|
|
83
|
+
classifier_params_used: ClassifierParams
|
|
84
|
+
steering_params_used: Optional[SteeringParams] = None
|
|
85
|
+
quality_progression: List[float] = None # Quality scores for each attempt
|
|
86
|
+
total_time_seconds: float = 0.0
|
|
87
|
+
|
|
88
|
+
class AgentClassifierDecisionSystem:
|
|
89
|
+
"""
|
|
90
|
+
Intelligent system that helps the agent make autonomous decisions about
|
|
91
|
+
which classifiers to use based on task analysis and cost-benefit considerations.
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
def __init__(self, marketplace: ClassifierMarketplace):
|
|
95
|
+
self.marketplace = marketplace
|
|
96
|
+
self.decision_history: List[ClassifierDecision] = []
|
|
97
|
+
|
|
98
|
+
def analyze_task_requirements(self, prompt: str, context: str = "",
|
|
99
|
+
priority: str = "all", fast_only: bool = False,
|
|
100
|
+
time_budget_minutes: float = 5.0, max_benchmarks: int = 1) -> TaskAnalysis:
|
|
101
|
+
"""
|
|
102
|
+
Analyze a task/prompt to select relevant benchmarks for training and steering.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
prompt: The prompt or task to analyze
|
|
106
|
+
context: Additional context about the task
|
|
107
|
+
priority: Priority level for benchmark selection
|
|
108
|
+
fast_only: Only use fast benchmarks
|
|
109
|
+
time_budget_minutes: Time budget for benchmark selection
|
|
110
|
+
max_benchmarks: Maximum number of benchmarks to select
|
|
111
|
+
prefer_fast: Prefer fast benchmarks
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
Analysis with relevant benchmarks for direct use
|
|
115
|
+
"""
|
|
116
|
+
print(f"š Analyzing task requirements for prompt...")
|
|
117
|
+
|
|
118
|
+
# Get relevant benchmarks for the prompt using priority-aware selection
|
|
119
|
+
existing_model = getattr(self.marketplace, 'model', None)
|
|
120
|
+
relevant_benchmarks = self._get_relevant_benchmarks_for_prompt(
|
|
121
|
+
prompt,
|
|
122
|
+
existing_model=existing_model,
|
|
123
|
+
priority=priority,
|
|
124
|
+
fast_only=fast_only,
|
|
125
|
+
time_budget_minutes=time_budget_minutes,
|
|
126
|
+
max_benchmarks=max_benchmarks
|
|
127
|
+
)
|
|
128
|
+
print(f" š Found {len(relevant_benchmarks)} relevant benchmarks")
|
|
129
|
+
|
|
130
|
+
return TaskAnalysis(
|
|
131
|
+
prompt_content=prompt,
|
|
132
|
+
relevant_benchmarks=relevant_benchmarks
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
def _get_relevant_benchmarks_for_prompt(self, prompt: str, existing_model=None,
|
|
136
|
+
priority: str = "all", fast_only: bool = False,
|
|
137
|
+
time_budget_minutes: float = 5.0, max_benchmarks: int = 1) -> List[Dict[str, Any]]:
|
|
138
|
+
"""Get relevant benchmarks for the prompt using the intelligent selection system with priority awareness."""
|
|
139
|
+
try:
|
|
140
|
+
# Import the benchmark selection function from the correct location
|
|
141
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'lm-harness-integration'))
|
|
142
|
+
from populate_tasks import get_relevant_benchmarks_for_prompt
|
|
143
|
+
|
|
144
|
+
# Use priority-aware selection with provided parameters
|
|
145
|
+
relevant_benchmarks = get_relevant_benchmarks_for_prompt(
|
|
146
|
+
prompt=prompt,
|
|
147
|
+
max_benchmarks=max_benchmarks,
|
|
148
|
+
existing_model=existing_model,
|
|
149
|
+
priority=priority,
|
|
150
|
+
fast_only=fast_only,
|
|
151
|
+
time_budget_minutes=time_budget_minutes
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
return relevant_benchmarks
|
|
155
|
+
except Exception as e:
|
|
156
|
+
print(f" ā ļø Failed to get relevant benchmarks: {e}")
|
|
157
|
+
# Fallback to basic high-priority benchmarks
|
|
158
|
+
return [
|
|
159
|
+
{'benchmark': 'mmlu', 'explanation': 'General knowledge benchmark', 'relevance_score': 1, 'priority': 'high', 'loading_time': 9.5},
|
|
160
|
+
{'benchmark': 'truthfulqa_mc1', 'explanation': 'Truthfulness benchmark', 'relevance_score': 2, 'priority': 'high', 'loading_time': 11.2},
|
|
161
|
+
{'benchmark': 'hellaswag', 'explanation': 'Commonsense reasoning benchmark', 'relevance_score': 3, 'priority': 'high', 'loading_time': 12.8}
|
|
162
|
+
]
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
async def create_single_quality_classifier(self,
|
|
169
|
+
task_analysis: TaskAnalysis,
|
|
170
|
+
classifier_params: 'ClassifierParams',
|
|
171
|
+
quality_threshold: float = 0.3,
|
|
172
|
+
time_budget_minutes: float = 10.0) -> SingleClassifierDecision:
|
|
173
|
+
"""
|
|
174
|
+
Create a single classifier trained on one benchmark.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
task_analysis: Analysis with relevant benchmarks
|
|
178
|
+
classifier_params: Model-determined classifier parameters
|
|
179
|
+
quality_threshold: Minimum quality score to accept existing classifiers
|
|
180
|
+
time_budget_minutes: Maximum time budget for creating new classifiers
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
Single classifier decision for the selected benchmark
|
|
184
|
+
"""
|
|
185
|
+
print(f"š Creating single quality classifier from {len(task_analysis.relevant_benchmarks)} benchmark(s)...")
|
|
186
|
+
|
|
187
|
+
# Extract benchmark names (should be just one now)
|
|
188
|
+
benchmark_names = [b['benchmark'] for b in task_analysis.relevant_benchmarks]
|
|
189
|
+
|
|
190
|
+
if not benchmark_names:
|
|
191
|
+
return SingleClassifierDecision(
|
|
192
|
+
benchmark_names=[],
|
|
193
|
+
action="skip",
|
|
194
|
+
reasoning="No benchmarks selected for classifier training",
|
|
195
|
+
confidence=0.0
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
# Use first (and should be only) benchmark
|
|
199
|
+
benchmark_name = benchmark_names[0]
|
|
200
|
+
print(f" š Using benchmark: {benchmark_name}")
|
|
201
|
+
|
|
202
|
+
# Set up budget manager
|
|
203
|
+
budget_manager = get_budget_manager()
|
|
204
|
+
budget_manager.set_time_budget(time_budget_minutes)
|
|
205
|
+
|
|
206
|
+
# Look for existing classifier for this exact model/layer/benchmark combination
|
|
207
|
+
available_classifiers = self.marketplace.discover_available_classifiers()
|
|
208
|
+
model_name = classifier_params.model_name if hasattr(classifier_params, 'model_name') else "unknown"
|
|
209
|
+
layer = classifier_params.optimal_layer
|
|
210
|
+
|
|
211
|
+
# Create specific classifier identifier
|
|
212
|
+
classifier_id = f"{model_name}_{benchmark_name}_layer_{layer}"
|
|
213
|
+
|
|
214
|
+
print(f" š Checking for existing classifier: {classifier_id}")
|
|
215
|
+
|
|
216
|
+
# Find existing classifier with exact match
|
|
217
|
+
existing_classifier = None
|
|
218
|
+
for classifier in available_classifiers:
|
|
219
|
+
# Check if classifier matches our exact requirements
|
|
220
|
+
if (benchmark_name.lower() in classifier.path.lower() and
|
|
221
|
+
str(layer) in classifier.path and
|
|
222
|
+
classifier.layer == layer):
|
|
223
|
+
existing_classifier = classifier
|
|
224
|
+
print(f" ā
Found existing classifier: {classifier.path}")
|
|
225
|
+
break
|
|
226
|
+
|
|
227
|
+
# Decision logic for single benchmark classifier
|
|
228
|
+
if existing_classifier and existing_classifier.quality_score >= quality_threshold:
|
|
229
|
+
return SingleClassifierDecision(
|
|
230
|
+
benchmark_names=[benchmark_name],
|
|
231
|
+
action="use_existing",
|
|
232
|
+
selected_classifier=existing_classifier,
|
|
233
|
+
reasoning=f"Found existing classifier for {benchmark_name} at layer {layer} with quality {existing_classifier.quality_score:.2f}",
|
|
234
|
+
confidence=existing_classifier.quality_score
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# Get creation estimate for single benchmark classifier
|
|
238
|
+
creation_estimate = self.marketplace.get_creation_estimate(benchmark_name)
|
|
239
|
+
|
|
240
|
+
# Check if we can afford to create new classifier
|
|
241
|
+
training_time_seconds = creation_estimate.estimated_training_time_minutes * 60
|
|
242
|
+
time_budget = budget_manager.get_budget(ResourceType.TIME)
|
|
243
|
+
|
|
244
|
+
if time_budget.can_afford(training_time_seconds):
|
|
245
|
+
return SingleClassifierDecision(
|
|
246
|
+
benchmark_names=[benchmark_name],
|
|
247
|
+
action="create_new",
|
|
248
|
+
creation_estimate=creation_estimate,
|
|
249
|
+
reasoning=f"Creating new classifier for {benchmark_name} at layer {layer}",
|
|
250
|
+
confidence=creation_estimate.confidence
|
|
251
|
+
)
|
|
252
|
+
else:
|
|
253
|
+
return SingleClassifierDecision(
|
|
254
|
+
benchmark_names=[benchmark_name],
|
|
255
|
+
action="skip",
|
|
256
|
+
reasoning=f"Insufficient time budget for creation (need {creation_estimate.estimated_training_time_minutes:.1f}min)",
|
|
257
|
+
confidence=0.0
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
async def execute_single_classifier_decision(self, decision: SingleClassifierDecision, classifier_params: 'ClassifierParams') -> Optional[Any]:
|
|
261
|
+
"""
|
|
262
|
+
Execute the single classifier decision to create or use the benchmark classifier.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
decision: The single classifier decision to execute
|
|
266
|
+
classifier_params: Model-determined classifier parameters
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
The trained classifier instance or None if skipped
|
|
270
|
+
"""
|
|
271
|
+
if decision.action == "skip":
|
|
272
|
+
print(f" ā¹ļø Skipping classifier creation: {decision.reasoning}")
|
|
273
|
+
return None
|
|
274
|
+
|
|
275
|
+
elif decision.action == "use_existing":
|
|
276
|
+
print(f" š¦ Using existing classifier: {decision.selected_classifier.path}")
|
|
277
|
+
print(f" Quality: {decision.selected_classifier.quality_score:.3f}")
|
|
278
|
+
print(f" Layer: {decision.selected_classifier.layer}")
|
|
279
|
+
return decision.selected_classifier
|
|
280
|
+
|
|
281
|
+
elif decision.action == "create_new":
|
|
282
|
+
benchmark_name = decision.benchmark_names[0] if decision.benchmark_names else "unknown"
|
|
283
|
+
print(f" šļø Creating new classifier for benchmark: {benchmark_name}")
|
|
284
|
+
start_time = time.time()
|
|
285
|
+
try:
|
|
286
|
+
# Create classifier using single benchmark training data
|
|
287
|
+
new_classifier = await self._create_single_benchmark_classifier(
|
|
288
|
+
benchmark_name=benchmark_name,
|
|
289
|
+
classifier_params=classifier_params
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
creation_time = time.time() - start_time
|
|
293
|
+
print(f" ā
Classifier created successfully in {creation_time:.1f}s")
|
|
294
|
+
return new_classifier
|
|
295
|
+
|
|
296
|
+
except Exception as e:
|
|
297
|
+
print(f" ā Failed to create classifier: {e}")
|
|
298
|
+
return None
|
|
299
|
+
|
|
300
|
+
return None
|
|
301
|
+
|
|
302
|
+
async def _create_single_benchmark_classifier(self, benchmark_name: str, classifier_params: 'ClassifierParams') -> Optional[Any]:
|
|
303
|
+
"""
|
|
304
|
+
Create a classifier for a single benchmark.
|
|
305
|
+
|
|
306
|
+
Args:
|
|
307
|
+
benchmark_name: Name of the benchmark to use for training
|
|
308
|
+
classifier_params: Model-determined classifier parameters
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
The trained classifier instance or None if failed
|
|
312
|
+
"""
|
|
313
|
+
from .create_classifier import ClassifierCreator
|
|
314
|
+
from ...training_config import TrainingConfig
|
|
315
|
+
|
|
316
|
+
try:
|
|
317
|
+
# Create training config
|
|
318
|
+
config = TrainingConfig(
|
|
319
|
+
issue_type=f"quality_{benchmark_name}",
|
|
320
|
+
layer=classifier_params.optimal_layer,
|
|
321
|
+
classifier_type=classifier_params.classifier_type,
|
|
322
|
+
threshold=classifier_params.classification_threshold,
|
|
323
|
+
training_samples=classifier_params.training_samples,
|
|
324
|
+
model_name=self.marketplace.model.name if self.marketplace.model else "unknown"
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
# Create classifier creator
|
|
328
|
+
creator = ClassifierCreator(self.marketplace.model)
|
|
329
|
+
|
|
330
|
+
# Create classifier using benchmark-specific training data
|
|
331
|
+
result = await creator.create_classifier_for_issue_with_benchmarks(
|
|
332
|
+
issue_type=f"quality_{benchmark_name}",
|
|
333
|
+
relevant_benchmarks=[benchmark_name],
|
|
334
|
+
layer=classifier_params.optimal_layer,
|
|
335
|
+
num_samples=classifier_params.training_samples,
|
|
336
|
+
config=config
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
return result.classifier if result else None
|
|
340
|
+
|
|
341
|
+
except Exception as e:
|
|
342
|
+
print(f" ā Error in single benchmark classifier creation: {e}")
|
|
343
|
+
raise
|
|
344
|
+
|
|
345
|
+
async def _create_combined_classifier(self, benchmark_names: List[str], classifier_params: 'ClassifierParams'):
|
|
346
|
+
"""
|
|
347
|
+
Create a classifier using combined training data from multiple benchmarks.
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
benchmark_names: List of benchmark names to combine
|
|
351
|
+
classifier_params: Model-determined parameters for classifier creation
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
Trained classifier instance
|
|
355
|
+
"""
|
|
356
|
+
from .create_classifier import ClassifierCreator
|
|
357
|
+
|
|
358
|
+
try:
|
|
359
|
+
# Initialize classifier creator
|
|
360
|
+
creator = ClassifierCreator(self.marketplace.model)
|
|
361
|
+
|
|
362
|
+
# Create classifier using combined benchmark training data
|
|
363
|
+
print(f" š Loading combined training data from benchmarks: {benchmark_names}")
|
|
364
|
+
classifier = await creator.create_combined_benchmark_classifier(
|
|
365
|
+
benchmark_names=benchmark_names,
|
|
366
|
+
classifier_params=classifier_params
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
return classifier
|
|
370
|
+
|
|
371
|
+
except Exception as e:
|
|
372
|
+
print(f" ā Error in combined classifier creation: {e}")
|
|
373
|
+
raise
|
|
374
|
+
|
|
375
|
+
async def make_classifier_decisions(self,
|
|
376
|
+
task_analysis: TaskAnalysis,
|
|
377
|
+
quality_threshold: float = 0.3,
|
|
378
|
+
time_budget_minutes: float = 10.0,
|
|
379
|
+
max_classifiers: int = None) -> List[ClassifierDecision]:
|
|
380
|
+
"""
|
|
381
|
+
Make decisions about which benchmark-specific classifiers to create or use.
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
task_analysis: Analysis with relevant benchmarks
|
|
385
|
+
quality_threshold: Minimum quality score to accept existing classifiers
|
|
386
|
+
time_budget_minutes: Maximum time budget for creating new classifiers
|
|
387
|
+
max_classifiers: Maximum number of classifiers to use (None = no limit)
|
|
388
|
+
|
|
389
|
+
Returns:
|
|
390
|
+
List of classifier decisions for each benchmark
|
|
391
|
+
"""
|
|
392
|
+
# Set up budget manager
|
|
393
|
+
budget_manager = get_budget_manager()
|
|
394
|
+
budget_manager.set_time_budget(time_budget_minutes)
|
|
395
|
+
|
|
396
|
+
# Discover available classifiers
|
|
397
|
+
await asyncio.sleep(0) # Make this async-compatible
|
|
398
|
+
available_classifiers = self.marketplace.discover_available_classifiers()
|
|
399
|
+
|
|
400
|
+
decisions = []
|
|
401
|
+
classifier_count = 0
|
|
402
|
+
|
|
403
|
+
# Create one classifier per relevant benchmark
|
|
404
|
+
for benchmark_info in task_analysis.relevant_benchmarks:
|
|
405
|
+
if max_classifiers and classifier_count >= max_classifiers:
|
|
406
|
+
print(f" ā¹ļø Reached maximum classifier limit ({max_classifiers})")
|
|
407
|
+
break
|
|
408
|
+
|
|
409
|
+
benchmark_name = benchmark_info['benchmark']
|
|
410
|
+
print(f"\n š Analyzing classifier for benchmark: {benchmark_name}")
|
|
411
|
+
|
|
412
|
+
# Look for existing benchmark-specific classifier
|
|
413
|
+
existing_options = [c for c in available_classifiers if benchmark_name.lower() in c.path.lower()]
|
|
414
|
+
best_existing = max(existing_options, key=lambda x: x.quality_score) if existing_options else None
|
|
415
|
+
|
|
416
|
+
# Get creation estimate for this benchmark
|
|
417
|
+
creation_estimate = self.marketplace.get_creation_estimate(benchmark_name)
|
|
418
|
+
|
|
419
|
+
# Make decision based on multiple factors
|
|
420
|
+
decision = self._evaluate_benchmark_classifier_options(
|
|
421
|
+
benchmark_name=benchmark_name,
|
|
422
|
+
best_existing=best_existing,
|
|
423
|
+
creation_estimate=creation_estimate,
|
|
424
|
+
quality_threshold=quality_threshold,
|
|
425
|
+
budget_manager=budget_manager
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
decisions.append(decision)
|
|
429
|
+
|
|
430
|
+
# Update budget and count
|
|
431
|
+
if decision.action == "create_new":
|
|
432
|
+
training_time_seconds = creation_estimate.estimated_training_time_minutes * 60
|
|
433
|
+
budget_manager.get_budget(ResourceType.TIME).spend(training_time_seconds)
|
|
434
|
+
classifier_count += 1
|
|
435
|
+
remaining_minutes = budget_manager.get_budget(ResourceType.TIME).remaining_budget / 60
|
|
436
|
+
print(f" ā±ļø Remaining time budget: {remaining_minutes:.1f} minutes")
|
|
437
|
+
elif decision.action == "use_existing":
|
|
438
|
+
classifier_count += 1
|
|
439
|
+
|
|
440
|
+
print(f" ā
Decision: {decision.action} - {decision.reasoning}")
|
|
441
|
+
|
|
442
|
+
# Store decisions in history
|
|
443
|
+
self.decision_history.extend(decisions)
|
|
444
|
+
|
|
445
|
+
return decisions
|
|
446
|
+
|
|
447
|
+
def _evaluate_benchmark_classifier_options(self,
|
|
448
|
+
benchmark_name: str,
|
|
449
|
+
best_existing: Optional[ClassifierListing],
|
|
450
|
+
creation_estimate: ClassifierCreationEstimate,
|
|
451
|
+
quality_threshold: float,
|
|
452
|
+
budget_manager) -> ClassifierDecision:
|
|
453
|
+
"""Evaluate whether to use existing, create new, or skip a benchmark-specific classifier."""
|
|
454
|
+
|
|
455
|
+
# Factor 1: Existing classifier quality
|
|
456
|
+
existing_quality = best_existing.quality_score if best_existing else 0.0
|
|
457
|
+
|
|
458
|
+
# Factor 2: Time constraints
|
|
459
|
+
time_budget = budget_manager.get_budget(ResourceType.TIME)
|
|
460
|
+
training_time_seconds = creation_estimate.estimated_training_time_minutes * 60
|
|
461
|
+
can_afford_creation = time_budget.can_afford(training_time_seconds)
|
|
462
|
+
|
|
463
|
+
# Factor 3: Expected benefit vs cost
|
|
464
|
+
creation_benefit = creation_estimate.estimated_quality_score
|
|
465
|
+
existing_benefit = existing_quality
|
|
466
|
+
|
|
467
|
+
# Decision logic
|
|
468
|
+
if best_existing and existing_quality >= quality_threshold:
|
|
469
|
+
if existing_quality >= creation_benefit or not can_afford_creation:
|
|
470
|
+
return ClassifierDecision(
|
|
471
|
+
benchmark_name=benchmark_name,
|
|
472
|
+
action="use_existing",
|
|
473
|
+
selected_classifier=best_existing,
|
|
474
|
+
reasoning=f"Existing classifier quality {existing_quality:.2f} meets threshold",
|
|
475
|
+
confidence=existing_quality
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
if can_afford_creation and creation_benefit > existing_benefit:
|
|
479
|
+
return ClassifierDecision(
|
|
480
|
+
benchmark_name=benchmark_name,
|
|
481
|
+
action="create_new",
|
|
482
|
+
creation_estimate=creation_estimate,
|
|
483
|
+
reasoning=f"Creating new classifier (est. quality {creation_benefit:.2f} > existing {existing_benefit:.2f})",
|
|
484
|
+
confidence=creation_estimate.confidence
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
if best_existing:
|
|
488
|
+
return ClassifierDecision(
|
|
489
|
+
benchmark_name=benchmark_name,
|
|
490
|
+
action="use_existing",
|
|
491
|
+
selected_classifier=best_existing,
|
|
492
|
+
reasoning=f"Using existing despite low quality - time/budget constraints",
|
|
493
|
+
confidence=existing_quality * 0.7 # Penalty for low quality
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
return ClassifierDecision(
|
|
497
|
+
benchmark_name=benchmark_name,
|
|
498
|
+
action="skip",
|
|
499
|
+
reasoning="No suitable existing classifier and cannot create new within budget",
|
|
500
|
+
confidence=0.0
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
async def execute_decisions(self, decisions: List[ClassifierDecision]) -> List[Dict[str, Any]]:
|
|
504
|
+
"""
|
|
505
|
+
Execute the classifier decisions and return the final classifier configs.
|
|
506
|
+
|
|
507
|
+
Args:
|
|
508
|
+
decisions: List of decisions to execute
|
|
509
|
+
|
|
510
|
+
Returns:
|
|
511
|
+
List of classifier configurations ready for use
|
|
512
|
+
"""
|
|
513
|
+
classifier_configs = []
|
|
514
|
+
|
|
515
|
+
for decision in decisions:
|
|
516
|
+
if decision.action == "skip":
|
|
517
|
+
continue
|
|
518
|
+
|
|
519
|
+
elif decision.action == "use_existing":
|
|
520
|
+
config = decision.selected_classifier.to_config()
|
|
521
|
+
classifier_configs.append(config)
|
|
522
|
+
print(f" š Using existing {decision.issue_type} classifier: {config['path']}")
|
|
523
|
+
|
|
524
|
+
elif decision.action == "create_new":
|
|
525
|
+
print(f" šļø Creating new classifier for benchmark: {decision.benchmark_name}...")
|
|
526
|
+
start_time = time.time()
|
|
527
|
+
try:
|
|
528
|
+
# Create benchmark-specific classifier
|
|
529
|
+
new_classifier = await self._create_classifier_for_benchmark(
|
|
530
|
+
benchmark_name=decision.benchmark_name
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
end_time = time.time()
|
|
534
|
+
|
|
535
|
+
# Track performance for future budget estimates
|
|
536
|
+
track_task_performance(
|
|
537
|
+
task_name=f"classifier_training_{decision.benchmark_name}",
|
|
538
|
+
start_time=start_time,
|
|
539
|
+
end_time=end_time
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
config = new_classifier.to_config()
|
|
543
|
+
config['benchmark'] = decision.benchmark_name
|
|
544
|
+
classifier_configs.append(config)
|
|
545
|
+
print(f" ā
Created: {config['path']} (took {end_time - start_time:.1f}s)")
|
|
546
|
+
except Exception as e:
|
|
547
|
+
print(f" ā Failed to create {decision.benchmark_name} classifier: {e}")
|
|
548
|
+
continue
|
|
549
|
+
|
|
550
|
+
return classifier_configs
|
|
551
|
+
|
|
552
|
+
async def _create_classifier_for_benchmark(self, benchmark_name: str):
|
|
553
|
+
"""
|
|
554
|
+
Create a classifier trained specifically on a benchmark dataset.
|
|
555
|
+
|
|
556
|
+
Args:
|
|
557
|
+
benchmark_name: Name of the benchmark to train on
|
|
558
|
+
|
|
559
|
+
Returns:
|
|
560
|
+
Trained classifier instance
|
|
561
|
+
"""
|
|
562
|
+
from .create_classifier import ClassifierCreator
|
|
563
|
+
|
|
564
|
+
try:
|
|
565
|
+
# Initialize classifier creator
|
|
566
|
+
creator = ClassifierCreator(self.marketplace.model)
|
|
567
|
+
|
|
568
|
+
# Create classifier using benchmark-specific training data
|
|
569
|
+
print(f" š Loading training data from benchmark: {benchmark_name}")
|
|
570
|
+
classifier = await creator.create_classifier_for_issue_with_benchmarks(
|
|
571
|
+
issue_type=benchmark_name, # Use benchmark name as issue type
|
|
572
|
+
relevant_benchmarks=[benchmark_name],
|
|
573
|
+
num_samples=50
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
return classifier
|
|
577
|
+
|
|
578
|
+
except Exception as e:
|
|
579
|
+
print(f" ā ļø Benchmark-based creation failed: {e}")
|
|
580
|
+
raise e
|
|
581
|
+
|
|
582
|
+
def get_decision_summary(self) -> str:
|
|
583
|
+
"""Get a summary of recent classifier decisions."""
|
|
584
|
+
if not self.decision_history:
|
|
585
|
+
return "No classifier decisions made yet."
|
|
586
|
+
|
|
587
|
+
recent_decisions = self.decision_history[-10:] # Last 10 decisions
|
|
588
|
+
|
|
589
|
+
summary = "\nš¤ Recent Classifier Decisions\n"
|
|
590
|
+
summary += "=" * 40 + "\n"
|
|
591
|
+
|
|
592
|
+
action_counts = {}
|
|
593
|
+
for decision in recent_decisions:
|
|
594
|
+
action_counts[decision.action] = action_counts.get(decision.action, 0) + 1
|
|
595
|
+
|
|
596
|
+
summary += f"Actions taken: {dict(action_counts)}\n\n"
|
|
597
|
+
|
|
598
|
+
for decision in recent_decisions[-5:]: # Show last 5
|
|
599
|
+
summary += f"⢠{decision.benchmark_name}: {decision.action}\n"
|
|
600
|
+
summary += f" Reasoning: {decision.reasoning}\n"
|
|
601
|
+
summary += f" Confidence: {decision.confidence:.2f}\n\n"
|
|
602
|
+
|
|
603
|
+
return summary
|
|
604
|
+
|
|
605
|
+
async def smart_classifier_selection(self,
|
|
606
|
+
prompt: str,
|
|
607
|
+
context: str = "",
|
|
608
|
+
quality_threshold: float = 0.3,
|
|
609
|
+
time_budget_minutes: float = 10.0,
|
|
610
|
+
max_classifiers: int = None) -> List[Dict[str, Any]]:
|
|
611
|
+
"""
|
|
612
|
+
One-stop method for intelligent classifier selection.
|
|
613
|
+
|
|
614
|
+
Args:
|
|
615
|
+
prompt: The task/prompt to analyze
|
|
616
|
+
context: Additional context
|
|
617
|
+
quality_threshold: Minimum quality for existing classifiers
|
|
618
|
+
time_budget_minutes: Time budget for creating new classifiers
|
|
619
|
+
max_classifiers: Maximum number of classifiers to use
|
|
620
|
+
|
|
621
|
+
Returns:
|
|
622
|
+
List of classifier configurations ready for use
|
|
623
|
+
"""
|
|
624
|
+
print(f"š§ Smart classifier selection for task...")
|
|
625
|
+
|
|
626
|
+
# Step 1: Analyze task requirements
|
|
627
|
+
task_analysis = self.analyze_task_requirements(prompt, context)
|
|
628
|
+
|
|
629
|
+
# Step 2: Make decisions about classifiers
|
|
630
|
+
decisions = await self.make_classifier_decisions(
|
|
631
|
+
task_analysis=task_analysis,
|
|
632
|
+
quality_threshold=quality_threshold,
|
|
633
|
+
time_budget_minutes=time_budget_minutes,
|
|
634
|
+
max_classifiers=max_classifiers
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
# Step 3: Execute decisions
|
|
638
|
+
classifier_configs = await self.execute_decisions(decisions)
|
|
639
|
+
|
|
640
|
+
print(f"šÆ Selected {len(classifier_configs)} classifiers for the task")
|
|
641
|
+
return classifier_configs
|