devsquad 3.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- devsquad-3.6.0.dist-info/METADATA +944 -0
- devsquad-3.6.0.dist-info/RECORD +95 -0
- devsquad-3.6.0.dist-info/WHEEL +5 -0
- devsquad-3.6.0.dist-info/entry_points.txt +2 -0
- devsquad-3.6.0.dist-info/licenses/LICENSE +21 -0
- devsquad-3.6.0.dist-info/top_level.txt +2 -0
- scripts/__init__.py +0 -0
- scripts/ai_semantic_matcher.py +512 -0
- scripts/alert_manager.py +505 -0
- scripts/api/__init__.py +43 -0
- scripts/api/models.py +386 -0
- scripts/api/routes/__init__.py +20 -0
- scripts/api/routes/dispatch.py +348 -0
- scripts/api/routes/lifecycle.py +330 -0
- scripts/api/routes/metrics_gates.py +347 -0
- scripts/api_server.py +318 -0
- scripts/auth.py +451 -0
- scripts/cli/__init__.py +1 -0
- scripts/cli/cli_visual.py +642 -0
- scripts/cli.py +1094 -0
- scripts/collaboration/__init__.py +212 -0
- scripts/collaboration/_version.py +1 -0
- scripts/collaboration/agent_briefing.py +656 -0
- scripts/collaboration/ai_semantic_matcher.py +260 -0
- scripts/collaboration/anchor_checker.py +281 -0
- scripts/collaboration/anti_rationalization.py +470 -0
- scripts/collaboration/async_integration_example.py +255 -0
- scripts/collaboration/batch_scheduler.py +149 -0
- scripts/collaboration/checkpoint_manager.py +561 -0
- scripts/collaboration/ci_feedback_adapter.py +351 -0
- scripts/collaboration/code_map_generator.py +247 -0
- scripts/collaboration/concern_pack_loader.py +352 -0
- scripts/collaboration/confidence_score.py +496 -0
- scripts/collaboration/config_loader.py +188 -0
- scripts/collaboration/consensus.py +244 -0
- scripts/collaboration/context_compressor.py +533 -0
- scripts/collaboration/coordinator.py +668 -0
- scripts/collaboration/dispatcher.py +1636 -0
- scripts/collaboration/dual_layer_context.py +128 -0
- scripts/collaboration/enhanced_worker.py +539 -0
- scripts/collaboration/feature_usage_tracker.py +206 -0
- scripts/collaboration/five_axis_consensus.py +334 -0
- scripts/collaboration/input_validator.py +401 -0
- scripts/collaboration/integration_example.py +287 -0
- scripts/collaboration/intent_workflow_mapper.py +350 -0
- scripts/collaboration/language_parsers.py +269 -0
- scripts/collaboration/lifecycle_protocol.py +1446 -0
- scripts/collaboration/llm_backend.py +453 -0
- scripts/collaboration/llm_cache.py +448 -0
- scripts/collaboration/llm_cache_async.py +347 -0
- scripts/collaboration/llm_retry.py +387 -0
- scripts/collaboration/llm_retry_async.py +389 -0
- scripts/collaboration/mce_adapter.py +597 -0
- scripts/collaboration/memory_bridge.py +1607 -0
- scripts/collaboration/models.py +537 -0
- scripts/collaboration/null_providers.py +297 -0
- scripts/collaboration/operation_classifier.py +289 -0
- scripts/collaboration/output_slicer.py +225 -0
- scripts/collaboration/performance_monitor.py +462 -0
- scripts/collaboration/permission_guard.py +865 -0
- scripts/collaboration/prompt_assembler.py +756 -0
- scripts/collaboration/prompt_variant_generator.py +483 -0
- scripts/collaboration/protocols.py +267 -0
- scripts/collaboration/report_formatter.py +352 -0
- scripts/collaboration/retrospective.py +279 -0
- scripts/collaboration/role_matcher.py +92 -0
- scripts/collaboration/role_template_market.py +352 -0
- scripts/collaboration/rule_collector.py +678 -0
- scripts/collaboration/scratchpad.py +346 -0
- scripts/collaboration/skill_registry.py +151 -0
- scripts/collaboration/skillifier.py +878 -0
- scripts/collaboration/standardized_role_template.py +317 -0
- scripts/collaboration/task_completion_checker.py +237 -0
- scripts/collaboration/test_quality_guard.py +695 -0
- scripts/collaboration/unified_gate_engine.py +598 -0
- scripts/collaboration/usage_tracker.py +309 -0
- scripts/collaboration/user_friendly_error.py +176 -0
- scripts/collaboration/verification_gate.py +312 -0
- scripts/collaboration/warmup_manager.py +635 -0
- scripts/collaboration/worker.py +513 -0
- scripts/collaboration/workflow_engine.py +684 -0
- scripts/dashboard.py +1088 -0
- scripts/generate_benchmark_report.py +786 -0
- scripts/history_manager.py +604 -0
- scripts/mcp_server.py +289 -0
- skills/__init__.py +32 -0
- skills/dispatch/handler.py +52 -0
- skills/intent/handler.py +59 -0
- skills/registry.py +67 -0
- skills/retrospective/__init__.py +0 -0
- skills/retrospective/handler.py +125 -0
- skills/review/handler.py +356 -0
- skills/security/handler.py +454 -0
- skills/test/__init__.py +0 -0
- skills/test/handler.py +78 -0
|
@@ -0,0 +1,496 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Confidence Score System
|
|
5
|
+
|
|
6
|
+
Provides confidence scoring for LLM responses to help agents make better decisions.
|
|
7
|
+
|
|
8
|
+
Features:
|
|
9
|
+
- Multi-factor confidence calculation
|
|
10
|
+
- Response quality assessment
|
|
11
|
+
- Uncertainty detection
|
|
12
|
+
- Threshold-based decision making
|
|
13
|
+
- Historical confidence tracking
|
|
14
|
+
|
|
15
|
+
Usage:
|
|
16
|
+
from scripts.collaboration.confidence_score import ConfidenceScorer
|
|
17
|
+
|
|
18
|
+
scorer = ConfidenceScorer()
|
|
19
|
+
|
|
20
|
+
# Calculate confidence for a response
|
|
21
|
+
score = scorer.calculate_confidence(
|
|
22
|
+
prompt="Design a REST API",
|
|
23
|
+
response="Here's the API design...",
|
|
24
|
+
metadata={"model": "gpt-4", "temperature": 0.7}
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# Check if confidence meets threshold
|
|
28
|
+
if score.is_confident(threshold=0.7):
|
|
29
|
+
# Proceed with high confidence
|
|
30
|
+
pass
|
|
31
|
+
else:
|
|
32
|
+
# Request human review or retry
|
|
33
|
+
pass
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
import re
|
|
37
|
+
import logging
|
|
38
|
+
from typing import Dict, Any, List, Optional, Tuple
|
|
39
|
+
from dataclasses import dataclass, field
|
|
40
|
+
from datetime import datetime
|
|
41
|
+
from enum import Enum
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
logger = logging.getLogger(__name__)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class ConfidenceLevel(Enum):
|
|
48
|
+
"""Confidence level categories"""
|
|
49
|
+
VERY_HIGH = "very_high" # >= 0.9
|
|
50
|
+
HIGH = "high" # >= 0.7
|
|
51
|
+
MEDIUM = "medium" # >= 0.5
|
|
52
|
+
LOW = "low" # >= 0.3
|
|
53
|
+
VERY_LOW = "very_low" # < 0.3
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class ConfidenceScore:
|
|
58
|
+
"""Confidence score result"""
|
|
59
|
+
overall_score: float # 0.0 to 1.0
|
|
60
|
+
level: ConfidenceLevel
|
|
61
|
+
factors: Dict[str, float] # Individual factor scores
|
|
62
|
+
reasoning: List[str] # Explanation of score
|
|
63
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
64
|
+
timestamp: float = field(default_factory=lambda: datetime.now().timestamp())
|
|
65
|
+
|
|
66
|
+
def is_confident(self, threshold: float = 0.7) -> bool:
|
|
67
|
+
"""Check if confidence meets threshold"""
|
|
68
|
+
return self.overall_score >= threshold
|
|
69
|
+
|
|
70
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
71
|
+
"""Convert to dictionary"""
|
|
72
|
+
return {
|
|
73
|
+
"overall_score": self.overall_score,
|
|
74
|
+
"level": self.level.value,
|
|
75
|
+
"factors": self.factors,
|
|
76
|
+
"reasoning": self.reasoning,
|
|
77
|
+
"metadata": self.metadata,
|
|
78
|
+
"timestamp": self.timestamp
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class ConfidenceScorer:
|
|
83
|
+
"""
|
|
84
|
+
Confidence Scorer for LLM Responses
|
|
85
|
+
|
|
86
|
+
Calculates confidence based on multiple factors:
|
|
87
|
+
1. Response completeness
|
|
88
|
+
2. Uncertainty indicators
|
|
89
|
+
3. Specificity and detail
|
|
90
|
+
4. Consistency
|
|
91
|
+
5. Model metadata (temperature, model quality)
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
# Uncertainty phrases that indicate low confidence
|
|
95
|
+
UNCERTAINTY_PHRASES = [
|
|
96
|
+
"i think", "maybe", "perhaps", "possibly", "might be",
|
|
97
|
+
"could be", "not sure", "uncertain", "unclear", "ambiguous",
|
|
98
|
+
"i'm not certain", "i don't know", "hard to say",
|
|
99
|
+
"it depends", "it's possible", "it seems", "appears to be"
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
# Hedging words that reduce confidence
|
|
103
|
+
HEDGING_WORDS = [
|
|
104
|
+
"probably", "likely", "unlikely", "somewhat", "fairly",
|
|
105
|
+
"relatively", "generally", "typically", "usually", "often"
|
|
106
|
+
]
|
|
107
|
+
|
|
108
|
+
def __init__(
|
|
109
|
+
self,
|
|
110
|
+
weights: Optional[Dict[str, float]] = None,
|
|
111
|
+
min_response_length: int = 50
|
|
112
|
+
):
|
|
113
|
+
"""
|
|
114
|
+
Initialize confidence scorer
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
weights: Custom weights for each factor (default: equal weights)
|
|
118
|
+
min_response_length: Minimum expected response length
|
|
119
|
+
"""
|
|
120
|
+
self.weights = weights or {
|
|
121
|
+
"completeness": 0.25,
|
|
122
|
+
"certainty": 0.25,
|
|
123
|
+
"specificity": 0.20,
|
|
124
|
+
"consistency": 0.15,
|
|
125
|
+
"model_quality": 0.15
|
|
126
|
+
}
|
|
127
|
+
self.min_response_length = min_response_length
|
|
128
|
+
self._max_history = 1000
|
|
129
|
+
|
|
130
|
+
self.history: List[ConfidenceScore] = []
|
|
131
|
+
|
|
132
|
+
def calculate_confidence(
|
|
133
|
+
self,
|
|
134
|
+
prompt: str,
|
|
135
|
+
response: str,
|
|
136
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
137
|
+
) -> ConfidenceScore:
|
|
138
|
+
"""
|
|
139
|
+
Calculate confidence score for a response
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
prompt: Original prompt
|
|
143
|
+
response: LLM response
|
|
144
|
+
metadata: Additional metadata (model, temperature, etc.)
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
ConfidenceScore object
|
|
148
|
+
"""
|
|
149
|
+
metadata = metadata or {}
|
|
150
|
+
factors = {}
|
|
151
|
+
reasoning = []
|
|
152
|
+
|
|
153
|
+
# 1. Completeness score
|
|
154
|
+
completeness, comp_reason = self._calculate_completeness(prompt, response)
|
|
155
|
+
factors["completeness"] = completeness
|
|
156
|
+
reasoning.extend(comp_reason)
|
|
157
|
+
|
|
158
|
+
# 2. Certainty score (inverse of uncertainty)
|
|
159
|
+
certainty, cert_reason = self._calculate_certainty(response)
|
|
160
|
+
factors["certainty"] = certainty
|
|
161
|
+
reasoning.extend(cert_reason)
|
|
162
|
+
|
|
163
|
+
# 3. Specificity score
|
|
164
|
+
specificity, spec_reason = self._calculate_specificity(response)
|
|
165
|
+
factors["specificity"] = specificity
|
|
166
|
+
reasoning.extend(spec_reason)
|
|
167
|
+
|
|
168
|
+
# 4. Consistency score
|
|
169
|
+
consistency, cons_reason = self._calculate_consistency(response)
|
|
170
|
+
factors["consistency"] = consistency
|
|
171
|
+
reasoning.extend(cons_reason)
|
|
172
|
+
|
|
173
|
+
# 5. Model quality score
|
|
174
|
+
model_quality, model_reason = self._calculate_model_quality(metadata)
|
|
175
|
+
factors["model_quality"] = model_quality
|
|
176
|
+
reasoning.extend(model_reason)
|
|
177
|
+
|
|
178
|
+
# Calculate weighted overall score
|
|
179
|
+
overall_score = sum(
|
|
180
|
+
factors[factor] * self.weights[factor]
|
|
181
|
+
for factor in self.weights
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# Determine confidence level
|
|
185
|
+
level = self._determine_level(overall_score)
|
|
186
|
+
|
|
187
|
+
# Create confidence score object
|
|
188
|
+
score = ConfidenceScore(
|
|
189
|
+
overall_score=overall_score,
|
|
190
|
+
level=level,
|
|
191
|
+
factors=factors,
|
|
192
|
+
reasoning=reasoning,
|
|
193
|
+
metadata=metadata
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Add to history
|
|
197
|
+
self.history.append(score)
|
|
198
|
+
|
|
199
|
+
if len(self.history) > self._max_history:
|
|
200
|
+
self.history = self.history[-self._max_history:]
|
|
201
|
+
|
|
202
|
+
return score
|
|
203
|
+
|
|
204
|
+
def _calculate_completeness(
|
|
205
|
+
self,
|
|
206
|
+
prompt: str,
|
|
207
|
+
response: str
|
|
208
|
+
) -> Tuple[float, List[str]]:
|
|
209
|
+
"""Calculate response completeness"""
|
|
210
|
+
reasoning = []
|
|
211
|
+
score = 0.0
|
|
212
|
+
|
|
213
|
+
# Check response length
|
|
214
|
+
if len(response) < self.min_response_length:
|
|
215
|
+
score = 0.3
|
|
216
|
+
reasoning.append(f"Response too short ({len(response)} chars)")
|
|
217
|
+
elif len(response) < self.min_response_length * 2:
|
|
218
|
+
score = 0.6
|
|
219
|
+
reasoning.append("Response length adequate")
|
|
220
|
+
else:
|
|
221
|
+
score = 0.9
|
|
222
|
+
reasoning.append("Response length good")
|
|
223
|
+
|
|
224
|
+
# Check for incomplete sentences
|
|
225
|
+
if response.endswith("...") or response.count("...") > 2:
|
|
226
|
+
score *= 0.7
|
|
227
|
+
reasoning.append("Response appears incomplete (ellipsis)")
|
|
228
|
+
|
|
229
|
+
# Check for truncation indicators
|
|
230
|
+
truncation_indicators = ["[truncated]", "[continued]", "...and more"]
|
|
231
|
+
if any(indicator in response.lower() for indicator in truncation_indicators):
|
|
232
|
+
score *= 0.5
|
|
233
|
+
reasoning.append("Response appears truncated")
|
|
234
|
+
|
|
235
|
+
return min(score, 1.0), reasoning
|
|
236
|
+
|
|
237
|
+
def _calculate_certainty(self, response: str) -> Tuple[float, List[str]]:
|
|
238
|
+
"""Calculate response certainty (inverse of uncertainty)"""
|
|
239
|
+
reasoning = []
|
|
240
|
+
response_lower = response.lower()
|
|
241
|
+
|
|
242
|
+
# Count uncertainty phrases
|
|
243
|
+
uncertainty_count = sum(
|
|
244
|
+
1 for phrase in self.UNCERTAINTY_PHRASES
|
|
245
|
+
if phrase in response_lower
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Count hedging words
|
|
249
|
+
hedging_count = sum(
|
|
250
|
+
1 for word in self.HEDGING_WORDS
|
|
251
|
+
if f" {word} " in f" {response_lower} "
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# Calculate certainty score
|
|
255
|
+
total_uncertainty = uncertainty_count + (hedging_count * 0.5)
|
|
256
|
+
|
|
257
|
+
if total_uncertainty == 0:
|
|
258
|
+
score = 1.0
|
|
259
|
+
reasoning.append("No uncertainty indicators found")
|
|
260
|
+
elif total_uncertainty <= 2:
|
|
261
|
+
score = 0.8
|
|
262
|
+
reasoning.append(f"Minor uncertainty ({int(total_uncertainty)} indicators)")
|
|
263
|
+
elif total_uncertainty <= 5:
|
|
264
|
+
score = 0.5
|
|
265
|
+
reasoning.append(f"Moderate uncertainty ({int(total_uncertainty)} indicators)")
|
|
266
|
+
else:
|
|
267
|
+
score = 0.2
|
|
268
|
+
reasoning.append(f"High uncertainty ({int(total_uncertainty)} indicators)")
|
|
269
|
+
|
|
270
|
+
return score, reasoning
|
|
271
|
+
|
|
272
|
+
def _calculate_specificity(self, response: str) -> Tuple[float, List[str]]:
|
|
273
|
+
"""Calculate response specificity and detail level"""
|
|
274
|
+
reasoning = []
|
|
275
|
+
|
|
276
|
+
# Check for specific details (numbers, code, examples)
|
|
277
|
+
has_numbers = bool(re.search(r'\d+', response))
|
|
278
|
+
has_code = bool(re.search(r'```|`[^`]+`', response))
|
|
279
|
+
has_examples = bool(re.search(r'(for example|e\.g\.|such as|like)', response, re.IGNORECASE))
|
|
280
|
+
has_lists = bool(re.search(r'^\s*[-*\d]+\.?\s', response, re.MULTILINE))
|
|
281
|
+
|
|
282
|
+
specificity_indicators = sum([has_numbers, has_code, has_examples, has_lists])
|
|
283
|
+
|
|
284
|
+
if specificity_indicators >= 3:
|
|
285
|
+
score = 1.0
|
|
286
|
+
reasoning.append("High specificity (numbers, code, examples, lists)")
|
|
287
|
+
elif specificity_indicators == 2:
|
|
288
|
+
score = 0.7
|
|
289
|
+
reasoning.append("Good specificity (some concrete details)")
|
|
290
|
+
elif specificity_indicators == 1:
|
|
291
|
+
score = 0.5
|
|
292
|
+
reasoning.append("Moderate specificity (limited details)")
|
|
293
|
+
else:
|
|
294
|
+
score = 0.3
|
|
295
|
+
reasoning.append("Low specificity (mostly abstract)")
|
|
296
|
+
|
|
297
|
+
# Check for vague language
|
|
298
|
+
vague_phrases = ["something", "somehow", "various", "several", "many", "some"]
|
|
299
|
+
vague_count = sum(1 for phrase in vague_phrases if phrase in response.lower())
|
|
300
|
+
|
|
301
|
+
if vague_count > 5:
|
|
302
|
+
score *= 0.7
|
|
303
|
+
reasoning.append(f"Contains vague language ({vague_count} instances)")
|
|
304
|
+
|
|
305
|
+
return score, reasoning
|
|
306
|
+
|
|
307
|
+
def _calculate_consistency(self, response: str) -> Tuple[float, List[str]]:
|
|
308
|
+
"""Calculate internal consistency of response"""
|
|
309
|
+
reasoning = []
|
|
310
|
+
score = 1.0
|
|
311
|
+
|
|
312
|
+
# Check for contradictions
|
|
313
|
+
contradiction_patterns = [
|
|
314
|
+
(r"(yes|true|correct).*?(no|false|incorrect)", "yes/no contradiction"),
|
|
315
|
+
(r"(always|never).*?(sometimes|occasionally)", "absolute/conditional contradiction"),
|
|
316
|
+
(r"(should|must).*?(should not|must not)", "directive contradiction")
|
|
317
|
+
]
|
|
318
|
+
|
|
319
|
+
for pattern, description in contradiction_patterns:
|
|
320
|
+
if re.search(pattern, response, re.IGNORECASE | re.DOTALL):
|
|
321
|
+
score *= 0.6
|
|
322
|
+
reasoning.append(f"Potential contradiction: {description}")
|
|
323
|
+
|
|
324
|
+
# Check for self-corrections
|
|
325
|
+
correction_phrases = ["actually", "correction", "i mean", "rather", "instead"]
|
|
326
|
+
correction_count = sum(
|
|
327
|
+
1 for phrase in correction_phrases
|
|
328
|
+
if phrase in response.lower()
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
if correction_count > 0:
|
|
332
|
+
score *= (1.0 - (correction_count * 0.1))
|
|
333
|
+
reasoning.append(f"Contains self-corrections ({correction_count})")
|
|
334
|
+
|
|
335
|
+
if score == 1.0:
|
|
336
|
+
reasoning.append("No consistency issues detected")
|
|
337
|
+
|
|
338
|
+
return max(score, 0.0), reasoning
|
|
339
|
+
|
|
340
|
+
def _calculate_model_quality(
|
|
341
|
+
self,
|
|
342
|
+
metadata: Dict[str, Any]
|
|
343
|
+
) -> Tuple[float, List[str]]:
|
|
344
|
+
"""Calculate score based on model metadata"""
|
|
345
|
+
reasoning = []
|
|
346
|
+
score = 0.7 # Default score
|
|
347
|
+
|
|
348
|
+
# Model quality tiers
|
|
349
|
+
model = metadata.get("model", "").lower()
|
|
350
|
+
if "gpt-4" in model or "claude-3" in model:
|
|
351
|
+
score = 0.95
|
|
352
|
+
reasoning.append("High-quality model (GPT-4/Claude-3)")
|
|
353
|
+
elif "gpt-3.5" in model or "claude-2" in model:
|
|
354
|
+
score = 0.8
|
|
355
|
+
reasoning.append("Good quality model (GPT-3.5/Claude-2)")
|
|
356
|
+
elif model:
|
|
357
|
+
score = 0.6
|
|
358
|
+
reasoning.append(f"Standard model ({model})")
|
|
359
|
+
else:
|
|
360
|
+
reasoning.append("Model unknown (default score)")
|
|
361
|
+
|
|
362
|
+
# Temperature adjustment
|
|
363
|
+
temperature = metadata.get("temperature")
|
|
364
|
+
if temperature is not None:
|
|
365
|
+
if temperature <= 0.3:
|
|
366
|
+
score *= 1.1 # More deterministic = higher confidence
|
|
367
|
+
reasoning.append(f"Low temperature ({temperature}) increases confidence")
|
|
368
|
+
elif temperature >= 0.9:
|
|
369
|
+
score *= 0.9 # More creative = lower confidence
|
|
370
|
+
reasoning.append(f"High temperature ({temperature}) reduces confidence")
|
|
371
|
+
|
|
372
|
+
# Token count (longer responses may indicate more thought)
|
|
373
|
+
token_count = metadata.get("token_count", 0)
|
|
374
|
+
if token_count > 1000:
|
|
375
|
+
score *= 1.05
|
|
376
|
+
reasoning.append("Detailed response (high token count)")
|
|
377
|
+
|
|
378
|
+
return min(score, 1.0), reasoning
|
|
379
|
+
|
|
380
|
+
def _determine_level(self, score: float) -> ConfidenceLevel:
|
|
381
|
+
"""Determine confidence level from score"""
|
|
382
|
+
if score >= 0.9:
|
|
383
|
+
return ConfidenceLevel.VERY_HIGH
|
|
384
|
+
elif score >= 0.7:
|
|
385
|
+
return ConfidenceLevel.HIGH
|
|
386
|
+
elif score >= 0.5:
|
|
387
|
+
return ConfidenceLevel.MEDIUM
|
|
388
|
+
elif score >= 0.3:
|
|
389
|
+
return ConfidenceLevel.LOW
|
|
390
|
+
else:
|
|
391
|
+
return ConfidenceLevel.VERY_LOW
|
|
392
|
+
|
|
393
|
+
def get_average_confidence(self, limit: Optional[int] = None) -> float:
|
|
394
|
+
"""Get average confidence from recent history"""
|
|
395
|
+
if not self.history:
|
|
396
|
+
return 0.0
|
|
397
|
+
|
|
398
|
+
recent = self.history[-limit:] if limit else self.history
|
|
399
|
+
return sum(score.overall_score for score in recent) / len(recent)
|
|
400
|
+
|
|
401
|
+
def get_confidence_trend(self, window: int = 10) -> str:
|
|
402
|
+
"""Get confidence trend (improving/declining/stable)"""
|
|
403
|
+
if len(self.history) < window:
|
|
404
|
+
return "insufficient_data"
|
|
405
|
+
|
|
406
|
+
recent = self.history[-window:]
|
|
407
|
+
first_half = recent[:window//2]
|
|
408
|
+
second_half = recent[window//2:]
|
|
409
|
+
|
|
410
|
+
first_avg = sum(s.overall_score for s in first_half) / len(first_half)
|
|
411
|
+
second_avg = sum(s.overall_score for s in second_half) / len(second_half)
|
|
412
|
+
|
|
413
|
+
diff = second_avg - first_avg
|
|
414
|
+
|
|
415
|
+
if diff > 0.1:
|
|
416
|
+
return "improving"
|
|
417
|
+
elif diff < -0.1:
|
|
418
|
+
return "declining"
|
|
419
|
+
else:
|
|
420
|
+
return "stable"
|
|
421
|
+
|
|
422
|
+
def export_stats(self) -> Dict[str, Any]:
|
|
423
|
+
"""Export confidence statistics"""
|
|
424
|
+
if not self.history:
|
|
425
|
+
return {
|
|
426
|
+
"total_scores": 0,
|
|
427
|
+
"average_confidence": 0.0,
|
|
428
|
+
"trend": "no_data"
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
return {
|
|
432
|
+
"total_scores": len(self.history),
|
|
433
|
+
"average_confidence": self.get_average_confidence(),
|
|
434
|
+
"recent_average": self.get_average_confidence(limit=10),
|
|
435
|
+
"trend": self.get_confidence_trend(),
|
|
436
|
+
"level_distribution": self._get_level_distribution(),
|
|
437
|
+
"factor_averages": self._get_factor_averages()
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
def _get_level_distribution(self) -> Dict[str, int]:
|
|
441
|
+
"""Get distribution of confidence levels"""
|
|
442
|
+
distribution = {level.value: 0 for level in ConfidenceLevel}
|
|
443
|
+
for score in self.history:
|
|
444
|
+
distribution[score.level.value] += 1
|
|
445
|
+
return distribution
|
|
446
|
+
|
|
447
|
+
def _get_factor_averages(self) -> Dict[str, float]:
|
|
448
|
+
"""Get average scores for each factor"""
|
|
449
|
+
if not self.history:
|
|
450
|
+
return {}
|
|
451
|
+
|
|
452
|
+
factor_sums = {}
|
|
453
|
+
for score in self.history:
|
|
454
|
+
for factor, value in score.factors.items():
|
|
455
|
+
factor_sums[factor] = factor_sums.get(factor, 0) + value
|
|
456
|
+
|
|
457
|
+
return {
|
|
458
|
+
factor: total / len(self.history)
|
|
459
|
+
for factor, total in factor_sums.items()
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
# Global scorer instance
|
|
464
|
+
_scorer_instance: Optional[ConfidenceScorer] = None
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
def get_confidence_scorer(
|
|
468
|
+
weights: Optional[Dict[str, float]] = None,
|
|
469
|
+
min_response_length: int = 50
|
|
470
|
+
) -> ConfidenceScorer:
|
|
471
|
+
"""Get or create global confidence scorer instance"""
|
|
472
|
+
global _scorer_instance
|
|
473
|
+
|
|
474
|
+
if _scorer_instance is None:
|
|
475
|
+
_scorer_instance = ConfidenceScorer(
|
|
476
|
+
weights=weights,
|
|
477
|
+
min_response_length=min_response_length
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
return _scorer_instance
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
def reset_scorer() -> None:
|
|
484
|
+
"""Reset global scorer instance (for testing)"""
|
|
485
|
+
global _scorer_instance
|
|
486
|
+
_scorer_instance = None
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
__version__ = "1.0.0"
|
|
490
|
+
__all__ = [
|
|
491
|
+
"ConfidenceScorer",
|
|
492
|
+
"ConfidenceScore",
|
|
493
|
+
"ConfidenceLevel",
|
|
494
|
+
"get_confidence_scorer",
|
|
495
|
+
"reset_scorer",
|
|
496
|
+
]
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import os
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Dict, Any, Optional
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
import yaml
|
|
10
|
+
except ImportError:
|
|
11
|
+
yaml = None
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
DEFAULT_CONFIG_PATHS = [
|
|
16
|
+
Path.home() / ".devsquad.yaml",
|
|
17
|
+
Path.home() / ".devsquad" / "config.yaml",
|
|
18
|
+
Path(".devsquad.yaml"),
|
|
19
|
+
Path("devsquad.yaml"),
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class DevSquadConfig:
|
|
25
|
+
backend: str = "mock"
|
|
26
|
+
base_url: Optional[str] = None
|
|
27
|
+
model: Optional[str] = None
|
|
28
|
+
timeout: int = 120
|
|
29
|
+
max_roles: int = 10
|
|
30
|
+
max_task_length: int = 10000
|
|
31
|
+
min_task_length: int = 5
|
|
32
|
+
strict_validation: bool = False
|
|
33
|
+
output_format: str = "structured"
|
|
34
|
+
checkpoint_enabled: bool = True
|
|
35
|
+
checkpoint_dir: str = "./checkpoints"
|
|
36
|
+
workflow_enabled: bool = False
|
|
37
|
+
workflow_dir: str = "./workflows"
|
|
38
|
+
cache_enabled: bool = True
|
|
39
|
+
cache_dir: str = "./data/llm_cache"
|
|
40
|
+
log_level: str = "WARNING"
|
|
41
|
+
|
|
42
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
43
|
+
return {
|
|
44
|
+
'backend': self.backend,
|
|
45
|
+
'base_url': self.base_url,
|
|
46
|
+
'model': self.model,
|
|
47
|
+
'timeout': self.timeout,
|
|
48
|
+
'max_roles': self.max_roles,
|
|
49
|
+
'max_task_length': self.max_task_length,
|
|
50
|
+
'min_task_length': self.min_task_length,
|
|
51
|
+
'strict_validation': self.strict_validation,
|
|
52
|
+
'output_format': self.output_format,
|
|
53
|
+
'checkpoint_enabled': self.checkpoint_enabled,
|
|
54
|
+
'checkpoint_dir': self.checkpoint_dir,
|
|
55
|
+
'workflow_enabled': self.workflow_enabled,
|
|
56
|
+
'workflow_dir': self.workflow_dir,
|
|
57
|
+
'cache_enabled': self.cache_enabled,
|
|
58
|
+
'cache_dir': self.cache_dir,
|
|
59
|
+
'log_level': self.log_level,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class ConfigManager:
|
|
64
|
+
"""
|
|
65
|
+
Configuration manager for DevSquad.
|
|
66
|
+
|
|
67
|
+
Loads config from (in order of priority):
|
|
68
|
+
1. Environment variables (highest)
|
|
69
|
+
2. ~/.devsquad.yaml or ./devsquad.yaml
|
|
70
|
+
3. Built-in defaults (lowest)
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
ENV_MAP = {
|
|
74
|
+
"DEVSQUAD_LLM_BACKEND": "backend",
|
|
75
|
+
"DEVSQUAD_BACKEND": "backend",
|
|
76
|
+
"DEVSQUAD_BASE_URL": "base_url",
|
|
77
|
+
"DEVSQUAD_MODEL": "model",
|
|
78
|
+
"DEVSQUAD_TIMEOUT": ("timeout", int),
|
|
79
|
+
"DEVSQUAD_MAX_ROLES": ("max_roles", int),
|
|
80
|
+
"DEVSQUAD_OUTPUT_FORMAT": "output_format",
|
|
81
|
+
"DEVSQUAD_STRICT": ("strict_validation", lambda v: v.lower() in ("true", "1", "yes")),
|
|
82
|
+
"DEVSQUAD_STRICT_VALIDATION": ("strict_validation", lambda v: v.lower() in ("true", "1", "yes")),
|
|
83
|
+
"DEVSQUAD_LOG_LEVEL": "log_level",
|
|
84
|
+
"DEVSQUAD_CHECKPOINT_DIR": "checkpoint_dir",
|
|
85
|
+
"DEVSQUAD_CACHE_DIR": "cache_dir",
|
|
86
|
+
"DEVSQUAD_CHECKPOINT_ENABLED": ("checkpoint_enabled", lambda v: v.lower() in ("true", "1", "yes")),
|
|
87
|
+
"DEVSQUAD_CACHE_ENABLED": ("cache_enabled", lambda v: v.lower() in ("true", "1", "yes")),
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
def __init__(self, config_path: Optional[str] = None):
|
|
91
|
+
self.config = DevSquadConfig()
|
|
92
|
+
self._config_path = None
|
|
93
|
+
|
|
94
|
+
if config_path:
|
|
95
|
+
self._config_path = Path(config_path)
|
|
96
|
+
else:
|
|
97
|
+
for path in DEFAULT_CONFIG_PATHS:
|
|
98
|
+
if path.exists():
|
|
99
|
+
self._config_path = path
|
|
100
|
+
break
|
|
101
|
+
|
|
102
|
+
self._load()
|
|
103
|
+
|
|
104
|
+
def _load(self):
|
|
105
|
+
if self._config_path and self._config_path.exists():
|
|
106
|
+
self._load_from_file(self._config_path)
|
|
107
|
+
|
|
108
|
+
self._load_from_env()
|
|
109
|
+
|
|
110
|
+
def _load_from_file(self, path: Path):
|
|
111
|
+
try:
|
|
112
|
+
if yaml is None:
|
|
113
|
+
logger.warning("pyyaml not installed, skipping config file %s", path)
|
|
114
|
+
return
|
|
115
|
+
with open(path, 'r', encoding='utf-8') as f:
|
|
116
|
+
data = yaml.safe_load(f) or {}
|
|
117
|
+
|
|
118
|
+
devsquad_data = data.get('devsquad', data)
|
|
119
|
+
|
|
120
|
+
for key, value in devsquad_data.items():
|
|
121
|
+
key_mapped = key.replace('-', '_')
|
|
122
|
+
if hasattr(self.config, key_mapped):
|
|
123
|
+
current = getattr(self.config, key_mapped)
|
|
124
|
+
if isinstance(current, bool) and not isinstance(value, bool):
|
|
125
|
+
value = str(value).lower() in ('true', '1', 'yes')
|
|
126
|
+
elif isinstance(current, int) and not isinstance(value, int):
|
|
127
|
+
try:
|
|
128
|
+
value = int(value)
|
|
129
|
+
except (ValueError, TypeError):
|
|
130
|
+
logger.warning("Invalid int value for %s: %s", key, value)
|
|
131
|
+
continue
|
|
132
|
+
elif isinstance(current, float) and not isinstance(value, (int, float)):
|
|
133
|
+
try:
|
|
134
|
+
value = float(value)
|
|
135
|
+
except (ValueError, TypeError):
|
|
136
|
+
logger.warning("Invalid float value for %s: %s", key, value)
|
|
137
|
+
continue
|
|
138
|
+
setattr(self.config, key_mapped, value)
|
|
139
|
+
|
|
140
|
+
logger.info("Config loaded from %s", path)
|
|
141
|
+
except Exception as e:
|
|
142
|
+
logger.warning("Failed to load config from %s: %s", path, e)
|
|
143
|
+
|
|
144
|
+
def _load_from_env(self):
|
|
145
|
+
for env_key, mapping in self.ENV_MAP.items():
|
|
146
|
+
env_value = os.environ.get(env_key)
|
|
147
|
+
if env_value is None:
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
if isinstance(mapping, tuple):
|
|
151
|
+
attr_name, converter = mapping
|
|
152
|
+
try:
|
|
153
|
+
setattr(self.config, attr_name, converter(env_value))
|
|
154
|
+
except (ValueError, TypeError):
|
|
155
|
+
logger.warning("Invalid env value for %s: %s", env_key, env_value)
|
|
156
|
+
else:
|
|
157
|
+
setattr(self.config, mapping, env_value)
|
|
158
|
+
|
|
159
|
+
def get(self, key: str, default: Any = None) -> Any:
|
|
160
|
+
return getattr(self.config, key, default)
|
|
161
|
+
|
|
162
|
+
def set(self, key: str, value: Any):
|
|
163
|
+
if hasattr(self.config, key):
|
|
164
|
+
setattr(self.config, key, value)
|
|
165
|
+
else:
|
|
166
|
+
logger.warning("Unknown config key: %s", key)
|
|
167
|
+
|
|
168
|
+
def save(self, path: Optional[str] = None):
|
|
169
|
+
save_path = Path(path) if path else (self._config_path or Path.home() / ".devsquad.yaml")
|
|
170
|
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
|
171
|
+
|
|
172
|
+
try:
|
|
173
|
+
if yaml is None:
|
|
174
|
+
logger.warning("pyyaml not installed, cannot save config to %s", save_path)
|
|
175
|
+
return
|
|
176
|
+
data = {'devsquad': self.config.to_dict()}
|
|
177
|
+
with open(save_path, 'w', encoding='utf-8') as f:
|
|
178
|
+
yaml.dump(data, f, default_flow_style=False, allow_unicode=True)
|
|
179
|
+
logger.info("Config saved to %s", save_path)
|
|
180
|
+
except Exception as e:
|
|
181
|
+
logger.warning("Failed to save config: %s", e)
|
|
182
|
+
|
|
183
|
+
@property
|
|
184
|
+
def config_path(self) -> Optional[str]:
|
|
185
|
+
return str(self._config_path) if self._config_path else None
|
|
186
|
+
|
|
187
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
188
|
+
return self.config.to_dict()
|