ctrlcode 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ctrlcode/__init__.py +8 -0
- ctrlcode/agents/__init__.py +29 -0
- ctrlcode/agents/cleanup.py +388 -0
- ctrlcode/agents/communication.py +439 -0
- ctrlcode/agents/observability.py +421 -0
- ctrlcode/agents/react_loop.py +297 -0
- ctrlcode/agents/registry.py +211 -0
- ctrlcode/agents/result_parser.py +242 -0
- ctrlcode/agents/workflow.py +723 -0
- ctrlcode/analysis/__init__.py +28 -0
- ctrlcode/analysis/ast_diff.py +163 -0
- ctrlcode/analysis/bug_detector.py +149 -0
- ctrlcode/analysis/code_graphs.py +329 -0
- ctrlcode/analysis/semantic.py +205 -0
- ctrlcode/analysis/static.py +183 -0
- ctrlcode/analysis/synthesizer.py +281 -0
- ctrlcode/analysis/tests.py +189 -0
- ctrlcode/cleanup/__init__.py +16 -0
- ctrlcode/cleanup/auto_merge.py +350 -0
- ctrlcode/cleanup/doc_gardening.py +388 -0
- ctrlcode/cleanup/pr_automation.py +330 -0
- ctrlcode/cleanup/scheduler.py +356 -0
- ctrlcode/config.py +380 -0
- ctrlcode/embeddings/__init__.py +6 -0
- ctrlcode/embeddings/embedder.py +192 -0
- ctrlcode/embeddings/vector_store.py +213 -0
- ctrlcode/fuzzing/__init__.py +24 -0
- ctrlcode/fuzzing/analyzer.py +280 -0
- ctrlcode/fuzzing/budget.py +112 -0
- ctrlcode/fuzzing/context.py +665 -0
- ctrlcode/fuzzing/context_fuzzer.py +506 -0
- ctrlcode/fuzzing/derived_orchestrator.py +732 -0
- ctrlcode/fuzzing/oracle_adapter.py +135 -0
- ctrlcode/linters/__init__.py +11 -0
- ctrlcode/linters/hand_rolled_utils.py +221 -0
- ctrlcode/linters/yolo_parsing.py +217 -0
- ctrlcode/metrics/__init__.py +6 -0
- ctrlcode/metrics/dashboard.py +283 -0
- ctrlcode/metrics/tech_debt.py +663 -0
- ctrlcode/paths.py +68 -0
- ctrlcode/permissions.py +179 -0
- ctrlcode/providers/__init__.py +15 -0
- ctrlcode/providers/anthropic.py +138 -0
- ctrlcode/providers/base.py +77 -0
- ctrlcode/providers/openai.py +197 -0
- ctrlcode/providers/parallel.py +104 -0
- ctrlcode/server.py +871 -0
- ctrlcode/session/__init__.py +6 -0
- ctrlcode/session/baseline.py +57 -0
- ctrlcode/session/manager.py +967 -0
- ctrlcode/skills/__init__.py +10 -0
- ctrlcode/skills/builtin/commit.toml +29 -0
- ctrlcode/skills/builtin/docs.toml +25 -0
- ctrlcode/skills/builtin/refactor.toml +33 -0
- ctrlcode/skills/builtin/review.toml +28 -0
- ctrlcode/skills/builtin/test.toml +28 -0
- ctrlcode/skills/loader.py +111 -0
- ctrlcode/skills/registry.py +139 -0
- ctrlcode/storage/__init__.py +19 -0
- ctrlcode/storage/history_db.py +708 -0
- ctrlcode/tools/__init__.py +220 -0
- ctrlcode/tools/bash.py +112 -0
- ctrlcode/tools/browser.py +352 -0
- ctrlcode/tools/executor.py +153 -0
- ctrlcode/tools/explore.py +486 -0
- ctrlcode/tools/mcp.py +108 -0
- ctrlcode/tools/observability.py +561 -0
- ctrlcode/tools/registry.py +193 -0
- ctrlcode/tools/todo.py +291 -0
- ctrlcode/tools/update.py +266 -0
- ctrlcode/tools/webfetch.py +147 -0
- ctrlcode-0.1.0.dist-info/METADATA +93 -0
- ctrlcode-0.1.0.dist-info/RECORD +75 -0
- ctrlcode-0.1.0.dist-info/WHEEL +4 -0
- ctrlcode-0.1.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""Semantic analysis using LLM to compare code behavior."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
from ..providers.base import Provider
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class SemanticDiff:
|
|
10
|
+
"""Result of semantic comparison."""
|
|
11
|
+
|
|
12
|
+
variant_id: str
|
|
13
|
+
agreement_score: float # 0.0 to 1.0
|
|
14
|
+
behavioral_differences: list[str]
|
|
15
|
+
correctness_assessment: str # "correct", "incorrect", "uncertain"
|
|
16
|
+
edge_cases: list[str]
|
|
17
|
+
improvements: list[str]
|
|
18
|
+
concerns: list[str]
|
|
19
|
+
|
|
20
|
+
def __post_init__(self):
|
|
21
|
+
if not self.behavioral_differences:
|
|
22
|
+
self.behavioral_differences = []
|
|
23
|
+
if not self.edge_cases:
|
|
24
|
+
self.edge_cases = []
|
|
25
|
+
if not self.improvements:
|
|
26
|
+
self.improvements = []
|
|
27
|
+
if not self.concerns:
|
|
28
|
+
self.concerns = []
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class SemanticAnalyzer:
|
|
32
|
+
"""Analyzes code semantics using LLM."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, provider: Provider):
|
|
35
|
+
"""
|
|
36
|
+
Initialize semantic analyzer.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
provider: LLM provider for analysis
|
|
40
|
+
"""
|
|
41
|
+
self.provider = provider
|
|
42
|
+
|
|
43
|
+
async def compare(
|
|
44
|
+
self,
|
|
45
|
+
baseline: str,
|
|
46
|
+
variant: str,
|
|
47
|
+
variant_id: str,
|
|
48
|
+
user_request: str,
|
|
49
|
+
) -> SemanticDiff:
|
|
50
|
+
"""
|
|
51
|
+
Compare semantic behavior of baseline and variant.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
baseline: Baseline code
|
|
55
|
+
variant: Variant code
|
|
56
|
+
variant_id: Identifier for variant
|
|
57
|
+
user_request: Original user request
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
SemanticDiff with behavioral comparison
|
|
61
|
+
"""
|
|
62
|
+
prompt = self._build_comparison_prompt(
|
|
63
|
+
baseline=baseline,
|
|
64
|
+
variant=variant,
|
|
65
|
+
user_request=user_request
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Get analysis from LLM
|
|
69
|
+
messages = [{"role": "user", "content": prompt}]
|
|
70
|
+
response = await self.provider.generate(messages, temperature=0.3)
|
|
71
|
+
|
|
72
|
+
# Parse response
|
|
73
|
+
analysis_text = response.get("text", "")
|
|
74
|
+
return self._parse_analysis(analysis_text, variant_id)
|
|
75
|
+
|
|
76
|
+
def _build_comparison_prompt(
|
|
77
|
+
self,
|
|
78
|
+
baseline: str,
|
|
79
|
+
variant: str,
|
|
80
|
+
user_request: str
|
|
81
|
+
) -> str:
|
|
82
|
+
"""Build prompt for semantic comparison."""
|
|
83
|
+
return f"""Compare these two code implementations for the request: "{user_request}"
|
|
84
|
+
|
|
85
|
+
Baseline:
|
|
86
|
+
```python
|
|
87
|
+
{baseline}
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Variant:
|
|
91
|
+
```python
|
|
92
|
+
{variant}
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Analyze the following:
|
|
96
|
+
|
|
97
|
+
1. **Behavioral Equivalence**: Do they produce the same behavior? Score 0-10.
|
|
98
|
+
|
|
99
|
+
2. **Differences**: What are the semantic differences?
|
|
100
|
+
- Different approach?
|
|
101
|
+
- Different edge case handling?
|
|
102
|
+
- Different error handling?
|
|
103
|
+
|
|
104
|
+
3. **Correctness**: Which implementation is more correct for the request?
|
|
105
|
+
- State "correct", "incorrect", or "uncertain"
|
|
106
|
+
- Explain why
|
|
107
|
+
|
|
108
|
+
4. **Edge Cases**: What edge cases does each handle (or not handle)?
|
|
109
|
+
|
|
110
|
+
5. **Improvements**: What does the variant do better than baseline?
|
|
111
|
+
|
|
112
|
+
6. **Concerns**: Any issues, bugs, or problems in the variant?
|
|
113
|
+
|
|
114
|
+
Format your response as:
|
|
115
|
+
SCORE: <0-10>
|
|
116
|
+
DIFFERENCES:
|
|
117
|
+
- <difference 1>
|
|
118
|
+
- <difference 2>
|
|
119
|
+
CORRECTNESS: <correct|incorrect|uncertain>
|
|
120
|
+
REASON: <explanation>
|
|
121
|
+
EDGE_CASES:
|
|
122
|
+
- <edge case 1>
|
|
123
|
+
- <edge case 2>
|
|
124
|
+
IMPROVEMENTS:
|
|
125
|
+
- <improvement 1>
|
|
126
|
+
- <improvement 2>
|
|
127
|
+
CONCERNS:
|
|
128
|
+
- <concern 1>
|
|
129
|
+
- <concern 2>
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
def _parse_analysis(self, text: str, variant_id: str) -> SemanticDiff:
|
|
133
|
+
"""
|
|
134
|
+
Parse LLM analysis response.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
text: Analysis text from LLM
|
|
138
|
+
variant_id: Variant identifier
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
SemanticDiff with parsed results
|
|
142
|
+
"""
|
|
143
|
+
lines = text.strip().split("\n")
|
|
144
|
+
|
|
145
|
+
# Extract sections
|
|
146
|
+
score = 5.0 # Default
|
|
147
|
+
differences = []
|
|
148
|
+
correctness = "uncertain"
|
|
149
|
+
edge_cases = []
|
|
150
|
+
improvements = []
|
|
151
|
+
concerns = []
|
|
152
|
+
|
|
153
|
+
current_section = None
|
|
154
|
+
|
|
155
|
+
for line in lines:
|
|
156
|
+
line = line.strip()
|
|
157
|
+
|
|
158
|
+
if line.startswith("SCORE:"):
|
|
159
|
+
try:
|
|
160
|
+
score_text = line.split(":", 1)[1].strip()
|
|
161
|
+
score = float(score_text.split()[0]) / 10.0 # Normalize to 0-1
|
|
162
|
+
except (ValueError, IndexError):
|
|
163
|
+
pass
|
|
164
|
+
|
|
165
|
+
elif line.startswith("DIFFERENCES:"):
|
|
166
|
+
current_section = "differences"
|
|
167
|
+
|
|
168
|
+
elif line.startswith("CORRECTNESS:"):
|
|
169
|
+
parts = line.split(":", 1)
|
|
170
|
+
if len(parts) > 1:
|
|
171
|
+
correctness = parts[1].strip().lower()
|
|
172
|
+
current_section = None
|
|
173
|
+
|
|
174
|
+
elif line.startswith("EDGE_CASES:"):
|
|
175
|
+
current_section = "edge_cases"
|
|
176
|
+
|
|
177
|
+
elif line.startswith("IMPROVEMENTS:"):
|
|
178
|
+
current_section = "improvements"
|
|
179
|
+
|
|
180
|
+
elif line.startswith("CONCERNS:"):
|
|
181
|
+
current_section = "concerns"
|
|
182
|
+
|
|
183
|
+
elif line.startswith("REASON:"):
|
|
184
|
+
current_section = None
|
|
185
|
+
|
|
186
|
+
elif line.startswith("-") and current_section:
|
|
187
|
+
item = line.lstrip("- ").strip()
|
|
188
|
+
if current_section == "differences":
|
|
189
|
+
differences.append(item)
|
|
190
|
+
elif current_section == "edge_cases":
|
|
191
|
+
edge_cases.append(item)
|
|
192
|
+
elif current_section == "improvements":
|
|
193
|
+
improvements.append(item)
|
|
194
|
+
elif current_section == "concerns":
|
|
195
|
+
concerns.append(item)
|
|
196
|
+
|
|
197
|
+
return SemanticDiff(
|
|
198
|
+
variant_id=variant_id,
|
|
199
|
+
agreement_score=score,
|
|
200
|
+
behavioral_differences=differences,
|
|
201
|
+
correctness_assessment=correctness,
|
|
202
|
+
edge_cases=edge_cases,
|
|
203
|
+
improvements=improvements,
|
|
204
|
+
concerns=concerns,
|
|
205
|
+
)
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""Static analysis for code quality metrics."""
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
import tempfile
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class StaticAnalysisResult:
|
|
11
|
+
"""Result of static analysis."""
|
|
12
|
+
|
|
13
|
+
variant_id: str
|
|
14
|
+
type_errors: int = 0
|
|
15
|
+
lint_issues: int = 0
|
|
16
|
+
complexity_score: float = 0.0
|
|
17
|
+
maintainability_index: float = 0.0
|
|
18
|
+
quality_score: float = 0.0 # Overall 0-1 score
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class StaticAnalyzer:
|
|
22
|
+
"""Performs static analysis on code."""
|
|
23
|
+
|
|
24
|
+
def analyze(self, code: str, variant_id: str) -> StaticAnalysisResult:
|
|
25
|
+
"""
|
|
26
|
+
Run static analysis on code.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
code: Code to analyze
|
|
30
|
+
variant_id: Identifier for variant
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
StaticAnalysisResult with quality metrics
|
|
34
|
+
"""
|
|
35
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
36
|
+
tmppath = Path(tmpdir)
|
|
37
|
+
code_file = tmppath / "code.py"
|
|
38
|
+
code_file.write_text(code)
|
|
39
|
+
|
|
40
|
+
# Run analyses
|
|
41
|
+
type_errors = self._run_mypy(code_file)
|
|
42
|
+
lint_issues = self._run_ruff(code_file)
|
|
43
|
+
complexity = self._calculate_complexity(code)
|
|
44
|
+
|
|
45
|
+
# Calculate overall quality score
|
|
46
|
+
quality = self._calculate_quality(
|
|
47
|
+
type_errors=type_errors,
|
|
48
|
+
lint_issues=lint_issues,
|
|
49
|
+
complexity=complexity,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
return StaticAnalysisResult(
|
|
53
|
+
variant_id=variant_id,
|
|
54
|
+
type_errors=type_errors,
|
|
55
|
+
lint_issues=lint_issues,
|
|
56
|
+
complexity_score=complexity,
|
|
57
|
+
quality_score=quality,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def _run_mypy(self, file_path: Path) -> int:
|
|
61
|
+
"""
|
|
62
|
+
Run mypy type checker.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
file_path: Path to code file
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Number of type errors
|
|
69
|
+
"""
|
|
70
|
+
try:
|
|
71
|
+
result = subprocess.run(
|
|
72
|
+
["mypy", str(file_path), "--ignore-missing-imports"],
|
|
73
|
+
capture_output=True,
|
|
74
|
+
text=True,
|
|
75
|
+
timeout=10,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Count error lines
|
|
79
|
+
errors = 0
|
|
80
|
+
for line in result.stdout.split("\n"):
|
|
81
|
+
if "error:" in line.lower():
|
|
82
|
+
errors += 1
|
|
83
|
+
|
|
84
|
+
return errors
|
|
85
|
+
|
|
86
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
87
|
+
return 0 # If mypy unavailable, assume no errors
|
|
88
|
+
|
|
89
|
+
def _run_ruff(self, file_path: Path) -> int:
|
|
90
|
+
"""
|
|
91
|
+
Run ruff linter.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
file_path: Path to code file
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Number of lint issues
|
|
98
|
+
"""
|
|
99
|
+
try:
|
|
100
|
+
result = subprocess.run(
|
|
101
|
+
["ruff", "check", str(file_path)],
|
|
102
|
+
capture_output=True,
|
|
103
|
+
text=True,
|
|
104
|
+
timeout=10,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Count issue lines
|
|
108
|
+
issues = 0
|
|
109
|
+
for line in result.stdout.split("\n"):
|
|
110
|
+
if line.strip() and not line.startswith("Found"):
|
|
111
|
+
issues += 1
|
|
112
|
+
|
|
113
|
+
return issues
|
|
114
|
+
|
|
115
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
116
|
+
return 0 # If ruff unavailable, assume no issues
|
|
117
|
+
|
|
118
|
+
def _calculate_complexity(self, code: str) -> float:
|
|
119
|
+
"""
|
|
120
|
+
Calculate code complexity.
|
|
121
|
+
|
|
122
|
+
Uses simple heuristics:
|
|
123
|
+
- Lines of code
|
|
124
|
+
- Nesting depth
|
|
125
|
+
- Number of branches
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
code: Code to analyze
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
Complexity score (lower is better)
|
|
132
|
+
"""
|
|
133
|
+
lines = [line for line in code.split("\n") if line.strip() and not line.strip().startswith("#")]
|
|
134
|
+
loc = len(lines)
|
|
135
|
+
|
|
136
|
+
# Calculate max nesting depth
|
|
137
|
+
max_depth = 0
|
|
138
|
+
|
|
139
|
+
for line in lines:
|
|
140
|
+
# Count leading spaces
|
|
141
|
+
stripped = line.lstrip()
|
|
142
|
+
if stripped:
|
|
143
|
+
indent = len(line) - len(stripped)
|
|
144
|
+
depth = indent // 4 # Assuming 4-space indents
|
|
145
|
+
max_depth = max(max_depth, depth)
|
|
146
|
+
|
|
147
|
+
# Count branches (if, for, while, try)
|
|
148
|
+
branches = sum(
|
|
149
|
+
1 for line in lines
|
|
150
|
+
if any(line.strip().startswith(kw) for kw in ["if ", "elif ", "for ", "while ", "try:", "except"])
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Complexity = weighted sum
|
|
154
|
+
complexity = (loc * 0.1) + (max_depth * 2) + (branches * 1.5)
|
|
155
|
+
|
|
156
|
+
return complexity
|
|
157
|
+
|
|
158
|
+
def _calculate_quality(
|
|
159
|
+
self,
|
|
160
|
+
type_errors: int,
|
|
161
|
+
lint_issues: int,
|
|
162
|
+
complexity: float,
|
|
163
|
+
) -> float:
|
|
164
|
+
"""
|
|
165
|
+
Calculate overall quality score.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
type_errors: Number of type errors
|
|
169
|
+
lint_issues: Number of lint issues
|
|
170
|
+
complexity: Complexity score
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Quality score between 0.0 and 1.0 (higher is better)
|
|
174
|
+
"""
|
|
175
|
+
# Penalties
|
|
176
|
+
type_penalty = min(type_errors * 0.1, 0.4) # Max 40% penalty
|
|
177
|
+
lint_penalty = min(lint_issues * 0.05, 0.3) # Max 30% penalty
|
|
178
|
+
complexity_penalty = min(complexity / 50, 0.3) # Max 30% penalty
|
|
179
|
+
|
|
180
|
+
# Start with perfect score and subtract penalties
|
|
181
|
+
score = 1.0 - (type_penalty + lint_penalty + complexity_penalty)
|
|
182
|
+
|
|
183
|
+
return max(0.0, min(1.0, score))
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
"""Feedback synthesis from differential analysis."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
from .ast_diff import ASTDiff
|
|
6
|
+
from .semantic import SemanticDiff
|
|
7
|
+
from .tests import TestResult
|
|
8
|
+
from .static import StaticAnalysisResult
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class AnalysisResult:
|
|
13
|
+
"""Complete analysis from all methods."""
|
|
14
|
+
|
|
15
|
+
structural_diffs: list[ASTDiff]
|
|
16
|
+
semantic_diffs: list[SemanticDiff]
|
|
17
|
+
test_results: list[TestResult]
|
|
18
|
+
static_results: list[StaticAnalysisResult]
|
|
19
|
+
elapsed_time: float
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class Feedback:
|
|
24
|
+
"""Synthesized feedback for improvement."""
|
|
25
|
+
|
|
26
|
+
improvement_prompt: str
|
|
27
|
+
quality_score: float
|
|
28
|
+
consensus: list[str]
|
|
29
|
+
discrepancies: list[str]
|
|
30
|
+
changes_made: list[str]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class FeedbackSynthesizer:
|
|
34
|
+
"""Synthesizes analysis results into actionable feedback."""
|
|
35
|
+
|
|
36
|
+
def __init__(self, weights: dict[str, float] | None = None):
|
|
37
|
+
"""
|
|
38
|
+
Initialize feedback synthesizer.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
weights: Quality scoring weights for each analysis type
|
|
42
|
+
"""
|
|
43
|
+
self.weights = weights or {
|
|
44
|
+
"tests": 0.4,
|
|
45
|
+
"static": 0.3,
|
|
46
|
+
"semantic": 0.3,
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
def synthesize(
|
|
50
|
+
self,
|
|
51
|
+
analysis: AnalysisResult,
|
|
52
|
+
iteration: int,
|
|
53
|
+
user_request: str,
|
|
54
|
+
) -> Feedback:
|
|
55
|
+
"""
|
|
56
|
+
Generate improvement feedback from analysis.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
analysis: Complete analysis results
|
|
60
|
+
iteration: Current iteration number
|
|
61
|
+
user_request: Original user request
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Feedback with improvement prompt and scores
|
|
65
|
+
"""
|
|
66
|
+
# Extract consensus patterns
|
|
67
|
+
consensus = self._find_consensus(analysis)
|
|
68
|
+
|
|
69
|
+
# Extract discrepancies
|
|
70
|
+
discrepancies = self._find_discrepancies(analysis)
|
|
71
|
+
|
|
72
|
+
# Calculate quality score
|
|
73
|
+
quality_score = self._calculate_quality(analysis)
|
|
74
|
+
|
|
75
|
+
# Build improvement prompt
|
|
76
|
+
improvement_prompt = self._build_improvement_prompt(
|
|
77
|
+
consensus=consensus,
|
|
78
|
+
discrepancies=discrepancies,
|
|
79
|
+
analysis=analysis,
|
|
80
|
+
iteration=iteration,
|
|
81
|
+
user_request=user_request,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
return Feedback(
|
|
85
|
+
improvement_prompt=improvement_prompt,
|
|
86
|
+
quality_score=quality_score,
|
|
87
|
+
consensus=consensus,
|
|
88
|
+
discrepancies=discrepancies,
|
|
89
|
+
changes_made=[], # Will be populated after improvement
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
def _find_consensus(self, analysis: AnalysisResult) -> list[str]:
|
|
93
|
+
"""
|
|
94
|
+
Find patterns agreed upon by all variants.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
analysis: Analysis results
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
List of consensus patterns
|
|
101
|
+
"""
|
|
102
|
+
consensus = []
|
|
103
|
+
|
|
104
|
+
# Check semantic agreement
|
|
105
|
+
if analysis.semantic_diffs:
|
|
106
|
+
high_agreement = [
|
|
107
|
+
d for d in analysis.semantic_diffs
|
|
108
|
+
if d.agreement_score > 0.8
|
|
109
|
+
]
|
|
110
|
+
if len(high_agreement) == len(analysis.semantic_diffs):
|
|
111
|
+
consensus.append("All variants show high semantic agreement")
|
|
112
|
+
|
|
113
|
+
# Check structural similarity
|
|
114
|
+
if analysis.structural_diffs:
|
|
115
|
+
similar = [
|
|
116
|
+
d for d in analysis.structural_diffs
|
|
117
|
+
if d.structural_similarity > 0.8
|
|
118
|
+
]
|
|
119
|
+
if len(similar) == len(analysis.structural_diffs):
|
|
120
|
+
consensus.append("All variants have similar structure")
|
|
121
|
+
|
|
122
|
+
# Check test passing
|
|
123
|
+
if analysis.test_results:
|
|
124
|
+
all_passed = all(t.passed for t in analysis.test_results)
|
|
125
|
+
if all_passed:
|
|
126
|
+
consensus.append("All variants pass tests")
|
|
127
|
+
|
|
128
|
+
# Check common improvements
|
|
129
|
+
improvements: dict[str, int] = {}
|
|
130
|
+
for sem_diff in analysis.semantic_diffs:
|
|
131
|
+
for imp in sem_diff.improvements:
|
|
132
|
+
improvements[imp] = improvements.get(imp, 0) + 1
|
|
133
|
+
|
|
134
|
+
# Improvements mentioned by multiple variants
|
|
135
|
+
common_improvements = [
|
|
136
|
+
imp for imp, count in improvements.items()
|
|
137
|
+
if count >= len(analysis.semantic_diffs) // 2
|
|
138
|
+
]
|
|
139
|
+
consensus.extend(common_improvements)
|
|
140
|
+
|
|
141
|
+
return consensus
|
|
142
|
+
|
|
143
|
+
def _find_discrepancies(self, analysis: AnalysisResult) -> list[str]:
|
|
144
|
+
"""
|
|
145
|
+
Find significant differences between variants.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
analysis: Analysis results
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
List of discrepancy descriptions
|
|
152
|
+
"""
|
|
153
|
+
discrepancies = []
|
|
154
|
+
|
|
155
|
+
# Test failures
|
|
156
|
+
if analysis.test_results:
|
|
157
|
+
failed = [t for t in analysis.test_results if not t.passed]
|
|
158
|
+
if failed:
|
|
159
|
+
discrepancies.append(
|
|
160
|
+
f"{len(failed)}/{len(analysis.test_results)} variants failed tests"
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# Semantic disagreements
|
|
164
|
+
if analysis.semantic_diffs:
|
|
165
|
+
low_agreement = [
|
|
166
|
+
d for d in analysis.semantic_diffs
|
|
167
|
+
if d.agreement_score < 0.5
|
|
168
|
+
]
|
|
169
|
+
if low_agreement:
|
|
170
|
+
discrepancies.append(
|
|
171
|
+
f"{len(low_agreement)} variants have low semantic agreement"
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Collect concerns
|
|
175
|
+
for sem_diff in analysis.semantic_diffs:
|
|
176
|
+
for concern in sem_diff.concerns:
|
|
177
|
+
if concern not in discrepancies:
|
|
178
|
+
discrepancies.append(f"Concern: {concern}")
|
|
179
|
+
|
|
180
|
+
# Structural differences
|
|
181
|
+
if analysis.structural_diffs:
|
|
182
|
+
syntax_errors = [d for d in analysis.structural_diffs if d.has_syntax_error]
|
|
183
|
+
if syntax_errors:
|
|
184
|
+
discrepancies.append(f"{len(syntax_errors)} variants have syntax errors")
|
|
185
|
+
|
|
186
|
+
# Static analysis issues
|
|
187
|
+
if analysis.static_results:
|
|
188
|
+
high_errors = [s for s in analysis.static_results if s.type_errors > 3]
|
|
189
|
+
if high_errors:
|
|
190
|
+
discrepancies.append(
|
|
191
|
+
f"{len(high_errors)} variants have multiple type errors"
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
high_complexity = [s for s in analysis.static_results if s.complexity_score > 20]
|
|
195
|
+
if high_complexity:
|
|
196
|
+
discrepancies.append(
|
|
197
|
+
f"{len(high_complexity)} variants have high complexity"
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
return discrepancies
|
|
201
|
+
|
|
202
|
+
def _calculate_quality(self, analysis: AnalysisResult) -> float:
|
|
203
|
+
"""
|
|
204
|
+
Calculate aggregate quality score.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
analysis: Analysis results
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
Quality score between 0.0 and 1.0
|
|
211
|
+
"""
|
|
212
|
+
score = 0.0
|
|
213
|
+
|
|
214
|
+
# Test passing rate
|
|
215
|
+
if analysis.test_results:
|
|
216
|
+
passing_rate = sum(
|
|
217
|
+
1 for t in analysis.test_results if t.passed
|
|
218
|
+
) / len(analysis.test_results)
|
|
219
|
+
score += passing_rate * self.weights["tests"]
|
|
220
|
+
|
|
221
|
+
# Static analysis score
|
|
222
|
+
if analysis.static_results:
|
|
223
|
+
avg_static = sum(
|
|
224
|
+
s.quality_score for s in analysis.static_results
|
|
225
|
+
) / len(analysis.static_results)
|
|
226
|
+
score += avg_static * self.weights["static"]
|
|
227
|
+
|
|
228
|
+
# Semantic coherence
|
|
229
|
+
if analysis.semantic_diffs:
|
|
230
|
+
avg_semantic = sum(
|
|
231
|
+
d.agreement_score for d in analysis.semantic_diffs
|
|
232
|
+
) / len(analysis.semantic_diffs)
|
|
233
|
+
score += avg_semantic * self.weights["semantic"]
|
|
234
|
+
|
|
235
|
+
return min(1.0, max(0.0, score))
|
|
236
|
+
|
|
237
|
+
def _build_improvement_prompt(
|
|
238
|
+
self,
|
|
239
|
+
consensus: list[str],
|
|
240
|
+
discrepancies: list[str],
|
|
241
|
+
analysis: AnalysisResult,
|
|
242
|
+
iteration: int,
|
|
243
|
+
user_request: str,
|
|
244
|
+
) -> str:
|
|
245
|
+
"""Build prompt for generating improved version."""
|
|
246
|
+
prompt = f"""Based on fuzzing iteration {iteration}, improve the code for: "{user_request}"
|
|
247
|
+
|
|
248
|
+
**Consensus (keep these patterns):**
|
|
249
|
+
{chr(10).join(f"- {c}" for c in consensus) if consensus else "- No strong consensus"}
|
|
250
|
+
|
|
251
|
+
**Issues to resolve:**
|
|
252
|
+
{chr(10).join(f"- {d}" for d in discrepancies) if discrepancies else "- No major issues"}
|
|
253
|
+
|
|
254
|
+
**Quality metrics:**
|
|
255
|
+
"""
|
|
256
|
+
|
|
257
|
+
# Add test results
|
|
258
|
+
if analysis.test_results:
|
|
259
|
+
pass_rate = sum(1 for t in analysis.test_results if t.passed) / len(analysis.test_results)
|
|
260
|
+
prompt += f"\n- Test pass rate: {pass_rate:.1%}"
|
|
261
|
+
|
|
262
|
+
# Add static analysis
|
|
263
|
+
if analysis.static_results:
|
|
264
|
+
avg_quality = sum(s.quality_score for s in analysis.static_results) / len(analysis.static_results)
|
|
265
|
+
prompt += f"\n- Static analysis quality: {avg_quality:.1%}"
|
|
266
|
+
|
|
267
|
+
# Add complexity
|
|
268
|
+
if analysis.static_results:
|
|
269
|
+
avg_complexity = sum(s.complexity_score for s in analysis.static_results) / len(analysis.static_results)
|
|
270
|
+
prompt += f"\n- Average complexity: {avg_complexity:.1f}"
|
|
271
|
+
|
|
272
|
+
prompt += "\n\n**Task:**\nGenerate an improved version that:\n"
|
|
273
|
+
prompt += "1. Maintains consensus elements\n"
|
|
274
|
+
prompt += "2. Resolves identified issues\n"
|
|
275
|
+
prompt += "3. Improves quality metrics\n"
|
|
276
|
+
prompt += "4. Handles edge cases properly\n"
|
|
277
|
+
prompt += "\n**IMPORTANT:** Respond with ONLY the improved output directly. "
|
|
278
|
+
prompt += "Do NOT include explanations, meta-commentary, or descriptions of changes. "
|
|
279
|
+
prompt += "Just provide the final improved result as you would respond to the original request.\n"
|
|
280
|
+
|
|
281
|
+
return prompt
|