devsquad 3.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- devsquad-3.6.0.dist-info/METADATA +944 -0
- devsquad-3.6.0.dist-info/RECORD +95 -0
- devsquad-3.6.0.dist-info/WHEEL +5 -0
- devsquad-3.6.0.dist-info/entry_points.txt +2 -0
- devsquad-3.6.0.dist-info/licenses/LICENSE +21 -0
- devsquad-3.6.0.dist-info/top_level.txt +2 -0
- scripts/__init__.py +0 -0
- scripts/ai_semantic_matcher.py +512 -0
- scripts/alert_manager.py +505 -0
- scripts/api/__init__.py +43 -0
- scripts/api/models.py +386 -0
- scripts/api/routes/__init__.py +20 -0
- scripts/api/routes/dispatch.py +348 -0
- scripts/api/routes/lifecycle.py +330 -0
- scripts/api/routes/metrics_gates.py +347 -0
- scripts/api_server.py +318 -0
- scripts/auth.py +451 -0
- scripts/cli/__init__.py +1 -0
- scripts/cli/cli_visual.py +642 -0
- scripts/cli.py +1094 -0
- scripts/collaboration/__init__.py +212 -0
- scripts/collaboration/_version.py +1 -0
- scripts/collaboration/agent_briefing.py +656 -0
- scripts/collaboration/ai_semantic_matcher.py +260 -0
- scripts/collaboration/anchor_checker.py +281 -0
- scripts/collaboration/anti_rationalization.py +470 -0
- scripts/collaboration/async_integration_example.py +255 -0
- scripts/collaboration/batch_scheduler.py +149 -0
- scripts/collaboration/checkpoint_manager.py +561 -0
- scripts/collaboration/ci_feedback_adapter.py +351 -0
- scripts/collaboration/code_map_generator.py +247 -0
- scripts/collaboration/concern_pack_loader.py +352 -0
- scripts/collaboration/confidence_score.py +496 -0
- scripts/collaboration/config_loader.py +188 -0
- scripts/collaboration/consensus.py +244 -0
- scripts/collaboration/context_compressor.py +533 -0
- scripts/collaboration/coordinator.py +668 -0
- scripts/collaboration/dispatcher.py +1636 -0
- scripts/collaboration/dual_layer_context.py +128 -0
- scripts/collaboration/enhanced_worker.py +539 -0
- scripts/collaboration/feature_usage_tracker.py +206 -0
- scripts/collaboration/five_axis_consensus.py +334 -0
- scripts/collaboration/input_validator.py +401 -0
- scripts/collaboration/integration_example.py +287 -0
- scripts/collaboration/intent_workflow_mapper.py +350 -0
- scripts/collaboration/language_parsers.py +269 -0
- scripts/collaboration/lifecycle_protocol.py +1446 -0
- scripts/collaboration/llm_backend.py +453 -0
- scripts/collaboration/llm_cache.py +448 -0
- scripts/collaboration/llm_cache_async.py +347 -0
- scripts/collaboration/llm_retry.py +387 -0
- scripts/collaboration/llm_retry_async.py +389 -0
- scripts/collaboration/mce_adapter.py +597 -0
- scripts/collaboration/memory_bridge.py +1607 -0
- scripts/collaboration/models.py +537 -0
- scripts/collaboration/null_providers.py +297 -0
- scripts/collaboration/operation_classifier.py +289 -0
- scripts/collaboration/output_slicer.py +225 -0
- scripts/collaboration/performance_monitor.py +462 -0
- scripts/collaboration/permission_guard.py +865 -0
- scripts/collaboration/prompt_assembler.py +756 -0
- scripts/collaboration/prompt_variant_generator.py +483 -0
- scripts/collaboration/protocols.py +267 -0
- scripts/collaboration/report_formatter.py +352 -0
- scripts/collaboration/retrospective.py +279 -0
- scripts/collaboration/role_matcher.py +92 -0
- scripts/collaboration/role_template_market.py +352 -0
- scripts/collaboration/rule_collector.py +678 -0
- scripts/collaboration/scratchpad.py +346 -0
- scripts/collaboration/skill_registry.py +151 -0
- scripts/collaboration/skillifier.py +878 -0
- scripts/collaboration/standardized_role_template.py +317 -0
- scripts/collaboration/task_completion_checker.py +237 -0
- scripts/collaboration/test_quality_guard.py +695 -0
- scripts/collaboration/unified_gate_engine.py +598 -0
- scripts/collaboration/usage_tracker.py +309 -0
- scripts/collaboration/user_friendly_error.py +176 -0
- scripts/collaboration/verification_gate.py +312 -0
- scripts/collaboration/warmup_manager.py +635 -0
- scripts/collaboration/worker.py +513 -0
- scripts/collaboration/workflow_engine.py +684 -0
- scripts/dashboard.py +1088 -0
- scripts/generate_benchmark_report.py +786 -0
- scripts/history_manager.py +604 -0
- scripts/mcp_server.py +289 -0
- skills/__init__.py +32 -0
- skills/dispatch/handler.py +52 -0
- skills/intent/handler.py +59 -0
- skills/registry.py +67 -0
- skills/retrospective/__init__.py +0 -0
- skills/retrospective/handler.py +125 -0
- skills/review/handler.py +356 -0
- skills/security/handler.py +454 -0
- skills/test/__init__.py +0 -0
- skills/test/handler.py +78 -0
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
VerificationGate - Hardened verification requirements for TaskCompletionChecker
|
|
5
|
+
|
|
6
|
+
Enforces Agent Skills' principle: "Seems right" is NEVER sufficient.
|
|
7
|
+
Every completion claim must have supporting evidence.
|
|
8
|
+
|
|
9
|
+
Integration point: Called by TaskCompletionChecker.check_dispatch_result()
|
|
10
|
+
to validate Worker output quality before accepting completion claims.
|
|
11
|
+
|
|
12
|
+
Spec reference: SPEC_V35_Agent_Skills_Quality_Framework.md Section 6.2
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class RedFlag:
|
|
24
|
+
"""A warning signal indicating something may be wrong with Worker output."""
|
|
25
|
+
id: str
|
|
26
|
+
severity: str # "critical" / "warning" / "info"
|
|
27
|
+
description: str
|
|
28
|
+
detection: Callable[[Any], bool]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class EvidenceItem:
|
|
33
|
+
"""A piece of evidence that a Worker should provide to prove completion."""
|
|
34
|
+
key: str
|
|
35
|
+
required: bool = False
|
|
36
|
+
required_for: Optional[List[str]] = None
|
|
37
|
+
description: str = ""
|
|
38
|
+
format_hint: str = ""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class CompletionContext:
|
|
43
|
+
"""Context data extracted from a Worker's result for gate evaluation."""
|
|
44
|
+
role_id: str
|
|
45
|
+
has_code_changes: bool = False
|
|
46
|
+
has_test_changes: bool = False
|
|
47
|
+
is_bug_fix: bool = False
|
|
48
|
+
has_repro_test: bool = False
|
|
49
|
+
test_run_count: int = 0
|
|
50
|
+
all_passed: bool = False
|
|
51
|
+
tests_skipped: int = 0
|
|
52
|
+
coverage_delta: float = 0.0
|
|
53
|
+
output_lines: int = 0
|
|
54
|
+
was_sliced: bool = False
|
|
55
|
+
claims_complete: bool = False
|
|
56
|
+
evidence: Dict[str, Any] = field(default_factory=dict)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class GateResult:
|
|
61
|
+
"""Result of running VerificationGate against a CompletionContext."""
|
|
62
|
+
passed: bool
|
|
63
|
+
red_flags: List[RedFlag] = field(default_factory=list)
|
|
64
|
+
missing_evidence: List[EvidenceItem] = field(default_factory=list)
|
|
65
|
+
verdict: str = "APPROVE"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class VerificationGate:
|
|
69
|
+
"""
|
|
70
|
+
Hardened verification requirements for TaskCompletionChecker.
|
|
71
|
+
|
|
72
|
+
Enforces mandatory evidence requirements and detects Red Flags that
|
|
73
|
+
indicate problems with Worker output quality.
|
|
74
|
+
|
|
75
|
+
Design borrowed from Agent Skills (addyosmani/agent-skills):
|
|
76
|
+
- Every skill ends with mandatory evidence checklist
|
|
77
|
+
- Red Flags provide early warning signals
|
|
78
|
+
- "Seems right" is NEVER sufficient as acceptance criteria
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
RED_FLAGS: List[RedFlag] = [
|
|
82
|
+
RedFlag(
|
|
83
|
+
id="no_test_for_new_behavior",
|
|
84
|
+
severity="critical",
|
|
85
|
+
description="Worker produced code changes without corresponding tests",
|
|
86
|
+
detection=lambda ctx: ctx.has_code_changes and not ctx.has_test_changes,
|
|
87
|
+
),
|
|
88
|
+
RedFlag(
|
|
89
|
+
id="tests_pass_first_run",
|
|
90
|
+
severity="warning",
|
|
91
|
+
description=(
|
|
92
|
+
"Tests pass on first run — may not be testing intended behavior"
|
|
93
|
+
),
|
|
94
|
+
detection=lambda ctx: (
|
|
95
|
+
ctx.test_run_count == 1 and ctx.all_passed and ctx.has_test_changes
|
|
96
|
+
),
|
|
97
|
+
),
|
|
98
|
+
RedFlag(
|
|
99
|
+
id="no_regression_test_for_bugfix",
|
|
100
|
+
severity="critical",
|
|
101
|
+
description="Bug fix task without failing reproduction test",
|
|
102
|
+
detection=lambda ctx: ctx.is_bug_fix and not ctx.has_repro_test,
|
|
103
|
+
),
|
|
104
|
+
RedFlag(
|
|
105
|
+
id="tests_skipped_or_disabled",
|
|
106
|
+
severity="critical",
|
|
107
|
+
description="Tests were skipped or disabled to make suite pass",
|
|
108
|
+
detection=lambda ctx: ctx.tests_skipped > 0,
|
|
109
|
+
),
|
|
110
|
+
RedFlag(
|
|
111
|
+
id="coverage_decreased",
|
|
112
|
+
severity="warning",
|
|
113
|
+
description="Code coverage decreased from baseline",
|
|
114
|
+
detection=lambda ctx: ctx.coverage_delta < -0.01,
|
|
115
|
+
),
|
|
116
|
+
RedFlag(
|
|
117
|
+
id="output_exceeds_limit",
|
|
118
|
+
severity="warning",
|
|
119
|
+
description=(
|
|
120
|
+
"Single Worker output exceeds 100 lines without slicing"
|
|
121
|
+
),
|
|
122
|
+
detection=lambda ctx: ctx.output_lines > 100 and not ctx.was_sliced,
|
|
123
|
+
),
|
|
124
|
+
RedFlag(
|
|
125
|
+
id="no_evidence_provided",
|
|
126
|
+
severity="critical",
|
|
127
|
+
description="Worker claims completion without providing evidence",
|
|
128
|
+
detection=lambda ctx: (
|
|
129
|
+
ctx.claims_complete and len(ctx.evidence) == 0
|
|
130
|
+
),
|
|
131
|
+
),
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
MANDATORY_EVIDENCE: List[EvidenceItem] = [
|
|
135
|
+
EvidenceItem(
|
|
136
|
+
key="test_results",
|
|
137
|
+
required=True,
|
|
138
|
+
description="Test execution output showing pass/fail status",
|
|
139
|
+
format_hint='e.g., "pytest: 142 passed, 0 failed in 3.2s"',
|
|
140
|
+
),
|
|
141
|
+
EvidenceItem(
|
|
142
|
+
key="build_status",
|
|
143
|
+
required_for=["architect", "solo-coder"],
|
|
144
|
+
description="Build success/failure with output",
|
|
145
|
+
format_hint='e.g., "Build succeeded in 1.2s"',
|
|
146
|
+
),
|
|
147
|
+
EvidenceItem(
|
|
148
|
+
key="diff_summary",
|
|
149
|
+
required=True,
|
|
150
|
+
description="Summary of changes made (files affected, lines changed)",
|
|
151
|
+
format_hint=(
|
|
152
|
+
'e.g., "Modified: dispatcher.py (+23/-5), '
|
|
153
|
+
'Added: ar_engine.py (+89)"'
|
|
154
|
+
),
|
|
155
|
+
),
|
|
156
|
+
]
|
|
157
|
+
|
|
158
|
+
def __init__(self, strict_mode: bool = True):
|
|
159
|
+
"""
|
|
160
|
+
Initialize VerificationGate.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
strict_mode: If True, any critical flag or missing evidence blocks
|
|
164
|
+
approval. If False, only logs warnings.
|
|
165
|
+
"""
|
|
166
|
+
self.strict_mode = strict_mode
|
|
167
|
+
|
|
168
|
+
def check(self, context: CompletionContext) -> GateResult:
|
|
169
|
+
"""
|
|
170
|
+
Run verification gate against completion context.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
context: CompletionContext with Worker result data
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
GateResult with passed status, triggered flags,
|
|
177
|
+
missing evidence, and verdict
|
|
178
|
+
"""
|
|
179
|
+
triggered_flags = []
|
|
180
|
+
for flag in self.RED_FLAGS:
|
|
181
|
+
try:
|
|
182
|
+
if flag.detection(context):
|
|
183
|
+
triggered_flags.append(flag)
|
|
184
|
+
logger.warning(
|
|
185
|
+
"Red Flag [%s]: %s (role=%s)",
|
|
186
|
+
flag.id, flag.description, context.role_id,
|
|
187
|
+
)
|
|
188
|
+
except Exception as e:
|
|
189
|
+
logger.debug("Red flag detection error for %s: %s", flag.id, e)
|
|
190
|
+
|
|
191
|
+
missing = self._check_missing_evidence(context)
|
|
192
|
+
|
|
193
|
+
critical_flags = [f for f in triggered_flags if f.severity == "critical"]
|
|
194
|
+
critical_missing = [e for e in missing if e.required]
|
|
195
|
+
|
|
196
|
+
if critical_flags or critical_missing:
|
|
197
|
+
verdict = "REJECT"
|
|
198
|
+
elif triggered_flags or missing:
|
|
199
|
+
verdict = "CONDITIONAL"
|
|
200
|
+
else:
|
|
201
|
+
verdict = "APPROVE"
|
|
202
|
+
|
|
203
|
+
return GateResult(
|
|
204
|
+
passed=(verdict == "APPROVE"),
|
|
205
|
+
red_flags=triggered_flags,
|
|
206
|
+
missing_evidence=missing,
|
|
207
|
+
verdict=verdict,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
def _check_missing_evidence(self, context: CompletionContext) -> List[EvidenceItem]:
|
|
211
|
+
"""Check which mandatory evidence items are missing."""
|
|
212
|
+
missing = []
|
|
213
|
+
for item in self.MANDATORY_EVIDENCE:
|
|
214
|
+
if item.required:
|
|
215
|
+
if item.key not in context.evidence:
|
|
216
|
+
missing.append(item)
|
|
217
|
+
elif item.required_for:
|
|
218
|
+
if context.role_id in item.required_for:
|
|
219
|
+
if item.key not in context.evidence:
|
|
220
|
+
missing.append(item)
|
|
221
|
+
return missing
|
|
222
|
+
|
|
223
|
+
def build_context_from_worker_result(
|
|
224
|
+
self, worker_result: Dict[str, Any]
|
|
225
|
+
) -> CompletionContext:
|
|
226
|
+
"""
|
|
227
|
+
Build CompletionContext from a raw worker result dict.
|
|
228
|
+
|
|
229
|
+
Extracts available fields heuristically from worker result structure.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
worker_result: Dict from DispatchResult.worker_results
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
Populated CompletionContext
|
|
236
|
+
"""
|
|
237
|
+
role_id = worker_result.get(
|
|
238
|
+
"role_id", worker_result.get("role", "unknown")
|
|
239
|
+
)
|
|
240
|
+
output = str(worker_result.get("output", ""))
|
|
241
|
+
success = worker_result.get("success", False)
|
|
242
|
+
errors = worker_result.get("errors", [])
|
|
243
|
+
|
|
244
|
+
output_lines = len(output.split("\n")) if output else 0
|
|
245
|
+
|
|
246
|
+
evidence = {}
|
|
247
|
+
verification = worker_result.get("verification")
|
|
248
|
+
if isinstance(verification, dict) and verification.get("passed"):
|
|
249
|
+
evidence["verification"] = verification
|
|
250
|
+
|
|
251
|
+
return CompletionContext(
|
|
252
|
+
role_id=role_id,
|
|
253
|
+
has_code_changes=output_lines > 10 and success,
|
|
254
|
+
has_test_changes="test" in output.lower()[:500],
|
|
255
|
+
is_bug_fix=self._is_likely_bug_fix(worker_result),
|
|
256
|
+
has_repro_test="reproduce" in output.lower() or "test_" in output.lower(),
|
|
257
|
+
test_run_count=1 if "test" in output.lower() else 0,
|
|
258
|
+
all_passed=success and not errors,
|
|
259
|
+
tests_skipped=worker_result.get("tests_skipped", 0),
|
|
260
|
+
coverage_delta=0.0,
|
|
261
|
+
output_lines=output_lines,
|
|
262
|
+
was_sliced=worker_result.get("was_sliced", False),
|
|
263
|
+
claims_complete=success,
|
|
264
|
+
evidence=evidence,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
@staticmethod
|
|
268
|
+
def _is_likely_bug_fix(worker_result: Dict[str, Any]) -> bool:
|
|
269
|
+
"""Heuristically determine if this looks like a bug fix task."""
|
|
270
|
+
task_desc = str(
|
|
271
|
+
worker_result.get("task_description", "")
|
|
272
|
+
or worker_result.get("original_task", "")
|
|
273
|
+
).lower()
|
|
274
|
+
bug_keywords = [
|
|
275
|
+
"fix", "bug", "error", "fail", "crash", "broken",
|
|
276
|
+
"修复", "错误", "失败", "崩溃", "异常",
|
|
277
|
+
]
|
|
278
|
+
return any(kw in task_desc for kw in bug_keywords)
|
|
279
|
+
|
|
280
|
+
def get_red_flag_by_id(self, flag_id: str) -> Optional[RedFlag]:
|
|
281
|
+
"""Look up a specific RedFlag by ID."""
|
|
282
|
+
for flag in self.RED_FLAGS:
|
|
283
|
+
if flag.id == flag_id:
|
|
284
|
+
return flag
|
|
285
|
+
return None
|
|
286
|
+
|
|
287
|
+
@property
|
|
288
|
+
def red_flag_count(self) -> int:
|
|
289
|
+
"""Total number of defined Red Flags."""
|
|
290
|
+
return len(self.RED_FLAGS)
|
|
291
|
+
|
|
292
|
+
@property
|
|
293
|
+
def evidence_item_count(self) -> int:
|
|
294
|
+
"""Total number of defined EvidenceItems."""
|
|
295
|
+
return len(self.MANDATORY_EVIDENCE)
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def get_shared_gate(strict_mode: bool = True) -> VerificationGate:
|
|
299
|
+
"""
|
|
300
|
+
Get or create shared singleton instance.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
strict_mode: If True, critical flags block approval
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
Shared VerificationGate instance
|
|
307
|
+
"""
|
|
308
|
+
if not hasattr(get_shared_gate, "_instance"):
|
|
309
|
+
get_shared_gate._instance = VerificationGate(
|
|
310
|
+
strict_mode=strict_mode
|
|
311
|
+
)
|
|
312
|
+
return get_shared_gate._instance
|