multi-forge 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- forge/__init__.py +3 -0
- forge/_extensions/agents/.gitkeep +0 -0
- forge/_extensions/commands/.gitkeep +0 -0
- forge/_extensions/skills/analyze/SKILL.md +87 -0
- forge/_extensions/skills/challenge/SKILL.md +91 -0
- forge/_extensions/skills/consensus/SKILL.md +120 -0
- forge/_extensions/skills/consensus/resources/code_consensus_evaluation.md +94 -0
- forge/_extensions/skills/consensus/resources/consensus_evaluation.md +70 -0
- forge/_extensions/skills/consensus/resources/synthesis.md +101 -0
- forge/_extensions/skills/debate/SKILL.md +116 -0
- forge/_extensions/skills/debate/resources/code_debate_evaluation.md +101 -0
- forge/_extensions/skills/debate/resources/debate_evaluation.md +90 -0
- forge/_extensions/skills/panel/SKILL.md +141 -0
- forge/_extensions/skills/panel/resources/synthesis.md +103 -0
- forge/_extensions/skills/qa/SKILL.md +704 -0
- forge/_extensions/skills/qa/resources/checklist/0-enable.md +78 -0
- forge/_extensions/skills/qa/resources/checklist/1-preflight.md +24 -0
- forge/_extensions/skills/qa/resources/checklist/10-resume.md +143 -0
- forge/_extensions/skills/qa/resources/checklist/11-config.md +150 -0
- forge/_extensions/skills/qa/resources/checklist/12-search.md +58 -0
- forge/_extensions/skills/qa/resources/checklist/13-guard.md +237 -0
- forge/_extensions/skills/qa/resources/checklist/14-workflow.md +305 -0
- forge/_extensions/skills/qa/resources/checklist/15-skills.md +155 -0
- forge/_extensions/skills/qa/resources/checklist/16-handoff.md +224 -0
- forge/_extensions/skills/qa/resources/checklist/17-info.md +50 -0
- forge/_extensions/skills/qa/resources/checklist/18-disable.md +84 -0
- forge/_extensions/skills/qa/resources/checklist/19-uninstall.md +146 -0
- forge/_extensions/skills/qa/resources/checklist/2-extensions.md +188 -0
- forge/_extensions/skills/qa/resources/checklist/20-cleanup.md +36 -0
- forge/_extensions/skills/qa/resources/checklist/3-auth.md +234 -0
- forge/_extensions/skills/qa/resources/checklist/4-proxy.md +481 -0
- forge/_extensions/skills/qa/resources/checklist/5-session.md +541 -0
- forge/_extensions/skills/qa/resources/checklist/6-hooks.md +275 -0
- forge/_extensions/skills/qa/resources/checklist/7-costs.md +309 -0
- forge/_extensions/skills/qa/resources/checklist/8-status-line.md +174 -0
- forge/_extensions/skills/qa/resources/checklist/9-direct-commands.md +146 -0
- forge/_extensions/skills/qa/resources/checklist.md +103 -0
- forge/_extensions/skills/qa/resources/report-template.md +62 -0
- forge/_extensions/skills/qa/scripts/start-container.sh +529 -0
- forge/_extensions/skills/qa/scripts/walkthrough-state.py +1137 -0
- forge/_extensions/skills/review/SKILL.md +125 -0
- forge/_extensions/skills/review/references/claude-4.6.md +474 -0
- forge/_extensions/skills/review/references/claude-4.7.md +710 -0
- forge/_extensions/skills/review/references/gemini-3.1.md +546 -0
- forge/_extensions/skills/review/references/gpt-5.5.md +490 -0
- forge/_extensions/skills/review/references/skills-writing-guide.md +1588 -0
- forge/_extensions/skills/review/resources/code-anthropic.md +160 -0
- forge/_extensions/skills/review/resources/code-gemini.md +184 -0
- forge/_extensions/skills/review/resources/code-openai.md +203 -0
- forge/_extensions/skills/review/resources/code.md +160 -0
- forge/_extensions/skills/review-docs/SKILL.md +121 -0
- forge/_extensions/skills/review-docs/resources/docs-anthropic.md +170 -0
- forge/_extensions/skills/review-docs/resources/docs-gemini.md +204 -0
- forge/_extensions/skills/review-docs/resources/docs-openai.md +231 -0
- forge/_extensions/skills/review-docs/resources/docs.md +170 -0
- forge/_extensions/skills/smoke-test/SKILL.md +27 -0
- forge/_extensions/skills/smoke-test/scripts/smoke-test.sh +118 -0
- forge/_extensions/skills/understand/SKILL.md +148 -0
- forge/_extensions/skills/understand/resources/code-anthropic.md +163 -0
- forge/_extensions/skills/understand/resources/code-gemini.md +194 -0
- forge/_extensions/skills/understand/resources/code-openai.md +181 -0
- forge/_extensions/skills/understand/resources/code.md +163 -0
- forge/_extensions/skills/understand/resources/docs-anthropic.md +177 -0
- forge/_extensions/skills/understand/resources/docs-gemini.md +202 -0
- forge/_extensions/skills/understand/resources/docs-openai.md +191 -0
- forge/_extensions/skills/understand/resources/docs.md +177 -0
- forge/_extensions/skills/walkthrough/SKILL.md +599 -0
- forge/_extensions/skills/walkthrough/resources/checklist.md +765 -0
- forge/_extensions/skills/walkthrough/scripts/run-in-repo.sh +118 -0
- forge/_extensions/skills/walkthrough/scripts/setup-test-repo.sh +198 -0
- forge/_extensions/skills/walkthrough/scripts/walkthrough-state.py +1137 -0
- forge/backend/__init__.py +174 -0
- forge/backend/adapters/__init__.py +38 -0
- forge/backend/adapters/litellm.py +158 -0
- forge/backend/creation.py +89 -0
- forge/backend/registry.py +178 -0
- forge/cli/__init__.py +16 -0
- forge/cli/auth.py +483 -0
- forge/cli/backend.py +298 -0
- forge/cli/claude.py +411 -0
- forge/cli/config_cmd.py +303 -0
- forge/cli/extensions.py +1001 -0
- forge/cli/gc.py +165 -0
- forge/cli/guard.py +1018 -0
- forge/cli/guards.py +106 -0
- forge/cli/handoff.py +110 -0
- forge/cli/hooks/__init__.py +36 -0
- forge/cli/hooks/_group.py +20 -0
- forge/cli/hooks/_helpers.py +149 -0
- forge/cli/hooks/commands.py +1677 -0
- forge/cli/hooks/direct_commands.py +1304 -0
- forge/cli/hooks/install.py +232 -0
- forge/cli/hooks/policy.py +151 -0
- forge/cli/hooks/read_hygiene.py +74 -0
- forge/cli/hooks/verification.py +370 -0
- forge/cli/logs.py +406 -0
- forge/cli/main.py +292 -0
- forge/cli/proxy.py +1821 -0
- forge/cli/proxy_costs.py +313 -0
- forge/cli/search.py +416 -0
- forge/cli/session.py +892 -0
- forge/cli/session_addendum.py +81 -0
- forge/cli/session_fork.py +750 -0
- forge/cli/session_handoff.py +141 -0
- forge/cli/session_lifecycle.py +2053 -0
- forge/cli/session_manage.py +1336 -0
- forge/cli/session_memory.py +201 -0
- forge/cli/status_line.py +1398 -0
- forge/cli/workflow.py +1964 -0
- forge/config/__init__.py +110 -0
- forge/config/dataclass_utils.py +88 -0
- forge/config/defaults/__init__.py +0 -0
- forge/config/defaults/backends/__init__.py +0 -0
- forge/config/defaults/backends/litellm.yaml +196 -0
- forge/config/defaults/templates/__init__.py +0 -0
- forge/config/defaults/templates/litellm-anthropic-local.yaml +33 -0
- forge/config/defaults/templates/litellm-anthropic.yaml +24 -0
- forge/config/defaults/templates/litellm-gemini-flash-local.yaml +37 -0
- forge/config/defaults/templates/litellm-gemini-local.yaml +32 -0
- forge/config/defaults/templates/litellm-gemini-test.yaml +34 -0
- forge/config/defaults/templates/litellm-gemini.yaml +21 -0
- forge/config/defaults/templates/litellm-openai-codex-local.yaml +36 -0
- forge/config/defaults/templates/litellm-openai-local.yaml +38 -0
- forge/config/defaults/templates/litellm-openai.yaml +28 -0
- forge/config/defaults/templates/openrouter-anthropic.yaml +23 -0
- forge/config/defaults/templates/openrouter-deepseek.yaml +26 -0
- forge/config/defaults/templates/openrouter-gemini-flash.yaml +26 -0
- forge/config/defaults/templates/openrouter-gemini.yaml +23 -0
- forge/config/defaults/templates/openrouter-glm.yaml +23 -0
- forge/config/defaults/templates/openrouter-kimi.yaml +30 -0
- forge/config/defaults/templates/openrouter-minimax.yaml +26 -0
- forge/config/defaults/templates/openrouter-openai-codex.yaml +23 -0
- forge/config/defaults/templates/openrouter-openai.yaml +28 -0
- forge/config/defaults/templates/openrouter-qwen.yaml +25 -0
- forge/config/loader.py +675 -0
- forge/config/schema.py +448 -0
- forge/core/__init__.py +5 -0
- forge/core/auth/__init__.py +67 -0
- forge/core/auth/capabilities.py +219 -0
- forge/core/auth/credentials_file.py +244 -0
- forge/core/auth/protocols.py +18 -0
- forge/core/auth/secrets.py +243 -0
- forge/core/auth/template_secrets.py +112 -0
- forge/core/data/__init__.py +5 -0
- forge/core/data/model_catalog.yaml +1522 -0
- forge/core/data/pricing.yaml +140 -0
- forge/core/data/system_prompt_addendums/__init__.py +0 -0
- forge/core/data/system_prompt_addendums/gemini.md +330 -0
- forge/core/data/system_prompt_addendums/openai.md +328 -0
- forge/core/llm/__init__.py +231 -0
- forge/core/llm/clients/__init__.py +14 -0
- forge/core/llm/clients/base.py +115 -0
- forge/core/llm/clients/litellm.py +619 -0
- forge/core/llm/clients/openai_compat.py +244 -0
- forge/core/llm/clients/openrouter.py +234 -0
- forge/core/llm/credentials.py +439 -0
- forge/core/llm/detection.py +86 -0
- forge/core/llm/errors.py +44 -0
- forge/core/llm/protocols.py +80 -0
- forge/core/llm/types.py +176 -0
- forge/core/logging.py +146 -0
- forge/core/models/__init__.py +91 -0
- forge/core/models/catalog.py +467 -0
- forge/core/models/pricing.py +165 -0
- forge/core/models/types.py +167 -0
- forge/core/naming.py +212 -0
- forge/core/ops/__init__.py +73 -0
- forge/core/ops/context.py +141 -0
- forge/core/ops/gc.py +802 -0
- forge/core/ops/proxy.py +146 -0
- forge/core/ops/resolution.py +135 -0
- forge/core/ops/session.py +344 -0
- forge/core/ops/session_context.py +548 -0
- forge/core/paths.py +38 -0
- forge/core/process.py +54 -0
- forge/core/reactive/__init__.py +38 -0
- forge/core/reactive/cost_tracking.py +300 -0
- forge/core/reactive/env.py +180 -0
- forge/core/reactive/proxy.py +78 -0
- forge/core/reactive/routing.py +622 -0
- forge/core/reactive/session_runner.py +185 -0
- forge/core/reactive/structured_output.py +62 -0
- forge/core/reactive/tagger.py +94 -0
- forge/core/reactive/throttle.py +132 -0
- forge/core/state/__init__.py +59 -0
- forge/core/state/exceptions.py +59 -0
- forge/core/state/io.py +140 -0
- forge/core/state/lock.py +99 -0
- forge/core/state/timestamps.py +60 -0
- forge/core/transcript.py +78 -0
- forge/core/typing_helpers.py +24 -0
- forge/core/workqueue/__init__.py +67 -0
- forge/core/workqueue/queue.py +552 -0
- forge/core/workqueue/types.py +63 -0
- forge/guard/__init__.py +26 -0
- forge/guard/deterministic/__init__.py +26 -0
- forge/guard/deterministic/base.py +158 -0
- forge/guard/deterministic/coding_standards.py +256 -0
- forge/guard/deterministic/registry.py +148 -0
- forge/guard/deterministic/tdd.py +171 -0
- forge/guard/engine.py +216 -0
- forge/guard/protocols.py +91 -0
- forge/guard/queries.py +96 -0
- forge/guard/semantic/__init__.py +34 -0
- forge/guard/semantic/promotion.py +18 -0
- forge/guard/semantic/supervisor.py +813 -0
- forge/guard/semantic/verdict.py +183 -0
- forge/guard/store.py +124 -0
- forge/guard/team/__init__.py +6 -0
- forge/guard/team/config.py +24 -0
- forge/guard/team/handlers.py +209 -0
- forge/guard/team/prompts.py +41 -0
- forge/guard/types.py +125 -0
- forge/guard/workflow/__init__.py +17 -0
- forge/guard/workflow/branches.py +67 -0
- forge/guard/workflow/config.py +63 -0
- forge/guard/workflow/divergence.py +113 -0
- forge/guard/workflow/policy.py +87 -0
- forge/guard/workflow/stages.py +205 -0
- forge/install/__init__.py +55 -0
- forge/install/cli.py +281 -0
- forge/install/exceptions.py +163 -0
- forge/install/hooks.py +109 -0
- forge/install/installer.py +1037 -0
- forge/install/models.py +321 -0
- forge/install/preset.py +272 -0
- forge/install/settings_merge.py +831 -0
- forge/install/tracking.py +238 -0
- forge/install/version.py +141 -0
- forge/proxy/__init__.py +0 -0
- forge/proxy/base_client.py +181 -0
- forge/proxy/client_adapter.py +476 -0
- forge/proxy/client_factory.py +531 -0
- forge/proxy/converters.py +1206 -0
- forge/proxy/cost_logger.py +132 -0
- forge/proxy/cost_tracker.py +242 -0
- forge/proxy/data_models.py +338 -0
- forge/proxy/error_hints.py +92 -0
- forge/proxy/metrics.py +222 -0
- forge/proxy/model_spec.py +158 -0
- forge/proxy/proxies.py +333 -0
- forge/proxy/proxy_identity.py +134 -0
- forge/proxy/proxy_orchestrator.py +1018 -0
- forge/proxy/proxy_startup.py +54 -0
- forge/proxy/server.py +1561 -0
- forge/proxy/utils.py +537 -0
- forge/review/__init__.py +6 -0
- forge/review/adversarial.py +111 -0
- forge/review/consensus.py +236 -0
- forge/review/engine.py +356 -0
- forge/review/models.py +437 -0
- forge/review/resources/__init__.py +5 -0
- forge/review/resources/codereview-performance.md +85 -0
- forge/review/resources/codereview-quick.md +75 -0
- forge/review/resources/codereview-security.md +92 -0
- forge/review/resources/codereview.md +85 -0
- forge/review/resources/docreview-quick.md +75 -0
- forge/review/resources/docreview.md +86 -0
- forge/review/resources/thinkdeep.md +89 -0
- forge/review/routing.py +368 -0
- forge/review/synthesis.py +73 -0
- forge/runtime_config.py +438 -0
- forge/search/__init__.py +55 -0
- forge/search/bm25_store.py +264 -0
- forge/search/content_store.py +197 -0
- forge/search/engine.py +352 -0
- forge/search/exceptions.py +51 -0
- forge/search/extractor.py +234 -0
- forge/search/index_state.py +295 -0
- forge/search/store.py +215 -0
- forge/search/tokenizer.py +24 -0
- forge/session/__init__.py +130 -0
- forge/session/active.py +339 -0
- forge/session/artifacts.py +202 -0
- forge/session/claude/__init__.py +50 -0
- forge/session/claude/cleanup.py +105 -0
- forge/session/claude/invoke.py +236 -0
- forge/session/claude/paths.py +200 -0
- forge/session/cleanup.py +216 -0
- forge/session/config.py +34 -0
- forge/session/direct_model.py +107 -0
- forge/session/effective.py +169 -0
- forge/session/exceptions.py +255 -0
- forge/session/handoff.py +881 -0
- forge/session/handoff_agent.py +544 -0
- forge/session/hooks/__init__.py +35 -0
- forge/session/hooks/models.py +73 -0
- forge/session/hooks/session_start.py +507 -0
- forge/session/identity.py +84 -0
- forge/session/index.py +553 -0
- forge/session/manager.py +1506 -0
- forge/session/models.py +572 -0
- forge/session/overrides.py +344 -0
- forge/session/plan_resolution.py +286 -0
- forge/session/prev_sessions.py +128 -0
- forge/session/store.py +431 -0
- forge/session/validation.py +47 -0
- forge/session/worktree/__init__.py +65 -0
- forge/session/worktree/cleanup.py +262 -0
- forge/session/worktree/config_copy.py +203 -0
- forge/session/worktree/create.py +332 -0
- forge/sidecar/__init__.py +29 -0
- forge/sidecar/container.py +161 -0
- forge/sidecar/docker.py +86 -0
- forge/sidecar/secrets.py +19 -0
- multi_forge-0.2.0.dist-info/METADATA +242 -0
- multi_forge-0.2.0.dist-info/RECORD +311 -0
- multi_forge-0.2.0.dist-info/WHEEL +4 -0
- multi_forge-0.2.0.dist-info/entry_points.txt +2 -0
- multi_forge-0.2.0.dist-info/licenses/LICENSE +203 -0
- multi_forge-0.2.0.dist-info/licenses/NOTICE +14 -0
forge/cli/workflow.py
ADDED
|
@@ -0,0 +1,1964 @@
|
|
|
1
|
+
"""Workflow runner CLI commands.
|
|
2
|
+
|
|
3
|
+
Provides:
|
|
4
|
+
- forge workflow panel: Fan out review with check gating
|
|
5
|
+
- forge workflow analyze: Deep single-model analysis
|
|
6
|
+
- forge workflow debate: Adversarial evaluation with stance injection
|
|
7
|
+
- forge workflow consensus: Two-round multi-model consensus building
|
|
8
|
+
- forge workflow list-models: Show available model backends
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import sys
|
|
15
|
+
import tempfile
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
import click
|
|
20
|
+
from rich.console import Console
|
|
21
|
+
|
|
22
|
+
from forge.proxy.proxies import ProxyResolutionError
|
|
23
|
+
from forge.review.models import (
|
|
24
|
+
NAMED_ROLES,
|
|
25
|
+
AdversarialOutput,
|
|
26
|
+
ConsensusOutput,
|
|
27
|
+
ModelSpec,
|
|
28
|
+
MultiReviewOutput,
|
|
29
|
+
ReviewResult,
|
|
30
|
+
RoleSpec,
|
|
31
|
+
StanceSpec,
|
|
32
|
+
resolve_model_specs,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# Verdict strings treated as "pass" by --check gating.
|
|
36
|
+
# ACCEPT/ACCEPT_WITH_CONDITIONS from debate resources;
|
|
37
|
+
# PASS/PASSED/TRUE as general-purpose aliases for other resources.
|
|
38
|
+
_ACCEPTING_VERDICTS = frozenset(
|
|
39
|
+
{
|
|
40
|
+
"ACCEPT",
|
|
41
|
+
"ACCEPT_WITH_CONDITIONS",
|
|
42
|
+
"PASS",
|
|
43
|
+
"PASSED",
|
|
44
|
+
"TRUE",
|
|
45
|
+
"SUPPORT",
|
|
46
|
+
"SUPPORT_WITH_CONDITIONS",
|
|
47
|
+
}
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _coerce_passed(val: Any) -> bool:
|
|
52
|
+
"""Coerce a 'passed' field to bool, handling string 'false' correctly.
|
|
53
|
+
|
|
54
|
+
Without this, ``bool("false")`` is ``True`` in Python -- a real CI bug
|
|
55
|
+
when models emit ``{"passed": "false"}`` as a string.
|
|
56
|
+
"""
|
|
57
|
+
if isinstance(val, bool):
|
|
58
|
+
return val
|
|
59
|
+
if isinstance(val, str):
|
|
60
|
+
return val.lower() in ("true", "1", "yes")
|
|
61
|
+
return bool(val)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
console = Console()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _run_preflight(
|
|
68
|
+
specs: list[ModelSpec],
|
|
69
|
+
*,
|
|
70
|
+
json_output: bool = False,
|
|
71
|
+
routing_plan: Any | None = None,
|
|
72
|
+
) -> None:
|
|
73
|
+
"""Check resolved routing/auth before spawning workers. Exit 1 on failure."""
|
|
74
|
+
from forge.review.engine import preflight_check
|
|
75
|
+
|
|
76
|
+
errors = preflight_check(specs, routing_plan=routing_plan)
|
|
77
|
+
warnings = _routing_plan_warnings(specs, routing_plan)
|
|
78
|
+
if not errors:
|
|
79
|
+
if not json_output:
|
|
80
|
+
for warning in warnings:
|
|
81
|
+
console.print(f"[yellow]Routing warning:[/yellow] {warning}")
|
|
82
|
+
return
|
|
83
|
+
if json_output:
|
|
84
|
+
data: dict[str, Any] = {"preflight_errors": errors}
|
|
85
|
+
if warnings:
|
|
86
|
+
data["routing_warnings"] = warnings
|
|
87
|
+
click.echo(json.dumps(data))
|
|
88
|
+
else:
|
|
89
|
+
console.print("[red]Error:[/red] Workflow preflight failed:")
|
|
90
|
+
for err in errors:
|
|
91
|
+
console.print(f" - {err}")
|
|
92
|
+
console.print(
|
|
93
|
+
"\n[dim]Tip: Check model availability with 'forge workflow list-models'.\n"
|
|
94
|
+
"Check proxy status: 'forge proxy list'\n"
|
|
95
|
+
"Check auth status: 'forge auth status'\n"
|
|
96
|
+
"Create a proxy: 'forge proxy create <template>'\n"
|
|
97
|
+
"Check worker runtime: 'command -v claude'[/dim]"
|
|
98
|
+
)
|
|
99
|
+
sys.exit(1)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _routing_plan_warnings(specs: list[ModelSpec], routing_plan: Any | None) -> list[str]:
|
|
103
|
+
"""Return deduped route warnings for human-facing workflow output."""
|
|
104
|
+
if routing_plan is None:
|
|
105
|
+
return []
|
|
106
|
+
|
|
107
|
+
warnings: list[str] = []
|
|
108
|
+
seen: set[str] = set()
|
|
109
|
+
for spec, result in zip(specs, routing_plan.routes):
|
|
110
|
+
if not result.warning:
|
|
111
|
+
continue
|
|
112
|
+
message = f"{spec.name}: {result.warning}"
|
|
113
|
+
if message in seen:
|
|
114
|
+
continue
|
|
115
|
+
seen.add(message)
|
|
116
|
+
warnings.append(message)
|
|
117
|
+
return warnings
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _resolved_models_summary(
|
|
121
|
+
specs: list[ModelSpec],
|
|
122
|
+
routing_plan: Any | None,
|
|
123
|
+
*,
|
|
124
|
+
worker_ids: list[str] | None = None,
|
|
125
|
+
roles: dict[str, str] | None = None,
|
|
126
|
+
role_field: str = "role",
|
|
127
|
+
) -> dict[str, dict[str, Any]]:
|
|
128
|
+
"""Return user-facing model routing metadata for workflow output."""
|
|
129
|
+
if routing_plan is None:
|
|
130
|
+
return {}
|
|
131
|
+
|
|
132
|
+
summary: dict[str, dict[str, Any]] = {}
|
|
133
|
+
for idx, (spec, result) in enumerate(zip(specs, routing_plan.routes)):
|
|
134
|
+
route = result.route
|
|
135
|
+
worker_id = worker_ids[idx] if worker_ids and idx < len(worker_ids) else spec.effective_worker_id
|
|
136
|
+
entry: dict[str, Any] = {
|
|
137
|
+
"requested_model": spec.name,
|
|
138
|
+
"model_id": spec.model_id,
|
|
139
|
+
"resolved_model": route.model_ref if route else None,
|
|
140
|
+
"provider": route.provider if route else None,
|
|
141
|
+
"source": result.source,
|
|
142
|
+
"proxy": result.proxy_id,
|
|
143
|
+
"template": result.template or (route.template_id if route else None),
|
|
144
|
+
}
|
|
145
|
+
if roles and worker_id in roles:
|
|
146
|
+
entry[role_field] = roles[worker_id]
|
|
147
|
+
if result.warning:
|
|
148
|
+
entry["warning"] = result.warning
|
|
149
|
+
summary[worker_id] = entry
|
|
150
|
+
return summary
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _format_resolved_models(summary: dict[str, dict[str, Any]]) -> str:
|
|
154
|
+
"""Format resolved model metadata for non-JSON workflow output."""
|
|
155
|
+
if not summary:
|
|
156
|
+
return ""
|
|
157
|
+
|
|
158
|
+
lines = ["Resolved models:"]
|
|
159
|
+
for worker_id, item in summary.items():
|
|
160
|
+
resolved = item.get("resolved_model") or "(unresolved)"
|
|
161
|
+
provider = item.get("provider") or "unknown"
|
|
162
|
+
proxy = item.get("proxy") or "(direct)"
|
|
163
|
+
template = item.get("template") or "(direct)"
|
|
164
|
+
requested = item.get("requested_model") or worker_id
|
|
165
|
+
role = f", role={item['role']}" if item.get("role") else ""
|
|
166
|
+
stance = f", stance={item['stance']}" if item.get("stance") else ""
|
|
167
|
+
lines.append(
|
|
168
|
+
f"- {worker_id}: requested={requested}, resolved={resolved}, "
|
|
169
|
+
f"provider={provider}, proxy={proxy}, template={template}{role}{stance}"
|
|
170
|
+
)
|
|
171
|
+
return "\n".join(lines) + "\n\n"
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _handle_routing_error(error: Exception, *, json_output: bool = False) -> None:
|
|
175
|
+
"""Handle routing resolution errors with clean CLI output. Calls sys.exit(1)."""
|
|
176
|
+
msg = str(error)
|
|
177
|
+
if json_output:
|
|
178
|
+
click.echo(json.dumps({"routing_error": msg}))
|
|
179
|
+
else:
|
|
180
|
+
console.print(f"[red]Error:[/red] Routing failed: {msg}")
|
|
181
|
+
sys.exit(1)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
_ROUTING_ERRORS = (RuntimeError, ValueError, ProxyResolutionError)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _load_workflow_resource(name: str) -> str:
|
|
188
|
+
"""Load a bundled workflow resource by name via importlib.resources."""
|
|
189
|
+
from importlib import resources
|
|
190
|
+
|
|
191
|
+
ref = resources.files("forge.review.resources").joinpath(name)
|
|
192
|
+
return ref.read_text(encoding="utf-8")
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
@click.group(context_settings={"help_option_names": ["-h", "--help"]})
|
|
196
|
+
def workflow_cmd() -> None:
|
|
197
|
+
"""Run multi-model workflows.
|
|
198
|
+
|
|
199
|
+
\b
|
|
200
|
+
Examples:
|
|
201
|
+
forge workflow panel docs/design.md # Multi-model doc review
|
|
202
|
+
forge workflow analyze "Should we use X?" # Deep single-model analysis
|
|
203
|
+
forge workflow debate "Proposal" --code # Adversarial code eval
|
|
204
|
+
"""
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
@workflow_cmd.command(name="list-models")
|
|
208
|
+
@click.option("--json", "json_output", is_flag=True, help="Output as JSON")
|
|
209
|
+
@click.option("--available", "available_only", is_flag=True, help="Show only ready models")
|
|
210
|
+
def list_models(json_output: bool, available_only: bool) -> None:
|
|
211
|
+
"""Show available model backends for workflow runners."""
|
|
212
|
+
from forge.review.models import available_model_specs, check_model_availability
|
|
213
|
+
|
|
214
|
+
availabilities = check_model_availability(available_model_specs())
|
|
215
|
+
|
|
216
|
+
if available_only:
|
|
217
|
+
availabilities = [a for a in availabilities if a.status == "ready"]
|
|
218
|
+
|
|
219
|
+
if json_output:
|
|
220
|
+
items = [
|
|
221
|
+
{
|
|
222
|
+
"name": a.spec.name,
|
|
223
|
+
"model_id": a.spec.model_id,
|
|
224
|
+
"family": a.spec.family,
|
|
225
|
+
"provider_refs": list(a.spec.provider_refs),
|
|
226
|
+
"preferred_proxy": a.spec.preferred_proxy,
|
|
227
|
+
"description": a.spec.description,
|
|
228
|
+
"status": a.status,
|
|
229
|
+
"reason": a.reason,
|
|
230
|
+
}
|
|
231
|
+
for a in availabilities
|
|
232
|
+
]
|
|
233
|
+
click.echo(json.dumps(items, indent=2))
|
|
234
|
+
return
|
|
235
|
+
|
|
236
|
+
if not availabilities:
|
|
237
|
+
console.print(
|
|
238
|
+
"[yellow]No models are currently ready.[/yellow]\n"
|
|
239
|
+
"[dim]Tip: Check 'forge proxy list' and 'forge auth status'.[/dim]"
|
|
240
|
+
)
|
|
241
|
+
return
|
|
242
|
+
|
|
243
|
+
_print_grouped_models(availabilities)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _primary_credential(spec: ModelSpec) -> str:
|
|
247
|
+
"""Determine the primary credential for a model spec.
|
|
248
|
+
|
|
249
|
+
Uses derive_model_routes() to get the first route's credential,
|
|
250
|
+
which is stable and deterministic (no registry read).
|
|
251
|
+
"""
|
|
252
|
+
from forge.review.routing import derive_model_routes
|
|
253
|
+
|
|
254
|
+
routes = derive_model_routes(spec)
|
|
255
|
+
if routes:
|
|
256
|
+
return routes[0].credential
|
|
257
|
+
return "unknown"
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _credential_env_var(credential_name: str) -> str:
|
|
261
|
+
"""Map a credential name to its primary env var for display."""
|
|
262
|
+
from forge.core.auth.capabilities import CREDENTIALS
|
|
263
|
+
|
|
264
|
+
cred = CREDENTIALS.get(credential_name)
|
|
265
|
+
if cred:
|
|
266
|
+
for ev in cred.env_vars:
|
|
267
|
+
if ev.required and ev.secret:
|
|
268
|
+
return ev.name
|
|
269
|
+
return ""
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def _credential_configured(credential_name: str) -> bool:
|
|
273
|
+
"""Check whether a credential's primary secret is available."""
|
|
274
|
+
env_var = _credential_env_var(credential_name)
|
|
275
|
+
if not env_var:
|
|
276
|
+
return False
|
|
277
|
+
from forge.core.auth.template_secrets import resolve_env_or_credential
|
|
278
|
+
|
|
279
|
+
return resolve_env_or_credential(env_var) is not None
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _print_grouped_models(availabilities: list) -> None:
|
|
283
|
+
"""Print models grouped by primary credential."""
|
|
284
|
+
from collections import OrderedDict
|
|
285
|
+
|
|
286
|
+
groups: OrderedDict[str, list] = OrderedDict()
|
|
287
|
+
for a in availabilities:
|
|
288
|
+
cred = _primary_credential(a.spec)
|
|
289
|
+
groups.setdefault(cred, []).append(a)
|
|
290
|
+
|
|
291
|
+
_STATUS_STYLES = {"ready": "green", "unavailable": "yellow", "error": "red"}
|
|
292
|
+
|
|
293
|
+
console.print("\n[bold]Available Models[/bold]\n")
|
|
294
|
+
|
|
295
|
+
for cred_name, items in groups.items():
|
|
296
|
+
env_var = _credential_env_var(cred_name)
|
|
297
|
+
configured = _credential_configured(cred_name)
|
|
298
|
+
config_tag = "[green]configured[/green]" if configured else "[yellow]not configured[/yellow]"
|
|
299
|
+
env_display = f" ({env_var})" if env_var else ""
|
|
300
|
+
console.print(f" [bold]{cred_name}[/bold]{env_display} [{config_tag}]")
|
|
301
|
+
|
|
302
|
+
for a in items:
|
|
303
|
+
style = _STATUS_STYLES.get(a.status, "")
|
|
304
|
+
desc = a.spec.description
|
|
305
|
+
if a.reason:
|
|
306
|
+
desc += f" [dim]({a.reason})[/dim]"
|
|
307
|
+
console.print(f" [cyan]{a.spec.name:<24}[/cyan] {desc:<50} [{style}]{a.status}[/{style}]")
|
|
308
|
+
console.print()
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
@workflow_cmd.command(name="panel")
|
|
312
|
+
@click.argument("target", nargs=-1)
|
|
313
|
+
@click.option("-p", "--prompt", type=str, default=None, help="Review prompt")
|
|
314
|
+
@click.option(
|
|
315
|
+
"--code",
|
|
316
|
+
"code_mode",
|
|
317
|
+
is_flag=True,
|
|
318
|
+
help="Use code review framework (default: document review)",
|
|
319
|
+
)
|
|
320
|
+
@click.option(
|
|
321
|
+
"--context",
|
|
322
|
+
"context_mode",
|
|
323
|
+
type=str,
|
|
324
|
+
default="blind",
|
|
325
|
+
help='Context mode: "blind" (default) or "resume:<uuid>"',
|
|
326
|
+
)
|
|
327
|
+
@click.option(
|
|
328
|
+
"--models",
|
|
329
|
+
"-m",
|
|
330
|
+
type=str,
|
|
331
|
+
default=None,
|
|
332
|
+
help="Comma-separated model names (default: all)",
|
|
333
|
+
)
|
|
334
|
+
@click.option("--timeout", "-t", type=int, default=600, help="Per-model timeout in seconds")
|
|
335
|
+
@click.option("--json", "json_output", is_flag=True, help="Output structured JSON")
|
|
336
|
+
@click.option(
|
|
337
|
+
"--check",
|
|
338
|
+
"check_mode",
|
|
339
|
+
is_flag=True,
|
|
340
|
+
help="Gate on results: exit 0 if passed, exit 1 if failed",
|
|
341
|
+
)
|
|
342
|
+
@click.option(
|
|
343
|
+
"--roles",
|
|
344
|
+
type=str,
|
|
345
|
+
default=None,
|
|
346
|
+
help=f"Comma-separated reviewer roles ({','.join(sorted(NAMED_ROLES))})",
|
|
347
|
+
)
|
|
348
|
+
@click.option(
|
|
349
|
+
"--review-type",
|
|
350
|
+
type=click.Choice(["full", "security", "performance", "quick"]),
|
|
351
|
+
default="full",
|
|
352
|
+
help="Review focus area (security/performance require --code)",
|
|
353
|
+
)
|
|
354
|
+
@click.option(
|
|
355
|
+
"--severity",
|
|
356
|
+
type=click.Choice(["high", "critical"]),
|
|
357
|
+
default=None,
|
|
358
|
+
help="Minimum severity to report",
|
|
359
|
+
)
|
|
360
|
+
@click.option("--proxy", "via", type=str, default=None, help="Route proxy-backed workers through this proxy")
|
|
361
|
+
@click.option("--cwd", type=click.Path(exists=True), default=None, help="Working directory")
|
|
362
|
+
@click.pass_context
|
|
363
|
+
def panel(
|
|
364
|
+
ctx: click.Context,
|
|
365
|
+
target: tuple[str, ...],
|
|
366
|
+
prompt: str | None,
|
|
367
|
+
code_mode: bool,
|
|
368
|
+
context_mode: str,
|
|
369
|
+
models: str | None,
|
|
370
|
+
timeout: int,
|
|
371
|
+
json_output: bool,
|
|
372
|
+
check_mode: bool,
|
|
373
|
+
roles: str | None,
|
|
374
|
+
review_type: str,
|
|
375
|
+
severity: str | None,
|
|
376
|
+
via: str | None,
|
|
377
|
+
cwd: str | None,
|
|
378
|
+
) -> None:
|
|
379
|
+
"""Fan out a review to multiple models.
|
|
380
|
+
|
|
381
|
+
\b
|
|
382
|
+
Examples:
|
|
383
|
+
forge workflow panel docs/design.md # docs review (default)
|
|
384
|
+
forge workflow panel src/forge/cli/ --code # code review
|
|
385
|
+
forge workflow panel -p "Review the error handling" # custom prompt
|
|
386
|
+
forge workflow panel src/ --code --roles security,architecture
|
|
387
|
+
forge workflow panel src/ --code --review-type security --severity high
|
|
388
|
+
"""
|
|
389
|
+
resume_id: str | None = None
|
|
390
|
+
if context_mode == "blind":
|
|
391
|
+
pass
|
|
392
|
+
elif context_mode.startswith("resume:"):
|
|
393
|
+
resume_id = context_mode[len("resume:") :]
|
|
394
|
+
if not resume_id:
|
|
395
|
+
console.print("[red]Error:[/red] --context resume:<uuid> requires a UUID.")
|
|
396
|
+
ctx.exit(2)
|
|
397
|
+
return
|
|
398
|
+
else:
|
|
399
|
+
console.print(f'[red]Error:[/red] Invalid --context "{context_mode}".' ' Use "blind" or "resume:<uuid>".')
|
|
400
|
+
ctx.exit(2)
|
|
401
|
+
return
|
|
402
|
+
|
|
403
|
+
# Prompt composition: (1) resolve base prompt/resource
|
|
404
|
+
resolved_prompt = _resolve_panel_prompt(target, prompt, code_mode, review_type)
|
|
405
|
+
if resolved_prompt is None:
|
|
406
|
+
console.print("[red]Error:[/red] No prompt provided. Use target argument, -p, or stdin.")
|
|
407
|
+
ctx.exit(2)
|
|
408
|
+
return
|
|
409
|
+
|
|
410
|
+
# Validate review-type/code-mode interaction.
|
|
411
|
+
# Only applies when a review resource is loaded (target-based prompt).
|
|
412
|
+
# Skip when -p or stdin provided a custom prompt (review_type is ignored).
|
|
413
|
+
uses_resource = not prompt and bool(target)
|
|
414
|
+
if uses_resource and review_type in ("security", "performance") and not code_mode:
|
|
415
|
+
console.print(f"[red]Error:[/red] --review-type {review_type} requires --code.")
|
|
416
|
+
ctx.exit(2)
|
|
417
|
+
return
|
|
418
|
+
|
|
419
|
+
# Prompt composition: (2) append severity suffix
|
|
420
|
+
if severity:
|
|
421
|
+
resolved_prompt += (
|
|
422
|
+
f"\n\nIMPORTANT: Report only {severity}-severity findings or above. "
|
|
423
|
+
f"Skip lower-severity issues. If no findings meet the {severity} threshold, "
|
|
424
|
+
f"explicitly state: 'No findings at or above {severity} severity.'"
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
try:
|
|
428
|
+
specs = resolve_model_specs(models)
|
|
429
|
+
except ValueError as e:
|
|
430
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
431
|
+
ctx.exit(2)
|
|
432
|
+
return
|
|
433
|
+
|
|
434
|
+
# Prompt composition: (3) prepend per-worker role prefix
|
|
435
|
+
if roles:
|
|
436
|
+
try:
|
|
437
|
+
role_list = _parse_roles(roles)
|
|
438
|
+
except ValueError as e:
|
|
439
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
440
|
+
ctx.exit(2)
|
|
441
|
+
return
|
|
442
|
+
specs = _apply_panel_roles(specs, role_list, resolved_prompt)
|
|
443
|
+
|
|
444
|
+
from forge.core.reactive.cost_tracking import (
|
|
445
|
+
resolve_proxy_urls_from_plan,
|
|
446
|
+
track_verb_cost,
|
|
447
|
+
)
|
|
448
|
+
from forge.review.engine import run_multi_review
|
|
449
|
+
from forge.review.routing import resolve_invocation_routing
|
|
450
|
+
|
|
451
|
+
try:
|
|
452
|
+
routing_plan = resolve_invocation_routing(specs, via=via)
|
|
453
|
+
except _ROUTING_ERRORS as e:
|
|
454
|
+
_handle_routing_error(e, json_output=json_output)
|
|
455
|
+
return
|
|
456
|
+
|
|
457
|
+
_run_preflight(specs, json_output=json_output, routing_plan=routing_plan)
|
|
458
|
+
|
|
459
|
+
with track_verb_cost("panel", resolve_proxy_urls_from_plan(routing_plan)):
|
|
460
|
+
output = run_multi_review(
|
|
461
|
+
resolved_prompt,
|
|
462
|
+
models=specs,
|
|
463
|
+
routing_plan=routing_plan,
|
|
464
|
+
timeout_seconds=timeout,
|
|
465
|
+
cwd=cwd or str(Path.cwd()),
|
|
466
|
+
resume_id=resume_id,
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
_handle_review_output(
|
|
470
|
+
ctx,
|
|
471
|
+
output,
|
|
472
|
+
check_mode=check_mode,
|
|
473
|
+
json_output=json_output,
|
|
474
|
+
resolved_models=_resolved_models_summary(specs, routing_plan),
|
|
475
|
+
routing_warnings=_routing_plan_warnings(specs, routing_plan),
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def _resolve_panel_prompt(
|
|
480
|
+
target: tuple[str, ...],
|
|
481
|
+
prompt: str | None,
|
|
482
|
+
code_mode: bool,
|
|
483
|
+
review_type: str = "full",
|
|
484
|
+
) -> str | None:
|
|
485
|
+
"""Resolve prompt for panel command. Priority: -p > target+framework > stdin.
|
|
486
|
+
|
|
487
|
+
When -p is provided, review_type is ignored (custom prompt overrides).
|
|
488
|
+
"""
|
|
489
|
+
if prompt:
|
|
490
|
+
return prompt
|
|
491
|
+
|
|
492
|
+
resolved_target = " ".join(target) if target else None
|
|
493
|
+
if resolved_target:
|
|
494
|
+
resource_name = _load_review_resource_name(code_mode, review_type)
|
|
495
|
+
framework = _load_workflow_resource(resource_name)
|
|
496
|
+
return f"{framework}\n\n---\n\n## Review Target\n\n{resolved_target}\n"
|
|
497
|
+
|
|
498
|
+
if not sys.stdin.isatty():
|
|
499
|
+
text = sys.stdin.read().strip()
|
|
500
|
+
return text if text else None
|
|
501
|
+
return None
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
# Review-type to resource file mapping
|
|
505
|
+
_CODE_REVIEW_RESOURCES = {
|
|
506
|
+
"full": "codereview.md",
|
|
507
|
+
"security": "codereview-security.md",
|
|
508
|
+
"performance": "codereview-performance.md",
|
|
509
|
+
"quick": "codereview-quick.md",
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
_DOC_REVIEW_RESOURCES = {
|
|
513
|
+
"full": "docreview.md",
|
|
514
|
+
"quick": "docreview-quick.md",
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
def _load_review_resource_name(code_mode: bool, review_type: str) -> str:
|
|
519
|
+
"""Map code_mode + review_type to a resource file name.
|
|
520
|
+
|
|
521
|
+
Falls back to the full resource if the variant doesn't exist.
|
|
522
|
+
"""
|
|
523
|
+
resources = _CODE_REVIEW_RESOURCES if code_mode else _DOC_REVIEW_RESOURCES
|
|
524
|
+
return resources.get(review_type, resources["full"])
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def _parse_roles(roles_str: str) -> list[str]:
|
|
528
|
+
"""Parse and validate comma-separated role names.
|
|
529
|
+
|
|
530
|
+
Raises ValueError for unknown or empty roles.
|
|
531
|
+
"""
|
|
532
|
+
roles = [r.strip() for r in roles_str.split(",") if r.strip()]
|
|
533
|
+
if not roles:
|
|
534
|
+
raise ValueError("No roles specified. Provide comma-separated role names.")
|
|
535
|
+
invalid = [r for r in roles if r not in NAMED_ROLES]
|
|
536
|
+
if invalid:
|
|
537
|
+
available = sorted(NAMED_ROLES.keys())
|
|
538
|
+
raise ValueError(f"Unknown roles: {invalid}. Available: {available}")
|
|
539
|
+
return roles
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def _apply_panel_roles(
|
|
543
|
+
specs: list[ModelSpec],
|
|
544
|
+
roles: list[str],
|
|
545
|
+
base_prompt: str,
|
|
546
|
+
) -> list[ModelSpec]:
|
|
547
|
+
"""Create per-worker specs with role-prefixed prompts.
|
|
548
|
+
|
|
549
|
+
Roles cycle across models when fewer roles than models.
|
|
550
|
+
Uses dataclasses.replace() on frozen ModelSpec.
|
|
551
|
+
"""
|
|
552
|
+
import dataclasses
|
|
553
|
+
|
|
554
|
+
result: list[ModelSpec] = []
|
|
555
|
+
seen: dict[str, int] = {}
|
|
556
|
+
for i, spec in enumerate(specs):
|
|
557
|
+
role_name = roles[i % len(roles)]
|
|
558
|
+
role_prompt = NAMED_ROLES[role_name]
|
|
559
|
+
worker_prompt = f"[ROLE: {role_name}]\n{role_prompt}\n\n{base_prompt}"
|
|
560
|
+
base_id = f"{spec.name}-{role_name}"
|
|
561
|
+
count = seen.get(base_id, 0)
|
|
562
|
+
seen[base_id] = count + 1
|
|
563
|
+
wid = base_id if count == 0 else f"{base_id}-{count}"
|
|
564
|
+
result.append(
|
|
565
|
+
dataclasses.replace(
|
|
566
|
+
spec,
|
|
567
|
+
prompt=worker_prompt,
|
|
568
|
+
worker_id=wid,
|
|
569
|
+
)
|
|
570
|
+
)
|
|
571
|
+
return result
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
def _evaluate_verdicts(results: list[ReviewResult]) -> tuple[bool, str]:
|
|
575
|
+
"""Evaluate --check gate with fail-closed semantics.
|
|
576
|
+
|
|
577
|
+
Every worker must succeed AND emit a parseable verdict. Missing verdicts
|
|
578
|
+
from successful workers count as failures. This is the unified check logic
|
|
579
|
+
shared by both panel and debate --check.
|
|
580
|
+
|
|
581
|
+
Returns:
|
|
582
|
+
(passed, reason) where reason is a diagnostic string for the check JSON.
|
|
583
|
+
"""
|
|
584
|
+
from forge.core.reactive.structured_output import extract_json_from_response
|
|
585
|
+
|
|
586
|
+
if not results:
|
|
587
|
+
return False, "no results"
|
|
588
|
+
|
|
589
|
+
verdicts: list[tuple[bool, str]] = []
|
|
590
|
+
for result in results:
|
|
591
|
+
if not result.success:
|
|
592
|
+
verdicts.append((False, f"worker {result.model_name} failed"))
|
|
593
|
+
continue
|
|
594
|
+
|
|
595
|
+
parsed = extract_json_from_response(result.stdout)
|
|
596
|
+
if parsed is None or not isinstance(parsed, dict):
|
|
597
|
+
verdicts.append((False, f"worker {result.model_name} emitted no verdict"))
|
|
598
|
+
continue
|
|
599
|
+
|
|
600
|
+
if "passed" in parsed:
|
|
601
|
+
v = _coerce_passed(parsed["passed"])
|
|
602
|
+
label = "accepted" if v else "rejected"
|
|
603
|
+
verdicts.append((v, f"worker {result.model_name} {label}"))
|
|
604
|
+
elif "verdict" in parsed:
|
|
605
|
+
v_str = str(parsed["verdict"]).upper()
|
|
606
|
+
v = v_str in _ACCEPTING_VERDICTS
|
|
607
|
+
label = "accepted" if v else "rejected"
|
|
608
|
+
verdicts.append((v, f"worker {result.model_name} {label}"))
|
|
609
|
+
elif "position" in parsed:
|
|
610
|
+
v_str = str(parsed["position"]).upper()
|
|
611
|
+
v = v_str in _ACCEPTING_VERDICTS
|
|
612
|
+
label = "accepted" if v else "rejected"
|
|
613
|
+
verdicts.append((v, f"worker {result.model_name} {label}"))
|
|
614
|
+
else:
|
|
615
|
+
verdicts.append(
|
|
616
|
+
(
|
|
617
|
+
False,
|
|
618
|
+
f"worker {result.model_name} emitted JSON without verdict fields",
|
|
619
|
+
)
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
if all(v for v, _ in verdicts):
|
|
623
|
+
return True, f"all {len(verdicts)} verdicts accepting"
|
|
624
|
+
|
|
625
|
+
# all() was False, so at least one entry has v=False
|
|
626
|
+
for v, reason in verdicts:
|
|
627
|
+
if not v:
|
|
628
|
+
return False, reason
|
|
629
|
+
|
|
630
|
+
# Unreachable: the loop above always finds a match when all() is False.
|
|
631
|
+
# Explicit raise instead of a silent fallback string.
|
|
632
|
+
raise AssertionError("unreachable: all() was False but no failing verdict found")
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
_CONSENSUS_ACCEPTING = frozenset({"SUPPORT", "SUPPORT_WITH_CONDITIONS"})
|
|
636
|
+
|
|
637
|
+
|
|
638
|
+
def _evaluate_consensus_positions(results: list[ReviewResult]) -> tuple[bool, str]:
|
|
639
|
+
"""Evaluate consensus --check gate with schema-strict semantics.
|
|
640
|
+
|
|
641
|
+
Unlike ``_evaluate_verdicts``, this requires the ``position`` field
|
|
642
|
+
specifically (rejects ``passed``/``verdict`` fallbacks) and only
|
|
643
|
+
accepts SUPPORT / SUPPORT_WITH_CONDITIONS.
|
|
644
|
+
|
|
645
|
+
Returns:
|
|
646
|
+
(passed, reason) where reason is a diagnostic string for the check JSON.
|
|
647
|
+
"""
|
|
648
|
+
from forge.core.reactive.structured_output import extract_json_from_response
|
|
649
|
+
|
|
650
|
+
if not results:
|
|
651
|
+
return False, "no results"
|
|
652
|
+
|
|
653
|
+
verdicts: list[tuple[bool, str]] = []
|
|
654
|
+
for result in results:
|
|
655
|
+
if not result.success:
|
|
656
|
+
verdicts.append((False, f"worker {result.model_name} failed"))
|
|
657
|
+
continue
|
|
658
|
+
|
|
659
|
+
parsed = extract_json_from_response(result.stdout)
|
|
660
|
+
if parsed is None or not isinstance(parsed, dict):
|
|
661
|
+
verdicts.append((False, f"worker {result.model_name} emitted no position"))
|
|
662
|
+
continue
|
|
663
|
+
|
|
664
|
+
if "position" not in parsed:
|
|
665
|
+
verdicts.append((False, f"worker {result.model_name} emitted JSON without position field"))
|
|
666
|
+
continue
|
|
667
|
+
|
|
668
|
+
v_str = str(parsed["position"]).upper()
|
|
669
|
+
v = v_str in _CONSENSUS_ACCEPTING
|
|
670
|
+
label = "supporting" if v else "opposing"
|
|
671
|
+
verdicts.append((v, f"worker {result.model_name} {label}"))
|
|
672
|
+
|
|
673
|
+
if all(v for v, _ in verdicts):
|
|
674
|
+
return True, f"all {len(verdicts)} positions supporting"
|
|
675
|
+
|
|
676
|
+
for v, reason in verdicts:
|
|
677
|
+
if not v:
|
|
678
|
+
return False, reason
|
|
679
|
+
|
|
680
|
+
raise AssertionError("unreachable: all() was False but no failing position found")
|
|
681
|
+
|
|
682
|
+
|
|
683
|
+
def _build_check_json(
|
|
684
|
+
output: MultiReviewOutput,
|
|
685
|
+
passed: bool,
|
|
686
|
+
reason: str,
|
|
687
|
+
resolved_models: dict[str, dict[str, Any]] | None = None,
|
|
688
|
+
routing_warnings: list[str] | None = None,
|
|
689
|
+
) -> dict[str, Any]:
|
|
690
|
+
"""Build JSON output for --check mode with gating fields."""
|
|
691
|
+
from forge.review.synthesis import build_json_dict
|
|
692
|
+
|
|
693
|
+
data = build_json_dict(output)
|
|
694
|
+
data["passed"] = passed
|
|
695
|
+
data["check_mode"] = "verdict"
|
|
696
|
+
data["reason"] = reason
|
|
697
|
+
if resolved_models:
|
|
698
|
+
data["resolved_models"] = resolved_models
|
|
699
|
+
if routing_warnings:
|
|
700
|
+
data["routing_warnings"] = routing_warnings
|
|
701
|
+
return data
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
def _handle_review_output(
|
|
705
|
+
ctx: click.Context,
|
|
706
|
+
output: MultiReviewOutput,
|
|
707
|
+
*,
|
|
708
|
+
check_mode: bool,
|
|
709
|
+
json_output: bool,
|
|
710
|
+
resolved_models: dict[str, dict[str, Any]] | None = None,
|
|
711
|
+
routing_warnings: list[str] | None = None,
|
|
712
|
+
) -> None:
|
|
713
|
+
"""Shared output handler for panel-based commands."""
|
|
714
|
+
from forge.review.synthesis import build_json_dict, format_synthesis_prompt
|
|
715
|
+
|
|
716
|
+
if check_mode:
|
|
717
|
+
passed, reason = _evaluate_verdicts(output.results)
|
|
718
|
+
data = _build_check_json(
|
|
719
|
+
output,
|
|
720
|
+
passed,
|
|
721
|
+
reason,
|
|
722
|
+
resolved_models=resolved_models,
|
|
723
|
+
routing_warnings=routing_warnings,
|
|
724
|
+
)
|
|
725
|
+
click.echo(json.dumps(data, indent=2))
|
|
726
|
+
ctx.exit(0 if passed else 1)
|
|
727
|
+
return
|
|
728
|
+
|
|
729
|
+
if json_output:
|
|
730
|
+
data = build_json_dict(output)
|
|
731
|
+
if resolved_models:
|
|
732
|
+
data["resolved_models"] = resolved_models
|
|
733
|
+
if routing_warnings:
|
|
734
|
+
data["routing_warnings"] = routing_warnings
|
|
735
|
+
click.echo(json.dumps(data, indent=2))
|
|
736
|
+
else:
|
|
737
|
+
click.echo(_format_resolved_models(resolved_models or {}) + format_synthesis_prompt(output))
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
# --- Analyze subcommand ---
|
|
741
|
+
|
|
742
|
+
|
|
743
|
+
@workflow_cmd.command(name="analyze")
|
|
744
|
+
@click.argument("topic", nargs=-1)
|
|
745
|
+
@click.option(
|
|
746
|
+
"-p",
|
|
747
|
+
"--prompt",
|
|
748
|
+
"prompt_text",
|
|
749
|
+
type=str,
|
|
750
|
+
default=None,
|
|
751
|
+
help="Topic to analyze (alternative to positional)",
|
|
752
|
+
)
|
|
753
|
+
@click.option(
|
|
754
|
+
"--models",
|
|
755
|
+
"-m",
|
|
756
|
+
type=str,
|
|
757
|
+
default="claude-opus",
|
|
758
|
+
help="Comma-separated model names (default: claude-opus)",
|
|
759
|
+
)
|
|
760
|
+
@click.option("--timeout", "-t", type=int, default=600, help="Per-model timeout in seconds")
|
|
761
|
+
@click.option("--json", "json_output", is_flag=True, help="Output structured JSON")
|
|
762
|
+
@click.option(
|
|
763
|
+
"--check",
|
|
764
|
+
"check_mode",
|
|
765
|
+
is_flag=True,
|
|
766
|
+
help="Gate on verdict: exit 0 if passed, exit 1 if failed",
|
|
767
|
+
)
|
|
768
|
+
@click.option("--proxy", "via", type=str, default=None, help="Route proxy-backed workers through this proxy")
|
|
769
|
+
@click.option("--cwd", type=click.Path(exists=True), default=None, help="Working directory")
|
|
770
|
+
@click.pass_context
|
|
771
|
+
def analyze(
|
|
772
|
+
ctx: click.Context,
|
|
773
|
+
topic: tuple[str, ...],
|
|
774
|
+
prompt_text: str | None,
|
|
775
|
+
models: str,
|
|
776
|
+
timeout: int,
|
|
777
|
+
json_output: bool,
|
|
778
|
+
check_mode: bool,
|
|
779
|
+
via: str | None,
|
|
780
|
+
cwd: str | None,
|
|
781
|
+
) -> None:
|
|
782
|
+
"""Deep structured analysis on a topic (single-model).
|
|
783
|
+
|
|
784
|
+
\b
|
|
785
|
+
Examples:
|
|
786
|
+
forge workflow analyze "Should we use event sourcing?"
|
|
787
|
+
forge workflow analyze -p "Evaluate migration strategy" --json
|
|
788
|
+
forge workflow analyze "Architecture review" --check
|
|
789
|
+
"""
|
|
790
|
+
resolved_topic = " ".join(topic) if topic else prompt_text
|
|
791
|
+
if not resolved_topic:
|
|
792
|
+
console.print("[red]Error:[/red] No topic provided. Pass as argument or use -p.")
|
|
793
|
+
ctx.exit(2)
|
|
794
|
+
return
|
|
795
|
+
|
|
796
|
+
try:
|
|
797
|
+
specs = resolve_model_specs(models)
|
|
798
|
+
except ValueError as e:
|
|
799
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
800
|
+
ctx.exit(2)
|
|
801
|
+
return
|
|
802
|
+
|
|
803
|
+
framework = _load_workflow_resource("thinkdeep.md")
|
|
804
|
+
combined_prompt = f"{framework}\n\n---\n\n## Topic to Analyze\n\n{resolved_topic}\n"
|
|
805
|
+
|
|
806
|
+
from forge.core.reactive.cost_tracking import (
|
|
807
|
+
resolve_proxy_urls_from_plan,
|
|
808
|
+
track_verb_cost,
|
|
809
|
+
)
|
|
810
|
+
from forge.review.engine import run_multi_review
|
|
811
|
+
from forge.review.routing import resolve_invocation_routing
|
|
812
|
+
|
|
813
|
+
try:
|
|
814
|
+
routing_plan = resolve_invocation_routing(specs, via=via)
|
|
815
|
+
except _ROUTING_ERRORS as e:
|
|
816
|
+
_handle_routing_error(e, json_output=json_output)
|
|
817
|
+
return
|
|
818
|
+
|
|
819
|
+
_run_preflight(specs, json_output=json_output, routing_plan=routing_plan)
|
|
820
|
+
|
|
821
|
+
with track_verb_cost("analyze", resolve_proxy_urls_from_plan(routing_plan)):
|
|
822
|
+
output = run_multi_review(
|
|
823
|
+
combined_prompt,
|
|
824
|
+
models=specs,
|
|
825
|
+
routing_plan=routing_plan,
|
|
826
|
+
timeout_seconds=timeout,
|
|
827
|
+
cwd=cwd or str(Path.cwd()),
|
|
828
|
+
)
|
|
829
|
+
|
|
830
|
+
_handle_review_output(
|
|
831
|
+
ctx,
|
|
832
|
+
output,
|
|
833
|
+
check_mode=check_mode,
|
|
834
|
+
json_output=json_output,
|
|
835
|
+
resolved_models=_resolved_models_summary(specs, routing_plan),
|
|
836
|
+
routing_warnings=_routing_plan_warnings(specs, routing_plan),
|
|
837
|
+
)
|
|
838
|
+
|
|
839
|
+
|
|
840
|
+
# --- Debate subcommand ---
|
|
841
|
+
|
|
842
|
+
_DEFAULT_PROPOSAL_STANCE_PROMPTS = {
|
|
843
|
+
"for": (
|
|
844
|
+
"You are evaluating this proposal as a SUPPORTER. "
|
|
845
|
+
"Identify strengths, viable implementation paths, and reasons to proceed. "
|
|
846
|
+
"Acknowledge genuine weaknesses but focus on how they can be addressed."
|
|
847
|
+
),
|
|
848
|
+
"against": (
|
|
849
|
+
"You are evaluating this proposal as a CRITIC. "
|
|
850
|
+
"Attack on these specific vectors: "
|
|
851
|
+
"(1) correctness -- are there logical gaps, incorrect assumptions, or unstated prerequisites? "
|
|
852
|
+
"(2) feasibility -- can this actually be done with the stated constraints and resources? "
|
|
853
|
+
"(3) internal contradictions -- does the proposal contradict itself across sections? "
|
|
854
|
+
"(4) unstated assumptions -- what is being taken for granted without evidence? "
|
|
855
|
+
"(5) alternatives -- are there simpler or better-established approaches being ignored? "
|
|
856
|
+
"Acknowledge genuine strengths but focus relentlessly on potential problems."
|
|
857
|
+
),
|
|
858
|
+
"neutral": (
|
|
859
|
+
"You are evaluating this proposal as a NEUTRAL ANALYST. "
|
|
860
|
+
"Weigh strengths against weaknesses objectively. "
|
|
861
|
+
"Provide a balanced assessment without advocating for or against."
|
|
862
|
+
),
|
|
863
|
+
}
|
|
864
|
+
|
|
865
|
+
_DEFAULT_CODE_STANCE_PROMPTS = {
|
|
866
|
+
"for": (
|
|
867
|
+
"You are evaluating this code as a SUPPORTER. "
|
|
868
|
+
"Identify good design, correct implementations, and production readiness. "
|
|
869
|
+
"Acknowledge genuine issues but focus on what works well and why."
|
|
870
|
+
),
|
|
871
|
+
"against": (
|
|
872
|
+
"You are evaluating this code as a CRITIC. "
|
|
873
|
+
"Attack on these specific vectors: "
|
|
874
|
+
"(1) correctness -- logic errors, edge cases, off-by-one, null handling? "
|
|
875
|
+
"(2) security -- injection, validation gaps, secrets, auth boundaries? "
|
|
876
|
+
"(3) performance -- unnecessary allocations, N+1 patterns, blocking in async? "
|
|
877
|
+
"(4) architecture -- coupling violations, wrong abstraction level, unstable contracts? "
|
|
878
|
+
"(5) test coverage -- are critical paths tested? are failure modes covered? "
|
|
879
|
+
"Acknowledge genuine strengths but focus relentlessly on potential problems."
|
|
880
|
+
),
|
|
881
|
+
"neutral": (
|
|
882
|
+
"You are evaluating this code as a NEUTRAL ANALYST. "
|
|
883
|
+
"Weigh quality, security, performance, and architecture objectively. "
|
|
884
|
+
"Provide a balanced assessment with specific file:line evidence."
|
|
885
|
+
),
|
|
886
|
+
}
|
|
887
|
+
|
|
888
|
+
_STANCE_CYCLE = ["for", "against", "neutral"]
|
|
889
|
+
|
|
890
|
+
# Debate evaluation template (canonical copy in src/skills/debate/resources/debate_evaluation.md).
|
|
891
|
+
# Embedded here so the CLI doesn't depend on skill installation.
|
|
892
|
+
_DEBATE_EVALUATION_TEMPLATE = """\
|
|
893
|
+
# Structured Evaluation
|
|
894
|
+
|
|
895
|
+
```xml
|
|
896
|
+
<role>
|
|
897
|
+
You are a technical evaluator performing a structured assessment.
|
|
898
|
+
{stance_prompt}
|
|
899
|
+
</role>
|
|
900
|
+
|
|
901
|
+
<behavior>
|
|
902
|
+
- Evaluate strictly on technical merits
|
|
903
|
+
- Support every claim with evidence or reasoning
|
|
904
|
+
- Be specific: cite exact trade-offs, not vague concerns
|
|
905
|
+
- Provide a clear verdict with confidence level
|
|
906
|
+
</behavior>
|
|
907
|
+
```
|
|
908
|
+
|
|
909
|
+
---
|
|
910
|
+
|
|
911
|
+
## Proposal Under Evaluation
|
|
912
|
+
|
|
913
|
+
{proposal}
|
|
914
|
+
|
|
915
|
+
---
|
|
916
|
+
|
|
917
|
+
## Evaluation Framework
|
|
918
|
+
|
|
919
|
+
### 1. Feasibility
|
|
920
|
+
|
|
921
|
+
- Can this be implemented with the available technology and resources?
|
|
922
|
+
- What are the key technical dependencies?
|
|
923
|
+
- Are there proven precedents or is this novel?
|
|
924
|
+
|
|
925
|
+
### 2. Correctness
|
|
926
|
+
|
|
927
|
+
- Does the proposal solve the stated problem?
|
|
928
|
+
- Are there logical gaps or incorrect assumptions?
|
|
929
|
+
- Does it handle edge cases and failure modes?
|
|
930
|
+
|
|
931
|
+
### 3. Trade-offs
|
|
932
|
+
|
|
933
|
+
- What does this approach gain vs alternatives?
|
|
934
|
+
- What does it cost (complexity, performance, maintenance)?
|
|
935
|
+
- Are the trade-offs appropriate for the context?
|
|
936
|
+
|
|
937
|
+
### 4. Risks
|
|
938
|
+
|
|
939
|
+
- What could go wrong in implementation?
|
|
940
|
+
- What could go wrong in production?
|
|
941
|
+
- What is the blast radius of failure?
|
|
942
|
+
|
|
943
|
+
### 5. Completeness
|
|
944
|
+
|
|
945
|
+
- Are all requirements addressed?
|
|
946
|
+
- Are there missing considerations?
|
|
947
|
+
- What would need to be added before this is production-ready?
|
|
948
|
+
|
|
949
|
+
### 6. Alternatives
|
|
950
|
+
|
|
951
|
+
- What other approaches could solve this problem?
|
|
952
|
+
- Why might they be better or worse?
|
|
953
|
+
|
|
954
|
+
### 7. Recommendation
|
|
955
|
+
|
|
956
|
+
- Overall verdict: ACCEPT, ACCEPT_WITH_CONDITIONS, or REJECT
|
|
957
|
+
- Confidence level: LOW, MEDIUM, HIGH
|
|
958
|
+
- Key conditions (if ACCEPT_WITH_CONDITIONS)
|
|
959
|
+
|
|
960
|
+
---
|
|
961
|
+
|
|
962
|
+
## Output Format
|
|
963
|
+
|
|
964
|
+
````xml
|
|
965
|
+
<output_format>
|
|
966
|
+
Respond with a structured evaluation in JSON:
|
|
967
|
+
|
|
968
|
+
{
|
|
969
|
+
"verdict": "ACCEPT" | "ACCEPT_WITH_CONDITIONS" | "REJECT",
|
|
970
|
+
"confidence": "LOW" | "MEDIUM" | "HIGH",
|
|
971
|
+
"key_findings": [
|
|
972
|
+
{"category": "feasibility|correctness|trade-offs|risks|completeness",
|
|
973
|
+
"finding": "specific finding",
|
|
974
|
+
"severity": "critical|high|medium|low"}
|
|
975
|
+
],
|
|
976
|
+
"recommendation": "1-2 sentence summary of your recommendation",
|
|
977
|
+
"conditions": ["condition 1", "condition 2"]
|
|
978
|
+
}
|
|
979
|
+
|
|
980
|
+
Wrap the JSON in a ```json code fence.
|
|
981
|
+
</output_format>
|
|
982
|
+
````
|
|
983
|
+
"""
|
|
984
|
+
|
|
985
|
+
# Code debate evaluation template (canonical copy in src/skills/debate/resources/code_debate_evaluation.md).
|
|
986
|
+
# Embedded here so the CLI doesn't depend on skill installation.
|
|
987
|
+
_CODE_DEBATE_EVALUATION_TEMPLATE = """\
|
|
988
|
+
# Adversarial Code Evaluation
|
|
989
|
+
|
|
990
|
+
```xml
|
|
991
|
+
<role>
|
|
992
|
+
You are a senior code evaluator performing a structured adversarial assessment.
|
|
993
|
+
{stance_prompt}
|
|
994
|
+
You identify bugs, design issues, security concerns, and performance problems.
|
|
995
|
+
You provide actionable feedback with specific code references.
|
|
996
|
+
</role>
|
|
997
|
+
|
|
998
|
+
<behavior>
|
|
999
|
+
- Read all code in scope before forming opinions
|
|
1000
|
+
- Cite specific file:line references for every finding
|
|
1001
|
+
- Evaluate strictly on technical merits
|
|
1002
|
+
- Support every claim with evidence or reasoning
|
|
1003
|
+
- Cover ALL files in ONE pass -- do not present partial results
|
|
1004
|
+
- Be specific: "potential null dereference at auth.py:45" not "might have issues"
|
|
1005
|
+
- Provide a clear verdict with confidence level
|
|
1006
|
+
</behavior>
|
|
1007
|
+
|
|
1008
|
+
<scope_constraints>
|
|
1009
|
+
- Review only what's in scope
|
|
1010
|
+
- Do not expand to adjacent code unless directly affected
|
|
1011
|
+
- If tests exist for reviewed code, check them for coverage gaps
|
|
1012
|
+
</scope_constraints>
|
|
1013
|
+
```
|
|
1014
|
+
|
|
1015
|
+
---
|
|
1016
|
+
|
|
1017
|
+
## Code Under Evaluation
|
|
1018
|
+
|
|
1019
|
+
{target}
|
|
1020
|
+
|
|
1021
|
+
---
|
|
1022
|
+
|
|
1023
|
+
## Evaluation Framework
|
|
1024
|
+
|
|
1025
|
+
### 1. Quality
|
|
1026
|
+
|
|
1027
|
+
- Logic errors and edge cases
|
|
1028
|
+
- Error handling: are errors caught, propagated, and surfaced correctly?
|
|
1029
|
+
- Type safety: do type annotations match runtime behavior?
|
|
1030
|
+
- Test coverage: are critical paths tested?
|
|
1031
|
+
|
|
1032
|
+
### 2. Security
|
|
1033
|
+
|
|
1034
|
+
- Input validation at trust boundaries
|
|
1035
|
+
- Injection vectors (command, SQL, path traversal)
|
|
1036
|
+
- Secrets in code or logs
|
|
1037
|
+
- Authentication and authorization gaps
|
|
1038
|
+
|
|
1039
|
+
### 3. Performance
|
|
1040
|
+
|
|
1041
|
+
- Unnecessary allocations or copies in hot paths
|
|
1042
|
+
- N+1 query patterns
|
|
1043
|
+
- Missing caching where data is reused
|
|
1044
|
+
- Blocking calls in async contexts
|
|
1045
|
+
|
|
1046
|
+
### 4. Architecture
|
|
1047
|
+
|
|
1048
|
+
- Component boundaries: is coupling appropriate?
|
|
1049
|
+
- Dependency direction: do imports flow the right way?
|
|
1050
|
+
- Abstraction level: is complexity in the right place?
|
|
1051
|
+
- Interface contracts: are public APIs stable and well-defined?
|
|
1052
|
+
|
|
1053
|
+
### 5. Risks
|
|
1054
|
+
|
|
1055
|
+
- What could go wrong in production?
|
|
1056
|
+
- What is the blast radius of failure?
|
|
1057
|
+
- Missing error recovery or graceful degradation?
|
|
1058
|
+
- Deployment or migration risks?
|
|
1059
|
+
|
|
1060
|
+
### 6. Recommendation
|
|
1061
|
+
|
|
1062
|
+
- Overall verdict: ACCEPT, ACCEPT_WITH_CONDITIONS, or REJECT
|
|
1063
|
+
- Confidence level: LOW, MEDIUM, HIGH
|
|
1064
|
+
- Key conditions (if ACCEPT_WITH_CONDITIONS)
|
|
1065
|
+
|
|
1066
|
+
---
|
|
1067
|
+
|
|
1068
|
+
## Output Format
|
|
1069
|
+
|
|
1070
|
+
````xml
|
|
1071
|
+
<output_format>
|
|
1072
|
+
Respond with a structured evaluation in JSON:
|
|
1073
|
+
|
|
1074
|
+
{
|
|
1075
|
+
"verdict": "ACCEPT" | "ACCEPT_WITH_CONDITIONS" | "REJECT",
|
|
1076
|
+
"confidence": "LOW" | "MEDIUM" | "HIGH",
|
|
1077
|
+
"key_findings": [
|
|
1078
|
+
{"category": "quality|security|performance|architecture|risks",
|
|
1079
|
+
"finding": "specific finding with file:line reference",
|
|
1080
|
+
"severity": "critical|high|medium|low"}
|
|
1081
|
+
],
|
|
1082
|
+
"recommendation": "1-2 sentence summary of your recommendation",
|
|
1083
|
+
"conditions": ["condition 1", "condition 2"]
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
Wrap the JSON in a ```json code fence.
|
|
1087
|
+
</output_format>
|
|
1088
|
+
````
|
|
1089
|
+
"""
|
|
1090
|
+
|
|
1091
|
+
|
|
1092
|
+
def _resolve_debate_prompt(
|
|
1093
|
+
subject: tuple[str, ...],
|
|
1094
|
+
prompt: str | None,
|
|
1095
|
+
code_mode: bool,
|
|
1096
|
+
) -> str | None:
|
|
1097
|
+
"""Resolve prompt for debate command. Priority: -p > subject+framework > stdin.
|
|
1098
|
+
|
|
1099
|
+
Unlike panel, all inputs are wrapped in a template because the adversarial
|
|
1100
|
+
runner requires ``{stance_prompt}`` in the resource file.
|
|
1101
|
+
"""
|
|
1102
|
+
resolved = prompt or (" ".join(subject) if subject else None)
|
|
1103
|
+
if not resolved and not sys.stdin.isatty():
|
|
1104
|
+
resolved = sys.stdin.read().strip() or None
|
|
1105
|
+
|
|
1106
|
+
if not resolved:
|
|
1107
|
+
return None
|
|
1108
|
+
|
|
1109
|
+
if code_mode:
|
|
1110
|
+
return _CODE_DEBATE_EVALUATION_TEMPLATE.replace("{target}", resolved)
|
|
1111
|
+
return _DEBATE_EVALUATION_TEMPLATE.replace("{proposal}", resolved)
|
|
1112
|
+
|
|
1113
|
+
|
|
1114
|
+
@workflow_cmd.command(name="debate")
|
|
1115
|
+
@click.argument("subject", nargs=-1)
|
|
1116
|
+
@click.option(
|
|
1117
|
+
"-p",
|
|
1118
|
+
"--prompt",
|
|
1119
|
+
"prompt_text",
|
|
1120
|
+
type=str,
|
|
1121
|
+
default=None,
|
|
1122
|
+
help="Subject to evaluate (alternative to positional)",
|
|
1123
|
+
)
|
|
1124
|
+
@click.option(
|
|
1125
|
+
"--code",
|
|
1126
|
+
"code_mode",
|
|
1127
|
+
is_flag=True,
|
|
1128
|
+
help="Use code evaluation framework (default: proposal evaluation)",
|
|
1129
|
+
)
|
|
1130
|
+
@click.option(
|
|
1131
|
+
"--models",
|
|
1132
|
+
"-m",
|
|
1133
|
+
type=str,
|
|
1134
|
+
default=None,
|
|
1135
|
+
help="Comma-separated model names (default: all)",
|
|
1136
|
+
)
|
|
1137
|
+
@click.option("--timeout", "-t", type=int, default=600, help="Per-model timeout in seconds")
|
|
1138
|
+
@click.option("--json", "json_output", is_flag=True, help="Output structured JSON")
|
|
1139
|
+
@click.option("--check", "check_mode", is_flag=True, help="Gate on verdicts: any REJECT exits 1")
|
|
1140
|
+
@click.option(
|
|
1141
|
+
"--worker",
|
|
1142
|
+
"workers",
|
|
1143
|
+
multiple=True,
|
|
1144
|
+
type=str,
|
|
1145
|
+
help='Worker spec: model:stance or model:"custom prompt" (repeatable)',
|
|
1146
|
+
)
|
|
1147
|
+
@click.option("--proxy", "via", type=str, default=None, help="Route proxy-backed workers through this proxy")
|
|
1148
|
+
@click.option("--cwd", type=click.Path(exists=True), default=None, help="Working directory")
|
|
1149
|
+
@click.pass_context
|
|
1150
|
+
def debate(
|
|
1151
|
+
ctx: click.Context,
|
|
1152
|
+
subject: tuple[str, ...],
|
|
1153
|
+
prompt_text: str | None,
|
|
1154
|
+
code_mode: bool,
|
|
1155
|
+
models: str | None,
|
|
1156
|
+
timeout: int,
|
|
1157
|
+
json_output: bool,
|
|
1158
|
+
check_mode: bool,
|
|
1159
|
+
workers: tuple[str, ...],
|
|
1160
|
+
via: str | None,
|
|
1161
|
+
cwd: str | None,
|
|
1162
|
+
) -> None:
|
|
1163
|
+
"""Adversarial evaluation with stance-injected workers.
|
|
1164
|
+
|
|
1165
|
+
Each model receives the evaluation template with its assigned stance prompt
|
|
1166
|
+
injected via {stance_prompt} replacement. Models are assigned stances
|
|
1167
|
+
cyclically: for, against, neutral.
|
|
1168
|
+
|
|
1169
|
+
Use --worker for explicit model:stance mapping or custom prompts.
|
|
1170
|
+
|
|
1171
|
+
Blinding is mandatory -- workers never see conversation context.
|
|
1172
|
+
|
|
1173
|
+
\b
|
|
1174
|
+
Examples:
|
|
1175
|
+
forge workflow debate "Should we use event sourcing?" --json
|
|
1176
|
+
forge workflow debate src/forge/cli/ --code --check
|
|
1177
|
+
forge workflow debate --worker gpt-5.5:for --worker "claude-opus:Focus on security" "proposal"
|
|
1178
|
+
"""
|
|
1179
|
+
from forge.review.adversarial import run_adversarial, validate_resource
|
|
1180
|
+
|
|
1181
|
+
if workers and models:
|
|
1182
|
+
console.print("[red]Error:[/red] --worker and --models are mutually exclusive.")
|
|
1183
|
+
ctx.exit(2)
|
|
1184
|
+
return
|
|
1185
|
+
|
|
1186
|
+
resolved = _resolve_debate_prompt(subject, prompt_text, code_mode)
|
|
1187
|
+
if not resolved:
|
|
1188
|
+
label = "target" if code_mode else "subject"
|
|
1189
|
+
console.print(f"[red]Error:[/red] No {label} provided. Pass as argument or use -p.")
|
|
1190
|
+
ctx.exit(2)
|
|
1191
|
+
return
|
|
1192
|
+
|
|
1193
|
+
# Write filled evaluation resource to a temp file for the adversarial runner
|
|
1194
|
+
tmp_file = None
|
|
1195
|
+
try:
|
|
1196
|
+
tmp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False)
|
|
1197
|
+
tmp_file.write(resolved)
|
|
1198
|
+
tmp_file.close()
|
|
1199
|
+
resource_path = tmp_file.name
|
|
1200
|
+
|
|
1201
|
+
try:
|
|
1202
|
+
validate_resource(resource_path)
|
|
1203
|
+
except ValueError as e:
|
|
1204
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1205
|
+
ctx.exit(2)
|
|
1206
|
+
return
|
|
1207
|
+
|
|
1208
|
+
if workers:
|
|
1209
|
+
try:
|
|
1210
|
+
stances = _parse_worker_specs(workers, code_mode=code_mode)
|
|
1211
|
+
except ValueError as e:
|
|
1212
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1213
|
+
ctx.exit(2)
|
|
1214
|
+
return
|
|
1215
|
+
else:
|
|
1216
|
+
try:
|
|
1217
|
+
specs = resolve_model_specs(models)
|
|
1218
|
+
except ValueError as e:
|
|
1219
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1220
|
+
ctx.exit(2)
|
|
1221
|
+
return
|
|
1222
|
+
stances = _build_stances(specs, code_mode=code_mode)
|
|
1223
|
+
|
|
1224
|
+
from forge.core.reactive.cost_tracking import (
|
|
1225
|
+
resolve_proxy_urls_from_plan,
|
|
1226
|
+
track_verb_cost,
|
|
1227
|
+
)
|
|
1228
|
+
from forge.review.routing import resolve_invocation_routing
|
|
1229
|
+
|
|
1230
|
+
stance_models = [s.model for s in stances]
|
|
1231
|
+
try:
|
|
1232
|
+
routing_plan = resolve_invocation_routing(stance_models, via=via)
|
|
1233
|
+
except _ROUTING_ERRORS as e:
|
|
1234
|
+
_handle_routing_error(e, json_output=json_output)
|
|
1235
|
+
return
|
|
1236
|
+
|
|
1237
|
+
_run_preflight(stance_models, json_output=json_output, routing_plan=routing_plan)
|
|
1238
|
+
|
|
1239
|
+
with track_verb_cost("debate", resolve_proxy_urls_from_plan(routing_plan)):
|
|
1240
|
+
output = run_adversarial(
|
|
1241
|
+
resource_path,
|
|
1242
|
+
stances,
|
|
1243
|
+
timeout_seconds=timeout,
|
|
1244
|
+
cwd=cwd or str(Path.cwd()),
|
|
1245
|
+
routing_plan=routing_plan,
|
|
1246
|
+
)
|
|
1247
|
+
finally:
|
|
1248
|
+
if tmp_file is not None:
|
|
1249
|
+
Path(tmp_file.name).unlink(missing_ok=True)
|
|
1250
|
+
|
|
1251
|
+
debate_warnings = _routing_plan_warnings(stance_models, routing_plan)
|
|
1252
|
+
debate_resolved_models = _resolved_models_summary(
|
|
1253
|
+
stance_models,
|
|
1254
|
+
routing_plan,
|
|
1255
|
+
worker_ids=[result.model_name for result in output.results],
|
|
1256
|
+
roles=output.stance_map,
|
|
1257
|
+
role_field="stance",
|
|
1258
|
+
)
|
|
1259
|
+
|
|
1260
|
+
if check_mode:
|
|
1261
|
+
passed, reason = _evaluate_verdicts(output.results)
|
|
1262
|
+
data = _build_adversarial_json(
|
|
1263
|
+
output,
|
|
1264
|
+
passed=passed,
|
|
1265
|
+
check_mode_str="verdict",
|
|
1266
|
+
reason=reason,
|
|
1267
|
+
resolved_models=debate_resolved_models,
|
|
1268
|
+
routing_warnings=debate_warnings,
|
|
1269
|
+
)
|
|
1270
|
+
click.echo(json.dumps(data, indent=2))
|
|
1271
|
+
ctx.exit(0 if passed else 1)
|
|
1272
|
+
return
|
|
1273
|
+
|
|
1274
|
+
if json_output:
|
|
1275
|
+
data = _build_adversarial_json(
|
|
1276
|
+
output,
|
|
1277
|
+
resolved_models=debate_resolved_models,
|
|
1278
|
+
routing_warnings=debate_warnings,
|
|
1279
|
+
)
|
|
1280
|
+
click.echo(json.dumps(data, indent=2))
|
|
1281
|
+
else:
|
|
1282
|
+
_print_debate_text(output, debate_resolved_models)
|
|
1283
|
+
|
|
1284
|
+
|
|
1285
|
+
def _build_stances(specs: list[ModelSpec], *, code_mode: bool = False) -> list[StanceSpec]:
|
|
1286
|
+
"""Assign stances cyclically to model specs."""
|
|
1287
|
+
prompts = _DEFAULT_CODE_STANCE_PROMPTS if code_mode else _DEFAULT_PROPOSAL_STANCE_PROMPTS
|
|
1288
|
+
stances: list[StanceSpec] = []
|
|
1289
|
+
for i, spec in enumerate(specs):
|
|
1290
|
+
stance = _STANCE_CYCLE[i % len(_STANCE_CYCLE)]
|
|
1291
|
+
stances.append(
|
|
1292
|
+
StanceSpec(
|
|
1293
|
+
stance=stance,
|
|
1294
|
+
stance_prompt=prompts[stance],
|
|
1295
|
+
model=spec,
|
|
1296
|
+
)
|
|
1297
|
+
)
|
|
1298
|
+
return stances
|
|
1299
|
+
|
|
1300
|
+
|
|
1301
|
+
def _parse_worker_specs(worker_args: tuple[str, ...] | list[str], *, code_mode: bool = False) -> list[StanceSpec]:
|
|
1302
|
+
"""Parse --worker arguments into StanceSpec list.
|
|
1303
|
+
|
|
1304
|
+
Formats:
|
|
1305
|
+
model:stance — stock stance (for/against/neutral)
|
|
1306
|
+
model:custom text — custom prompt (anything not a known stance)
|
|
1307
|
+
|
|
1308
|
+
Shells strip quotes before Click sees them, so ``model:"Focus on X"``
|
|
1309
|
+
arrives as ``model:Focus on X``. The parser treats any RHS that is not
|
|
1310
|
+
a known stance name as a custom prompt — no quote detection needed.
|
|
1311
|
+
|
|
1312
|
+
Raises ValueError for unknown models or missing colon.
|
|
1313
|
+
"""
|
|
1314
|
+
from forge.review.models import AVAILABLE_MODELS
|
|
1315
|
+
|
|
1316
|
+
prompts = _DEFAULT_CODE_STANCE_PROMPTS if code_mode else _DEFAULT_PROPOSAL_STANCE_PROMPTS
|
|
1317
|
+
stances: list[StanceSpec] = []
|
|
1318
|
+
for arg in worker_args:
|
|
1319
|
+
if ":" not in arg:
|
|
1320
|
+
raise ValueError(f"Invalid --worker '{arg}'. Expected model:stance or model:custom prompt.")
|
|
1321
|
+
|
|
1322
|
+
model_name, rest = arg.split(":", 1)
|
|
1323
|
+
model_name = model_name.strip()
|
|
1324
|
+
|
|
1325
|
+
if model_name not in AVAILABLE_MODELS:
|
|
1326
|
+
available = list(AVAILABLE_MODELS.keys())
|
|
1327
|
+
raise ValueError(f"Unknown model '{model_name}'. Available: {available}")
|
|
1328
|
+
|
|
1329
|
+
spec = AVAILABLE_MODELS[model_name]
|
|
1330
|
+
rest = rest.strip()
|
|
1331
|
+
|
|
1332
|
+
# Strip optional surrounding quotes (may survive in some shell contexts)
|
|
1333
|
+
if len(rest) >= 2 and rest[0] in ('"', "'") and rest[-1] == rest[0]:
|
|
1334
|
+
rest = rest[1:-1]
|
|
1335
|
+
|
|
1336
|
+
if not rest:
|
|
1337
|
+
raise ValueError(f"Empty stance/prompt for model '{model_name}'.")
|
|
1338
|
+
|
|
1339
|
+
if rest in prompts:
|
|
1340
|
+
stances.append(
|
|
1341
|
+
StanceSpec(
|
|
1342
|
+
stance=rest,
|
|
1343
|
+
stance_prompt=prompts[rest],
|
|
1344
|
+
model=spec,
|
|
1345
|
+
)
|
|
1346
|
+
)
|
|
1347
|
+
else:
|
|
1348
|
+
# Anything not a known stance is a custom prompt
|
|
1349
|
+
label = rest[:30] + ("..." if len(rest) > 30 else "")
|
|
1350
|
+
stances.append(
|
|
1351
|
+
StanceSpec(
|
|
1352
|
+
stance="custom",
|
|
1353
|
+
stance_prompt=rest,
|
|
1354
|
+
model=spec,
|
|
1355
|
+
display_label=label,
|
|
1356
|
+
)
|
|
1357
|
+
)
|
|
1358
|
+
|
|
1359
|
+
return stances
|
|
1360
|
+
|
|
1361
|
+
|
|
1362
|
+
def _build_adversarial_json(
|
|
1363
|
+
output: AdversarialOutput,
|
|
1364
|
+
*,
|
|
1365
|
+
passed: bool | None = None,
|
|
1366
|
+
check_mode_str: str | None = None,
|
|
1367
|
+
reason: str | None = None,
|
|
1368
|
+
resolved_models: dict[str, dict[str, Any]] | None = None,
|
|
1369
|
+
routing_warnings: list[str] | None = None,
|
|
1370
|
+
) -> dict[str, Any]:
|
|
1371
|
+
"""Build JSON output for adversarial evaluation."""
|
|
1372
|
+
data: dict[str, Any] = {
|
|
1373
|
+
"resource_path": "(generated)",
|
|
1374
|
+
"stances": output.stances,
|
|
1375
|
+
"results": {
|
|
1376
|
+
r.model_name: {
|
|
1377
|
+
"stance": output.stance_map.get(r.model_name, "unknown"),
|
|
1378
|
+
"response": r.stdout if r.success else None,
|
|
1379
|
+
"error": r.error,
|
|
1380
|
+
"duration_seconds": round(r.duration_seconds, 2),
|
|
1381
|
+
"success": r.success,
|
|
1382
|
+
}
|
|
1383
|
+
for r in output.results
|
|
1384
|
+
},
|
|
1385
|
+
"successful": output.successful,
|
|
1386
|
+
"failed": output.failed,
|
|
1387
|
+
}
|
|
1388
|
+
if resolved_models:
|
|
1389
|
+
data["resolved_models"] = resolved_models
|
|
1390
|
+
if passed is not None:
|
|
1391
|
+
data["passed"] = passed
|
|
1392
|
+
if check_mode_str is not None:
|
|
1393
|
+
data["check_mode"] = check_mode_str
|
|
1394
|
+
if reason is not None:
|
|
1395
|
+
data["reason"] = reason
|
|
1396
|
+
if routing_warnings:
|
|
1397
|
+
data["routing_warnings"] = routing_warnings
|
|
1398
|
+
return data
|
|
1399
|
+
|
|
1400
|
+
|
|
1401
|
+
def _print_debate_text(output: AdversarialOutput, resolved_models: dict[str, dict[str, Any]] | None = None) -> None:
|
|
1402
|
+
"""Print adversarial results as human-readable text."""
|
|
1403
|
+
console.print(f"\n[bold]Adversarial Evaluation[/bold] ({len(output.results)} workers)\n")
|
|
1404
|
+
if resolved_models:
|
|
1405
|
+
console.print(_format_resolved_models(resolved_models).rstrip())
|
|
1406
|
+
console.print()
|
|
1407
|
+
|
|
1408
|
+
for i, result in enumerate(output.results):
|
|
1409
|
+
stance = output.stances[i] if i < len(output.stances) else "unknown"
|
|
1410
|
+
header = f"[cyan]{result.model_name}[/cyan] ([dim]{stance}[/dim])"
|
|
1411
|
+
if result.success:
|
|
1412
|
+
console.print(f"--- {header} ---")
|
|
1413
|
+
console.print(result.stdout)
|
|
1414
|
+
console.print()
|
|
1415
|
+
else:
|
|
1416
|
+
console.print(f"--- {header} [red]FAILED[/red] ---")
|
|
1417
|
+
console.print(f"[red]{result.error}[/red]\n")
|
|
1418
|
+
|
|
1419
|
+
|
|
1420
|
+
# --- Consensus subcommand ---
|
|
1421
|
+
|
|
1422
|
+
_PROPOSAL_ROLE_CYCLE = ["architecture", "security", "correctness"]
|
|
1423
|
+
_CODE_ROLE_CYCLE = ["architecture", "security", "maintainability"]
|
|
1424
|
+
|
|
1425
|
+
_CONSENSUS_EVALUATION_TEMPLATE = """\
|
|
1426
|
+
# Consensus Evaluation
|
|
1427
|
+
|
|
1428
|
+
```xml
|
|
1429
|
+
<role>
|
|
1430
|
+
You are a technical expert participating in a multi-perspective consensus process.
|
|
1431
|
+
{role_prompt}
|
|
1432
|
+
</role>
|
|
1433
|
+
|
|
1434
|
+
<behavior>
|
|
1435
|
+
- Evaluate from your assigned perspective
|
|
1436
|
+
- Support every claim with evidence or reasoning
|
|
1437
|
+
- Be specific about trade-offs and constraints
|
|
1438
|
+
- Identify both strengths and weaknesses from your viewpoint
|
|
1439
|
+
- Provide a clear position with confidence level
|
|
1440
|
+
</behavior>
|
|
1441
|
+
```
|
|
1442
|
+
|
|
1443
|
+
---
|
|
1444
|
+
|
|
1445
|
+
## Subject Under Evaluation
|
|
1446
|
+
|
|
1447
|
+
{subject}
|
|
1448
|
+
|
|
1449
|
+
---
|
|
1450
|
+
|
|
1451
|
+
## Evaluation Framework
|
|
1452
|
+
|
|
1453
|
+
### 1. Assessment from Your Perspective
|
|
1454
|
+
|
|
1455
|
+
- What are the key considerations from your assigned viewpoint?
|
|
1456
|
+
- What risks or opportunities do you see that others might miss?
|
|
1457
|
+
|
|
1458
|
+
### 2. Strengths
|
|
1459
|
+
|
|
1460
|
+
- What aspects of this proposal align well with your area of focus?
|
|
1461
|
+
|
|
1462
|
+
### 3. Concerns
|
|
1463
|
+
|
|
1464
|
+
- What issues or risks do you identify from your perspective?
|
|
1465
|
+
- How severe are they? What is the mitigation path?
|
|
1466
|
+
|
|
1467
|
+
### 4. Recommendation
|
|
1468
|
+
|
|
1469
|
+
- Your position: SUPPORT, SUPPORT_WITH_CONDITIONS, or OPPOSE
|
|
1470
|
+
- Confidence level: LOW, MEDIUM, HIGH
|
|
1471
|
+
- Key conditions (if SUPPORT_WITH_CONDITIONS)
|
|
1472
|
+
|
|
1473
|
+
---
|
|
1474
|
+
|
|
1475
|
+
## Output Format
|
|
1476
|
+
|
|
1477
|
+
````xml
|
|
1478
|
+
<output_format>
|
|
1479
|
+
Respond with your assessment in JSON:
|
|
1480
|
+
|
|
1481
|
+
{
|
|
1482
|
+
"position": "SUPPORT" | "SUPPORT_WITH_CONDITIONS" | "OPPOSE",
|
|
1483
|
+
"confidence": "LOW" | "MEDIUM" | "HIGH",
|
|
1484
|
+
"key_points": [
|
|
1485
|
+
{"category": "strength|concern|risk|opportunity",
|
|
1486
|
+
"point": "specific finding from your perspective",
|
|
1487
|
+
"severity": "critical|high|medium|low"}
|
|
1488
|
+
],
|
|
1489
|
+
"recommendation": "1-2 sentence summary from your perspective",
|
|
1490
|
+
"conditions": ["condition 1", "condition 2"]
|
|
1491
|
+
}
|
|
1492
|
+
|
|
1493
|
+
Wrap the JSON in a ```json code fence.
|
|
1494
|
+
</output_format>
|
|
1495
|
+
````
|
|
1496
|
+
"""
|
|
1497
|
+
|
|
1498
|
+
_CODE_CONSENSUS_EVALUATION_TEMPLATE = """\
|
|
1499
|
+
# Code Consensus Evaluation
|
|
1500
|
+
|
|
1501
|
+
```xml
|
|
1502
|
+
<role>
|
|
1503
|
+
You are a senior code evaluator participating in a multi-perspective consensus process.
|
|
1504
|
+
{role_prompt}
|
|
1505
|
+
You identify issues and opportunities from your assigned perspective.
|
|
1506
|
+
You provide actionable feedback with specific code references.
|
|
1507
|
+
</role>
|
|
1508
|
+
|
|
1509
|
+
<behavior>
|
|
1510
|
+
- Read all code in scope before forming opinions
|
|
1511
|
+
- Cite specific file:line references for every finding
|
|
1512
|
+
- Evaluate from your assigned perspective
|
|
1513
|
+
- Support every claim with evidence or reasoning
|
|
1514
|
+
- Cover ALL files in ONE pass -- do not present partial results
|
|
1515
|
+
- Be specific: "potential null dereference at auth.py:45" not "might have issues"
|
|
1516
|
+
- Provide a clear position with confidence level
|
|
1517
|
+
</behavior>
|
|
1518
|
+
|
|
1519
|
+
<scope_constraints>
|
|
1520
|
+
- Review only what's in scope
|
|
1521
|
+
- Do not expand to adjacent code unless directly affected
|
|
1522
|
+
- If tests exist for reviewed code, check them for coverage gaps
|
|
1523
|
+
</scope_constraints>
|
|
1524
|
+
```
|
|
1525
|
+
|
|
1526
|
+
---
|
|
1527
|
+
|
|
1528
|
+
## Code Under Evaluation
|
|
1529
|
+
|
|
1530
|
+
{target}
|
|
1531
|
+
|
|
1532
|
+
---
|
|
1533
|
+
|
|
1534
|
+
## Evaluation Framework
|
|
1535
|
+
|
|
1536
|
+
### 1. Quality
|
|
1537
|
+
|
|
1538
|
+
- Logic errors and edge cases
|
|
1539
|
+
- Error handling: are errors caught, propagated, and surfaced correctly?
|
|
1540
|
+
- Type safety: do type annotations match runtime behavior?
|
|
1541
|
+
- Test coverage: are critical paths tested?
|
|
1542
|
+
|
|
1543
|
+
### 2. Security
|
|
1544
|
+
|
|
1545
|
+
- Input validation at trust boundaries
|
|
1546
|
+
- Injection vectors (command, SQL, path traversal)
|
|
1547
|
+
- Secrets in code or logs
|
|
1548
|
+
- Authentication and authorization gaps
|
|
1549
|
+
|
|
1550
|
+
### 3. Performance
|
|
1551
|
+
|
|
1552
|
+
- Unnecessary allocations or copies in hot paths
|
|
1553
|
+
- N+1 query patterns
|
|
1554
|
+
- Missing caching where data is reused
|
|
1555
|
+
- Blocking calls in async contexts
|
|
1556
|
+
|
|
1557
|
+
### 4. Architecture
|
|
1558
|
+
|
|
1559
|
+
- Component boundaries: is coupling appropriate?
|
|
1560
|
+
- Dependency direction: do imports flow the right way?
|
|
1561
|
+
- Abstraction level: is complexity in the right place?
|
|
1562
|
+
- Interface contracts: are public APIs stable and well-defined?
|
|
1563
|
+
|
|
1564
|
+
### 5. Recommendation
|
|
1565
|
+
|
|
1566
|
+
- Your position: SUPPORT, SUPPORT_WITH_CONDITIONS, or OPPOSE
|
|
1567
|
+
- Confidence level: LOW, MEDIUM, HIGH
|
|
1568
|
+
- Key conditions (if SUPPORT_WITH_CONDITIONS)
|
|
1569
|
+
|
|
1570
|
+
---
|
|
1571
|
+
|
|
1572
|
+
## Output Format
|
|
1573
|
+
|
|
1574
|
+
````xml
|
|
1575
|
+
<output_format>
|
|
1576
|
+
Respond with your assessment in JSON:
|
|
1577
|
+
|
|
1578
|
+
{
|
|
1579
|
+
"position": "SUPPORT" | "SUPPORT_WITH_CONDITIONS" | "OPPOSE",
|
|
1580
|
+
"confidence": "LOW" | "MEDIUM" | "HIGH",
|
|
1581
|
+
"key_points": [
|
|
1582
|
+
{"category": "quality|security|performance|architecture|maintainability",
|
|
1583
|
+
"point": "specific finding with file:line reference",
|
|
1584
|
+
"severity": "critical|high|medium|low"}
|
|
1585
|
+
],
|
|
1586
|
+
"recommendation": "1-2 sentence summary from your perspective",
|
|
1587
|
+
"conditions": ["condition 1", "condition 2"]
|
|
1588
|
+
}
|
|
1589
|
+
|
|
1590
|
+
Wrap the JSON in a ```json code fence.
|
|
1591
|
+
</output_format>
|
|
1592
|
+
````
|
|
1593
|
+
"""
|
|
1594
|
+
|
|
1595
|
+
|
|
1596
|
+
def _resolve_consensus_prompt(
|
|
1597
|
+
subject: tuple[str, ...],
|
|
1598
|
+
prompt: str | None,
|
|
1599
|
+
code_mode: bool,
|
|
1600
|
+
) -> str | None:
|
|
1601
|
+
"""Resolve prompt for consensus. Wraps subject in template with {role_prompt} marker."""
|
|
1602
|
+
resolved = prompt or (" ".join(subject) if subject else None)
|
|
1603
|
+
if not resolved and not sys.stdin.isatty():
|
|
1604
|
+
resolved = sys.stdin.read().strip() or None
|
|
1605
|
+
|
|
1606
|
+
if not resolved:
|
|
1607
|
+
return None
|
|
1608
|
+
|
|
1609
|
+
if code_mode:
|
|
1610
|
+
return _CODE_CONSENSUS_EVALUATION_TEMPLATE.replace("{target}", resolved)
|
|
1611
|
+
return _CONSENSUS_EVALUATION_TEMPLATE.replace("{subject}", resolved)
|
|
1612
|
+
|
|
1613
|
+
|
|
1614
|
+
def _build_consensus_roles(
|
|
1615
|
+
specs: list[ModelSpec],
|
|
1616
|
+
code_mode: bool,
|
|
1617
|
+
) -> list[RoleSpec]:
|
|
1618
|
+
"""Assign roles cyclically to model specs. Cycle depends on mode."""
|
|
1619
|
+
cycle = _CODE_ROLE_CYCLE if code_mode else _PROPOSAL_ROLE_CYCLE
|
|
1620
|
+
role_specs: list[RoleSpec] = []
|
|
1621
|
+
for i, spec in enumerate(specs):
|
|
1622
|
+
role_name = cycle[i % len(cycle)]
|
|
1623
|
+
role_specs.append(
|
|
1624
|
+
RoleSpec(
|
|
1625
|
+
role=role_name,
|
|
1626
|
+
role_prompt=NAMED_ROLES[role_name],
|
|
1627
|
+
model=spec,
|
|
1628
|
+
)
|
|
1629
|
+
)
|
|
1630
|
+
return role_specs
|
|
1631
|
+
|
|
1632
|
+
|
|
1633
|
+
def _parse_consensus_worker_specs(
|
|
1634
|
+
worker_args: tuple[str, ...] | list[str],
|
|
1635
|
+
) -> list[RoleSpec]:
|
|
1636
|
+
"""Parse --worker arguments into RoleSpec list.
|
|
1637
|
+
|
|
1638
|
+
Formats:
|
|
1639
|
+
model:role -- named role (architecture, security, etc.)
|
|
1640
|
+
model:custom text -- custom role prompt
|
|
1641
|
+
|
|
1642
|
+
Raises ValueError for unknown models or missing colon.
|
|
1643
|
+
"""
|
|
1644
|
+
from forge.review.models import AVAILABLE_MODELS
|
|
1645
|
+
|
|
1646
|
+
role_specs: list[RoleSpec] = []
|
|
1647
|
+
for arg in worker_args:
|
|
1648
|
+
if ":" not in arg:
|
|
1649
|
+
raise ValueError(f"Invalid --worker '{arg}'. Expected model:role or model:custom prompt.")
|
|
1650
|
+
|
|
1651
|
+
model_name, rest = arg.split(":", 1)
|
|
1652
|
+
model_name = model_name.strip()
|
|
1653
|
+
|
|
1654
|
+
if model_name not in AVAILABLE_MODELS:
|
|
1655
|
+
available = list(AVAILABLE_MODELS.keys())
|
|
1656
|
+
raise ValueError(f"Unknown model '{model_name}'. Available: {available}")
|
|
1657
|
+
|
|
1658
|
+
spec = AVAILABLE_MODELS[model_name]
|
|
1659
|
+
rest = rest.strip()
|
|
1660
|
+
|
|
1661
|
+
# Strip optional surrounding quotes (may survive in some shell contexts)
|
|
1662
|
+
if len(rest) >= 2 and rest[0] in ('"', "'") and rest[-1] == rest[0]:
|
|
1663
|
+
rest = rest[1:-1]
|
|
1664
|
+
|
|
1665
|
+
if not rest:
|
|
1666
|
+
raise ValueError(f"Empty role/prompt for model '{model_name}'.")
|
|
1667
|
+
|
|
1668
|
+
if rest in NAMED_ROLES:
|
|
1669
|
+
role_specs.append(RoleSpec(role=rest, role_prompt=NAMED_ROLES[rest], model=spec))
|
|
1670
|
+
else:
|
|
1671
|
+
label = rest[:30] + ("..." if len(rest) > 30 else "")
|
|
1672
|
+
role_specs.append(
|
|
1673
|
+
RoleSpec(
|
|
1674
|
+
role="custom",
|
|
1675
|
+
role_prompt=rest,
|
|
1676
|
+
model=spec,
|
|
1677
|
+
display_label=label,
|
|
1678
|
+
)
|
|
1679
|
+
)
|
|
1680
|
+
|
|
1681
|
+
return role_specs
|
|
1682
|
+
|
|
1683
|
+
|
|
1684
|
+
def _build_consensus_json(
|
|
1685
|
+
output: ConsensusOutput,
|
|
1686
|
+
*,
|
|
1687
|
+
passed: bool | None = None,
|
|
1688
|
+
check_mode_str: str | None = None,
|
|
1689
|
+
reason: str | None = None,
|
|
1690
|
+
resolved_models: dict[str, dict[str, Any]] | None = None,
|
|
1691
|
+
routing_warnings: list[str] | None = None,
|
|
1692
|
+
) -> dict[str, Any]:
|
|
1693
|
+
"""Build JSON output for consensus workflow."""
|
|
1694
|
+
data: dict[str, Any] = {
|
|
1695
|
+
"subject": output.subject,
|
|
1696
|
+
"roles": output.roles,
|
|
1697
|
+
"role_map": output.role_map,
|
|
1698
|
+
"round1": {
|
|
1699
|
+
r.model_name: {
|
|
1700
|
+
"role": output.role_map.get(r.model_name, "unknown"),
|
|
1701
|
+
"response": r.stdout if r.success else None,
|
|
1702
|
+
"error": r.error,
|
|
1703
|
+
"duration_seconds": round(r.duration_seconds, 2),
|
|
1704
|
+
"success": r.success,
|
|
1705
|
+
}
|
|
1706
|
+
for r in output.round1_results
|
|
1707
|
+
},
|
|
1708
|
+
"round2": {
|
|
1709
|
+
r.model_name: {
|
|
1710
|
+
"role": output.role_map.get(r.model_name, "unknown"),
|
|
1711
|
+
"response": r.stdout if r.success else None,
|
|
1712
|
+
"error": r.error,
|
|
1713
|
+
"duration_seconds": round(r.duration_seconds, 2),
|
|
1714
|
+
"success": r.success,
|
|
1715
|
+
}
|
|
1716
|
+
for r in output.round2_results
|
|
1717
|
+
},
|
|
1718
|
+
"reconciliation_brief": output.reconciliation_brief,
|
|
1719
|
+
"successful": output.successful,
|
|
1720
|
+
"failed": output.failed,
|
|
1721
|
+
}
|
|
1722
|
+
if resolved_models:
|
|
1723
|
+
data["resolved_models"] = resolved_models
|
|
1724
|
+
if passed is not None:
|
|
1725
|
+
data["passed"] = passed
|
|
1726
|
+
if check_mode_str is not None:
|
|
1727
|
+
data["check_mode"] = check_mode_str
|
|
1728
|
+
if reason is not None:
|
|
1729
|
+
data["reason"] = reason
|
|
1730
|
+
if routing_warnings:
|
|
1731
|
+
data["routing_warnings"] = routing_warnings
|
|
1732
|
+
return data
|
|
1733
|
+
|
|
1734
|
+
|
|
1735
|
+
def _print_consensus_text(output: ConsensusOutput, resolved_models: dict[str, dict[str, Any]] | None = None) -> None:
|
|
1736
|
+
"""Print consensus results as structured human-readable text."""
|
|
1737
|
+
console.print(f"\n[bold]Consensus Workflow[/bold] " f"({len(output.round2_results)} workers, 2 rounds)\n")
|
|
1738
|
+
if resolved_models:
|
|
1739
|
+
console.print(_format_resolved_models(resolved_models).rstrip())
|
|
1740
|
+
console.print()
|
|
1741
|
+
|
|
1742
|
+
# Round 1 positions (truncated)
|
|
1743
|
+
console.print("[dim]Round 1: Initial Positions[/dim]\n")
|
|
1744
|
+
for result in output.round1_results:
|
|
1745
|
+
role = output.role_map.get(result.model_name, "unknown")
|
|
1746
|
+
header = f"[cyan]{result.model_name}[/cyan] ([dim]{role}[/dim])"
|
|
1747
|
+
if result.success:
|
|
1748
|
+
console.print(f"--- {header} ---")
|
|
1749
|
+
excerpt = result.stdout[:500]
|
|
1750
|
+
if len(result.stdout) > 500:
|
|
1751
|
+
excerpt += "..."
|
|
1752
|
+
console.print(excerpt)
|
|
1753
|
+
console.print()
|
|
1754
|
+
else:
|
|
1755
|
+
console.print(f"--- {header} [red]FAILED[/red] ---")
|
|
1756
|
+
console.print(f"[red]{result.error}[/red]\n")
|
|
1757
|
+
|
|
1758
|
+
# Reconciliation brief (dimmed)
|
|
1759
|
+
console.print("[dim]--- Reconciliation Brief ---[/dim]")
|
|
1760
|
+
console.print(f"[dim]{output.reconciliation_brief[:300]}...[/dim]\n")
|
|
1761
|
+
|
|
1762
|
+
# Round 2 recommendations (full)
|
|
1763
|
+
console.print("[dim]Round 2: Reconciliation[/dim]\n")
|
|
1764
|
+
for result in output.round2_results:
|
|
1765
|
+
role = output.role_map.get(result.model_name, "unknown")
|
|
1766
|
+
header = f"[cyan]{result.model_name}[/cyan] ([dim]{role}[/dim])"
|
|
1767
|
+
if result.success:
|
|
1768
|
+
console.print(f"--- {header} ---")
|
|
1769
|
+
console.print(result.stdout)
|
|
1770
|
+
console.print()
|
|
1771
|
+
else:
|
|
1772
|
+
console.print(f"--- {header} [red]FAILED[/red] ---")
|
|
1773
|
+
console.print(f"[red]{result.error}[/red]\n")
|
|
1774
|
+
|
|
1775
|
+
# Status line (execution status only; actual convergence is in the synthesis)
|
|
1776
|
+
completed = sum(1 for r in output.round2_results if r.success)
|
|
1777
|
+
total = len(output.round2_results)
|
|
1778
|
+
console.print(f"[bold]Completed: {completed}/{total} workers finished reconciliation[/bold]")
|
|
1779
|
+
|
|
1780
|
+
|
|
1781
|
+
@workflow_cmd.command(name="consensus")
|
|
1782
|
+
@click.argument("subject", nargs=-1)
|
|
1783
|
+
@click.option(
|
|
1784
|
+
"-p",
|
|
1785
|
+
"--prompt",
|
|
1786
|
+
"prompt_text",
|
|
1787
|
+
type=str,
|
|
1788
|
+
default=None,
|
|
1789
|
+
help="Subject to build consensus on (alternative to positional)",
|
|
1790
|
+
)
|
|
1791
|
+
@click.option(
|
|
1792
|
+
"--code",
|
|
1793
|
+
"code_mode",
|
|
1794
|
+
is_flag=True,
|
|
1795
|
+
help="Use code evaluation framework (default: proposal evaluation)",
|
|
1796
|
+
)
|
|
1797
|
+
@click.option(
|
|
1798
|
+
"--models",
|
|
1799
|
+
"-m",
|
|
1800
|
+
type=str,
|
|
1801
|
+
default=None,
|
|
1802
|
+
help="Comma-separated model names (default: all)",
|
|
1803
|
+
)
|
|
1804
|
+
@click.option(
|
|
1805
|
+
"--timeout",
|
|
1806
|
+
"-t",
|
|
1807
|
+
type=int,
|
|
1808
|
+
default=600,
|
|
1809
|
+
help="Per-round timeout in seconds (total wall time ~2x for two rounds)",
|
|
1810
|
+
)
|
|
1811
|
+
@click.option("--json", "json_output", is_flag=True, help="Output structured JSON")
|
|
1812
|
+
@click.option(
|
|
1813
|
+
"--check",
|
|
1814
|
+
"check_mode",
|
|
1815
|
+
is_flag=True,
|
|
1816
|
+
help="Gate on positions: exit 0 if all supporting, exit 1 otherwise",
|
|
1817
|
+
)
|
|
1818
|
+
@click.option(
|
|
1819
|
+
"--worker",
|
|
1820
|
+
"workers",
|
|
1821
|
+
multiple=True,
|
|
1822
|
+
type=str,
|
|
1823
|
+
help='Worker spec: model:role or model:"custom prompt" (repeatable)',
|
|
1824
|
+
)
|
|
1825
|
+
@click.option("--proxy", "via", type=str, default=None, help="Route proxy-backed workers through this proxy")
|
|
1826
|
+
@click.option("--cwd", type=click.Path(exists=True), default=None, help="Working directory")
|
|
1827
|
+
@click.pass_context
|
|
1828
|
+
def consensus(
|
|
1829
|
+
ctx: click.Context,
|
|
1830
|
+
subject: tuple[str, ...],
|
|
1831
|
+
prompt_text: str | None,
|
|
1832
|
+
code_mode: bool,
|
|
1833
|
+
models: str | None,
|
|
1834
|
+
timeout: int,
|
|
1835
|
+
json_output: bool,
|
|
1836
|
+
check_mode: bool,
|
|
1837
|
+
workers: tuple[str, ...],
|
|
1838
|
+
via: str | None,
|
|
1839
|
+
cwd: str | None,
|
|
1840
|
+
) -> None:
|
|
1841
|
+
"""Two-round consensus building with role-assigned workers.
|
|
1842
|
+
|
|
1843
|
+
Round 1: Each model evaluates the subject from an assigned role
|
|
1844
|
+
(architecture, security, etc.) independently.
|
|
1845
|
+
Round 2: Each model receives all Round 1 positions and produces
|
|
1846
|
+
a reconciled recommendation.
|
|
1847
|
+
|
|
1848
|
+
Default roles: architecture, security, correctness (proposals)
|
|
1849
|
+
or architecture, security, maintainability (code).
|
|
1850
|
+
|
|
1851
|
+
\b
|
|
1852
|
+
Examples:
|
|
1853
|
+
forge workflow consensus "Should we use event sourcing?" --json
|
|
1854
|
+
forge workflow consensus src/forge/cli/ --code --check
|
|
1855
|
+
forge workflow consensus --worker gpt-5.5:security --worker "claude-opus:Focus on DX" "proposal"
|
|
1856
|
+
"""
|
|
1857
|
+
from forge.review.consensus import run_consensus, validate_resource
|
|
1858
|
+
|
|
1859
|
+
if workers and models:
|
|
1860
|
+
console.print("[red]Error:[/red] --worker and --models are mutually exclusive.")
|
|
1861
|
+
ctx.exit(2)
|
|
1862
|
+
return
|
|
1863
|
+
|
|
1864
|
+
# Resolve raw subject once (positional > -p > stdin) to avoid double-read
|
|
1865
|
+
raw_subject = prompt_text or (" ".join(subject) if subject else None)
|
|
1866
|
+
if not raw_subject and not sys.stdin.isatty():
|
|
1867
|
+
raw_subject = sys.stdin.read().strip() or None
|
|
1868
|
+
|
|
1869
|
+
resolved = _resolve_consensus_prompt((), raw_subject, code_mode)
|
|
1870
|
+
if not resolved:
|
|
1871
|
+
label = "target" if code_mode else "subject"
|
|
1872
|
+
console.print(f"[red]Error:[/red] No {label} provided. Pass as argument or use -p.")
|
|
1873
|
+
ctx.exit(2)
|
|
1874
|
+
return
|
|
1875
|
+
|
|
1876
|
+
tmp_file = None
|
|
1877
|
+
try:
|
|
1878
|
+
tmp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False)
|
|
1879
|
+
tmp_file.write(resolved)
|
|
1880
|
+
tmp_file.close()
|
|
1881
|
+
resource_path = tmp_file.name
|
|
1882
|
+
|
|
1883
|
+
try:
|
|
1884
|
+
validate_resource(resource_path)
|
|
1885
|
+
except ValueError as e:
|
|
1886
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1887
|
+
ctx.exit(2)
|
|
1888
|
+
return
|
|
1889
|
+
|
|
1890
|
+
if workers:
|
|
1891
|
+
try:
|
|
1892
|
+
role_specs = _parse_consensus_worker_specs(workers)
|
|
1893
|
+
except ValueError as e:
|
|
1894
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1895
|
+
ctx.exit(2)
|
|
1896
|
+
return
|
|
1897
|
+
else:
|
|
1898
|
+
try:
|
|
1899
|
+
specs = resolve_model_specs(models)
|
|
1900
|
+
except ValueError as e:
|
|
1901
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
1902
|
+
ctx.exit(2)
|
|
1903
|
+
return
|
|
1904
|
+
role_specs = _build_consensus_roles(specs, code_mode)
|
|
1905
|
+
|
|
1906
|
+
from forge.core.reactive.cost_tracking import (
|
|
1907
|
+
resolve_proxy_urls_from_plan,
|
|
1908
|
+
track_verb_cost,
|
|
1909
|
+
)
|
|
1910
|
+
from forge.review.routing import resolve_invocation_routing
|
|
1911
|
+
|
|
1912
|
+
role_models = [r.model for r in role_specs]
|
|
1913
|
+
try:
|
|
1914
|
+
routing_plan = resolve_invocation_routing(role_models, via=via)
|
|
1915
|
+
except _ROUTING_ERRORS as e:
|
|
1916
|
+
_handle_routing_error(e, json_output=json_output)
|
|
1917
|
+
return
|
|
1918
|
+
|
|
1919
|
+
_run_preflight(role_models, json_output=json_output, routing_plan=routing_plan)
|
|
1920
|
+
|
|
1921
|
+
with track_verb_cost("consensus", resolve_proxy_urls_from_plan(routing_plan)):
|
|
1922
|
+
output = run_consensus(
|
|
1923
|
+
resource_path,
|
|
1924
|
+
role_specs,
|
|
1925
|
+
timeout_seconds=timeout,
|
|
1926
|
+
cwd=cwd or str(Path.cwd()),
|
|
1927
|
+
original_subject=raw_subject or "",
|
|
1928
|
+
routing_plan=routing_plan,
|
|
1929
|
+
)
|
|
1930
|
+
finally:
|
|
1931
|
+
if tmp_file is not None:
|
|
1932
|
+
Path(tmp_file.name).unlink(missing_ok=True)
|
|
1933
|
+
|
|
1934
|
+
consensus_warnings = _routing_plan_warnings(role_models, routing_plan)
|
|
1935
|
+
consensus_resolved_models = _resolved_models_summary(
|
|
1936
|
+
role_models,
|
|
1937
|
+
routing_plan,
|
|
1938
|
+
worker_ids=[result.model_name for result in output.round1_results],
|
|
1939
|
+
roles=output.role_map,
|
|
1940
|
+
)
|
|
1941
|
+
|
|
1942
|
+
if check_mode:
|
|
1943
|
+
passed, reason = _evaluate_consensus_positions(output.round2_results)
|
|
1944
|
+
data = _build_consensus_json(
|
|
1945
|
+
output,
|
|
1946
|
+
passed=passed,
|
|
1947
|
+
check_mode_str="position",
|
|
1948
|
+
reason=reason,
|
|
1949
|
+
resolved_models=consensus_resolved_models,
|
|
1950
|
+
routing_warnings=consensus_warnings,
|
|
1951
|
+
)
|
|
1952
|
+
click.echo(json.dumps(data, indent=2))
|
|
1953
|
+
ctx.exit(0 if passed else 1)
|
|
1954
|
+
return
|
|
1955
|
+
|
|
1956
|
+
if json_output:
|
|
1957
|
+
data = _build_consensus_json(
|
|
1958
|
+
output,
|
|
1959
|
+
resolved_models=consensus_resolved_models,
|
|
1960
|
+
routing_warnings=consensus_warnings,
|
|
1961
|
+
)
|
|
1962
|
+
click.echo(json.dumps(data, indent=2))
|
|
1963
|
+
else:
|
|
1964
|
+
_print_consensus_text(output, consensus_resolved_models)
|