emdash-core 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- emdash_core/__init__.py +3 -0
- emdash_core/agent/__init__.py +37 -0
- emdash_core/agent/agents.py +225 -0
- emdash_core/agent/code_reviewer.py +476 -0
- emdash_core/agent/compaction.py +143 -0
- emdash_core/agent/context_manager.py +140 -0
- emdash_core/agent/events.py +338 -0
- emdash_core/agent/handlers.py +224 -0
- emdash_core/agent/inprocess_subagent.py +377 -0
- emdash_core/agent/mcp/__init__.py +50 -0
- emdash_core/agent/mcp/client.py +346 -0
- emdash_core/agent/mcp/config.py +302 -0
- emdash_core/agent/mcp/manager.py +496 -0
- emdash_core/agent/mcp/tool_factory.py +213 -0
- emdash_core/agent/prompts/__init__.py +38 -0
- emdash_core/agent/prompts/main_agent.py +104 -0
- emdash_core/agent/prompts/subagents.py +131 -0
- emdash_core/agent/prompts/workflow.py +136 -0
- emdash_core/agent/providers/__init__.py +34 -0
- emdash_core/agent/providers/base.py +143 -0
- emdash_core/agent/providers/factory.py +80 -0
- emdash_core/agent/providers/models.py +220 -0
- emdash_core/agent/providers/openai_provider.py +463 -0
- emdash_core/agent/providers/transformers_provider.py +217 -0
- emdash_core/agent/research/__init__.py +81 -0
- emdash_core/agent/research/agent.py +143 -0
- emdash_core/agent/research/controller.py +254 -0
- emdash_core/agent/research/critic.py +428 -0
- emdash_core/agent/research/macros.py +469 -0
- emdash_core/agent/research/planner.py +449 -0
- emdash_core/agent/research/researcher.py +436 -0
- emdash_core/agent/research/state.py +523 -0
- emdash_core/agent/research/synthesizer.py +594 -0
- emdash_core/agent/reviewer_profile.py +475 -0
- emdash_core/agent/rules.py +123 -0
- emdash_core/agent/runner.py +601 -0
- emdash_core/agent/session.py +262 -0
- emdash_core/agent/spec_schema.py +66 -0
- emdash_core/agent/specification.py +479 -0
- emdash_core/agent/subagent.py +397 -0
- emdash_core/agent/subagent_prompts.py +13 -0
- emdash_core/agent/toolkit.py +482 -0
- emdash_core/agent/toolkits/__init__.py +64 -0
- emdash_core/agent/toolkits/base.py +96 -0
- emdash_core/agent/toolkits/explore.py +47 -0
- emdash_core/agent/toolkits/plan.py +55 -0
- emdash_core/agent/tools/__init__.py +141 -0
- emdash_core/agent/tools/analytics.py +436 -0
- emdash_core/agent/tools/base.py +131 -0
- emdash_core/agent/tools/coding.py +484 -0
- emdash_core/agent/tools/github_mcp.py +592 -0
- emdash_core/agent/tools/history.py +13 -0
- emdash_core/agent/tools/modes.py +153 -0
- emdash_core/agent/tools/plan.py +206 -0
- emdash_core/agent/tools/plan_write.py +135 -0
- emdash_core/agent/tools/search.py +412 -0
- emdash_core/agent/tools/spec.py +341 -0
- emdash_core/agent/tools/task.py +262 -0
- emdash_core/agent/tools/task_output.py +204 -0
- emdash_core/agent/tools/tasks.py +454 -0
- emdash_core/agent/tools/traversal.py +588 -0
- emdash_core/agent/tools/web.py +179 -0
- emdash_core/analytics/__init__.py +5 -0
- emdash_core/analytics/engine.py +1286 -0
- emdash_core/api/__init__.py +5 -0
- emdash_core/api/agent.py +308 -0
- emdash_core/api/agents.py +154 -0
- emdash_core/api/analyze.py +264 -0
- emdash_core/api/auth.py +173 -0
- emdash_core/api/context.py +77 -0
- emdash_core/api/db.py +121 -0
- emdash_core/api/embed.py +131 -0
- emdash_core/api/feature.py +143 -0
- emdash_core/api/health.py +93 -0
- emdash_core/api/index.py +162 -0
- emdash_core/api/plan.py +110 -0
- emdash_core/api/projectmd.py +210 -0
- emdash_core/api/query.py +320 -0
- emdash_core/api/research.py +122 -0
- emdash_core/api/review.py +161 -0
- emdash_core/api/router.py +76 -0
- emdash_core/api/rules.py +116 -0
- emdash_core/api/search.py +119 -0
- emdash_core/api/spec.py +99 -0
- emdash_core/api/swarm.py +223 -0
- emdash_core/api/tasks.py +109 -0
- emdash_core/api/team.py +120 -0
- emdash_core/auth/__init__.py +17 -0
- emdash_core/auth/github.py +389 -0
- emdash_core/config.py +74 -0
- emdash_core/context/__init__.py +52 -0
- emdash_core/context/models.py +50 -0
- emdash_core/context/providers/__init__.py +11 -0
- emdash_core/context/providers/base.py +74 -0
- emdash_core/context/providers/explored_areas.py +183 -0
- emdash_core/context/providers/touched_areas.py +360 -0
- emdash_core/context/registry.py +73 -0
- emdash_core/context/reranker.py +199 -0
- emdash_core/context/service.py +260 -0
- emdash_core/context/session.py +352 -0
- emdash_core/core/__init__.py +104 -0
- emdash_core/core/config.py +454 -0
- emdash_core/core/exceptions.py +55 -0
- emdash_core/core/models.py +265 -0
- emdash_core/core/review_config.py +57 -0
- emdash_core/db/__init__.py +67 -0
- emdash_core/db/auth.py +134 -0
- emdash_core/db/models.py +91 -0
- emdash_core/db/provider.py +222 -0
- emdash_core/db/providers/__init__.py +5 -0
- emdash_core/db/providers/supabase.py +452 -0
- emdash_core/embeddings/__init__.py +24 -0
- emdash_core/embeddings/indexer.py +534 -0
- emdash_core/embeddings/models.py +192 -0
- emdash_core/embeddings/providers/__init__.py +7 -0
- emdash_core/embeddings/providers/base.py +112 -0
- emdash_core/embeddings/providers/fireworks.py +141 -0
- emdash_core/embeddings/providers/openai.py +104 -0
- emdash_core/embeddings/registry.py +146 -0
- emdash_core/embeddings/service.py +215 -0
- emdash_core/graph/__init__.py +26 -0
- emdash_core/graph/builder.py +134 -0
- emdash_core/graph/connection.py +692 -0
- emdash_core/graph/schema.py +416 -0
- emdash_core/graph/writer.py +667 -0
- emdash_core/ingestion/__init__.py +7 -0
- emdash_core/ingestion/change_detector.py +150 -0
- emdash_core/ingestion/git/__init__.py +5 -0
- emdash_core/ingestion/git/commit_analyzer.py +196 -0
- emdash_core/ingestion/github/__init__.py +6 -0
- emdash_core/ingestion/github/pr_fetcher.py +296 -0
- emdash_core/ingestion/github/task_extractor.py +100 -0
- emdash_core/ingestion/orchestrator.py +540 -0
- emdash_core/ingestion/parsers/__init__.py +10 -0
- emdash_core/ingestion/parsers/base_parser.py +66 -0
- emdash_core/ingestion/parsers/call_graph_builder.py +121 -0
- emdash_core/ingestion/parsers/class_extractor.py +154 -0
- emdash_core/ingestion/parsers/function_extractor.py +202 -0
- emdash_core/ingestion/parsers/import_analyzer.py +119 -0
- emdash_core/ingestion/parsers/python_parser.py +123 -0
- emdash_core/ingestion/parsers/registry.py +72 -0
- emdash_core/ingestion/parsers/ts_ast_parser.js +313 -0
- emdash_core/ingestion/parsers/typescript_parser.py +278 -0
- emdash_core/ingestion/repository.py +346 -0
- emdash_core/models/__init__.py +38 -0
- emdash_core/models/agent.py +68 -0
- emdash_core/models/index.py +77 -0
- emdash_core/models/query.py +113 -0
- emdash_core/planning/__init__.py +7 -0
- emdash_core/planning/agent_api.py +413 -0
- emdash_core/planning/context_builder.py +265 -0
- emdash_core/planning/feature_context.py +232 -0
- emdash_core/planning/feature_expander.py +646 -0
- emdash_core/planning/llm_explainer.py +198 -0
- emdash_core/planning/similarity.py +509 -0
- emdash_core/planning/team_focus.py +821 -0
- emdash_core/server.py +153 -0
- emdash_core/sse/__init__.py +5 -0
- emdash_core/sse/stream.py +196 -0
- emdash_core/swarm/__init__.py +17 -0
- emdash_core/swarm/merge_agent.py +383 -0
- emdash_core/swarm/session_manager.py +274 -0
- emdash_core/swarm/swarm_runner.py +226 -0
- emdash_core/swarm/task_definition.py +137 -0
- emdash_core/swarm/worker_spawner.py +319 -0
- emdash_core/swarm/worktree_manager.py +278 -0
- emdash_core/templates/__init__.py +10 -0
- emdash_core/templates/defaults/agent-builder.md.template +82 -0
- emdash_core/templates/defaults/focus.md.template +115 -0
- emdash_core/templates/defaults/pr-review-enhanced.md.template +309 -0
- emdash_core/templates/defaults/pr-review.md.template +80 -0
- emdash_core/templates/defaults/project.md.template +85 -0
- emdash_core/templates/defaults/research_critic.md.template +112 -0
- emdash_core/templates/defaults/research_planner.md.template +85 -0
- emdash_core/templates/defaults/research_synthesizer.md.template +128 -0
- emdash_core/templates/defaults/reviewer.md.template +81 -0
- emdash_core/templates/defaults/spec.md.template +41 -0
- emdash_core/templates/defaults/tasks.md.template +78 -0
- emdash_core/templates/loader.py +296 -0
- emdash_core/utils/__init__.py +45 -0
- emdash_core/utils/git.py +84 -0
- emdash_core/utils/image.py +502 -0
- emdash_core/utils/logger.py +51 -0
- emdash_core-0.1.7.dist-info/METADATA +35 -0
- emdash_core-0.1.7.dist-info/RECORD +187 -0
- emdash_core-0.1.7.dist-info/WHEEL +4 -0
- emdash_core-0.1.7.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,428 @@
|
|
|
1
|
+
"""Critic agent for evaluating research quality.
|
|
2
|
+
|
|
3
|
+
The Critic evaluates research completeness AND team value adherence.
|
|
4
|
+
It enforces hard rules and provides feedback for improvement.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
|
|
12
|
+
from ..providers import get_provider
|
|
13
|
+
from ..providers.factory import DEFAULT_MODEL
|
|
14
|
+
from .state import (
|
|
15
|
+
ResearchPlan,
|
|
16
|
+
EvidenceItem,
|
|
17
|
+
Claim,
|
|
18
|
+
Gap,
|
|
19
|
+
Critique,
|
|
20
|
+
CritiqueScores,
|
|
21
|
+
FollowUpQuestion,
|
|
22
|
+
Contradiction,
|
|
23
|
+
ValuesViolation,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
CRITIC_SYSTEM_PROMPT = """You are a research critic that evaluates research quality and team value adherence.
|
|
28
|
+
|
|
29
|
+
Your job is to:
|
|
30
|
+
1. Score research across 5 dimensions (1-5 each)
|
|
31
|
+
2. Identify issues that must be fixed
|
|
32
|
+
3. Suggest follow-up questions
|
|
33
|
+
4. Detect contradictions
|
|
34
|
+
5. Check team value compliance
|
|
35
|
+
|
|
36
|
+
SCORING DIMENSIONS (1-5):
|
|
37
|
+
- coverage: Were all P0/P1 questions addressed?
|
|
38
|
+
- evidence: Are claims backed by tool results?
|
|
39
|
+
- depth: Is analysis thorough or superficial?
|
|
40
|
+
- coherence: Do findings connect logically?
|
|
41
|
+
- team_alignment: Does output follow team values?
|
|
42
|
+
|
|
43
|
+
DECISION RULES:
|
|
44
|
+
- APPROVE: All P0 met, evidence >= 3, team_alignment >= 4
|
|
45
|
+
- CONTINUE: Progress made, but gaps remain
|
|
46
|
+
- REJECT: Values violated (ungrounded claims, missing evidence)
|
|
47
|
+
- ESCALATE: Need more powerful model/budget
|
|
48
|
+
|
|
49
|
+
TEAM VALUES TO CHECK:
|
|
50
|
+
- V1: All claims have evidence IDs
|
|
51
|
+
- V2: Evidence is reproducible (tool calls documented)
|
|
52
|
+
- V3: Output includes reviewer checklist
|
|
53
|
+
- V4: Budget respected
|
|
54
|
+
- V5: Report ends with actionable tasks
|
|
55
|
+
- V6: Uses team vocabulary
|
|
56
|
+
|
|
57
|
+
OUTPUT FORMAT (JSON only):
|
|
58
|
+
{
|
|
59
|
+
"decision": "APPROVE|CONTINUE|REJECT|ESCALATE",
|
|
60
|
+
"scores": {
|
|
61
|
+
"coverage": 1-5,
|
|
62
|
+
"evidence": 1-5,
|
|
63
|
+
"depth": 1-5,
|
|
64
|
+
"coherence": 1-5,
|
|
65
|
+
"team_alignment": 1-5
|
|
66
|
+
},
|
|
67
|
+
"must_fix": ["blocking issue 1", ...],
|
|
68
|
+
"follow_up_questions": [
|
|
69
|
+
{"question": "...", "why": "...", "suggested_tools": [...]}
|
|
70
|
+
],
|
|
71
|
+
"risky_claims": ["C1", "C3"],
|
|
72
|
+
"contradictions": [
|
|
73
|
+
{"claim_a": "C1", "claim_b": "C2", "note": "..."}
|
|
74
|
+
],
|
|
75
|
+
"values_violations": [
|
|
76
|
+
{"value": "V1", "issue": "...", "affected_claims": ["C1"]}
|
|
77
|
+
]
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
Be strict. Evidence quality matters more than quantity."""
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class CriticAgent:
|
|
84
|
+
"""Evaluates research quality and team value adherence.
|
|
85
|
+
|
|
86
|
+
The Critic provides objective feedback on research progress
|
|
87
|
+
and enforces hard rules about evidence and values.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
def __init__(
|
|
91
|
+
self,
|
|
92
|
+
model: str = DEFAULT_MODEL,
|
|
93
|
+
verbose: bool = True,
|
|
94
|
+
):
|
|
95
|
+
"""Initialize the critic agent.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
model: LLM model to use
|
|
99
|
+
verbose: Whether to print progress
|
|
100
|
+
"""
|
|
101
|
+
self.provider = get_provider(model)
|
|
102
|
+
self.model = model
|
|
103
|
+
self.verbose = verbose
|
|
104
|
+
self.console = Console()
|
|
105
|
+
|
|
106
|
+
def evaluate(
|
|
107
|
+
self,
|
|
108
|
+
plan: ResearchPlan,
|
|
109
|
+
evidence: list[EvidenceItem],
|
|
110
|
+
claims: list[Claim],
|
|
111
|
+
gaps: list[Gap],
|
|
112
|
+
iteration: int,
|
|
113
|
+
budget_used_percent: float,
|
|
114
|
+
) -> Critique:
|
|
115
|
+
"""Evaluate research progress.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
plan: Research plan
|
|
119
|
+
evidence: Evidence collected
|
|
120
|
+
claims: Claims proposed
|
|
121
|
+
gaps: Gaps identified
|
|
122
|
+
iteration: Current iteration number
|
|
123
|
+
budget_used_percent: Percentage of budget used
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
Critique with scores and feedback
|
|
127
|
+
"""
|
|
128
|
+
if self.verbose:
|
|
129
|
+
self.console.print(f"[cyan]Evaluating iteration {iteration + 1}...[/cyan]")
|
|
130
|
+
|
|
131
|
+
# Try LLM-based evaluation
|
|
132
|
+
try:
|
|
133
|
+
critique = self._llm_evaluate(
|
|
134
|
+
plan, evidence, claims, gaps, iteration, budget_used_percent
|
|
135
|
+
)
|
|
136
|
+
if critique:
|
|
137
|
+
if self.verbose:
|
|
138
|
+
self.console.print(
|
|
139
|
+
f"[{'green' if critique.decision == 'APPROVE' else 'yellow'}]"
|
|
140
|
+
f"Decision: {critique.decision} "
|
|
141
|
+
f"(avg score: {critique.scores.average():.1f})"
|
|
142
|
+
f"[/{'green' if critique.decision == 'APPROVE' else 'yellow'}]"
|
|
143
|
+
)
|
|
144
|
+
return critique
|
|
145
|
+
except Exception as e:
|
|
146
|
+
if self.verbose:
|
|
147
|
+
self.console.print(f"[yellow]LLM evaluation failed: {e}[/yellow]")
|
|
148
|
+
|
|
149
|
+
# Fallback to heuristic evaluation
|
|
150
|
+
return self._heuristic_evaluate(
|
|
151
|
+
plan, evidence, claims, gaps, iteration, budget_used_percent
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
def _llm_evaluate(
|
|
155
|
+
self,
|
|
156
|
+
plan: ResearchPlan,
|
|
157
|
+
evidence: list[EvidenceItem],
|
|
158
|
+
claims: list[Claim],
|
|
159
|
+
gaps: list[Gap],
|
|
160
|
+
iteration: int,
|
|
161
|
+
budget_used_percent: float,
|
|
162
|
+
) -> Optional[Critique]:
|
|
163
|
+
"""Evaluate using LLM.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
plan: Research plan
|
|
167
|
+
evidence: Evidence collected
|
|
168
|
+
claims: Claims proposed
|
|
169
|
+
gaps: Gaps identified
|
|
170
|
+
iteration: Current iteration
|
|
171
|
+
budget_used_percent: Budget usage
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
Critique or None on failure
|
|
175
|
+
"""
|
|
176
|
+
# Format input for LLM
|
|
177
|
+
questions_text = "\n".join([
|
|
178
|
+
f"- [{q.priority}] {q.qid}: {q.question}"
|
|
179
|
+
for q in plan.questions
|
|
180
|
+
])
|
|
181
|
+
|
|
182
|
+
evidence_text = "\n".join([
|
|
183
|
+
f"- {e.id}: {e.tool} -> {e.summary}"
|
|
184
|
+
for e in evidence
|
|
185
|
+
]) or "No evidence"
|
|
186
|
+
|
|
187
|
+
claims_text = "\n".join([
|
|
188
|
+
f"- {c.id} (conf={c.confidence}, evidence={c.evidence_ids}): {c.statement}"
|
|
189
|
+
for c in claims
|
|
190
|
+
]) or "No claims"
|
|
191
|
+
|
|
192
|
+
gaps_text = "\n".join([
|
|
193
|
+
f"- {g.question}: {g.reason}"
|
|
194
|
+
for g in gaps
|
|
195
|
+
]) or "No gaps"
|
|
196
|
+
|
|
197
|
+
user_message = f"""Evaluate this research progress.
|
|
198
|
+
|
|
199
|
+
GOAL: {plan.goal}
|
|
200
|
+
|
|
201
|
+
QUESTIONS:
|
|
202
|
+
{questions_text}
|
|
203
|
+
|
|
204
|
+
EVIDENCE COLLECTED:
|
|
205
|
+
{evidence_text}
|
|
206
|
+
|
|
207
|
+
CLAIMS MADE:
|
|
208
|
+
{claims_text}
|
|
209
|
+
|
|
210
|
+
GAPS IDENTIFIED:
|
|
211
|
+
{gaps_text}
|
|
212
|
+
|
|
213
|
+
STATS:
|
|
214
|
+
- Iteration: {iteration + 1}/{plan.max_iterations}
|
|
215
|
+
- Budget used: {budget_used_percent:.0f}%
|
|
216
|
+
- Evidence count: {len(evidence)}
|
|
217
|
+
- Claims count: {len(claims)}
|
|
218
|
+
- Gaps count: {len(gaps)}
|
|
219
|
+
|
|
220
|
+
TEAM VALUES CHECKLIST:
|
|
221
|
+
{chr(10).join(plan.team_values_checklist)}
|
|
222
|
+
|
|
223
|
+
Evaluate and return JSON only."""
|
|
224
|
+
|
|
225
|
+
messages = [{"role": "user", "content": user_message}]
|
|
226
|
+
response = self.provider.chat(messages, system=CRITIC_SYSTEM_PROMPT)
|
|
227
|
+
content = response.content or ""
|
|
228
|
+
|
|
229
|
+
# Parse JSON
|
|
230
|
+
try:
|
|
231
|
+
json_str = content
|
|
232
|
+
if "```" in content:
|
|
233
|
+
start = content.find("```")
|
|
234
|
+
end = content.find("```", start + 3)
|
|
235
|
+
if end > start:
|
|
236
|
+
json_str = content[start + 3:end]
|
|
237
|
+
if json_str.startswith("json"):
|
|
238
|
+
json_str = json_str[4:]
|
|
239
|
+
|
|
240
|
+
data = json.loads(json_str.strip())
|
|
241
|
+
|
|
242
|
+
scores = CritiqueScores(
|
|
243
|
+
coverage=max(1, min(5, data["scores"].get("coverage", 3))),
|
|
244
|
+
evidence=max(1, min(5, data["scores"].get("evidence", 3))),
|
|
245
|
+
depth=max(1, min(5, data["scores"].get("depth", 3))),
|
|
246
|
+
coherence=max(1, min(5, data["scores"].get("coherence", 3))),
|
|
247
|
+
team_alignment=max(1, min(5, data["scores"].get("team_alignment", 3))),
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
# Parse follow-up questions
|
|
251
|
+
follow_ups = []
|
|
252
|
+
for q in data.get("follow_up_questions", []):
|
|
253
|
+
follow_ups.append(FollowUpQuestion(
|
|
254
|
+
question=q.get("question", ""),
|
|
255
|
+
why=q.get("why", ""),
|
|
256
|
+
suggested_tools=q.get("suggested_tools", []),
|
|
257
|
+
))
|
|
258
|
+
|
|
259
|
+
# Parse contradictions
|
|
260
|
+
contradictions = []
|
|
261
|
+
for c in data.get("contradictions", []):
|
|
262
|
+
contradictions.append(Contradiction(
|
|
263
|
+
claim_a=c.get("claim_a", ""),
|
|
264
|
+
claim_b=c.get("claim_b", ""),
|
|
265
|
+
note=c.get("note", ""),
|
|
266
|
+
))
|
|
267
|
+
|
|
268
|
+
# Parse values violations
|
|
269
|
+
violations = []
|
|
270
|
+
for v in data.get("values_violations", []):
|
|
271
|
+
violations.append(ValuesViolation(
|
|
272
|
+
value=v.get("value", ""),
|
|
273
|
+
issue=v.get("issue", ""),
|
|
274
|
+
affected_claims=v.get("affected_claims", []),
|
|
275
|
+
))
|
|
276
|
+
|
|
277
|
+
# Validate decision against scores
|
|
278
|
+
decision = data.get("decision", "CONTINUE")
|
|
279
|
+
if decision == "APPROVE":
|
|
280
|
+
if scores.evidence < 3:
|
|
281
|
+
decision = "CONTINUE"
|
|
282
|
+
if scores.team_alignment < 4:
|
|
283
|
+
decision = "CONTINUE"
|
|
284
|
+
|
|
285
|
+
return Critique(
|
|
286
|
+
decision=decision,
|
|
287
|
+
scores=scores,
|
|
288
|
+
must_fix=data.get("must_fix", []),
|
|
289
|
+
follow_up_questions=follow_ups,
|
|
290
|
+
risky_claims=data.get("risky_claims", []),
|
|
291
|
+
contradictions=contradictions,
|
|
292
|
+
values_violations=violations,
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
except (json.JSONDecodeError, KeyError, TypeError, ValueError) as e:
|
|
296
|
+
if self.verbose:
|
|
297
|
+
self.console.print(f"[yellow]Failed to parse critique: {e}[/yellow]")
|
|
298
|
+
return None
|
|
299
|
+
|
|
300
|
+
def _heuristic_evaluate(
|
|
301
|
+
self,
|
|
302
|
+
plan: ResearchPlan,
|
|
303
|
+
evidence: list[EvidenceItem],
|
|
304
|
+
claims: list[Claim],
|
|
305
|
+
gaps: list[Gap],
|
|
306
|
+
iteration: int,
|
|
307
|
+
budget_used_percent: float,
|
|
308
|
+
) -> Critique:
|
|
309
|
+
"""Evaluate using heuristics.
|
|
310
|
+
|
|
311
|
+
Fallback method when LLM evaluation fails.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
plan: Research plan
|
|
315
|
+
evidence: Evidence collected
|
|
316
|
+
claims: Claims proposed
|
|
317
|
+
gaps: Gaps identified
|
|
318
|
+
iteration: Current iteration
|
|
319
|
+
budget_used_percent: Budget usage
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
Critique
|
|
323
|
+
"""
|
|
324
|
+
# Calculate coverage score
|
|
325
|
+
p0_questions = plan.get_p0_questions()
|
|
326
|
+
p0_answered = 0
|
|
327
|
+
for q in p0_questions:
|
|
328
|
+
# Check if any claim relates to this question
|
|
329
|
+
q_keywords = set(q.question.lower().split())
|
|
330
|
+
for claim in claims:
|
|
331
|
+
c_keywords = set(claim.statement.lower().split())
|
|
332
|
+
if len(q_keywords & c_keywords) >= 2:
|
|
333
|
+
p0_answered += 1
|
|
334
|
+
break
|
|
335
|
+
|
|
336
|
+
coverage = min(5, max(1, int(5 * p0_answered / max(len(p0_questions), 1))))
|
|
337
|
+
|
|
338
|
+
# Calculate evidence score
|
|
339
|
+
if not evidence:
|
|
340
|
+
evidence_score = 1
|
|
341
|
+
elif len(evidence) < 3:
|
|
342
|
+
evidence_score = 2
|
|
343
|
+
elif len(evidence) < 10:
|
|
344
|
+
evidence_score = 3
|
|
345
|
+
else:
|
|
346
|
+
evidence_score = 4
|
|
347
|
+
|
|
348
|
+
# Boost if claims have good evidence
|
|
349
|
+
high_conf_claims = [c for c in claims if c.confidence >= 2]
|
|
350
|
+
if len(high_conf_claims) > len(claims) / 2:
|
|
351
|
+
evidence_score = min(5, evidence_score + 1)
|
|
352
|
+
|
|
353
|
+
# Calculate depth score
|
|
354
|
+
if not claims:
|
|
355
|
+
depth = 1
|
|
356
|
+
elif len(claims) < 3:
|
|
357
|
+
depth = 2
|
|
358
|
+
elif len(claims) < 8:
|
|
359
|
+
depth = 3
|
|
360
|
+
else:
|
|
361
|
+
depth = 4
|
|
362
|
+
|
|
363
|
+
# Coherence score
|
|
364
|
+
coherence = 3 # Default to acceptable
|
|
365
|
+
if len(gaps) > len(claims):
|
|
366
|
+
coherence = 2 # Too many gaps
|
|
367
|
+
|
|
368
|
+
# Team alignment score
|
|
369
|
+
team_alignment = 4 # Default to good
|
|
370
|
+
|
|
371
|
+
# Check for ungrounded claims (V1 violation)
|
|
372
|
+
ungrounded = [c for c in claims if len(c.evidence_ids) == 0]
|
|
373
|
+
if ungrounded:
|
|
374
|
+
team_alignment = 2
|
|
375
|
+
|
|
376
|
+
scores = CritiqueScores(
|
|
377
|
+
coverage=coverage,
|
|
378
|
+
evidence=evidence_score,
|
|
379
|
+
depth=depth,
|
|
380
|
+
coherence=coherence,
|
|
381
|
+
team_alignment=team_alignment,
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
# Determine decision
|
|
385
|
+
must_fix = []
|
|
386
|
+
follow_ups = []
|
|
387
|
+
|
|
388
|
+
if coverage >= 4 and evidence_score >= 3 and team_alignment >= 4:
|
|
389
|
+
decision = "APPROVE"
|
|
390
|
+
elif iteration >= plan.max_iterations - 1:
|
|
391
|
+
decision = "APPROVE" # Out of iterations
|
|
392
|
+
must_fix.append("Max iterations reached")
|
|
393
|
+
elif budget_used_percent > 90:
|
|
394
|
+
decision = "APPROVE" # Out of budget
|
|
395
|
+
must_fix.append("Budget nearly exhausted")
|
|
396
|
+
elif not evidence and iteration > 0:
|
|
397
|
+
decision = "ESCALATE"
|
|
398
|
+
must_fix.append("No evidence collected despite attempts")
|
|
399
|
+
else:
|
|
400
|
+
decision = "CONTINUE"
|
|
401
|
+
|
|
402
|
+
# Generate follow-up questions
|
|
403
|
+
for q in p0_questions:
|
|
404
|
+
answered = False
|
|
405
|
+
for claim in claims:
|
|
406
|
+
c_keywords = set(claim.statement.lower().split())
|
|
407
|
+
q_keywords = set(q.question.lower().split())
|
|
408
|
+
if len(q_keywords & c_keywords) >= 2:
|
|
409
|
+
answered = True
|
|
410
|
+
break
|
|
411
|
+
|
|
412
|
+
if not answered:
|
|
413
|
+
follow_ups.append(FollowUpQuestion(
|
|
414
|
+
question=q.question,
|
|
415
|
+
why="P0 question not yet answered",
|
|
416
|
+
suggested_tools=q.suggested_tools,
|
|
417
|
+
qid=q.qid,
|
|
418
|
+
))
|
|
419
|
+
|
|
420
|
+
return Critique(
|
|
421
|
+
decision=decision,
|
|
422
|
+
scores=scores,
|
|
423
|
+
must_fix=must_fix,
|
|
424
|
+
follow_up_questions=follow_ups[:3],
|
|
425
|
+
risky_claims=[],
|
|
426
|
+
contradictions=[],
|
|
427
|
+
values_violations=[],
|
|
428
|
+
)
|