aidp 0.33.0 → 0.34.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +35 -0
- data/lib/aidp/analyze/tree_sitter_scan.rb +3 -0
- data/lib/aidp/cli/eval_command.rb +399 -0
- data/lib/aidp/cli/harness_command.rb +1 -1
- data/lib/aidp/cli/security_command.rb +416 -0
- data/lib/aidp/cli/tools_command.rb +6 -4
- data/lib/aidp/cli.rb +170 -3
- data/lib/aidp/concurrency/exec.rb +3 -0
- data/lib/aidp/config.rb +113 -0
- data/lib/aidp/config_paths.rb +20 -0
- data/lib/aidp/daemon/runner.rb +8 -4
- data/lib/aidp/errors.rb +134 -0
- data/lib/aidp/evaluations/context_capture.rb +205 -0
- data/lib/aidp/evaluations/evaluation_record.rb +114 -0
- data/lib/aidp/evaluations/evaluation_storage.rb +250 -0
- data/lib/aidp/evaluations.rb +23 -0
- data/lib/aidp/execute/async_work_loop_runner.rb +4 -1
- data/lib/aidp/execute/interactive_repl.rb +6 -2
- data/lib/aidp/execute/prompt_evaluator.rb +359 -0
- data/lib/aidp/execute/repl_macros.rb +100 -1
- data/lib/aidp/execute/work_loop_runner.rb +399 -47
- data/lib/aidp/execute/work_loop_state.rb +4 -1
- data/lib/aidp/execute/workflow_selector.rb +3 -0
- data/lib/aidp/harness/ai_decision_engine.rb +79 -0
- data/lib/aidp/harness/capability_registry.rb +2 -0
- data/lib/aidp/harness/condition_detector.rb +3 -0
- data/lib/aidp/harness/config_loader.rb +3 -0
- data/lib/aidp/harness/enhanced_runner.rb +14 -11
- data/lib/aidp/harness/error_handler.rb +3 -0
- data/lib/aidp/harness/provider_factory.rb +3 -0
- data/lib/aidp/harness/provider_manager.rb +6 -0
- data/lib/aidp/harness/runner.rb +5 -1
- data/lib/aidp/harness/state/persistence.rb +3 -0
- data/lib/aidp/harness/state_manager.rb +3 -0
- data/lib/aidp/harness/status_display.rb +28 -20
- data/lib/aidp/harness/thinking_depth_manager.rb +32 -32
- data/lib/aidp/harness/ui/enhanced_tui.rb +4 -0
- data/lib/aidp/harness/ui/enhanced_workflow_selector.rb +4 -0
- data/lib/aidp/harness/ui/error_handler.rb +3 -0
- data/lib/aidp/harness/ui/job_monitor.rb +4 -0
- data/lib/aidp/harness/ui/navigation/submenu.rb +2 -0
- data/lib/aidp/harness/ui/navigation/workflow_selector.rb +6 -0
- data/lib/aidp/harness/ui/spinner_helper.rb +3 -0
- data/lib/aidp/harness/ui/workflow_controller.rb +3 -0
- data/lib/aidp/harness/user_interface.rb +3 -0
- data/lib/aidp/loader.rb +2 -2
- data/lib/aidp/logger.rb +3 -0
- data/lib/aidp/message_display.rb +31 -0
- data/lib/aidp/pr_worktree_manager.rb +18 -6
- data/lib/aidp/provider_manager.rb +3 -0
- data/lib/aidp/providers/base.rb +2 -0
- data/lib/aidp/security/rule_of_two_enforcer.rb +210 -0
- data/lib/aidp/security/secrets_proxy.rb +328 -0
- data/lib/aidp/security/secrets_registry.rb +227 -0
- data/lib/aidp/security/trifecta_state.rb +220 -0
- data/lib/aidp/security/watch_mode_handler.rb +306 -0
- data/lib/aidp/security/work_loop_adapter.rb +277 -0
- data/lib/aidp/security.rb +56 -0
- data/lib/aidp/setup/wizard.rb +4 -2
- data/lib/aidp/version.rb +1 -1
- data/lib/aidp/watch/auto_merger.rb +274 -0
- data/lib/aidp/watch/auto_pr_processor.rb +125 -7
- data/lib/aidp/watch/build_processor.rb +16 -1
- data/lib/aidp/watch/change_request_processor.rb +680 -286
- data/lib/aidp/watch/ci_fix_processor.rb +262 -4
- data/lib/aidp/watch/feedback_collector.rb +191 -0
- data/lib/aidp/watch/hierarchical_pr_strategy.rb +256 -0
- data/lib/aidp/watch/implementation_verifier.rb +142 -1
- data/lib/aidp/watch/plan_generator.rb +70 -13
- data/lib/aidp/watch/plan_processor.rb +12 -5
- data/lib/aidp/watch/projects_processor.rb +286 -0
- data/lib/aidp/watch/repository_client.rb +861 -53
- data/lib/aidp/watch/review_processor.rb +33 -6
- data/lib/aidp/watch/runner.rb +51 -11
- data/lib/aidp/watch/state_store.rb +233 -0
- data/lib/aidp/watch/sub_issue_creator.rb +221 -0
- data/lib/aidp/workflows/guided_agent.rb +4 -0
- data/lib/aidp/workstream_executor.rb +3 -0
- data/lib/aidp/worktree.rb +61 -11
- data/lib/aidp/worktree_branch_manager.rb +347 -101
- data/templates/implementation/iterative_implementation.md +46 -3
- metadata +20 -1
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../harness/ai_decision_engine"
|
|
4
|
+
|
|
5
|
+
module Aidp
|
|
6
|
+
module Execute
|
|
7
|
+
# Evaluates prompt effectiveness using ZFC after multiple iterations
|
|
8
|
+
#
|
|
9
|
+
# FIX for issue #391: When the work loop reaches 10+ iterations without completion,
|
|
10
|
+
# this evaluator assesses prompt quality and suggests improvements.
|
|
11
|
+
#
|
|
12
|
+
# Uses Zero Framework Cognition (ZFC) to analyze:
|
|
13
|
+
# - Whether the prompt clearly defines completion criteria
|
|
14
|
+
# - If task breakdown instructions are adequate
|
|
15
|
+
# - Whether the agent has sufficient context
|
|
16
|
+
# - If there are blockers preventing progress
|
|
17
|
+
#
|
|
18
|
+
# @example
|
|
19
|
+
# evaluator = PromptEvaluator.new(config)
|
|
20
|
+
# result = evaluator.evaluate(
|
|
21
|
+
# prompt_content: prompt_manager.read,
|
|
22
|
+
# iteration_count: 12,
|
|
23
|
+
# task_summary: persistent_tasklist.summary,
|
|
24
|
+
# recent_failures: all_results
|
|
25
|
+
# )
|
|
26
|
+
# # => { effective: false, issues: [...], suggestions: [...] }
|
|
27
|
+
#
|
|
28
|
+
class PromptEvaluator
|
|
29
|
+
# Threshold for triggering evaluation
|
|
30
|
+
EVALUATION_ITERATION_THRESHOLD = 10
|
|
31
|
+
|
|
32
|
+
# Re-evaluate periodically after threshold
|
|
33
|
+
EVALUATION_INTERVAL = 5
|
|
34
|
+
|
|
35
|
+
# Expose for testability
|
|
36
|
+
attr_reader :ai_decision_engine
|
|
37
|
+
|
|
38
|
+
def initialize(config, ai_decision_engine: nil)
|
|
39
|
+
@config = config
|
|
40
|
+
@ai_decision_engine = ai_decision_engine || safely_build_ai_decision_engine
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Safely build AIDecisionEngine, returning nil if config doesn't support it
|
|
44
|
+
# This allows tests with mock configs to work without AI calls
|
|
45
|
+
def safely_build_ai_decision_engine
|
|
46
|
+
# Check if config supports the methods AIDecisionEngine needs
|
|
47
|
+
return nil unless @config.respond_to?(:default_provider)
|
|
48
|
+
|
|
49
|
+
build_default_ai_decision_engine
|
|
50
|
+
rescue => e
|
|
51
|
+
Aidp.log_debug("prompt_evaluator", "skipping_ai_decision_engine",
|
|
52
|
+
reason: e.message)
|
|
53
|
+
nil
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Check if evaluation should be triggered based on iteration count
|
|
57
|
+
# @param iteration_count [Integer] Current iteration number
|
|
58
|
+
# @return [Boolean]
|
|
59
|
+
def should_evaluate?(iteration_count)
|
|
60
|
+
return false unless iteration_count >= EVALUATION_ITERATION_THRESHOLD
|
|
61
|
+
|
|
62
|
+
# Evaluate at threshold and every EVALUATION_INTERVAL after
|
|
63
|
+
(iteration_count - EVALUATION_ITERATION_THRESHOLD) % EVALUATION_INTERVAL == 0
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Evaluate prompt effectiveness
|
|
67
|
+
# @param prompt_content [String] Current PROMPT.md content
|
|
68
|
+
# @param iteration_count [Integer] Current iteration number
|
|
69
|
+
# @param task_summary [Hash] Summary of task statuses
|
|
70
|
+
# @param recent_failures [Hash] Recent test/lint failures
|
|
71
|
+
# @param step_name [String] Name of current step
|
|
72
|
+
# @return [Hash] Evaluation result with :effective, :issues, :suggestions
|
|
73
|
+
def evaluate(prompt_content:, iteration_count:, task_summary:, recent_failures:, step_name: nil)
|
|
74
|
+
Aidp.log_debug("prompt_evaluator", "starting_evaluation",
|
|
75
|
+
iteration: iteration_count,
|
|
76
|
+
step: step_name,
|
|
77
|
+
prompt_size: prompt_content&.length || 0)
|
|
78
|
+
|
|
79
|
+
# When AI decision engine is unavailable (e.g., in tests with mock configs),
|
|
80
|
+
# return a neutral result that doesn't trigger feedback appending
|
|
81
|
+
unless @ai_decision_engine
|
|
82
|
+
Aidp.log_debug("prompt_evaluator", "skipping_evaluation_no_ai_engine")
|
|
83
|
+
return {
|
|
84
|
+
effective: true, # Assume effective to avoid unnecessary feedback
|
|
85
|
+
issues: [],
|
|
86
|
+
suggestions: [],
|
|
87
|
+
likely_blockers: [],
|
|
88
|
+
recommended_actions: [],
|
|
89
|
+
confidence: 0.0,
|
|
90
|
+
skipped: true,
|
|
91
|
+
skip_reason: "AI decision engine not available"
|
|
92
|
+
}
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
prompt = build_evaluation_prompt(
|
|
96
|
+
prompt_content: prompt_content,
|
|
97
|
+
iteration_count: iteration_count,
|
|
98
|
+
task_summary: task_summary,
|
|
99
|
+
recent_failures: recent_failures
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
schema = {
|
|
103
|
+
type: "object",
|
|
104
|
+
properties: {
|
|
105
|
+
effective: {
|
|
106
|
+
type: "boolean",
|
|
107
|
+
description: "True if the prompt is likely to lead to completion within a few more iterations"
|
|
108
|
+
},
|
|
109
|
+
issues: {
|
|
110
|
+
type: "array",
|
|
111
|
+
items: {type: "string"},
|
|
112
|
+
description: "Specific problems identified with the current prompt"
|
|
113
|
+
},
|
|
114
|
+
suggestions: {
|
|
115
|
+
type: "array",
|
|
116
|
+
items: {type: "string"},
|
|
117
|
+
description: "Actionable suggestions to improve prompt effectiveness"
|
|
118
|
+
},
|
|
119
|
+
likely_blockers: {
|
|
120
|
+
type: "array",
|
|
121
|
+
items: {type: "string"},
|
|
122
|
+
description: "Potential blockers preventing progress"
|
|
123
|
+
},
|
|
124
|
+
recommended_actions: {
|
|
125
|
+
type: "array",
|
|
126
|
+
items: {
|
|
127
|
+
type: "object",
|
|
128
|
+
properties: {
|
|
129
|
+
action: {type: "string"},
|
|
130
|
+
priority: {type: "string", enum: ["high", "medium", "low"]},
|
|
131
|
+
rationale: {type: "string"}
|
|
132
|
+
}
|
|
133
|
+
},
|
|
134
|
+
description: "Specific actions to take, prioritized"
|
|
135
|
+
},
|
|
136
|
+
confidence: {
|
|
137
|
+
type: "number",
|
|
138
|
+
minimum: 0.0,
|
|
139
|
+
maximum: 1.0,
|
|
140
|
+
description: "Confidence in this assessment"
|
|
141
|
+
}
|
|
142
|
+
},
|
|
143
|
+
required: ["effective", "issues", "suggestions", "confidence"]
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
begin
|
|
147
|
+
result = @ai_decision_engine.decide(
|
|
148
|
+
:prompt_evaluation,
|
|
149
|
+
context: {prompt: prompt},
|
|
150
|
+
schema: schema,
|
|
151
|
+
tier: :mini,
|
|
152
|
+
cache_ttl: nil # Each evaluation is context-specific
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
Aidp.log_info("prompt_evaluator", "evaluation_complete",
|
|
156
|
+
iteration: iteration_count,
|
|
157
|
+
effective: result[:effective],
|
|
158
|
+
issue_count: result[:issues]&.size || 0,
|
|
159
|
+
confidence: result[:confidence])
|
|
160
|
+
|
|
161
|
+
result
|
|
162
|
+
rescue => e
|
|
163
|
+
Aidp.log_error("prompt_evaluator", "evaluation_failed",
|
|
164
|
+
error: e.message,
|
|
165
|
+
error_class: e.class.name)
|
|
166
|
+
|
|
167
|
+
build_fallback_result("Evaluation failed: #{e.message}")
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# Generate improvement recommendations for the prompt template
|
|
172
|
+
# Used for AGD pattern - generating improved templates based on evaluation
|
|
173
|
+
# @param evaluation_result [Hash] Result from evaluate()
|
|
174
|
+
# @param original_template [String] The original template content
|
|
175
|
+
# @return [Hash] Template improvements
|
|
176
|
+
def generate_template_improvements(evaluation_result:, original_template:)
|
|
177
|
+
return nil unless @ai_decision_engine
|
|
178
|
+
|
|
179
|
+
Aidp.log_debug("prompt_evaluator", "generating_template_improvements",
|
|
180
|
+
issue_count: evaluation_result[:issues]&.size || 0)
|
|
181
|
+
|
|
182
|
+
prompt = build_improvement_prompt(evaluation_result, original_template)
|
|
183
|
+
|
|
184
|
+
schema = {
|
|
185
|
+
type: "object",
|
|
186
|
+
properties: {
|
|
187
|
+
improved_sections: {
|
|
188
|
+
type: "array",
|
|
189
|
+
items: {
|
|
190
|
+
type: "object",
|
|
191
|
+
properties: {
|
|
192
|
+
section_name: {type: "string"},
|
|
193
|
+
original: {type: "string"},
|
|
194
|
+
improved: {type: "string"},
|
|
195
|
+
rationale: {type: "string"}
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
},
|
|
199
|
+
additional_sections: {
|
|
200
|
+
type: "array",
|
|
201
|
+
items: {
|
|
202
|
+
type: "object",
|
|
203
|
+
properties: {
|
|
204
|
+
section_name: {type: "string"},
|
|
205
|
+
content: {type: "string"},
|
|
206
|
+
rationale: {type: "string"}
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
},
|
|
210
|
+
completion_criteria_improvements: {
|
|
211
|
+
type: "array",
|
|
212
|
+
items: {type: "string"},
|
|
213
|
+
description: "Specific improvements to completion criteria definitions"
|
|
214
|
+
}
|
|
215
|
+
},
|
|
216
|
+
required: ["improved_sections", "completion_criteria_improvements"]
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
@ai_decision_engine.decide(
|
|
220
|
+
:template_improvement,
|
|
221
|
+
context: {prompt: prompt},
|
|
222
|
+
schema: schema,
|
|
223
|
+
tier: :standard, # Use standard tier for more thoughtful improvements
|
|
224
|
+
cache_ttl: nil
|
|
225
|
+
)
|
|
226
|
+
rescue => e
|
|
227
|
+
Aidp.log_error("prompt_evaluator", "template_improvement_failed",
|
|
228
|
+
error: e.message)
|
|
229
|
+
nil
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
private
|
|
233
|
+
|
|
234
|
+
def build_evaluation_prompt(prompt_content:, iteration_count:, task_summary:, recent_failures:)
|
|
235
|
+
<<~PROMPT
|
|
236
|
+
You are evaluating the effectiveness of a work loop prompt that has been running for #{iteration_count} iterations without completion.
|
|
237
|
+
|
|
238
|
+
## Current Prompt Content
|
|
239
|
+
#{truncate_content(prompt_content, 8000)}
|
|
240
|
+
|
|
241
|
+
## Task Summary
|
|
242
|
+
#{format_task_summary(task_summary)}
|
|
243
|
+
|
|
244
|
+
## Recent Check Results
|
|
245
|
+
#{format_failures(recent_failures)}
|
|
246
|
+
|
|
247
|
+
## Evaluation Criteria
|
|
248
|
+
|
|
249
|
+
Analyze why this prompt may not be leading to completion:
|
|
250
|
+
|
|
251
|
+
1. **Clarity of Goals**: Are the implementation requirements clearly defined?
|
|
252
|
+
2. **Task Breakdown**: Does the prompt guide proper task decomposition?
|
|
253
|
+
3. **Completion Criteria**: Are the completion criteria specific and achievable?
|
|
254
|
+
4. **Context Sufficiency**: Does the agent have enough context to proceed?
|
|
255
|
+
5. **Blockers**: Are there technical blockers or missing information?
|
|
256
|
+
6. **Scope**: Is the scope realistic for an AI agent to complete?
|
|
257
|
+
|
|
258
|
+
## Your Assessment
|
|
259
|
+
|
|
260
|
+
Provide:
|
|
261
|
+
- Whether this prompt is likely effective (true/false)
|
|
262
|
+
- Specific issues with the current prompt
|
|
263
|
+
- Actionable suggestions for improvement
|
|
264
|
+
- Likely blockers preventing progress
|
|
265
|
+
- Prioritized recommended actions
|
|
266
|
+
- Your confidence in this assessment (0.0-1.0)
|
|
267
|
+
|
|
268
|
+
Be specific and actionable. Focus on what can be changed to achieve completion.
|
|
269
|
+
PROMPT
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
def build_improvement_prompt(evaluation_result, original_template)
|
|
273
|
+
<<~PROMPT
|
|
274
|
+
Based on the following prompt evaluation, suggest improvements to the template.
|
|
275
|
+
|
|
276
|
+
## Evaluation Results
|
|
277
|
+
- Effective: #{evaluation_result[:effective]}
|
|
278
|
+
- Issues: #{(evaluation_result[:issues] || []).join(", ")}
|
|
279
|
+
- Suggestions: #{(evaluation_result[:suggestions] || []).join(", ")}
|
|
280
|
+
|
|
281
|
+
## Original Template
|
|
282
|
+
#{truncate_content(original_template, 4000)}
|
|
283
|
+
|
|
284
|
+
## Your Task
|
|
285
|
+
|
|
286
|
+
Suggest specific improvements to make the template more effective:
|
|
287
|
+
1. Identify sections that need improvement
|
|
288
|
+
2. Propose new sections if needed
|
|
289
|
+
3. Focus especially on completion criteria clarity
|
|
290
|
+
4. Ensure task breakdown instructions are explicit
|
|
291
|
+
5. Add guidance for common failure modes
|
|
292
|
+
|
|
293
|
+
Be specific - provide actual text that could replace or supplement the template.
|
|
294
|
+
PROMPT
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
def format_task_summary(task_summary)
|
|
298
|
+
return "_No task summary available_" if task_summary.nil? || task_summary.empty?
|
|
299
|
+
|
|
300
|
+
if task_summary.is_a?(Hash)
|
|
301
|
+
parts = []
|
|
302
|
+
parts << "Total: #{task_summary[:total] || 0}"
|
|
303
|
+
parts << "Done: #{task_summary[:done] || 0}"
|
|
304
|
+
parts << "In Progress: #{task_summary[:in_progress] || 0}"
|
|
305
|
+
parts << "Pending: #{task_summary[:pending] || 0}"
|
|
306
|
+
parts << "Abandoned: #{task_summary[:abandoned] || 0}"
|
|
307
|
+
parts.join(" | ")
|
|
308
|
+
else
|
|
309
|
+
task_summary.to_s
|
|
310
|
+
end
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
def format_failures(recent_failures)
|
|
314
|
+
return "_No recent failures_" if recent_failures.nil? || recent_failures.empty?
|
|
315
|
+
|
|
316
|
+
parts = []
|
|
317
|
+
recent_failures.each do |check_type, result|
|
|
318
|
+
next unless result.is_a?(Hash)
|
|
319
|
+
|
|
320
|
+
status = result[:success] ? "✅ passed" : "❌ failed"
|
|
321
|
+
parts << "- #{check_type}: #{status}"
|
|
322
|
+
|
|
323
|
+
if !result[:success] && result[:failures]
|
|
324
|
+
failures = result[:failures].take(3)
|
|
325
|
+
failures.each { |f| parts << " - #{truncate_content(f.to_s, 200)}" }
|
|
326
|
+
end
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
parts.empty? ? "_No failures to report_" : parts.join("\n")
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
def truncate_content(content, max_length)
|
|
333
|
+
return "_No content_" if content.nil? || content.empty?
|
|
334
|
+
return content if content.length <= max_length
|
|
335
|
+
|
|
336
|
+
"#{content[0, max_length]}\n\n[... truncated, showing first #{max_length} characters ...]"
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
def build_fallback_result(reason)
|
|
340
|
+
{
|
|
341
|
+
effective: nil,
|
|
342
|
+
issues: ["Unable to evaluate: #{reason}"],
|
|
343
|
+
suggestions: ["Check AI configuration and try again"],
|
|
344
|
+
likely_blockers: [],
|
|
345
|
+
recommended_actions: [],
|
|
346
|
+
confidence: 0.0
|
|
347
|
+
}
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
def build_default_ai_decision_engine
|
|
351
|
+
Aidp::Harness::AIDecisionEngine.new(@config)
|
|
352
|
+
rescue => e
|
|
353
|
+
Aidp.log_warn("prompt_evaluator", "failed_to_create_ai_decision_engine",
|
|
354
|
+
error: e.message)
|
|
355
|
+
nil
|
|
356
|
+
end
|
|
357
|
+
end
|
|
358
|
+
end
|
|
359
|
+
end
|
|
@@ -9,7 +9,9 @@ module Aidp
|
|
|
9
9
|
# - /split - Divide work into smaller contracts
|
|
10
10
|
# - /halt-on <pattern> - Pause on specific test failures
|
|
11
11
|
class ReplMacros
|
|
12
|
-
attr_reader :pinned_files, :focus_patterns, :halt_patterns, :split_mode, :current_workstream
|
|
12
|
+
attr_reader :pinned_files, :focus_patterns, :halt_patterns, :split_mode, :current_workstream
|
|
13
|
+
# Expose current_skill for testability
|
|
14
|
+
attr_accessor :current_skill
|
|
13
15
|
|
|
14
16
|
def initialize(project_dir: Dir.pwd)
|
|
15
17
|
@pinned_files = Set.new
|
|
@@ -299,6 +301,12 @@ module Aidp
|
|
|
299
301
|
usage: "/tasks <list|show|done|abandon|stats> [args]",
|
|
300
302
|
example: "/tasks list pending",
|
|
301
303
|
handler: method(:cmd_tasks)
|
|
304
|
+
},
|
|
305
|
+
"/rate" => {
|
|
306
|
+
description: "Rate the current output (good/neutral/bad)",
|
|
307
|
+
usage: "/rate <good|neutral|bad> [comment]",
|
|
308
|
+
example: "/rate good 'Clean code generated'",
|
|
309
|
+
handler: method(:cmd_rate)
|
|
302
310
|
}
|
|
303
311
|
}
|
|
304
312
|
end
|
|
@@ -2071,6 +2079,97 @@ module Aidp
|
|
|
2071
2079
|
{success: false, message: "Error: #{e.message}", action: :none}
|
|
2072
2080
|
end
|
|
2073
2081
|
|
|
2082
|
+
# Command: /rate <good|neutral|bad> [comment]
|
|
2083
|
+
# Rate the current output with optional comment
|
|
2084
|
+
def cmd_rate(args)
|
|
2085
|
+
rating = args.shift
|
|
2086
|
+
comment = args.join(" ")
|
|
2087
|
+
comment = nil if comment.empty?
|
|
2088
|
+
|
|
2089
|
+
unless rating
|
|
2090
|
+
return {
|
|
2091
|
+
success: false,
|
|
2092
|
+
message: "Usage: /rate <good|neutral|bad> [comment]\n\nExamples:\n /rate good\n /rate bad 'Generated code had bugs'\n /rate neutral 'Acceptable but could be better'",
|
|
2093
|
+
action: :none
|
|
2094
|
+
}
|
|
2095
|
+
end
|
|
2096
|
+
|
|
2097
|
+
# Validate rating
|
|
2098
|
+
unless %w[good neutral bad].include?(rating.downcase)
|
|
2099
|
+
return {
|
|
2100
|
+
success: false,
|
|
2101
|
+
message: "Invalid rating '#{rating}'. Must be: good, neutral, or bad",
|
|
2102
|
+
action: :none
|
|
2103
|
+
}
|
|
2104
|
+
end
|
|
2105
|
+
|
|
2106
|
+
Aidp.log_debug("repl_macros", "rate_command", rating: rating, has_comment: !comment.nil?)
|
|
2107
|
+
|
|
2108
|
+
begin
|
|
2109
|
+
require_relative "../evaluations"
|
|
2110
|
+
|
|
2111
|
+
# Capture context
|
|
2112
|
+
context_capture = Aidp::Evaluations::ContextCapture.new(project_dir: @project_dir)
|
|
2113
|
+
context = context_capture.capture(
|
|
2114
|
+
step_name: @current_step_name,
|
|
2115
|
+
iteration: @current_iteration
|
|
2116
|
+
)
|
|
2117
|
+
|
|
2118
|
+
# Create evaluation record
|
|
2119
|
+
record = Aidp::Evaluations::EvaluationRecord.new(
|
|
2120
|
+
rating: rating.downcase,
|
|
2121
|
+
comment: comment,
|
|
2122
|
+
target_type: "work_loop",
|
|
2123
|
+
context: context
|
|
2124
|
+
)
|
|
2125
|
+
|
|
2126
|
+
# Store evaluation
|
|
2127
|
+
storage = Aidp::Evaluations::EvaluationStorage.new(project_dir: @project_dir)
|
|
2128
|
+
result = storage.store(record)
|
|
2129
|
+
|
|
2130
|
+
if result[:success]
|
|
2131
|
+
rating_display = case rating.downcase
|
|
2132
|
+
when "good" then "good (+)"
|
|
2133
|
+
when "neutral" then "neutral (~)"
|
|
2134
|
+
when "bad" then "bad (-)"
|
|
2135
|
+
else rating
|
|
2136
|
+
end
|
|
2137
|
+
|
|
2138
|
+
msg_lines = ["Evaluation recorded: #{record.id}"]
|
|
2139
|
+
msg_lines << " Rating: #{rating_display}"
|
|
2140
|
+
msg_lines << " Comment: #{comment}" if comment
|
|
2141
|
+
msg_lines << ""
|
|
2142
|
+
msg_lines << "View all evaluations: aidp eval list"
|
|
2143
|
+
|
|
2144
|
+
{
|
|
2145
|
+
success: true,
|
|
2146
|
+
message: msg_lines.join("\n"),
|
|
2147
|
+
action: :evaluation_recorded,
|
|
2148
|
+
data: {id: record.id, rating: rating.downcase, comment: comment}
|
|
2149
|
+
}
|
|
2150
|
+
else
|
|
2151
|
+
{
|
|
2152
|
+
success: false,
|
|
2153
|
+
message: "Failed to store evaluation: #{result[:error]}",
|
|
2154
|
+
action: :none
|
|
2155
|
+
}
|
|
2156
|
+
end
|
|
2157
|
+
rescue ArgumentError => e
|
|
2158
|
+
{
|
|
2159
|
+
success: false,
|
|
2160
|
+
message: "Error: #{e.message}",
|
|
2161
|
+
action: :none
|
|
2162
|
+
}
|
|
2163
|
+
rescue => e
|
|
2164
|
+
Aidp.log_error("repl_macros", "rate_command_failed", error: e.message)
|
|
2165
|
+
{
|
|
2166
|
+
success: false,
|
|
2167
|
+
message: "Failed to record evaluation: #{e.message}",
|
|
2168
|
+
action: :none
|
|
2169
|
+
}
|
|
2170
|
+
end
|
|
2171
|
+
end
|
|
2172
|
+
|
|
2074
2173
|
private
|
|
2075
2174
|
|
|
2076
2175
|
# List tasks with optional status filter
|