aidp 0.33.0 → 0.34.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +35 -0
  3. data/lib/aidp/analyze/tree_sitter_scan.rb +3 -0
  4. data/lib/aidp/cli/eval_command.rb +399 -0
  5. data/lib/aidp/cli/harness_command.rb +1 -1
  6. data/lib/aidp/cli/security_command.rb +416 -0
  7. data/lib/aidp/cli/tools_command.rb +6 -4
  8. data/lib/aidp/cli.rb +170 -3
  9. data/lib/aidp/concurrency/exec.rb +3 -0
  10. data/lib/aidp/config.rb +113 -0
  11. data/lib/aidp/config_paths.rb +20 -0
  12. data/lib/aidp/daemon/runner.rb +8 -4
  13. data/lib/aidp/errors.rb +134 -0
  14. data/lib/aidp/evaluations/context_capture.rb +205 -0
  15. data/lib/aidp/evaluations/evaluation_record.rb +114 -0
  16. data/lib/aidp/evaluations/evaluation_storage.rb +250 -0
  17. data/lib/aidp/evaluations.rb +23 -0
  18. data/lib/aidp/execute/async_work_loop_runner.rb +4 -1
  19. data/lib/aidp/execute/interactive_repl.rb +6 -2
  20. data/lib/aidp/execute/prompt_evaluator.rb +359 -0
  21. data/lib/aidp/execute/repl_macros.rb +100 -1
  22. data/lib/aidp/execute/work_loop_runner.rb +399 -47
  23. data/lib/aidp/execute/work_loop_state.rb +4 -1
  24. data/lib/aidp/execute/workflow_selector.rb +3 -0
  25. data/lib/aidp/harness/ai_decision_engine.rb +79 -0
  26. data/lib/aidp/harness/capability_registry.rb +2 -0
  27. data/lib/aidp/harness/condition_detector.rb +3 -0
  28. data/lib/aidp/harness/config_loader.rb +3 -0
  29. data/lib/aidp/harness/enhanced_runner.rb +14 -11
  30. data/lib/aidp/harness/error_handler.rb +3 -0
  31. data/lib/aidp/harness/provider_factory.rb +3 -0
  32. data/lib/aidp/harness/provider_manager.rb +6 -0
  33. data/lib/aidp/harness/runner.rb +5 -1
  34. data/lib/aidp/harness/state/persistence.rb +3 -0
  35. data/lib/aidp/harness/state_manager.rb +3 -0
  36. data/lib/aidp/harness/status_display.rb +28 -20
  37. data/lib/aidp/harness/thinking_depth_manager.rb +32 -32
  38. data/lib/aidp/harness/ui/enhanced_tui.rb +4 -0
  39. data/lib/aidp/harness/ui/enhanced_workflow_selector.rb +4 -0
  40. data/lib/aidp/harness/ui/error_handler.rb +3 -0
  41. data/lib/aidp/harness/ui/job_monitor.rb +4 -0
  42. data/lib/aidp/harness/ui/navigation/submenu.rb +2 -0
  43. data/lib/aidp/harness/ui/navigation/workflow_selector.rb +6 -0
  44. data/lib/aidp/harness/ui/spinner_helper.rb +3 -0
  45. data/lib/aidp/harness/ui/workflow_controller.rb +3 -0
  46. data/lib/aidp/harness/user_interface.rb +3 -0
  47. data/lib/aidp/loader.rb +2 -2
  48. data/lib/aidp/logger.rb +3 -0
  49. data/lib/aidp/message_display.rb +31 -0
  50. data/lib/aidp/pr_worktree_manager.rb +18 -6
  51. data/lib/aidp/provider_manager.rb +3 -0
  52. data/lib/aidp/providers/base.rb +2 -0
  53. data/lib/aidp/security/rule_of_two_enforcer.rb +210 -0
  54. data/lib/aidp/security/secrets_proxy.rb +328 -0
  55. data/lib/aidp/security/secrets_registry.rb +227 -0
  56. data/lib/aidp/security/trifecta_state.rb +220 -0
  57. data/lib/aidp/security/watch_mode_handler.rb +306 -0
  58. data/lib/aidp/security/work_loop_adapter.rb +277 -0
  59. data/lib/aidp/security.rb +56 -0
  60. data/lib/aidp/setup/wizard.rb +4 -2
  61. data/lib/aidp/version.rb +1 -1
  62. data/lib/aidp/watch/auto_merger.rb +274 -0
  63. data/lib/aidp/watch/auto_pr_processor.rb +125 -7
  64. data/lib/aidp/watch/build_processor.rb +16 -1
  65. data/lib/aidp/watch/change_request_processor.rb +680 -286
  66. data/lib/aidp/watch/ci_fix_processor.rb +262 -4
  67. data/lib/aidp/watch/feedback_collector.rb +191 -0
  68. data/lib/aidp/watch/hierarchical_pr_strategy.rb +256 -0
  69. data/lib/aidp/watch/implementation_verifier.rb +142 -1
  70. data/lib/aidp/watch/plan_generator.rb +70 -13
  71. data/lib/aidp/watch/plan_processor.rb +12 -5
  72. data/lib/aidp/watch/projects_processor.rb +286 -0
  73. data/lib/aidp/watch/repository_client.rb +861 -53
  74. data/lib/aidp/watch/review_processor.rb +33 -6
  75. data/lib/aidp/watch/runner.rb +51 -11
  76. data/lib/aidp/watch/state_store.rb +233 -0
  77. data/lib/aidp/watch/sub_issue_creator.rb +221 -0
  78. data/lib/aidp/workflows/guided_agent.rb +4 -0
  79. data/lib/aidp/workstream_executor.rb +3 -0
  80. data/lib/aidp/worktree.rb +61 -11
  81. data/lib/aidp/worktree_branch_manager.rb +347 -101
  82. data/templates/implementation/iterative_implementation.md +46 -3
  83. metadata +20 -1
@@ -0,0 +1,359 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../harness/ai_decision_engine"
4
+
5
+ module Aidp
6
+ module Execute
7
+ # Evaluates prompt effectiveness using ZFC after multiple iterations
8
+ #
9
+ # FIX for issue #391: When the work loop reaches 10+ iterations without completion,
10
+ # this evaluator assesses prompt quality and suggests improvements.
11
+ #
12
+ # Uses Zero Framework Cognition (ZFC) to analyze:
13
+ # - Whether the prompt clearly defines completion criteria
14
+ # - If task breakdown instructions are adequate
15
+ # - Whether the agent has sufficient context
16
+ # - If there are blockers preventing progress
17
+ #
18
+ # @example
19
+ # evaluator = PromptEvaluator.new(config)
20
+ # result = evaluator.evaluate(
21
+ # prompt_content: prompt_manager.read,
22
+ # iteration_count: 12,
23
+ # task_summary: persistent_tasklist.summary,
24
+ # recent_failures: all_results
25
+ # )
26
+ # # => { effective: false, issues: [...], suggestions: [...] }
27
+ #
28
+ class PromptEvaluator
29
+ # Threshold for triggering evaluation
30
+ EVALUATION_ITERATION_THRESHOLD = 10
31
+
32
+ # Re-evaluate periodically after threshold
33
+ EVALUATION_INTERVAL = 5
34
+
35
+ # Expose for testability
36
+ attr_reader :ai_decision_engine
37
+
38
+ def initialize(config, ai_decision_engine: nil)
39
+ @config = config
40
+ @ai_decision_engine = ai_decision_engine || safely_build_ai_decision_engine
41
+ end
42
+
43
+ # Safely build AIDecisionEngine, returning nil if config doesn't support it
44
+ # This allows tests with mock configs to work without AI calls
45
+ def safely_build_ai_decision_engine
46
+ # Check if config supports the methods AIDecisionEngine needs
47
+ return nil unless @config.respond_to?(:default_provider)
48
+
49
+ build_default_ai_decision_engine
50
+ rescue => e
51
+ Aidp.log_debug("prompt_evaluator", "skipping_ai_decision_engine",
52
+ reason: e.message)
53
+ nil
54
+ end
55
+
56
+ # Check if evaluation should be triggered based on iteration count
57
+ # @param iteration_count [Integer] Current iteration number
58
+ # @return [Boolean]
59
+ def should_evaluate?(iteration_count)
60
+ return false unless iteration_count >= EVALUATION_ITERATION_THRESHOLD
61
+
62
+ # Evaluate at threshold and every EVALUATION_INTERVAL after
63
+ (iteration_count - EVALUATION_ITERATION_THRESHOLD) % EVALUATION_INTERVAL == 0
64
+ end
65
+
66
+ # Evaluate prompt effectiveness
67
+ # @param prompt_content [String] Current PROMPT.md content
68
+ # @param iteration_count [Integer] Current iteration number
69
+ # @param task_summary [Hash] Summary of task statuses
70
+ # @param recent_failures [Hash] Recent test/lint failures
71
+ # @param step_name [String] Name of current step
72
+ # @return [Hash] Evaluation result with :effective, :issues, :suggestions
73
+ def evaluate(prompt_content:, iteration_count:, task_summary:, recent_failures:, step_name: nil)
74
+ Aidp.log_debug("prompt_evaluator", "starting_evaluation",
75
+ iteration: iteration_count,
76
+ step: step_name,
77
+ prompt_size: prompt_content&.length || 0)
78
+
79
+ # When AI decision engine is unavailable (e.g., in tests with mock configs),
80
+ # return a neutral result that doesn't trigger feedback appending
81
+ unless @ai_decision_engine
82
+ Aidp.log_debug("prompt_evaluator", "skipping_evaluation_no_ai_engine")
83
+ return {
84
+ effective: true, # Assume effective to avoid unnecessary feedback
85
+ issues: [],
86
+ suggestions: [],
87
+ likely_blockers: [],
88
+ recommended_actions: [],
89
+ confidence: 0.0,
90
+ skipped: true,
91
+ skip_reason: "AI decision engine not available"
92
+ }
93
+ end
94
+
95
+ prompt = build_evaluation_prompt(
96
+ prompt_content: prompt_content,
97
+ iteration_count: iteration_count,
98
+ task_summary: task_summary,
99
+ recent_failures: recent_failures
100
+ )
101
+
102
+ schema = {
103
+ type: "object",
104
+ properties: {
105
+ effective: {
106
+ type: "boolean",
107
+ description: "True if the prompt is likely to lead to completion within a few more iterations"
108
+ },
109
+ issues: {
110
+ type: "array",
111
+ items: {type: "string"},
112
+ description: "Specific problems identified with the current prompt"
113
+ },
114
+ suggestions: {
115
+ type: "array",
116
+ items: {type: "string"},
117
+ description: "Actionable suggestions to improve prompt effectiveness"
118
+ },
119
+ likely_blockers: {
120
+ type: "array",
121
+ items: {type: "string"},
122
+ description: "Potential blockers preventing progress"
123
+ },
124
+ recommended_actions: {
125
+ type: "array",
126
+ items: {
127
+ type: "object",
128
+ properties: {
129
+ action: {type: "string"},
130
+ priority: {type: "string", enum: ["high", "medium", "low"]},
131
+ rationale: {type: "string"}
132
+ }
133
+ },
134
+ description: "Specific actions to take, prioritized"
135
+ },
136
+ confidence: {
137
+ type: "number",
138
+ minimum: 0.0,
139
+ maximum: 1.0,
140
+ description: "Confidence in this assessment"
141
+ }
142
+ },
143
+ required: ["effective", "issues", "suggestions", "confidence"]
144
+ }
145
+
146
+ begin
147
+ result = @ai_decision_engine.decide(
148
+ :prompt_evaluation,
149
+ context: {prompt: prompt},
150
+ schema: schema,
151
+ tier: :mini,
152
+ cache_ttl: nil # Each evaluation is context-specific
153
+ )
154
+
155
+ Aidp.log_info("prompt_evaluator", "evaluation_complete",
156
+ iteration: iteration_count,
157
+ effective: result[:effective],
158
+ issue_count: result[:issues]&.size || 0,
159
+ confidence: result[:confidence])
160
+
161
+ result
162
+ rescue => e
163
+ Aidp.log_error("prompt_evaluator", "evaluation_failed",
164
+ error: e.message,
165
+ error_class: e.class.name)
166
+
167
+ build_fallback_result("Evaluation failed: #{e.message}")
168
+ end
169
+ end
170
+
171
+ # Generate improvement recommendations for the prompt template
172
+ # Used for AGD pattern - generating improved templates based on evaluation
173
+ # @param evaluation_result [Hash] Result from evaluate()
174
+ # @param original_template [String] The original template content
175
+ # @return [Hash] Template improvements
176
+ def generate_template_improvements(evaluation_result:, original_template:)
177
+ return nil unless @ai_decision_engine
178
+
179
+ Aidp.log_debug("prompt_evaluator", "generating_template_improvements",
180
+ issue_count: evaluation_result[:issues]&.size || 0)
181
+
182
+ prompt = build_improvement_prompt(evaluation_result, original_template)
183
+
184
+ schema = {
185
+ type: "object",
186
+ properties: {
187
+ improved_sections: {
188
+ type: "array",
189
+ items: {
190
+ type: "object",
191
+ properties: {
192
+ section_name: {type: "string"},
193
+ original: {type: "string"},
194
+ improved: {type: "string"},
195
+ rationale: {type: "string"}
196
+ }
197
+ }
198
+ },
199
+ additional_sections: {
200
+ type: "array",
201
+ items: {
202
+ type: "object",
203
+ properties: {
204
+ section_name: {type: "string"},
205
+ content: {type: "string"},
206
+ rationale: {type: "string"}
207
+ }
208
+ }
209
+ },
210
+ completion_criteria_improvements: {
211
+ type: "array",
212
+ items: {type: "string"},
213
+ description: "Specific improvements to completion criteria definitions"
214
+ }
215
+ },
216
+ required: ["improved_sections", "completion_criteria_improvements"]
217
+ }
218
+
219
+ @ai_decision_engine.decide(
220
+ :template_improvement,
221
+ context: {prompt: prompt},
222
+ schema: schema,
223
+ tier: :standard, # Use standard tier for more thoughtful improvements
224
+ cache_ttl: nil
225
+ )
226
+ rescue => e
227
+ Aidp.log_error("prompt_evaluator", "template_improvement_failed",
228
+ error: e.message)
229
+ nil
230
+ end
231
+
232
+ private
233
+
234
+ def build_evaluation_prompt(prompt_content:, iteration_count:, task_summary:, recent_failures:)
235
+ <<~PROMPT
236
+ You are evaluating the effectiveness of a work loop prompt that has been running for #{iteration_count} iterations without completion.
237
+
238
+ ## Current Prompt Content
239
+ #{truncate_content(prompt_content, 8000)}
240
+
241
+ ## Task Summary
242
+ #{format_task_summary(task_summary)}
243
+
244
+ ## Recent Check Results
245
+ #{format_failures(recent_failures)}
246
+
247
+ ## Evaluation Criteria
248
+
249
+ Analyze why this prompt may not be leading to completion:
250
+
251
+ 1. **Clarity of Goals**: Are the implementation requirements clearly defined?
252
+ 2. **Task Breakdown**: Does the prompt guide proper task decomposition?
253
+ 3. **Completion Criteria**: Are the completion criteria specific and achievable?
254
+ 4. **Context Sufficiency**: Does the agent have enough context to proceed?
255
+ 5. **Blockers**: Are there technical blockers or missing information?
256
+ 6. **Scope**: Is the scope realistic for an AI agent to complete?
257
+
258
+ ## Your Assessment
259
+
260
+ Provide:
261
+ - Whether this prompt is likely effective (true/false)
262
+ - Specific issues with the current prompt
263
+ - Actionable suggestions for improvement
264
+ - Likely blockers preventing progress
265
+ - Prioritized recommended actions
266
+ - Your confidence in this assessment (0.0-1.0)
267
+
268
+ Be specific and actionable. Focus on what can be changed to achieve completion.
269
+ PROMPT
270
+ end
271
+
272
+ def build_improvement_prompt(evaluation_result, original_template)
273
+ <<~PROMPT
274
+ Based on the following prompt evaluation, suggest improvements to the template.
275
+
276
+ ## Evaluation Results
277
+ - Effective: #{evaluation_result[:effective]}
278
+ - Issues: #{(evaluation_result[:issues] || []).join(", ")}
279
+ - Suggestions: #{(evaluation_result[:suggestions] || []).join(", ")}
280
+
281
+ ## Original Template
282
+ #{truncate_content(original_template, 4000)}
283
+
284
+ ## Your Task
285
+
286
+ Suggest specific improvements to make the template more effective:
287
+ 1. Identify sections that need improvement
288
+ 2. Propose new sections if needed
289
+ 3. Focus especially on completion criteria clarity
290
+ 4. Ensure task breakdown instructions are explicit
291
+ 5. Add guidance for common failure modes
292
+
293
+ Be specific - provide actual text that could replace or supplement the template.
294
+ PROMPT
295
+ end
296
+
297
+ def format_task_summary(task_summary)
298
+ return "_No task summary available_" if task_summary.nil? || task_summary.empty?
299
+
300
+ if task_summary.is_a?(Hash)
301
+ parts = []
302
+ parts << "Total: #{task_summary[:total] || 0}"
303
+ parts << "Done: #{task_summary[:done] || 0}"
304
+ parts << "In Progress: #{task_summary[:in_progress] || 0}"
305
+ parts << "Pending: #{task_summary[:pending] || 0}"
306
+ parts << "Abandoned: #{task_summary[:abandoned] || 0}"
307
+ parts.join(" | ")
308
+ else
309
+ task_summary.to_s
310
+ end
311
+ end
312
+
313
+ def format_failures(recent_failures)
314
+ return "_No recent failures_" if recent_failures.nil? || recent_failures.empty?
315
+
316
+ parts = []
317
+ recent_failures.each do |check_type, result|
318
+ next unless result.is_a?(Hash)
319
+
320
+ status = result[:success] ? "✅ passed" : "❌ failed"
321
+ parts << "- #{check_type}: #{status}"
322
+
323
+ if !result[:success] && result[:failures]
324
+ failures = result[:failures].take(3)
325
+ failures.each { |f| parts << " - #{truncate_content(f.to_s, 200)}" }
326
+ end
327
+ end
328
+
329
+ parts.empty? ? "_No failures to report_" : parts.join("\n")
330
+ end
331
+
332
+ def truncate_content(content, max_length)
333
+ return "_No content_" if content.nil? || content.empty?
334
+ return content if content.length <= max_length
335
+
336
+ "#{content[0, max_length]}\n\n[... truncated, showing first #{max_length} characters ...]"
337
+ end
338
+
339
+ def build_fallback_result(reason)
340
+ {
341
+ effective: nil,
342
+ issues: ["Unable to evaluate: #{reason}"],
343
+ suggestions: ["Check AI configuration and try again"],
344
+ likely_blockers: [],
345
+ recommended_actions: [],
346
+ confidence: 0.0
347
+ }
348
+ end
349
+
350
+ def build_default_ai_decision_engine
351
+ Aidp::Harness::AIDecisionEngine.new(@config)
352
+ rescue => e
353
+ Aidp.log_warn("prompt_evaluator", "failed_to_create_ai_decision_engine",
354
+ error: e.message)
355
+ nil
356
+ end
357
+ end
358
+ end
359
+ end
@@ -9,7 +9,9 @@ module Aidp
9
9
  # - /split - Divide work into smaller contracts
10
10
  # - /halt-on <pattern> - Pause on specific test failures
11
11
  class ReplMacros
12
- attr_reader :pinned_files, :focus_patterns, :halt_patterns, :split_mode, :current_workstream, :current_skill
12
+ attr_reader :pinned_files, :focus_patterns, :halt_patterns, :split_mode, :current_workstream
13
+ # Expose current_skill for testability
14
+ attr_accessor :current_skill
13
15
 
14
16
  def initialize(project_dir: Dir.pwd)
15
17
  @pinned_files = Set.new
@@ -299,6 +301,12 @@ module Aidp
299
301
  usage: "/tasks <list|show|done|abandon|stats> [args]",
300
302
  example: "/tasks list pending",
301
303
  handler: method(:cmd_tasks)
304
+ },
305
+ "/rate" => {
306
+ description: "Rate the current output (good/neutral/bad)",
307
+ usage: "/rate <good|neutral|bad> [comment]",
308
+ example: "/rate good 'Clean code generated'",
309
+ handler: method(:cmd_rate)
302
310
  }
303
311
  }
304
312
  end
@@ -2071,6 +2079,97 @@ module Aidp
2071
2079
  {success: false, message: "Error: #{e.message}", action: :none}
2072
2080
  end
2073
2081
 
2082
+ # Command: /rate <good|neutral|bad> [comment]
2083
+ # Rate the current output with optional comment
2084
+ def cmd_rate(args)
2085
+ rating = args.shift
2086
+ comment = args.join(" ")
2087
+ comment = nil if comment.empty?
2088
+
2089
+ unless rating
2090
+ return {
2091
+ success: false,
2092
+ message: "Usage: /rate <good|neutral|bad> [comment]\n\nExamples:\n /rate good\n /rate bad 'Generated code had bugs'\n /rate neutral 'Acceptable but could be better'",
2093
+ action: :none
2094
+ }
2095
+ end
2096
+
2097
+ # Validate rating
2098
+ unless %w[good neutral bad].include?(rating.downcase)
2099
+ return {
2100
+ success: false,
2101
+ message: "Invalid rating '#{rating}'. Must be: good, neutral, or bad",
2102
+ action: :none
2103
+ }
2104
+ end
2105
+
2106
+ Aidp.log_debug("repl_macros", "rate_command", rating: rating, has_comment: !comment.nil?)
2107
+
2108
+ begin
2109
+ require_relative "../evaluations"
2110
+
2111
+ # Capture context
2112
+ context_capture = Aidp::Evaluations::ContextCapture.new(project_dir: @project_dir)
2113
+ context = context_capture.capture(
2114
+ step_name: @current_step_name,
2115
+ iteration: @current_iteration
2116
+ )
2117
+
2118
+ # Create evaluation record
2119
+ record = Aidp::Evaluations::EvaluationRecord.new(
2120
+ rating: rating.downcase,
2121
+ comment: comment,
2122
+ target_type: "work_loop",
2123
+ context: context
2124
+ )
2125
+
2126
+ # Store evaluation
2127
+ storage = Aidp::Evaluations::EvaluationStorage.new(project_dir: @project_dir)
2128
+ result = storage.store(record)
2129
+
2130
+ if result[:success]
2131
+ rating_display = case rating.downcase
2132
+ when "good" then "good (+)"
2133
+ when "neutral" then "neutral (~)"
2134
+ when "bad" then "bad (-)"
2135
+ else rating
2136
+ end
2137
+
2138
+ msg_lines = ["Evaluation recorded: #{record.id}"]
2139
+ msg_lines << " Rating: #{rating_display}"
2140
+ msg_lines << " Comment: #{comment}" if comment
2141
+ msg_lines << ""
2142
+ msg_lines << "View all evaluations: aidp eval list"
2143
+
2144
+ {
2145
+ success: true,
2146
+ message: msg_lines.join("\n"),
2147
+ action: :evaluation_recorded,
2148
+ data: {id: record.id, rating: rating.downcase, comment: comment}
2149
+ }
2150
+ else
2151
+ {
2152
+ success: false,
2153
+ message: "Failed to store evaluation: #{result[:error]}",
2154
+ action: :none
2155
+ }
2156
+ end
2157
+ rescue ArgumentError => e
2158
+ {
2159
+ success: false,
2160
+ message: "Error: #{e.message}",
2161
+ action: :none
2162
+ }
2163
+ rescue => e
2164
+ Aidp.log_error("repl_macros", "rate_command_failed", error: e.message)
2165
+ {
2166
+ success: false,
2167
+ message: "Failed to record evaluation: #{e.message}",
2168
+ action: :none
2169
+ }
2170
+ end
2171
+ end
2172
+
2074
2173
  private
2075
2174
 
2076
2175
  # List tasks with optional status filter