aidp 0.32.0 → 0.34.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +35 -0
- data/lib/aidp/analyze/feature_analyzer.rb +322 -320
- data/lib/aidp/analyze/tree_sitter_scan.rb +3 -0
- data/lib/aidp/auto_update/coordinator.rb +97 -7
- data/lib/aidp/auto_update.rb +0 -12
- data/lib/aidp/cli/devcontainer_commands.rb +0 -5
- data/lib/aidp/cli/eval_command.rb +399 -0
- data/lib/aidp/cli/harness_command.rb +1 -1
- data/lib/aidp/cli/security_command.rb +416 -0
- data/lib/aidp/cli/tools_command.rb +6 -4
- data/lib/aidp/cli.rb +172 -4
- data/lib/aidp/comment_consolidator.rb +78 -0
- data/lib/aidp/concurrency/exec.rb +3 -0
- data/lib/aidp/concurrency.rb +0 -3
- data/lib/aidp/config.rb +113 -1
- data/lib/aidp/config_paths.rb +91 -0
- data/lib/aidp/daemon/runner.rb +8 -4
- data/lib/aidp/errors.rb +134 -0
- data/lib/aidp/evaluations/context_capture.rb +205 -0
- data/lib/aidp/evaluations/evaluation_record.rb +114 -0
- data/lib/aidp/evaluations/evaluation_storage.rb +250 -0
- data/lib/aidp/evaluations.rb +23 -0
- data/lib/aidp/execute/async_work_loop_runner.rb +4 -1
- data/lib/aidp/execute/interactive_repl.rb +6 -2
- data/lib/aidp/execute/prompt_evaluator.rb +359 -0
- data/lib/aidp/execute/repl_macros.rb +100 -1
- data/lib/aidp/execute/work_loop_runner.rb +719 -58
- data/lib/aidp/execute/work_loop_state.rb +4 -1
- data/lib/aidp/execute/workflow_selector.rb +3 -0
- data/lib/aidp/harness/ai_decision_engine.rb +79 -0
- data/lib/aidp/harness/ai_filter_factory.rb +285 -0
- data/lib/aidp/harness/capability_registry.rb +2 -0
- data/lib/aidp/harness/condition_detector.rb +3 -0
- data/lib/aidp/harness/config_loader.rb +3 -0
- data/lib/aidp/harness/config_schema.rb +97 -1
- data/lib/aidp/harness/config_validator.rb +1 -1
- data/lib/aidp/harness/configuration.rb +61 -5
- data/lib/aidp/harness/enhanced_runner.rb +14 -11
- data/lib/aidp/harness/error_handler.rb +3 -0
- data/lib/aidp/harness/filter_definition.rb +212 -0
- data/lib/aidp/harness/generated_filter_strategy.rb +197 -0
- data/lib/aidp/harness/output_filter.rb +50 -25
- data/lib/aidp/harness/output_filter_config.rb +129 -0
- data/lib/aidp/harness/provider_factory.rb +3 -0
- data/lib/aidp/harness/provider_manager.rb +96 -2
- data/lib/aidp/harness/runner.rb +5 -12
- data/lib/aidp/harness/state/persistence.rb +3 -0
- data/lib/aidp/harness/state_manager.rb +3 -0
- data/lib/aidp/harness/status_display.rb +28 -20
- data/lib/aidp/harness/test_runner.rb +179 -41
- data/lib/aidp/harness/thinking_depth_manager.rb +44 -28
- data/lib/aidp/harness/ui/enhanced_tui.rb +4 -0
- data/lib/aidp/harness/ui/enhanced_workflow_selector.rb +4 -0
- data/lib/aidp/harness/ui/error_handler.rb +3 -0
- data/lib/aidp/harness/ui/job_monitor.rb +4 -0
- data/lib/aidp/harness/ui/navigation/submenu.rb +2 -2
- data/lib/aidp/harness/ui/navigation/workflow_selector.rb +6 -0
- data/lib/aidp/harness/ui/spinner_helper.rb +3 -0
- data/lib/aidp/harness/ui/workflow_controller.rb +3 -0
- data/lib/aidp/harness/user_interface.rb +3 -0
- data/lib/aidp/loader.rb +195 -0
- data/lib/aidp/logger.rb +3 -0
- data/lib/aidp/message_display.rb +31 -0
- data/lib/aidp/metadata/compiler.rb +29 -17
- data/lib/aidp/metadata/query.rb +1 -1
- data/lib/aidp/metadata/scanner.rb +8 -1
- data/lib/aidp/metadata/tool_metadata.rb +13 -13
- data/lib/aidp/metadata/validator.rb +10 -0
- data/lib/aidp/metadata.rb +16 -0
- data/lib/aidp/pr_worktree_manager.rb +20 -8
- data/lib/aidp/provider_manager.rb +4 -7
- data/lib/aidp/providers/base.rb +2 -0
- data/lib/aidp/security/rule_of_two_enforcer.rb +210 -0
- data/lib/aidp/security/secrets_proxy.rb +328 -0
- data/lib/aidp/security/secrets_registry.rb +227 -0
- data/lib/aidp/security/trifecta_state.rb +220 -0
- data/lib/aidp/security/watch_mode_handler.rb +306 -0
- data/lib/aidp/security/work_loop_adapter.rb +277 -0
- data/lib/aidp/security.rb +56 -0
- data/lib/aidp/setup/wizard.rb +283 -11
- data/lib/aidp/skills.rb +0 -5
- data/lib/aidp/storage/csv_storage.rb +3 -0
- data/lib/aidp/style_guide/selector.rb +360 -0
- data/lib/aidp/tooling_detector.rb +283 -16
- data/lib/aidp/version.rb +1 -1
- data/lib/aidp/watch/auto_merger.rb +274 -0
- data/lib/aidp/watch/auto_pr_processor.rb +125 -7
- data/lib/aidp/watch/build_processor.rb +16 -1
- data/lib/aidp/watch/change_request_processor.rb +682 -150
- data/lib/aidp/watch/ci_fix_processor.rb +262 -4
- data/lib/aidp/watch/feedback_collector.rb +191 -0
- data/lib/aidp/watch/hierarchical_pr_strategy.rb +256 -0
- data/lib/aidp/watch/implementation_verifier.rb +142 -1
- data/lib/aidp/watch/plan_generator.rb +70 -13
- data/lib/aidp/watch/plan_processor.rb +12 -5
- data/lib/aidp/watch/projects_processor.rb +286 -0
- data/lib/aidp/watch/repository_client.rb +871 -22
- data/lib/aidp/watch/review_processor.rb +33 -6
- data/lib/aidp/watch/runner.rb +80 -29
- data/lib/aidp/watch/state_store.rb +233 -0
- data/lib/aidp/watch/sub_issue_creator.rb +221 -0
- data/lib/aidp/watch.rb +5 -7
- data/lib/aidp/workflows/guided_agent.rb +4 -0
- data/lib/aidp/workstream_cleanup.rb +0 -2
- data/lib/aidp/workstream_executor.rb +3 -4
- data/lib/aidp/worktree.rb +61 -12
- data/lib/aidp/worktree_branch_manager.rb +347 -101
- data/lib/aidp.rb +21 -106
- data/templates/implementation/iterative_implementation.md +46 -3
- metadata +91 -36
- data/lib/aidp/config/paths.rb +0 -131
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "fileutils"
|
|
5
|
+
require_relative "evaluation_record"
|
|
6
|
+
require_relative "../config_paths"
|
|
7
|
+
require_relative "../rescue_logging"
|
|
8
|
+
|
|
9
|
+
module Aidp
|
|
10
|
+
module Evaluations
|
|
11
|
+
# Storage manager for evaluation records
|
|
12
|
+
#
|
|
13
|
+
# Stores evaluations in `.aidp/evaluations/` with append-only semantics:
|
|
14
|
+
# - Individual evaluations stored as JSON files: `eval_YYYYMMDD_HHMMSS_xxxx.json`
|
|
15
|
+
# - Indexed summary file for efficient lookups: `index.json`
|
|
16
|
+
#
|
|
17
|
+
# @example Storing an evaluation
|
|
18
|
+
# storage = EvaluationStorage.new(project_dir: Dir.pwd)
|
|
19
|
+
# storage.store(record)
|
|
20
|
+
#
|
|
21
|
+
# @example Listing evaluations
|
|
22
|
+
# storage.list(limit: 10)
|
|
23
|
+
# storage.list(rating: "bad")
|
|
24
|
+
class EvaluationStorage
|
|
25
|
+
include Aidp::RescueLogging
|
|
26
|
+
|
|
27
|
+
def initialize(project_dir: Dir.pwd)
|
|
28
|
+
@project_dir = project_dir
|
|
29
|
+
@evaluations_dir = ConfigPaths.evaluations_dir(project_dir)
|
|
30
|
+
@index_file = ConfigPaths.evaluations_index_file(project_dir)
|
|
31
|
+
|
|
32
|
+
Aidp.log_debug("evaluation_storage", "initialize",
|
|
33
|
+
project_dir: project_dir, evaluations_dir: @evaluations_dir)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Store a new evaluation record
|
|
37
|
+
#
|
|
38
|
+
# @param record [EvaluationRecord] The evaluation to store
|
|
39
|
+
# @return [Hash] Result with :success and :id keys
|
|
40
|
+
def store(record)
|
|
41
|
+
ensure_directory
|
|
42
|
+
file_path = File.join(@evaluations_dir, "#{record.id}.json")
|
|
43
|
+
|
|
44
|
+
Aidp.log_debug("evaluation_storage", "store",
|
|
45
|
+
id: record.id, rating: record.rating, file_path: file_path)
|
|
46
|
+
|
|
47
|
+
File.write(file_path, JSON.pretty_generate(record.to_h))
|
|
48
|
+
update_index(record)
|
|
49
|
+
|
|
50
|
+
{success: true, id: record.id, file_path: file_path}
|
|
51
|
+
rescue => error
|
|
52
|
+
log_rescue(error,
|
|
53
|
+
component: "evaluation_storage",
|
|
54
|
+
action: "store",
|
|
55
|
+
fallback: {success: false},
|
|
56
|
+
id: record.id)
|
|
57
|
+
{success: false, error: error.message, id: record.id}
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Load a specific evaluation by ID
|
|
61
|
+
#
|
|
62
|
+
# @param id [String] The evaluation ID
|
|
63
|
+
# @return [EvaluationRecord, nil] The record or nil if not found
|
|
64
|
+
def load(id)
|
|
65
|
+
file_path = File.join(@evaluations_dir, "#{id}.json")
|
|
66
|
+
return nil unless File.exist?(file_path)
|
|
67
|
+
|
|
68
|
+
Aidp.log_debug("evaluation_storage", "load", id: id)
|
|
69
|
+
|
|
70
|
+
data = JSON.parse(File.read(file_path))
|
|
71
|
+
EvaluationRecord.from_h(data)
|
|
72
|
+
rescue => error
|
|
73
|
+
log_rescue(error,
|
|
74
|
+
component: "evaluation_storage",
|
|
75
|
+
action: "load",
|
|
76
|
+
fallback: nil,
|
|
77
|
+
id: id)
|
|
78
|
+
nil
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# List evaluations with optional filtering
|
|
82
|
+
#
|
|
83
|
+
# @param limit [Integer] Maximum number of records to return
|
|
84
|
+
# @param rating [String, nil] Filter by rating (good/neutral/bad)
|
|
85
|
+
# @param target_type [String, nil] Filter by target type
|
|
86
|
+
# @return [Array<EvaluationRecord>] Matching records, newest first
|
|
87
|
+
def list(limit: 50, rating: nil, target_type: nil)
|
|
88
|
+
Aidp.log_debug("evaluation_storage", "list",
|
|
89
|
+
limit: limit, rating: rating, target_type: target_type)
|
|
90
|
+
|
|
91
|
+
index = load_index
|
|
92
|
+
entries = index[:entries] || []
|
|
93
|
+
|
|
94
|
+
# Apply filters
|
|
95
|
+
entries = entries.select { |e| e[:rating] == rating } if rating
|
|
96
|
+
entries = entries.select { |e| e[:target_type] == target_type } if target_type
|
|
97
|
+
|
|
98
|
+
# Sort by created_at descending, take limit
|
|
99
|
+
entries = entries.sort_by { |e| e[:created_at] || "" }.reverse.take(limit)
|
|
100
|
+
|
|
101
|
+
# Load full records
|
|
102
|
+
entries.filter_map { |entry| load(entry[:id]) }
|
|
103
|
+
rescue => error
|
|
104
|
+
log_rescue(error,
|
|
105
|
+
component: "evaluation_storage",
|
|
106
|
+
action: "list",
|
|
107
|
+
fallback: [],
|
|
108
|
+
limit: limit)
|
|
109
|
+
[]
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Get statistics about evaluations
|
|
113
|
+
#
|
|
114
|
+
# @return [Hash] Statistics including counts by rating
|
|
115
|
+
def stats
|
|
116
|
+
Aidp.log_debug("evaluation_storage", "stats")
|
|
117
|
+
|
|
118
|
+
index = load_index
|
|
119
|
+
entries = index[:entries] || []
|
|
120
|
+
|
|
121
|
+
total = entries.size
|
|
122
|
+
by_rating = entries.group_by { |e| e[:rating] }
|
|
123
|
+
by_target = entries.group_by { |e| e[:target_type] }
|
|
124
|
+
|
|
125
|
+
{
|
|
126
|
+
total: total,
|
|
127
|
+
by_rating: {
|
|
128
|
+
good: (by_rating["good"] || []).size,
|
|
129
|
+
neutral: (by_rating["neutral"] || []).size,
|
|
130
|
+
bad: (by_rating["bad"] || []).size
|
|
131
|
+
},
|
|
132
|
+
by_target_type: by_target.transform_values(&:size),
|
|
133
|
+
first_evaluation: entries.min_by { |e| e[:created_at] || "" }&.dig(:created_at),
|
|
134
|
+
last_evaluation: entries.max_by { |e| e[:created_at] || "" }&.dig(:created_at)
|
|
135
|
+
}
|
|
136
|
+
rescue => error
|
|
137
|
+
log_rescue(error,
|
|
138
|
+
component: "evaluation_storage",
|
|
139
|
+
action: "stats",
|
|
140
|
+
fallback: {total: 0, by_rating: {good: 0, neutral: 0, bad: 0}})
|
|
141
|
+
{total: 0, by_rating: {good: 0, neutral: 0, bad: 0}, by_target_type: {}}
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Delete an evaluation by ID
|
|
145
|
+
#
|
|
146
|
+
# @param id [String] The evaluation ID
|
|
147
|
+
# @return [Hash] Result with :success key
|
|
148
|
+
def delete(id)
|
|
149
|
+
file_path = File.join(@evaluations_dir, "#{id}.json")
|
|
150
|
+
return {success: true, message: "Evaluation not found"} unless File.exist?(file_path)
|
|
151
|
+
|
|
152
|
+
Aidp.log_debug("evaluation_storage", "delete", id: id)
|
|
153
|
+
|
|
154
|
+
File.delete(file_path)
|
|
155
|
+
remove_from_index(id)
|
|
156
|
+
|
|
157
|
+
{success: true, id: id}
|
|
158
|
+
rescue => error
|
|
159
|
+
log_rescue(error,
|
|
160
|
+
component: "evaluation_storage",
|
|
161
|
+
action: "delete",
|
|
162
|
+
fallback: {success: false},
|
|
163
|
+
id: id)
|
|
164
|
+
{success: false, error: error.message}
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Clear all evaluations
|
|
168
|
+
#
|
|
169
|
+
# @return [Hash] Result with :success and :count keys
|
|
170
|
+
def clear
|
|
171
|
+
Aidp.log_debug("evaluation_storage", "clear")
|
|
172
|
+
|
|
173
|
+
return {success: true, count: 0} unless Dir.exist?(@evaluations_dir)
|
|
174
|
+
|
|
175
|
+
count = Dir.glob(File.join(@evaluations_dir, "eval_*.json")).size
|
|
176
|
+
FileUtils.rm_rf(@evaluations_dir)
|
|
177
|
+
|
|
178
|
+
{success: true, count: count}
|
|
179
|
+
rescue => error
|
|
180
|
+
log_rescue(error,
|
|
181
|
+
component: "evaluation_storage",
|
|
182
|
+
action: "clear",
|
|
183
|
+
fallback: {success: false})
|
|
184
|
+
{success: false, error: error.message}
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# Check if evaluations directory exists and has evaluations
|
|
188
|
+
def any?
|
|
189
|
+
Dir.exist?(@evaluations_dir) && Dir.glob(File.join(@evaluations_dir, "eval_*.json")).any?
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
private
|
|
193
|
+
|
|
194
|
+
def ensure_directory
|
|
195
|
+
ConfigPaths.ensure_evaluations_dir(@project_dir)
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
def load_index
|
|
199
|
+
return {entries: []} unless File.exist?(@index_file)
|
|
200
|
+
|
|
201
|
+
data = JSON.parse(File.read(@index_file))
|
|
202
|
+
symbolize_index(data)
|
|
203
|
+
rescue
|
|
204
|
+
{entries: []}
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
def update_index(record)
|
|
208
|
+
index = load_index
|
|
209
|
+
index[:entries] ||= []
|
|
210
|
+
|
|
211
|
+
# Add new entry to index (stores minimal data for quick lookups)
|
|
212
|
+
index[:entries] << {
|
|
213
|
+
id: record.id,
|
|
214
|
+
rating: record.rating,
|
|
215
|
+
target_type: record.target_type,
|
|
216
|
+
target_id: record.target_id,
|
|
217
|
+
created_at: record.created_at
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
index[:updated_at] = Time.now.iso8601
|
|
221
|
+
|
|
222
|
+
File.write(@index_file, JSON.pretty_generate(index))
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
def remove_from_index(id)
|
|
226
|
+
index = load_index
|
|
227
|
+
index[:entries]&.reject! { |e| e[:id] == id }
|
|
228
|
+
index[:updated_at] = Time.now.iso8601
|
|
229
|
+
|
|
230
|
+
File.write(@index_file, JSON.pretty_generate(index))
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
def symbolize_index(data)
|
|
234
|
+
return data unless data.is_a?(Hash)
|
|
235
|
+
result = {}
|
|
236
|
+
data.each do |key, value|
|
|
237
|
+
sym_key = key.is_a?(String) ? key.to_sym : key
|
|
238
|
+
result[sym_key] = if value.is_a?(Array)
|
|
239
|
+
value.map { |v| v.is_a?(Hash) ? symbolize_index(v) : v }
|
|
240
|
+
elsif value.is_a?(Hash)
|
|
241
|
+
symbolize_index(value)
|
|
242
|
+
else
|
|
243
|
+
value
|
|
244
|
+
end
|
|
245
|
+
end
|
|
246
|
+
result
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
end
|
|
250
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "evaluations/evaluation_record"
|
|
4
|
+
require_relative "evaluations/evaluation_storage"
|
|
5
|
+
require_relative "evaluations/context_capture"
|
|
6
|
+
|
|
7
|
+
module Aidp
|
|
8
|
+
# Evaluation and feedback system for AIDP outputs
|
|
9
|
+
#
|
|
10
|
+
# Enables users to rate generated outputs (prompts, work units, work loops)
|
|
11
|
+
# as good, neutral, or bad while capturing rich execution context.
|
|
12
|
+
#
|
|
13
|
+
# @example Creating and storing an evaluation
|
|
14
|
+
# record = Aidp::Evaluations::EvaluationRecord.new(
|
|
15
|
+
# rating: "good",
|
|
16
|
+
# comment: "Clean code generated",
|
|
17
|
+
# target_type: "work_unit"
|
|
18
|
+
# )
|
|
19
|
+
# storage = Aidp::Evaluations::EvaluationStorage.new
|
|
20
|
+
# storage.store(record)
|
|
21
|
+
module Evaluations
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -21,6 +21,9 @@ module Aidp
|
|
|
21
21
|
|
|
22
22
|
attr_reader :state, :instruction_queue, :work_thread
|
|
23
23
|
|
|
24
|
+
# Expose sync_runner for testability
|
|
25
|
+
attr_accessor :sync_runner
|
|
26
|
+
|
|
24
27
|
def initialize(project_dir, provider_manager, config, options = {})
|
|
25
28
|
@project_dir = project_dir
|
|
26
29
|
@provider_manager = provider_manager
|
|
@@ -175,7 +178,7 @@ module Aidp
|
|
|
175
178
|
def save_cancellation_checkpoint
|
|
176
179
|
return unless @sync_runner
|
|
177
180
|
|
|
178
|
-
checkpoint = @sync_runner.
|
|
181
|
+
checkpoint = @sync_runner.checkpoint
|
|
179
182
|
return unless checkpoint
|
|
180
183
|
|
|
181
184
|
checkpoint.record_checkpoint(
|
|
@@ -23,6 +23,10 @@ module Aidp
|
|
|
23
23
|
class InteractiveRepl
|
|
24
24
|
include Aidp::RescueLogging
|
|
25
25
|
|
|
26
|
+
# Expose running state and repl_macros for testability
|
|
27
|
+
attr_accessor :running
|
|
28
|
+
attr_reader :repl_macros, :async_runner, :completion_setup_needed, :output_display_thread
|
|
29
|
+
|
|
26
30
|
def initialize(project_dir, provider_manager, config, options = {})
|
|
27
31
|
@project_dir = project_dir
|
|
28
32
|
@provider_manager = provider_manager
|
|
@@ -30,8 +34,8 @@ module Aidp
|
|
|
30
34
|
@options = options
|
|
31
35
|
@prompt = options[:prompt] || TTY::Prompt.new
|
|
32
36
|
@async_runner_class = options[:async_runner_class] || AsyncWorkLoopRunner
|
|
33
|
-
@async_runner =
|
|
34
|
-
@repl_macros = ReplMacros.new
|
|
37
|
+
@async_runner = options[:async_runner]
|
|
38
|
+
@repl_macros = options[:repl_macros] || ReplMacros.new
|
|
35
39
|
@output_display_thread = nil
|
|
36
40
|
@running = false
|
|
37
41
|
@completion_setup_needed = true
|
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../harness/ai_decision_engine"
|
|
4
|
+
|
|
5
|
+
module Aidp
|
|
6
|
+
module Execute
|
|
7
|
+
# Evaluates prompt effectiveness using ZFC after multiple iterations
|
|
8
|
+
#
|
|
9
|
+
# FIX for issue #391: When the work loop reaches 10+ iterations without completion,
|
|
10
|
+
# this evaluator assesses prompt quality and suggests improvements.
|
|
11
|
+
#
|
|
12
|
+
# Uses Zero Framework Cognition (ZFC) to analyze:
|
|
13
|
+
# - Whether the prompt clearly defines completion criteria
|
|
14
|
+
# - If task breakdown instructions are adequate
|
|
15
|
+
# - Whether the agent has sufficient context
|
|
16
|
+
# - If there are blockers preventing progress
|
|
17
|
+
#
|
|
18
|
+
# @example
|
|
19
|
+
# evaluator = PromptEvaluator.new(config)
|
|
20
|
+
# result = evaluator.evaluate(
|
|
21
|
+
# prompt_content: prompt_manager.read,
|
|
22
|
+
# iteration_count: 12,
|
|
23
|
+
# task_summary: persistent_tasklist.summary,
|
|
24
|
+
# recent_failures: all_results
|
|
25
|
+
# )
|
|
26
|
+
# # => { effective: false, issues: [...], suggestions: [...] }
|
|
27
|
+
#
|
|
28
|
+
class PromptEvaluator
|
|
29
|
+
# Threshold for triggering evaluation
|
|
30
|
+
EVALUATION_ITERATION_THRESHOLD = 10
|
|
31
|
+
|
|
32
|
+
# Re-evaluate periodically after threshold
|
|
33
|
+
EVALUATION_INTERVAL = 5
|
|
34
|
+
|
|
35
|
+
# Expose for testability
|
|
36
|
+
attr_reader :ai_decision_engine
|
|
37
|
+
|
|
38
|
+
def initialize(config, ai_decision_engine: nil)
|
|
39
|
+
@config = config
|
|
40
|
+
@ai_decision_engine = ai_decision_engine || safely_build_ai_decision_engine
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Safely build AIDecisionEngine, returning nil if config doesn't support it
|
|
44
|
+
# This allows tests with mock configs to work without AI calls
|
|
45
|
+
def safely_build_ai_decision_engine
|
|
46
|
+
# Check if config supports the methods AIDecisionEngine needs
|
|
47
|
+
return nil unless @config.respond_to?(:default_provider)
|
|
48
|
+
|
|
49
|
+
build_default_ai_decision_engine
|
|
50
|
+
rescue => e
|
|
51
|
+
Aidp.log_debug("prompt_evaluator", "skipping_ai_decision_engine",
|
|
52
|
+
reason: e.message)
|
|
53
|
+
nil
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Check if evaluation should be triggered based on iteration count
|
|
57
|
+
# @param iteration_count [Integer] Current iteration number
|
|
58
|
+
# @return [Boolean]
|
|
59
|
+
def should_evaluate?(iteration_count)
|
|
60
|
+
return false unless iteration_count >= EVALUATION_ITERATION_THRESHOLD
|
|
61
|
+
|
|
62
|
+
# Evaluate at threshold and every EVALUATION_INTERVAL after
|
|
63
|
+
(iteration_count - EVALUATION_ITERATION_THRESHOLD) % EVALUATION_INTERVAL == 0
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Evaluate prompt effectiveness
|
|
67
|
+
# @param prompt_content [String] Current PROMPT.md content
|
|
68
|
+
# @param iteration_count [Integer] Current iteration number
|
|
69
|
+
# @param task_summary [Hash] Summary of task statuses
|
|
70
|
+
# @param recent_failures [Hash] Recent test/lint failures
|
|
71
|
+
# @param step_name [String] Name of current step
|
|
72
|
+
# @return [Hash] Evaluation result with :effective, :issues, :suggestions
|
|
73
|
+
def evaluate(prompt_content:, iteration_count:, task_summary:, recent_failures:, step_name: nil)
|
|
74
|
+
Aidp.log_debug("prompt_evaluator", "starting_evaluation",
|
|
75
|
+
iteration: iteration_count,
|
|
76
|
+
step: step_name,
|
|
77
|
+
prompt_size: prompt_content&.length || 0)
|
|
78
|
+
|
|
79
|
+
# When AI decision engine is unavailable (e.g., in tests with mock configs),
|
|
80
|
+
# return a neutral result that doesn't trigger feedback appending
|
|
81
|
+
unless @ai_decision_engine
|
|
82
|
+
Aidp.log_debug("prompt_evaluator", "skipping_evaluation_no_ai_engine")
|
|
83
|
+
return {
|
|
84
|
+
effective: true, # Assume effective to avoid unnecessary feedback
|
|
85
|
+
issues: [],
|
|
86
|
+
suggestions: [],
|
|
87
|
+
likely_blockers: [],
|
|
88
|
+
recommended_actions: [],
|
|
89
|
+
confidence: 0.0,
|
|
90
|
+
skipped: true,
|
|
91
|
+
skip_reason: "AI decision engine not available"
|
|
92
|
+
}
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
prompt = build_evaluation_prompt(
|
|
96
|
+
prompt_content: prompt_content,
|
|
97
|
+
iteration_count: iteration_count,
|
|
98
|
+
task_summary: task_summary,
|
|
99
|
+
recent_failures: recent_failures
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
schema = {
|
|
103
|
+
type: "object",
|
|
104
|
+
properties: {
|
|
105
|
+
effective: {
|
|
106
|
+
type: "boolean",
|
|
107
|
+
description: "True if the prompt is likely to lead to completion within a few more iterations"
|
|
108
|
+
},
|
|
109
|
+
issues: {
|
|
110
|
+
type: "array",
|
|
111
|
+
items: {type: "string"},
|
|
112
|
+
description: "Specific problems identified with the current prompt"
|
|
113
|
+
},
|
|
114
|
+
suggestions: {
|
|
115
|
+
type: "array",
|
|
116
|
+
items: {type: "string"},
|
|
117
|
+
description: "Actionable suggestions to improve prompt effectiveness"
|
|
118
|
+
},
|
|
119
|
+
likely_blockers: {
|
|
120
|
+
type: "array",
|
|
121
|
+
items: {type: "string"},
|
|
122
|
+
description: "Potential blockers preventing progress"
|
|
123
|
+
},
|
|
124
|
+
recommended_actions: {
|
|
125
|
+
type: "array",
|
|
126
|
+
items: {
|
|
127
|
+
type: "object",
|
|
128
|
+
properties: {
|
|
129
|
+
action: {type: "string"},
|
|
130
|
+
priority: {type: "string", enum: ["high", "medium", "low"]},
|
|
131
|
+
rationale: {type: "string"}
|
|
132
|
+
}
|
|
133
|
+
},
|
|
134
|
+
description: "Specific actions to take, prioritized"
|
|
135
|
+
},
|
|
136
|
+
confidence: {
|
|
137
|
+
type: "number",
|
|
138
|
+
minimum: 0.0,
|
|
139
|
+
maximum: 1.0,
|
|
140
|
+
description: "Confidence in this assessment"
|
|
141
|
+
}
|
|
142
|
+
},
|
|
143
|
+
required: ["effective", "issues", "suggestions", "confidence"]
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
begin
|
|
147
|
+
result = @ai_decision_engine.decide(
|
|
148
|
+
:prompt_evaluation,
|
|
149
|
+
context: {prompt: prompt},
|
|
150
|
+
schema: schema,
|
|
151
|
+
tier: :mini,
|
|
152
|
+
cache_ttl: nil # Each evaluation is context-specific
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
Aidp.log_info("prompt_evaluator", "evaluation_complete",
|
|
156
|
+
iteration: iteration_count,
|
|
157
|
+
effective: result[:effective],
|
|
158
|
+
issue_count: result[:issues]&.size || 0,
|
|
159
|
+
confidence: result[:confidence])
|
|
160
|
+
|
|
161
|
+
result
|
|
162
|
+
rescue => e
|
|
163
|
+
Aidp.log_error("prompt_evaluator", "evaluation_failed",
|
|
164
|
+
error: e.message,
|
|
165
|
+
error_class: e.class.name)
|
|
166
|
+
|
|
167
|
+
build_fallback_result("Evaluation failed: #{e.message}")
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# Generate improvement recommendations for the prompt template
|
|
172
|
+
# Used for AGD pattern - generating improved templates based on evaluation
|
|
173
|
+
# @param evaluation_result [Hash] Result from evaluate()
|
|
174
|
+
# @param original_template [String] The original template content
|
|
175
|
+
# @return [Hash] Template improvements
|
|
176
|
+
def generate_template_improvements(evaluation_result:, original_template:)
|
|
177
|
+
return nil unless @ai_decision_engine
|
|
178
|
+
|
|
179
|
+
Aidp.log_debug("prompt_evaluator", "generating_template_improvements",
|
|
180
|
+
issue_count: evaluation_result[:issues]&.size || 0)
|
|
181
|
+
|
|
182
|
+
prompt = build_improvement_prompt(evaluation_result, original_template)
|
|
183
|
+
|
|
184
|
+
schema = {
|
|
185
|
+
type: "object",
|
|
186
|
+
properties: {
|
|
187
|
+
improved_sections: {
|
|
188
|
+
type: "array",
|
|
189
|
+
items: {
|
|
190
|
+
type: "object",
|
|
191
|
+
properties: {
|
|
192
|
+
section_name: {type: "string"},
|
|
193
|
+
original: {type: "string"},
|
|
194
|
+
improved: {type: "string"},
|
|
195
|
+
rationale: {type: "string"}
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
},
|
|
199
|
+
additional_sections: {
|
|
200
|
+
type: "array",
|
|
201
|
+
items: {
|
|
202
|
+
type: "object",
|
|
203
|
+
properties: {
|
|
204
|
+
section_name: {type: "string"},
|
|
205
|
+
content: {type: "string"},
|
|
206
|
+
rationale: {type: "string"}
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
},
|
|
210
|
+
completion_criteria_improvements: {
|
|
211
|
+
type: "array",
|
|
212
|
+
items: {type: "string"},
|
|
213
|
+
description: "Specific improvements to completion criteria definitions"
|
|
214
|
+
}
|
|
215
|
+
},
|
|
216
|
+
required: ["improved_sections", "completion_criteria_improvements"]
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
@ai_decision_engine.decide(
|
|
220
|
+
:template_improvement,
|
|
221
|
+
context: {prompt: prompt},
|
|
222
|
+
schema: schema,
|
|
223
|
+
tier: :standard, # Use standard tier for more thoughtful improvements
|
|
224
|
+
cache_ttl: nil
|
|
225
|
+
)
|
|
226
|
+
rescue => e
|
|
227
|
+
Aidp.log_error("prompt_evaluator", "template_improvement_failed",
|
|
228
|
+
error: e.message)
|
|
229
|
+
nil
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
private
|
|
233
|
+
|
|
234
|
+
def build_evaluation_prompt(prompt_content:, iteration_count:, task_summary:, recent_failures:)
|
|
235
|
+
<<~PROMPT
|
|
236
|
+
You are evaluating the effectiveness of a work loop prompt that has been running for #{iteration_count} iterations without completion.
|
|
237
|
+
|
|
238
|
+
## Current Prompt Content
|
|
239
|
+
#{truncate_content(prompt_content, 8000)}
|
|
240
|
+
|
|
241
|
+
## Task Summary
|
|
242
|
+
#{format_task_summary(task_summary)}
|
|
243
|
+
|
|
244
|
+
## Recent Check Results
|
|
245
|
+
#{format_failures(recent_failures)}
|
|
246
|
+
|
|
247
|
+
## Evaluation Criteria
|
|
248
|
+
|
|
249
|
+
Analyze why this prompt may not be leading to completion:
|
|
250
|
+
|
|
251
|
+
1. **Clarity of Goals**: Are the implementation requirements clearly defined?
|
|
252
|
+
2. **Task Breakdown**: Does the prompt guide proper task decomposition?
|
|
253
|
+
3. **Completion Criteria**: Are the completion criteria specific and achievable?
|
|
254
|
+
4. **Context Sufficiency**: Does the agent have enough context to proceed?
|
|
255
|
+
5. **Blockers**: Are there technical blockers or missing information?
|
|
256
|
+
6. **Scope**: Is the scope realistic for an AI agent to complete?
|
|
257
|
+
|
|
258
|
+
## Your Assessment
|
|
259
|
+
|
|
260
|
+
Provide:
|
|
261
|
+
- Whether this prompt is likely effective (true/false)
|
|
262
|
+
- Specific issues with the current prompt
|
|
263
|
+
- Actionable suggestions for improvement
|
|
264
|
+
- Likely blockers preventing progress
|
|
265
|
+
- Prioritized recommended actions
|
|
266
|
+
- Your confidence in this assessment (0.0-1.0)
|
|
267
|
+
|
|
268
|
+
Be specific and actionable. Focus on what can be changed to achieve completion.
|
|
269
|
+
PROMPT
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
def build_improvement_prompt(evaluation_result, original_template)
|
|
273
|
+
<<~PROMPT
|
|
274
|
+
Based on the following prompt evaluation, suggest improvements to the template.
|
|
275
|
+
|
|
276
|
+
## Evaluation Results
|
|
277
|
+
- Effective: #{evaluation_result[:effective]}
|
|
278
|
+
- Issues: #{(evaluation_result[:issues] || []).join(", ")}
|
|
279
|
+
- Suggestions: #{(evaluation_result[:suggestions] || []).join(", ")}
|
|
280
|
+
|
|
281
|
+
## Original Template
|
|
282
|
+
#{truncate_content(original_template, 4000)}
|
|
283
|
+
|
|
284
|
+
## Your Task
|
|
285
|
+
|
|
286
|
+
Suggest specific improvements to make the template more effective:
|
|
287
|
+
1. Identify sections that need improvement
|
|
288
|
+
2. Propose new sections if needed
|
|
289
|
+
3. Focus especially on completion criteria clarity
|
|
290
|
+
4. Ensure task breakdown instructions are explicit
|
|
291
|
+
5. Add guidance for common failure modes
|
|
292
|
+
|
|
293
|
+
Be specific - provide actual text that could replace or supplement the template.
|
|
294
|
+
PROMPT
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
def format_task_summary(task_summary)
|
|
298
|
+
return "_No task summary available_" if task_summary.nil? || task_summary.empty?
|
|
299
|
+
|
|
300
|
+
if task_summary.is_a?(Hash)
|
|
301
|
+
parts = []
|
|
302
|
+
parts << "Total: #{task_summary[:total] || 0}"
|
|
303
|
+
parts << "Done: #{task_summary[:done] || 0}"
|
|
304
|
+
parts << "In Progress: #{task_summary[:in_progress] || 0}"
|
|
305
|
+
parts << "Pending: #{task_summary[:pending] || 0}"
|
|
306
|
+
parts << "Abandoned: #{task_summary[:abandoned] || 0}"
|
|
307
|
+
parts.join(" | ")
|
|
308
|
+
else
|
|
309
|
+
task_summary.to_s
|
|
310
|
+
end
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
def format_failures(recent_failures)
|
|
314
|
+
return "_No recent failures_" if recent_failures.nil? || recent_failures.empty?
|
|
315
|
+
|
|
316
|
+
parts = []
|
|
317
|
+
recent_failures.each do |check_type, result|
|
|
318
|
+
next unless result.is_a?(Hash)
|
|
319
|
+
|
|
320
|
+
status = result[:success] ? "✅ passed" : "❌ failed"
|
|
321
|
+
parts << "- #{check_type}: #{status}"
|
|
322
|
+
|
|
323
|
+
if !result[:success] && result[:failures]
|
|
324
|
+
failures = result[:failures].take(3)
|
|
325
|
+
failures.each { |f| parts << " - #{truncate_content(f.to_s, 200)}" }
|
|
326
|
+
end
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
parts.empty? ? "_No failures to report_" : parts.join("\n")
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
def truncate_content(content, max_length)
|
|
333
|
+
return "_No content_" if content.nil? || content.empty?
|
|
334
|
+
return content if content.length <= max_length
|
|
335
|
+
|
|
336
|
+
"#{content[0, max_length]}\n\n[... truncated, showing first #{max_length} characters ...]"
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
def build_fallback_result(reason)
|
|
340
|
+
{
|
|
341
|
+
effective: nil,
|
|
342
|
+
issues: ["Unable to evaluate: #{reason}"],
|
|
343
|
+
suggestions: ["Check AI configuration and try again"],
|
|
344
|
+
likely_blockers: [],
|
|
345
|
+
recommended_actions: [],
|
|
346
|
+
confidence: 0.0
|
|
347
|
+
}
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
def build_default_ai_decision_engine
|
|
351
|
+
Aidp::Harness::AIDecisionEngine.new(@config)
|
|
352
|
+
rescue => e
|
|
353
|
+
Aidp.log_warn("prompt_evaluator", "failed_to_create_ai_decision_engine",
|
|
354
|
+
error: e.message)
|
|
355
|
+
nil
|
|
356
|
+
end
|
|
357
|
+
end
|
|
358
|
+
end
|
|
359
|
+
end
|