lex-eval 0.3.10 → 0.3.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1ba8f0431d907def112bdd8f02a0685002fbfc75754bb658d620ea4889b03c70
|
|
4
|
+
data.tar.gz: b38fcf01598441f544cf8721d9109010983b357be7085e4e0196edd9872cd16a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 678d325ad2c47aa74abfe3a8ec4f8c7eb63bb14beec4e8911c7912963ded1cc966dbe57dfe50b721e54cdd85df1ef67e6c04af73004c76f560e9b5b45c0ea132
|
|
7
|
+
data.tar.gz: abd3cab893f187e0c51929cb9f91fa7f3806d250a339c6c137e6073589d0d6e0a18e9560dcdc9645b39f07c9471e9b93ee5ae0eaf3b74f421501383ba3951078
|
|
@@ -23,18 +23,25 @@ module Legion
|
|
|
23
23
|
required: %i[confidence recommendation explanation]
|
|
24
24
|
}.freeze
|
|
25
25
|
|
|
26
|
-
def review_output(input:, output:, review_prompt: nil, **)
|
|
26
|
+
def review_output(input:, output:, review_prompt: nil, model: nil, provider: nil, **) # rubocop:disable Metrics/ParameterLists
|
|
27
27
|
prompt = build_review_message(review_prompt || default_review_prompt, input, output)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
28
|
+
llm_kwargs = {
|
|
29
|
+
message: prompt, schema: REVIEW_SCHEMA,
|
|
30
|
+
intent: { capability: :reasoning },
|
|
31
|
+
caller: { extension: 'lex-eval', operation: 'agentic_review' }
|
|
32
|
+
}
|
|
33
|
+
llm_kwargs[:model] = model if model
|
|
34
|
+
llm_kwargs[:provider] = provider if provider
|
|
35
|
+
Legion::LLM.structured(**llm_kwargs)
|
|
31
36
|
rescue StandardError => e
|
|
37
|
+
log.warn(e.message) if respond_to?(:log, true)
|
|
32
38
|
{ confidence: 0.0, recommendation: 'reject',
|
|
33
39
|
issues: [], explanation: "review error: #{e.message}" }
|
|
34
40
|
end
|
|
35
41
|
|
|
36
|
-
def review_with_escalation(input:, output:, review_prompt: nil, **)
|
|
37
|
-
review = review_output(input: input, output: output, review_prompt: review_prompt
|
|
42
|
+
def review_with_escalation(input:, output:, review_prompt: nil, model: nil, provider: nil, **) # rubocop:disable Metrics/ParameterLists
|
|
43
|
+
review = review_output(input: input, output: output, review_prompt: review_prompt,
|
|
44
|
+
model: model, provider: provider)
|
|
38
45
|
action, priority = determine_escalation(review[:confidence])
|
|
39
46
|
|
|
40
47
|
return review.merge(action: :auto_approve, escalated: false) if action == :auto_approve
|
|
@@ -42,9 +49,11 @@ module Legion
|
|
|
42
49
|
review.merge(action: action, escalated: true, priority: priority)
|
|
43
50
|
end
|
|
44
51
|
|
|
45
|
-
def review_experiment(input:, output_a:, output_b:, review_prompt: nil, **)
|
|
46
|
-
review_a = review_output(input: input, output: output_a, review_prompt: review_prompt
|
|
47
|
-
|
|
52
|
+
def review_experiment(input:, output_a:, output_b:, review_prompt: nil, model: nil, provider: nil, **) # rubocop:disable Metrics/ParameterLists
|
|
53
|
+
review_a = review_output(input: input, output: output_a, review_prompt: review_prompt,
|
|
54
|
+
model: model, provider: provider)
|
|
55
|
+
review_b = review_output(input: input, output: output_b, review_prompt: review_prompt,
|
|
56
|
+
model: model, provider: provider)
|
|
48
57
|
|
|
49
58
|
conf_a = review_a[:confidence] || 0.0
|
|
50
59
|
conf_b = review_b[:confidence] || 0.0
|
|
@@ -64,6 +73,7 @@ module Legion
|
|
|
64
73
|
review_a: review_a,
|
|
65
74
|
review_b: review_b }
|
|
66
75
|
rescue StandardError => e
|
|
76
|
+
log.warn(e.message) if respond_to?(:log, true)
|
|
67
77
|
{ reviewed: false, reason: "experiment error: #{e.message}" }
|
|
68
78
|
end
|
|
69
79
|
|
|
@@ -13,7 +13,7 @@ module Legion
|
|
|
13
13
|
|
|
14
14
|
SPEC_TIMEOUT = 30
|
|
15
15
|
|
|
16
|
-
def review_generated(code:, spec_code:, context:) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity, Metrics/MethodLength
|
|
16
|
+
def review_generated(code:, spec_code:, context:, review_k: nil, review_models: nil) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity, Metrics/MethodLength
|
|
17
17
|
settings = validation_settings
|
|
18
18
|
stages = {}
|
|
19
19
|
issues = []
|
|
@@ -45,7 +45,13 @@ module Legion
|
|
|
45
45
|
|
|
46
46
|
# Stage 4: LLM review (optional)
|
|
47
47
|
if settings[:llm_review] && llm_available?
|
|
48
|
-
|
|
48
|
+
k = review_k || default_review_k
|
|
49
|
+
models = review_models || default_review_models
|
|
50
|
+
stages[:llm_review] = if k > 1
|
|
51
|
+
adversarial_llm_review(code, context, count: k, models: models)
|
|
52
|
+
else
|
|
53
|
+
llm_review(code, context, model_spec: build_model_assignments(1, models)&.first)
|
|
54
|
+
end
|
|
49
55
|
issues.concat(stages[:llm_review][:issues] || [])
|
|
50
56
|
end
|
|
51
57
|
|
|
@@ -75,6 +81,80 @@ module Legion
|
|
|
75
81
|
{}
|
|
76
82
|
end
|
|
77
83
|
|
|
84
|
+
def default_review_k
|
|
85
|
+
return 1 unless defined?(Legion::Settings)
|
|
86
|
+
|
|
87
|
+
Legion::Settings.dig(:codegen, :self_generate, :validation, :review_k) || 1
|
|
88
|
+
rescue StandardError => e
|
|
89
|
+
log.warn(e.message)
|
|
90
|
+
1
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def default_review_models
|
|
94
|
+
return [] unless defined?(Legion::Settings)
|
|
95
|
+
|
|
96
|
+
Legion::Settings.dig(:codegen, :self_generate, :validation, :review_models) || []
|
|
97
|
+
rescue StandardError => e
|
|
98
|
+
log.warn(e.message)
|
|
99
|
+
[]
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def provider_available?(provider_sym)
|
|
103
|
+
return false unless defined?(Legion::Settings)
|
|
104
|
+
|
|
105
|
+
Legion::Settings.dig(:llm, :providers, provider_sym, :enabled) == true
|
|
106
|
+
rescue StandardError => e
|
|
107
|
+
log.warn(e.message)
|
|
108
|
+
false
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def build_model_assignments(count, models) # rubocop:disable Metrics/PerceivedComplexity
|
|
112
|
+
return Array.new(count) { nil } if models.nil? || models.empty?
|
|
113
|
+
|
|
114
|
+
available = models.select do |spec|
|
|
115
|
+
next false unless spec.is_a?(Hash)
|
|
116
|
+
|
|
117
|
+
provider_sym = spec[:provider]&.to_sym
|
|
118
|
+
if provider_sym && !provider_available?(provider_sym)
|
|
119
|
+
log.warn("review provider #{provider_sym} not available, skipping")
|
|
120
|
+
false
|
|
121
|
+
else
|
|
122
|
+
true
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
return Array.new(count) { nil } if available.empty?
|
|
127
|
+
|
|
128
|
+
Array.new(count) { |i| available[i % available.size] }
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def adversarial_llm_review(code, context, count:, models: []) # rubocop:disable Metrics/PerceivedComplexity
|
|
132
|
+
assignments = build_model_assignments(count, models)
|
|
133
|
+
|
|
134
|
+
reviews = assignments.map { |spec| llm_review(code, context, model_spec: spec) }
|
|
135
|
+
|
|
136
|
+
approvals = reviews.count { |r| r[:confidence] >= 0.5 }
|
|
137
|
+
rejections = count - approvals
|
|
138
|
+
all_issues = reviews.flat_map { |r| r[:issues] || [] }.uniq
|
|
139
|
+
|
|
140
|
+
avg_confidence = reviews.sum { |r| r[:confidence] || 0.0 } / reviews.size
|
|
141
|
+
|
|
142
|
+
{
|
|
143
|
+
passed: approvals > rejections,
|
|
144
|
+
issues: all_issues,
|
|
145
|
+
confidence: avg_confidence,
|
|
146
|
+
k: count,
|
|
147
|
+
approvals: approvals,
|
|
148
|
+
rejections: rejections,
|
|
149
|
+
reviews: reviews
|
|
150
|
+
}
|
|
151
|
+
rescue StandardError => e
|
|
152
|
+
log.warn("adversarial review failed: #{e.message}")
|
|
153
|
+
fallback = llm_review(code, context)
|
|
154
|
+
fallback.merge(k: count, approvals: (fallback[:passed] ? 1 : 0),
|
|
155
|
+
rejections: (fallback[:passed] ? 0 : 1), reviews: [fallback])
|
|
156
|
+
end
|
|
157
|
+
|
|
78
158
|
def check_syntax(code, spec_code)
|
|
79
159
|
errors = []
|
|
80
160
|
begin
|
|
@@ -121,19 +201,28 @@ module Legion
|
|
|
121
201
|
{ passed: false, output: '', errors: e.message, exit_code: -1 }
|
|
122
202
|
end
|
|
123
203
|
|
|
124
|
-
def llm_review(code, context)
|
|
204
|
+
def llm_review(code, context, model_spec: nil) # rubocop:disable Metrics/PerceivedComplexity
|
|
125
205
|
return { passed: true, issues: [], confidence: 0.5 } unless defined?(Runners::AgenticReview)
|
|
126
206
|
|
|
207
|
+
extra_kwargs = {}
|
|
208
|
+
if model_spec
|
|
209
|
+
extra_kwargs[:model] = model_spec[:model] if model_spec[:model]
|
|
210
|
+
extra_kwargs[:provider] = model_spec[:provider] if model_spec[:provider]
|
|
211
|
+
end
|
|
212
|
+
|
|
127
213
|
result = Runners::AgenticReview.review_output(
|
|
128
214
|
input: context,
|
|
129
215
|
output: code,
|
|
130
|
-
review_prompt: 'Review this generated Ruby code for correctness, safety, and Legion conventions.'
|
|
216
|
+
review_prompt: 'Review this generated Ruby code for correctness, safety, and Legion conventions.',
|
|
217
|
+
**extra_kwargs
|
|
131
218
|
)
|
|
132
219
|
|
|
133
220
|
{
|
|
134
|
-
passed: result[:
|
|
221
|
+
passed: result[:recommendation] == 'approve',
|
|
135
222
|
issues: result[:issues] || [],
|
|
136
|
-
confidence: result[:confidence] || 0.5
|
|
223
|
+
confidence: result[:confidence] || 0.5,
|
|
224
|
+
provider: model_spec&.dig(:provider),
|
|
225
|
+
model: model_spec&.dig(:model)
|
|
137
226
|
}
|
|
138
227
|
rescue StandardError => e
|
|
139
228
|
log.warn("llm review failed: #{e.message}")
|