lex-eval 0.3.10 → 0.3.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3214b4f11d574772ad1323a0b37816d45c29ac61e5713f2ee82776ac8906ec09
4
- data.tar.gz: b3bd8441ea3a5eda81b9f402e3d7e44e803504002ada86695c41b885f76d624b
3
+ metadata.gz: 1ba8f0431d907def112bdd8f02a0685002fbfc75754bb658d620ea4889b03c70
4
+ data.tar.gz: b38fcf01598441f544cf8721d9109010983b357be7085e4e0196edd9872cd16a
5
5
  SHA512:
6
- metadata.gz: 97601e029b078a0fbf7c1606968e90f5e5f308be477f543e793ff8533eb3ff6ff62716614388eda67416cb671b45e3a07d3d77e58fdf80484944c24bd866cc5f
7
- data.tar.gz: c20ab43b7eed22645560f0022094cb8c2b7c9ae0044c452aa57fcdb524e6833bcc30e07b801ec58b52175e2b12871b44e7dace59dfa20499e0a9e6b0529a6ea4
6
+ metadata.gz: 678d325ad2c47aa74abfe3a8ec4f8c7eb63bb14beec4e8911c7912963ded1cc966dbe57dfe50b721e54cdd85df1ef67e6c04af73004c76f560e9b5b45c0ea132
7
+ data.tar.gz: abd3cab893f187e0c51929cb9f91fa7f3806d250a339c6c137e6073589d0d6e0a18e9560dcdc9645b39f07c9471e9b93ee5ae0eaf3b74f421501383ba3951078
@@ -23,18 +23,25 @@ module Legion
23
23
  required: %i[confidence recommendation explanation]
24
24
  }.freeze
25
25
 
26
- def review_output(input:, output:, review_prompt: nil, **)
26
+ def review_output(input:, output:, review_prompt: nil, model: nil, provider: nil, **) # rubocop:disable Metrics/ParameterLists
27
27
  prompt = build_review_message(review_prompt || default_review_prompt, input, output)
28
- Legion::LLM.structured(message: prompt, schema: REVIEW_SCHEMA,
29
- intent: { capability: :reasoning },
30
- caller: { extension: 'lex-eval', operation: 'agentic_review' })
28
+ llm_kwargs = {
29
+ message: prompt, schema: REVIEW_SCHEMA,
30
+ intent: { capability: :reasoning },
31
+ caller: { extension: 'lex-eval', operation: 'agentic_review' }
32
+ }
33
+ llm_kwargs[:model] = model if model
34
+ llm_kwargs[:provider] = provider if provider
35
+ Legion::LLM.structured(**llm_kwargs)
31
36
  rescue StandardError => e
37
+ log.warn(e.message) if respond_to?(:log, true)
32
38
  { confidence: 0.0, recommendation: 'reject',
33
39
  issues: [], explanation: "review error: #{e.message}" }
34
40
  end
35
41
 
36
- def review_with_escalation(input:, output:, review_prompt: nil, **)
37
- review = review_output(input: input, output: output, review_prompt: review_prompt)
42
+ def review_with_escalation(input:, output:, review_prompt: nil, model: nil, provider: nil, **) # rubocop:disable Metrics/ParameterLists
43
+ review = review_output(input: input, output: output, review_prompt: review_prompt,
44
+ model: model, provider: provider)
38
45
  action, priority = determine_escalation(review[:confidence])
39
46
 
40
47
  return review.merge(action: :auto_approve, escalated: false) if action == :auto_approve
@@ -42,9 +49,11 @@ module Legion
42
49
  review.merge(action: action, escalated: true, priority: priority)
43
50
  end
44
51
 
45
- def review_experiment(input:, output_a:, output_b:, review_prompt: nil, **)
46
- review_a = review_output(input: input, output: output_a, review_prompt: review_prompt)
47
- review_b = review_output(input: input, output: output_b, review_prompt: review_prompt)
52
+ def review_experiment(input:, output_a:, output_b:, review_prompt: nil, model: nil, provider: nil, **) # rubocop:disable Metrics/ParameterLists
53
+ review_a = review_output(input: input, output: output_a, review_prompt: review_prompt,
54
+ model: model, provider: provider)
55
+ review_b = review_output(input: input, output: output_b, review_prompt: review_prompt,
56
+ model: model, provider: provider)
48
57
 
49
58
  conf_a = review_a[:confidence] || 0.0
50
59
  conf_b = review_b[:confidence] || 0.0
@@ -64,6 +73,7 @@ module Legion
64
73
  review_a: review_a,
65
74
  review_b: review_b }
66
75
  rescue StandardError => e
76
+ log.warn(e.message) if respond_to?(:log, true)
67
77
  { reviewed: false, reason: "experiment error: #{e.message}" }
68
78
  end
69
79
 
@@ -13,7 +13,7 @@ module Legion
13
13
 
14
14
  SPEC_TIMEOUT = 30
15
15
 
16
- def review_generated(code:, spec_code:, context:) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity, Metrics/MethodLength
16
+ def review_generated(code:, spec_code:, context:, review_k: nil, review_models: nil) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity, Metrics/MethodLength
17
17
  settings = validation_settings
18
18
  stages = {}
19
19
  issues = []
@@ -45,7 +45,13 @@ module Legion
45
45
 
46
46
  # Stage 4: LLM review (optional)
47
47
  if settings[:llm_review] && llm_available?
48
- stages[:llm_review] = llm_review(code, context)
48
+ k = review_k || default_review_k
49
+ models = review_models || default_review_models
50
+ stages[:llm_review] = if k > 1
51
+ adversarial_llm_review(code, context, count: k, models: models)
52
+ else
53
+ llm_review(code, context, model_spec: build_model_assignments(1, models)&.first)
54
+ end
49
55
  issues.concat(stages[:llm_review][:issues] || [])
50
56
  end
51
57
 
@@ -75,6 +81,80 @@ module Legion
75
81
  {}
76
82
  end
77
83
 
84
+ def default_review_k
85
+ return 1 unless defined?(Legion::Settings)
86
+
87
+ Legion::Settings.dig(:codegen, :self_generate, :validation, :review_k) || 1
88
+ rescue StandardError => e
89
+ log.warn(e.message)
90
+ 1
91
+ end
92
+
93
+ def default_review_models
94
+ return [] unless defined?(Legion::Settings)
95
+
96
+ Legion::Settings.dig(:codegen, :self_generate, :validation, :review_models) || []
97
+ rescue StandardError => e
98
+ log.warn(e.message)
99
+ []
100
+ end
101
+
102
+ def provider_available?(provider_sym)
103
+ return false unless defined?(Legion::Settings)
104
+
105
+ Legion::Settings.dig(:llm, :providers, provider_sym, :enabled) == true
106
+ rescue StandardError => e
107
+ log.warn(e.message)
108
+ false
109
+ end
110
+
111
+ def build_model_assignments(count, models) # rubocop:disable Metrics/PerceivedComplexity
112
+ return Array.new(count) { nil } if models.nil? || models.empty?
113
+
114
+ available = models.select do |spec|
115
+ next false unless spec.is_a?(Hash)
116
+
117
+ provider_sym = spec[:provider]&.to_sym
118
+ if provider_sym && !provider_available?(provider_sym)
119
+ log.warn("review provider #{provider_sym} not available, skipping")
120
+ false
121
+ else
122
+ true
123
+ end
124
+ end
125
+
126
+ return Array.new(count) { nil } if available.empty?
127
+
128
+ Array.new(count) { |i| available[i % available.size] }
129
+ end
130
+
131
+ def adversarial_llm_review(code, context, count:, models: []) # rubocop:disable Metrics/PerceivedComplexity
132
+ assignments = build_model_assignments(count, models)
133
+
134
+ reviews = assignments.map { |spec| llm_review(code, context, model_spec: spec) }
135
+
136
+ approvals = reviews.count { |r| r[:confidence] >= 0.5 }
137
+ rejections = count - approvals
138
+ all_issues = reviews.flat_map { |r| r[:issues] || [] }.uniq
139
+
140
+ avg_confidence = reviews.sum { |r| r[:confidence] || 0.0 } / reviews.size
141
+
142
+ {
143
+ passed: approvals > rejections,
144
+ issues: all_issues,
145
+ confidence: avg_confidence,
146
+ k: count,
147
+ approvals: approvals,
148
+ rejections: rejections,
149
+ reviews: reviews
150
+ }
151
+ rescue StandardError => e
152
+ log.warn("adversarial review failed: #{e.message}")
153
+ fallback = llm_review(code, context)
154
+ fallback.merge(k: count, approvals: (fallback[:passed] ? 1 : 0),
155
+ rejections: (fallback[:passed] ? 0 : 1), reviews: [fallback])
156
+ end
157
+
78
158
  def check_syntax(code, spec_code)
79
159
  errors = []
80
160
  begin
@@ -121,19 +201,28 @@ module Legion
121
201
  { passed: false, output: '', errors: e.message, exit_code: -1 }
122
202
  end
123
203
 
124
- def llm_review(code, context)
204
+ def llm_review(code, context, model_spec: nil) # rubocop:disable Metrics/PerceivedComplexity
125
205
  return { passed: true, issues: [], confidence: 0.5 } unless defined?(Runners::AgenticReview)
126
206
 
207
+ extra_kwargs = {}
208
+ if model_spec
209
+ extra_kwargs[:model] = model_spec[:model] if model_spec[:model]
210
+ extra_kwargs[:provider] = model_spec[:provider] if model_spec[:provider]
211
+ end
212
+
127
213
  result = Runners::AgenticReview.review_output(
128
214
  input: context,
129
215
  output: code,
130
- review_prompt: 'Review this generated Ruby code for correctness, safety, and Legion conventions.'
216
+ review_prompt: 'Review this generated Ruby code for correctness, safety, and Legion conventions.',
217
+ **extra_kwargs
131
218
  )
132
219
 
133
220
  {
134
- passed: result[:reviewed] != false,
221
+ passed: result[:recommendation] == 'approve',
135
222
  issues: result[:issues] || [],
136
- confidence: result[:confidence] || 0.5
223
+ confidence: result[:confidence] || 0.5,
224
+ provider: model_spec&.dig(:provider),
225
+ model: model_spec&.dig(:model)
137
226
  }
138
227
  rescue StandardError => e
139
228
  log.warn("llm review failed: #{e.message}")
@@ -3,7 +3,7 @@
3
3
  module Legion
4
4
  module Extensions
5
5
  module Eval
6
- VERSION = '0.3.10'
6
+ VERSION = '0.3.11'
7
7
  end
8
8
  end
9
9
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lex-eval
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.10
4
+ version: 0.3.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matthew Iverson