lex-eval 0.3.10 → 0.3.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3214b4f11d574772ad1323a0b37816d45c29ac61e5713f2ee82776ac8906ec09
4
- data.tar.gz: b3bd8441ea3a5eda81b9f402e3d7e44e803504002ada86695c41b885f76d624b
3
+ metadata.gz: bcf09bf5fdfb1a7c80e050e440c874f6e099ddd41ac0f85155733d1c4aa80bed
4
+ data.tar.gz: f1291ca88a8a4f7cf30abce7616bf8ab4145d1db0fe6ed6c0112e09482bda77b
5
5
  SHA512:
6
- metadata.gz: 97601e029b078a0fbf7c1606968e90f5e5f308be477f543e793ff8533eb3ff6ff62716614388eda67416cb671b45e3a07d3d77e58fdf80484944c24bd866cc5f
7
- data.tar.gz: c20ab43b7eed22645560f0022094cb8c2b7c9ae0044c452aa57fcdb524e6833bcc30e07b801ec58b52175e2b12871b44e7dace59dfa20499e0a9e6b0529a6ea4
6
+ metadata.gz: 1df90ff664f437acbfa6eb6e1b7dd5efc7620a8b4ee73b8f11a4fb5b35f8b2ab7324d887186abf56c929a84799e68eba3a30a375b05760048fab6eb6f1078b2f
7
+ data.tar.gz: 1887981dc5dc141deb187f1d8ed7e2c5075c911d87b7a367773b93fd96ea6b52c6564b44a90ee8c32a1e9e6670c5ba381b371a8440fdad39b8ea9fda2d42cb9b
@@ -11,6 +11,7 @@ module Legion
11
11
 
12
12
  def runner_class = self.class
13
13
  def runner_function = 'action'
14
+ def check_subtask? = true
14
15
 
15
16
  def action(payload)
16
17
  code = payload[:runner_code] || payload[:code]
@@ -30,17 +31,13 @@ module Legion
30
31
  result
31
32
  rescue StandardError => e
32
33
  log.warn("CodeReviewSubscriber failed: #{e.message}")
33
- { passed: false, verdict: :reject, error: e.message }
34
+ { passed: false, verdict: 'reject', error: e.message }
34
35
  end
35
36
 
36
37
  private
37
38
 
38
39
  def log
39
- return Legion::Logging if defined?(Legion::Logging)
40
-
41
- @log ||= Object.new.tap do |nl|
42
- %i[debug info warn error fatal].each { |m| nl.define_singleton_method(m) { |*| nil } }
43
- end
40
+ Legion::Logging
44
41
  end
45
42
  end
46
43
  end
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'legion/extensions/actors/subscription' unless defined?(Legion::Extensions::Actors::Subscription)
3
+ require 'legion/extensions/actors/subscription'
4
4
 
5
5
  module Legion
6
6
  module Extensions
@@ -30,12 +30,12 @@ module Legion
30
30
  false
31
31
  end
32
32
 
33
- def enabled?
34
- return false unless defined?(Legion::Transport)
33
+ def enabled? # rubocop:disable Legion/Extension/ActorEnabledSideEffects
34
+ return false unless Legion.const_defined?(:Transport, false)
35
35
  return false unless defined?(Legion::Extensions::Eval::Runners::Online)
36
36
 
37
37
  online_enabled?
38
- rescue StandardError
38
+ rescue StandardError => _e
39
39
  false
40
40
  end
41
41
 
@@ -31,7 +31,7 @@ module Legion
31
31
  def valid_json?(str)
32
32
  ::JSON.parse(str)
33
33
  true
34
- rescue ::JSON::ParserError
34
+ rescue ::JSON::ParserError => _e
35
35
  false
36
36
  end
37
37
  end
@@ -37,24 +37,24 @@ module Legion
37
37
  def evaluate_impl(input:, output:, expected:)
38
38
  prompt = render_template(input: input, output: output, expected: expected)
39
39
  evaluate_structured(prompt)
40
- rescue StandardError
40
+ rescue StandardError => _e
41
41
  evaluate_regex_fallback(prompt)
42
42
  end
43
43
 
44
44
  def evaluate_structured(prompt)
45
45
  return evaluate_regex_fallback(prompt) unless structured_available?
46
46
 
47
- result = Legion::LLM.structured(message: prompt, schema: JUDGE_SCHEMA,
47
+ result = Legion::LLM.structured(message: prompt, schema: JUDGE_SCHEMA, # rubocop:disable Legion/HelperMigration/DirectLlm
48
48
  intent: { capability: :reasoning },
49
49
  caller: { extension: 'lex-eval', operation: 'judge' })
50
50
  { score: result[:score], passed: result[:passed],
51
51
  explanation: result[:explanation], evidence: result[:evidence] || [] }
52
- rescue StandardError
52
+ rescue StandardError => _e
53
53
  evaluate_regex_fallback(prompt)
54
54
  end
55
55
 
56
56
  def evaluate_regex_fallback(prompt)
57
- response = Legion::LLM.chat(message: prompt, intent: { capability: :reasoning },
57
+ response = Legion::LLM.chat(message: prompt, intent: { capability: :reasoning }, # rubocop:disable Legion/HelperMigration/DirectLlm
58
58
  caller: { extension: 'lex-eval', operation: 'judge' })
59
59
  score = extract_score(response.content)
60
60
  { score: score, explanation: response.content, passed: score >= threshold, evidence: [] }
@@ -79,11 +79,7 @@ module Legion
79
79
  end
80
80
 
81
81
  def log
82
- return Legion::Logging if defined?(Legion::Logging)
83
-
84
- @log ||= Object.new.tap do |nl|
85
- %i[debug info warn error fatal].each { |m| nl.define_singleton_method(m) { |*| nil } }
86
- end
82
+ Legion::Logging
87
83
  end
88
84
  end
89
85
  end
@@ -4,7 +4,7 @@ module Legion
4
4
  module Extensions
5
5
  module Eval
6
6
  module Runners
7
- module AgenticReview
7
+ module AgenticReview # rubocop:disable Legion/Extension/RunnerIncludeHelpers
8
8
  REVIEW_SCHEMA = {
9
9
  type: :object,
10
10
  properties: {
@@ -23,18 +23,25 @@ module Legion
23
23
  required: %i[confidence recommendation explanation]
24
24
  }.freeze
25
25
 
26
- def review_output(input:, output:, review_prompt: nil, **)
26
+ def review_output(input:, output:, review_prompt: nil, model: nil, provider: nil, **)
27
27
  prompt = build_review_message(review_prompt || default_review_prompt, input, output)
28
- Legion::LLM.structured(message: prompt, schema: REVIEW_SCHEMA,
29
- intent: { capability: :reasoning },
30
- caller: { extension: 'lex-eval', operation: 'agentic_review' })
28
+ llm_kwargs = {
29
+ message: prompt, schema: REVIEW_SCHEMA,
30
+ intent: { capability: :reasoning },
31
+ caller: { extension: 'lex-eval', operation: 'agentic_review' }
32
+ }
33
+ llm_kwargs[:model] = model if model
34
+ llm_kwargs[:provider] = provider if provider
35
+ Legion::LLM.structured(**llm_kwargs) # rubocop:disable Legion/HelperMigration/DirectLlm
31
36
  rescue StandardError => e
37
+ log.warn(e.message) if respond_to?(:log, true) # rubocop:disable Legion/HelperMigration/LoggingGuard
32
38
  { confidence: 0.0, recommendation: 'reject',
33
39
  issues: [], explanation: "review error: #{e.message}" }
34
40
  end
35
41
 
36
- def review_with_escalation(input:, output:, review_prompt: nil, **)
37
- review = review_output(input: input, output: output, review_prompt: review_prompt)
42
+ def review_with_escalation(input:, output:, review_prompt: nil, model: nil, provider: nil, **)
43
+ review = review_output(input: input, output: output, review_prompt: review_prompt,
44
+ model: model, provider: provider)
38
45
  action, priority = determine_escalation(review[:confidence])
39
46
 
40
47
  return review.merge(action: :auto_approve, escalated: false) if action == :auto_approve
@@ -42,9 +49,11 @@ module Legion
42
49
  review.merge(action: action, escalated: true, priority: priority)
43
50
  end
44
51
 
45
- def review_experiment(input:, output_a:, output_b:, review_prompt: nil, **)
46
- review_a = review_output(input: input, output: output_a, review_prompt: review_prompt)
47
- review_b = review_output(input: input, output: output_b, review_prompt: review_prompt)
52
+ def review_experiment(input:, output_a:, output_b:, review_prompt: nil, model: nil, provider: nil, **)
53
+ review_a = review_output(input: input, output: output_a, review_prompt: review_prompt,
54
+ model: model, provider: provider)
55
+ review_b = review_output(input: input, output: output_b, review_prompt: review_prompt,
56
+ model: model, provider: provider)
48
57
 
49
58
  conf_a = review_a[:confidence] || 0.0
50
59
  conf_b = review_b[:confidence] || 0.0
@@ -64,6 +73,7 @@ module Legion
64
73
  review_a: review_a,
65
74
  review_b: review_b }
66
75
  rescue StandardError => e
76
+ log.warn(e.message) if respond_to?(:log, true) # rubocop:disable Legion/HelperMigration/LoggingGuard
67
77
  { reviewed: false, reason: "experiment error: #{e.message}" }
68
78
  end
69
79
 
@@ -5,6 +5,8 @@ module Legion
5
5
  module Eval
6
6
  module Runners
7
7
  module Annotation
8
+ extend self
9
+
8
10
  def create_queue(name:, **opts)
9
11
  db[:annotation_queues].insert(
10
12
  name: name,
@@ -15,7 +17,7 @@ module Legion
15
17
  created_at: Time.now.utc
16
18
  )
17
19
  { created: true, name: name }
18
- rescue Sequel::UniqueConstraintViolation
20
+ rescue Sequel::UniqueConstraintViolation => _e
19
21
  { error: 'already_exists', name: name }
20
22
  end
21
23
 
@@ -13,7 +13,7 @@ module Legion
13
13
 
14
14
  SPEC_TIMEOUT = 30
15
15
 
16
- def review_generated(code:, spec_code:, context:) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity, Metrics/MethodLength
16
+ def review_generated(code:, spec_code:, context: {}, review_k: nil, review_models: nil, **extra) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
17
17
  settings = validation_settings
18
18
  stages = {}
19
19
  issues = []
@@ -22,8 +22,9 @@ module Legion
22
22
  if settings[:syntax_check] != false
23
23
  stages[:syntax] = check_syntax(code, spec_code)
24
24
  unless stages[:syntax][:passed]
25
- return build_result(passed: false, verdict: :reject, stages: stages,
26
- issues: stages[:syntax][:errors], confidence: 0.0)
25
+ return build_result(passed: false, verdict: 'reject', stages: stages,
26
+ issues: stages[:syntax][:errors], confidence: 0.0,
27
+ code: code, spec_code: spec_code, extra: extra)
27
28
  end
28
29
  end
29
30
 
@@ -31,7 +32,8 @@ module Legion
31
32
  stages[:security] = check_security(code)
32
33
  unless stages[:security][:passed]
33
34
  issues.concat(stages[:security][:flagged].map { |f| "security: #{f[:pattern]} on line #{f[:line]}" })
34
- return build_result(passed: false, verdict: :reject, stages: stages, issues: issues, confidence: 0.0)
35
+ return build_result(passed: false, verdict: 'reject', stages: stages, issues: issues, confidence: 0.0,
36
+ code: code, spec_code: spec_code, extra: extra)
35
37
  end
36
38
 
37
39
  # Stage 3: Spec execution (optional)
@@ -39,13 +41,20 @@ module Legion
39
41
  stages[:specs] = run_specs(code, spec_code)
40
42
  unless stages[:specs][:passed]
41
43
  issues << "specs failed: #{stages[:specs][:output]}"
42
- return build_result(passed: false, verdict: :revise, stages: stages, issues: issues, confidence: 0.2)
44
+ return build_result(passed: false, verdict: 'revise', stages: stages, issues: issues, confidence: 0.2,
45
+ code: code, spec_code: spec_code, extra: extra)
43
46
  end
44
47
  end
45
48
 
46
49
  # Stage 4: LLM review (optional)
47
50
  if settings[:llm_review] && llm_available?
48
- stages[:llm_review] = llm_review(code, context)
51
+ k = review_k || default_review_k
52
+ models = review_models || default_review_models
53
+ stages[:llm_review] = if k > 1
54
+ adversarial_llm_review(code, context, count: k, models: models)
55
+ else
56
+ llm_review(code, context, model_spec: build_model_assignments(1, models)&.first)
57
+ end
49
58
  issues.concat(stages[:llm_review][:issues] || [])
50
59
  end
51
60
 
@@ -59,9 +68,10 @@ module Legion
59
68
  end
60
69
 
61
70
  confidence = calculate_confidence(stages)
62
- verdict = confidence >= 0.5 ? :approve : :revise
71
+ verdict = confidence >= 0.5 ? 'approve' : 'revise'
63
72
 
64
- build_result(passed: true, verdict: verdict, stages: stages, issues: issues, confidence: confidence)
73
+ build_result(passed: true, verdict: verdict, stages: stages, issues: issues, confidence: confidence,
74
+ code: code, spec_code: spec_code, extra: extra)
65
75
  end
66
76
 
67
77
  private
@@ -75,6 +85,80 @@ module Legion
75
85
  {}
76
86
  end
77
87
 
88
+ def default_review_k
89
+ return 1 unless defined?(Legion::Settings)
90
+
91
+ Legion::Settings.dig(:codegen, :self_generate, :validation, :review_k) || 1
92
+ rescue StandardError => e
93
+ log.warn(e.message)
94
+ 1
95
+ end
96
+
97
+ def default_review_models
98
+ return [] unless defined?(Legion::Settings)
99
+
100
+ Legion::Settings.dig(:codegen, :self_generate, :validation, :review_models) || []
101
+ rescue StandardError => e
102
+ log.warn(e.message)
103
+ []
104
+ end
105
+
106
+ def provider_available?(provider_sym)
107
+ return false unless defined?(Legion::Settings)
108
+
109
+ Legion::Settings.dig(:llm, :providers, provider_sym, :enabled) == true
110
+ rescue StandardError => e
111
+ log.warn(e.message)
112
+ false
113
+ end
114
+
115
+ def build_model_assignments(count, models)
116
+ return Array.new(count) { nil } if models.nil? || models.empty?
117
+
118
+ available = models.select do |spec|
119
+ next false unless spec.is_a?(Hash)
120
+
121
+ provider_sym = spec[:provider]&.to_sym
122
+ if provider_sym && !provider_available?(provider_sym)
123
+ log.warn("review provider #{provider_sym} not available, skipping")
124
+ false
125
+ else
126
+ true
127
+ end
128
+ end
129
+
130
+ return Array.new(count) { nil } if available.empty?
131
+
132
+ Array.new(count) { |i| available[i % available.size] }
133
+ end
134
+
135
+ def adversarial_llm_review(code, context, count:, models: [])
136
+ assignments = build_model_assignments(count, models)
137
+
138
+ reviews = assignments.map { |spec| llm_review(code, context, model_spec: spec) }
139
+
140
+ approvals = reviews.count { |r| r[:confidence] >= 0.5 }
141
+ rejections = count - approvals
142
+ all_issues = reviews.flat_map { |r| r[:issues] || [] }.uniq
143
+
144
+ avg_confidence = reviews.sum { |r| r[:confidence] || 0.0 } / reviews.size
145
+
146
+ {
147
+ passed: approvals > rejections,
148
+ issues: all_issues,
149
+ confidence: avg_confidence,
150
+ k: count,
151
+ approvals: approvals,
152
+ rejections: rejections,
153
+ reviews: reviews
154
+ }
155
+ rescue StandardError => e
156
+ log.warn("adversarial review failed: #{e.message}")
157
+ fallback = llm_review(code, context)
158
+ fallback.merge(k: count, approvals: (fallback[:passed] ? 1 : 0),
159
+ rejections: (fallback[:passed] ? 0 : 1), reviews: [fallback])
160
+ end
161
+
78
162
  def check_syntax(code, spec_code)
79
163
  errors = []
80
164
  begin
@@ -121,19 +205,28 @@ module Legion
121
205
  { passed: false, output: '', errors: e.message, exit_code: -1 }
122
206
  end
123
207
 
124
- def llm_review(code, context)
208
+ def llm_review(code, context, model_spec: nil)
125
209
  return { passed: true, issues: [], confidence: 0.5 } unless defined?(Runners::AgenticReview)
126
210
 
211
+ extra_kwargs = {}
212
+ if model_spec
213
+ extra_kwargs[:model] = model_spec[:model] if model_spec[:model]
214
+ extra_kwargs[:provider] = model_spec[:provider] if model_spec[:provider]
215
+ end
216
+
127
217
  result = Runners::AgenticReview.review_output(
128
218
  input: context,
129
219
  output: code,
130
- review_prompt: 'Review this generated Ruby code for correctness, safety, and Legion conventions.'
220
+ review_prompt: 'Review this generated Ruby code for correctness, safety, and Legion conventions.',
221
+ **extra_kwargs
131
222
  )
132
223
 
133
224
  {
134
- passed: result[:reviewed] != false,
225
+ passed: result[:recommendation] == 'approve',
135
226
  issues: result[:issues] || [],
136
- confidence: result[:confidence] || 0.5
227
+ confidence: result[:confidence] || 0.5,
228
+ provider: model_spec&.dig(:provider),
229
+ model: model_spec&.dig(:model)
137
230
  }
138
231
  rescue StandardError => e
139
232
  log.warn("llm review failed: #{e.message}")
@@ -183,7 +276,7 @@ module Legion
183
276
  scores.sum / scores.size
184
277
  end
185
278
 
186
- def stage_scores(stages) # rubocop:disable Metrics/PerceivedComplexity
279
+ def stage_scores(stages)
187
280
  scores = []
188
281
  scores << (stage_passed?(stages[:syntax]) ? 1.0 : 0.0) if stages[:syntax]
189
282
  scores << (stage_passed?(stages[:security]) ? 1.0 : 0.0) if stages[:security]
@@ -193,8 +286,12 @@ module Legion
193
286
  scores
194
287
  end
195
288
 
196
- def build_result(passed:, verdict:, stages:, issues:, confidence:)
197
- { passed: passed, verdict: verdict, confidence: confidence, stages: stages, issues: issues }
289
+ def build_result(passed:, verdict:, stages:, issues:, confidence:, code: nil, spec_code: nil, extra: {})
290
+ result = { passed: passed, verdict: verdict, confidence: confidence, stages: stages, issues: issues }
291
+ result[:code] = code
292
+ result[:spec_code] = spec_code
293
+ extra.each { |k, v| result[k] = v unless result.key?(k) }
294
+ result
198
295
  end
199
296
  end
200
297
  end
@@ -5,6 +5,8 @@ module Legion
5
5
  module Eval
6
6
  module Runners
7
7
  module Evaluation
8
+ extend self # rubocop:disable Style/ModuleFunction
9
+
8
10
  def run_evaluation(evaluator_name:, evaluator_config: {}, inputs: [], **)
9
11
  evaluator = build_evaluator(evaluator_name, evaluator_config)
10
12
  results = inputs.map.with_index do |row, idx|
@@ -3,7 +3,7 @@
3
3
  module Legion
4
4
  module Extensions
5
5
  module Eval
6
- VERSION = '0.3.10'
6
+ VERSION = '0.3.13'
7
7
  end
8
8
  end
9
9
  end
@@ -22,7 +22,7 @@ if defined?(Legion::Transport::Exchange)
22
22
  require_relative 'eval/transport/messages/code_review_completed'
23
23
  end
24
24
 
25
- require_relative 'eval/actors/code_review_subscriber' if defined?(Legion::Extensions::Actors::Subscription)
25
+ require_relative 'eval/actors/code_review_subscriber'
26
26
 
27
27
  module Legion
28
28
  module Extensions
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lex-eval
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.10
4
+ version: 0.3.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matthew Iverson