lex-eval 0.3.11 → 0.3.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1ba8f0431d907def112bdd8f02a0685002fbfc75754bb658d620ea4889b03c70
4
- data.tar.gz: b38fcf01598441f544cf8721d9109010983b357be7085e4e0196edd9872cd16a
3
+ metadata.gz: bcf09bf5fdfb1a7c80e050e440c874f6e099ddd41ac0f85155733d1c4aa80bed
4
+ data.tar.gz: f1291ca88a8a4f7cf30abce7616bf8ab4145d1db0fe6ed6c0112e09482bda77b
5
5
  SHA512:
6
- metadata.gz: 678d325ad2c47aa74abfe3a8ec4f8c7eb63bb14beec4e8911c7912963ded1cc966dbe57dfe50b721e54cdd85df1ef67e6c04af73004c76f560e9b5b45c0ea132
7
- data.tar.gz: abd3cab893f187e0c51929cb9f91fa7f3806d250a339c6c137e6073589d0d6e0a18e9560dcdc9645b39f07c9471e9b93ee5ae0eaf3b74f421501383ba3951078
6
+ metadata.gz: 1df90ff664f437acbfa6eb6e1b7dd5efc7620a8b4ee73b8f11a4fb5b35f8b2ab7324d887186abf56c929a84799e68eba3a30a375b05760048fab6eb6f1078b2f
7
+ data.tar.gz: 1887981dc5dc141deb187f1d8ed7e2c5075c911d87b7a367773b93fd96ea6b52c6564b44a90ee8c32a1e9e6670c5ba381b371a8440fdad39b8ea9fda2d42cb9b
@@ -11,6 +11,7 @@ module Legion
11
11
 
12
12
  def runner_class = self.class
13
13
  def runner_function = 'action'
14
+ def check_subtask? = true
14
15
 
15
16
  def action(payload)
16
17
  code = payload[:runner_code] || payload[:code]
@@ -30,17 +31,13 @@ module Legion
30
31
  result
31
32
  rescue StandardError => e
32
33
  log.warn("CodeReviewSubscriber failed: #{e.message}")
33
- { passed: false, verdict: :reject, error: e.message }
34
+ { passed: false, verdict: 'reject', error: e.message }
34
35
  end
35
36
 
36
37
  private
37
38
 
38
39
  def log
39
- return Legion::Logging if defined?(Legion::Logging)
40
-
41
- @log ||= Object.new.tap do |nl|
42
- %i[debug info warn error fatal].each { |m| nl.define_singleton_method(m) { |*| nil } }
43
- end
40
+ Legion::Logging
44
41
  end
45
42
  end
46
43
  end
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'legion/extensions/actors/subscription' unless defined?(Legion::Extensions::Actors::Subscription)
3
+ require 'legion/extensions/actors/subscription'
4
4
 
5
5
  module Legion
6
6
  module Extensions
@@ -30,12 +30,12 @@ module Legion
30
30
  false
31
31
  end
32
32
 
33
- def enabled?
34
- return false unless defined?(Legion::Transport)
33
+ def enabled? # rubocop:disable Legion/Extension/ActorEnabledSideEffects
34
+ return false unless Legion.const_defined?(:Transport, false)
35
35
  return false unless defined?(Legion::Extensions::Eval::Runners::Online)
36
36
 
37
37
  online_enabled?
38
- rescue StandardError
38
+ rescue StandardError => _e
39
39
  false
40
40
  end
41
41
 
@@ -31,7 +31,7 @@ module Legion
31
31
  def valid_json?(str)
32
32
  ::JSON.parse(str)
33
33
  true
34
- rescue ::JSON::ParserError
34
+ rescue ::JSON::ParserError => _e
35
35
  false
36
36
  end
37
37
  end
@@ -37,24 +37,24 @@ module Legion
37
37
  def evaluate_impl(input:, output:, expected:)
38
38
  prompt = render_template(input: input, output: output, expected: expected)
39
39
  evaluate_structured(prompt)
40
- rescue StandardError
40
+ rescue StandardError => _e
41
41
  evaluate_regex_fallback(prompt)
42
42
  end
43
43
 
44
44
  def evaluate_structured(prompt)
45
45
  return evaluate_regex_fallback(prompt) unless structured_available?
46
46
 
47
- result = Legion::LLM.structured(message: prompt, schema: JUDGE_SCHEMA,
47
+ result = Legion::LLM.structured(message: prompt, schema: JUDGE_SCHEMA, # rubocop:disable Legion/HelperMigration/DirectLlm
48
48
  intent: { capability: :reasoning },
49
49
  caller: { extension: 'lex-eval', operation: 'judge' })
50
50
  { score: result[:score], passed: result[:passed],
51
51
  explanation: result[:explanation], evidence: result[:evidence] || [] }
52
- rescue StandardError
52
+ rescue StandardError => _e
53
53
  evaluate_regex_fallback(prompt)
54
54
  end
55
55
 
56
56
  def evaluate_regex_fallback(prompt)
57
- response = Legion::LLM.chat(message: prompt, intent: { capability: :reasoning },
57
+ response = Legion::LLM.chat(message: prompt, intent: { capability: :reasoning }, # rubocop:disable Legion/HelperMigration/DirectLlm
58
58
  caller: { extension: 'lex-eval', operation: 'judge' })
59
59
  score = extract_score(response.content)
60
60
  { score: score, explanation: response.content, passed: score >= threshold, evidence: [] }
@@ -79,11 +79,7 @@ module Legion
79
79
  end
80
80
 
81
81
  def log
82
- return Legion::Logging if defined?(Legion::Logging)
83
-
84
- @log ||= Object.new.tap do |nl|
85
- %i[debug info warn error fatal].each { |m| nl.define_singleton_method(m) { |*| nil } }
86
- end
82
+ Legion::Logging
87
83
  end
88
84
  end
89
85
  end
@@ -4,7 +4,7 @@ module Legion
4
4
  module Extensions
5
5
  module Eval
6
6
  module Runners
7
- module AgenticReview
7
+ module AgenticReview # rubocop:disable Legion/Extension/RunnerIncludeHelpers
8
8
  REVIEW_SCHEMA = {
9
9
  type: :object,
10
10
  properties: {
@@ -23,7 +23,7 @@ module Legion
23
23
  required: %i[confidence recommendation explanation]
24
24
  }.freeze
25
25
 
26
- def review_output(input:, output:, review_prompt: nil, model: nil, provider: nil, **) # rubocop:disable Metrics/ParameterLists
26
+ def review_output(input:, output:, review_prompt: nil, model: nil, provider: nil, **)
27
27
  prompt = build_review_message(review_prompt || default_review_prompt, input, output)
28
28
  llm_kwargs = {
29
29
  message: prompt, schema: REVIEW_SCHEMA,
@@ -32,14 +32,14 @@ module Legion
32
32
  }
33
33
  llm_kwargs[:model] = model if model
34
34
  llm_kwargs[:provider] = provider if provider
35
- Legion::LLM.structured(**llm_kwargs)
35
+ Legion::LLM.structured(**llm_kwargs) # rubocop:disable Legion/HelperMigration/DirectLlm
36
36
  rescue StandardError => e
37
- log.warn(e.message) if respond_to?(:log, true)
37
+ log.warn(e.message) if respond_to?(:log, true) # rubocop:disable Legion/HelperMigration/LoggingGuard
38
38
  { confidence: 0.0, recommendation: 'reject',
39
39
  issues: [], explanation: "review error: #{e.message}" }
40
40
  end
41
41
 
42
- def review_with_escalation(input:, output:, review_prompt: nil, model: nil, provider: nil, **) # rubocop:disable Metrics/ParameterLists
42
+ def review_with_escalation(input:, output:, review_prompt: nil, model: nil, provider: nil, **)
43
43
  review = review_output(input: input, output: output, review_prompt: review_prompt,
44
44
  model: model, provider: provider)
45
45
  action, priority = determine_escalation(review[:confidence])
@@ -49,7 +49,7 @@ module Legion
49
49
  review.merge(action: action, escalated: true, priority: priority)
50
50
  end
51
51
 
52
- def review_experiment(input:, output_a:, output_b:, review_prompt: nil, model: nil, provider: nil, **) # rubocop:disable Metrics/ParameterLists
52
+ def review_experiment(input:, output_a:, output_b:, review_prompt: nil, model: nil, provider: nil, **)
53
53
  review_a = review_output(input: input, output: output_a, review_prompt: review_prompt,
54
54
  model: model, provider: provider)
55
55
  review_b = review_output(input: input, output: output_b, review_prompt: review_prompt,
@@ -73,7 +73,7 @@ module Legion
73
73
  review_a: review_a,
74
74
  review_b: review_b }
75
75
  rescue StandardError => e
76
- log.warn(e.message) if respond_to?(:log, true)
76
+ log.warn(e.message) if respond_to?(:log, true) # rubocop:disable Legion/HelperMigration/LoggingGuard
77
77
  { reviewed: false, reason: "experiment error: #{e.message}" }
78
78
  end
79
79
 
@@ -5,6 +5,8 @@ module Legion
5
5
  module Eval
6
6
  module Runners
7
7
  module Annotation
8
+ extend self
9
+
8
10
  def create_queue(name:, **opts)
9
11
  db[:annotation_queues].insert(
10
12
  name: name,
@@ -15,7 +17,7 @@ module Legion
15
17
  created_at: Time.now.utc
16
18
  )
17
19
  { created: true, name: name }
18
- rescue Sequel::UniqueConstraintViolation
20
+ rescue Sequel::UniqueConstraintViolation => _e
19
21
  { error: 'already_exists', name: name }
20
22
  end
21
23
 
@@ -13,7 +13,7 @@ module Legion
13
13
 
14
14
  SPEC_TIMEOUT = 30
15
15
 
16
- def review_generated(code:, spec_code:, context:, review_k: nil, review_models: nil) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity, Metrics/MethodLength
16
+ def review_generated(code:, spec_code:, context: {}, review_k: nil, review_models: nil, **extra) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
17
17
  settings = validation_settings
18
18
  stages = {}
19
19
  issues = []
@@ -22,8 +22,9 @@ module Legion
22
22
  if settings[:syntax_check] != false
23
23
  stages[:syntax] = check_syntax(code, spec_code)
24
24
  unless stages[:syntax][:passed]
25
- return build_result(passed: false, verdict: :reject, stages: stages,
26
- issues: stages[:syntax][:errors], confidence: 0.0)
25
+ return build_result(passed: false, verdict: 'reject', stages: stages,
26
+ issues: stages[:syntax][:errors], confidence: 0.0,
27
+ code: code, spec_code: spec_code, extra: extra)
27
28
  end
28
29
  end
29
30
 
@@ -31,7 +32,8 @@ module Legion
31
32
  stages[:security] = check_security(code)
32
33
  unless stages[:security][:passed]
33
34
  issues.concat(stages[:security][:flagged].map { |f| "security: #{f[:pattern]} on line #{f[:line]}" })
34
- return build_result(passed: false, verdict: :reject, stages: stages, issues: issues, confidence: 0.0)
35
+ return build_result(passed: false, verdict: 'reject', stages: stages, issues: issues, confidence: 0.0,
36
+ code: code, spec_code: spec_code, extra: extra)
35
37
  end
36
38
 
37
39
  # Stage 3: Spec execution (optional)
@@ -39,7 +41,8 @@ module Legion
39
41
  stages[:specs] = run_specs(code, spec_code)
40
42
  unless stages[:specs][:passed]
41
43
  issues << "specs failed: #{stages[:specs][:output]}"
42
- return build_result(passed: false, verdict: :revise, stages: stages, issues: issues, confidence: 0.2)
44
+ return build_result(passed: false, verdict: 'revise', stages: stages, issues: issues, confidence: 0.2,
45
+ code: code, spec_code: spec_code, extra: extra)
43
46
  end
44
47
  end
45
48
 
@@ -65,9 +68,10 @@ module Legion
65
68
  end
66
69
 
67
70
  confidence = calculate_confidence(stages)
68
- verdict = confidence >= 0.5 ? :approve : :revise
71
+ verdict = confidence >= 0.5 ? 'approve' : 'revise'
69
72
 
70
- build_result(passed: true, verdict: verdict, stages: stages, issues: issues, confidence: confidence)
73
+ build_result(passed: true, verdict: verdict, stages: stages, issues: issues, confidence: confidence,
74
+ code: code, spec_code: spec_code, extra: extra)
71
75
  end
72
76
 
73
77
  private
@@ -108,7 +112,7 @@ module Legion
108
112
  false
109
113
  end
110
114
 
111
- def build_model_assignments(count, models) # rubocop:disable Metrics/PerceivedComplexity
115
+ def build_model_assignments(count, models)
112
116
  return Array.new(count) { nil } if models.nil? || models.empty?
113
117
 
114
118
  available = models.select do |spec|
@@ -128,7 +132,7 @@ module Legion
128
132
  Array.new(count) { |i| available[i % available.size] }
129
133
  end
130
134
 
131
- def adversarial_llm_review(code, context, count:, models: []) # rubocop:disable Metrics/PerceivedComplexity
135
+ def adversarial_llm_review(code, context, count:, models: [])
132
136
  assignments = build_model_assignments(count, models)
133
137
 
134
138
  reviews = assignments.map { |spec| llm_review(code, context, model_spec: spec) }
@@ -201,7 +205,7 @@ module Legion
201
205
  { passed: false, output: '', errors: e.message, exit_code: -1 }
202
206
  end
203
207
 
204
- def llm_review(code, context, model_spec: nil) # rubocop:disable Metrics/PerceivedComplexity
208
+ def llm_review(code, context, model_spec: nil)
205
209
  return { passed: true, issues: [], confidence: 0.5 } unless defined?(Runners::AgenticReview)
206
210
 
207
211
  extra_kwargs = {}
@@ -272,7 +276,7 @@ module Legion
272
276
  scores.sum / scores.size
273
277
  end
274
278
 
275
- def stage_scores(stages) # rubocop:disable Metrics/PerceivedComplexity
279
+ def stage_scores(stages)
276
280
  scores = []
277
281
  scores << (stage_passed?(stages[:syntax]) ? 1.0 : 0.0) if stages[:syntax]
278
282
  scores << (stage_passed?(stages[:security]) ? 1.0 : 0.0) if stages[:security]
@@ -282,8 +286,12 @@ module Legion
282
286
  scores
283
287
  end
284
288
 
285
- def build_result(passed:, verdict:, stages:, issues:, confidence:)
286
- { passed: passed, verdict: verdict, confidence: confidence, stages: stages, issues: issues }
289
+ def build_result(passed:, verdict:, stages:, issues:, confidence:, code: nil, spec_code: nil, extra: {})
290
+ result = { passed: passed, verdict: verdict, confidence: confidence, stages: stages, issues: issues }
291
+ result[:code] = code
292
+ result[:spec_code] = spec_code
293
+ extra.each { |k, v| result[k] = v unless result.key?(k) }
294
+ result
287
295
  end
288
296
  end
289
297
  end
@@ -5,6 +5,8 @@ module Legion
5
5
  module Eval
6
6
  module Runners
7
7
  module Evaluation
8
+ extend self # rubocop:disable Style/ModuleFunction
9
+
8
10
  def run_evaluation(evaluator_name:, evaluator_config: {}, inputs: [], **)
9
11
  evaluator = build_evaluator(evaluator_name, evaluator_config)
10
12
  results = inputs.map.with_index do |row, idx|
@@ -3,7 +3,7 @@
3
3
  module Legion
4
4
  module Extensions
5
5
  module Eval
6
- VERSION = '0.3.11'
6
+ VERSION = '0.3.13'
7
7
  end
8
8
  end
9
9
  end
@@ -22,7 +22,7 @@ if defined?(Legion::Transport::Exchange)
22
22
  require_relative 'eval/transport/messages/code_review_completed'
23
23
  end
24
24
 
25
- require_relative 'eval/actors/code_review_subscriber' if defined?(Legion::Extensions::Actors::Subscription)
25
+ require_relative 'eval/actors/code_review_subscriber'
26
26
 
27
27
  module Legion
28
28
  module Extensions
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lex-eval
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.11
4
+ version: 0.3.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matthew Iverson