lex-eval 0.3.11 → 0.3.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/legion/extensions/eval/actors/code_review_subscriber.rb +3 -6
- data/lib/legion/extensions/eval/actors/online.rb +4 -4
- data/lib/legion/extensions/eval/evaluators/code_evaluator.rb +1 -1
- data/lib/legion/extensions/eval/evaluators/llm_judge.rb +4 -4
- data/lib/legion/extensions/eval/helpers/guardrails.rb +1 -5
- data/lib/legion/extensions/eval/runners/agentic_review.rb +7 -7
- data/lib/legion/extensions/eval/runners/annotation.rb +3 -1
- data/lib/legion/extensions/eval/runners/code_review.rb +21 -13
- data/lib/legion/extensions/eval/runners/evaluation.rb +2 -0
- data/lib/legion/extensions/eval/version.rb +1 -1
- data/lib/legion/extensions/eval.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: bcf09bf5fdfb1a7c80e050e440c874f6e099ddd41ac0f85155733d1c4aa80bed
|
|
4
|
+
data.tar.gz: f1291ca88a8a4f7cf30abce7616bf8ab4145d1db0fe6ed6c0112e09482bda77b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 1df90ff664f437acbfa6eb6e1b7dd5efc7620a8b4ee73b8f11a4fb5b35f8b2ab7324d887186abf56c929a84799e68eba3a30a375b05760048fab6eb6f1078b2f
|
|
7
|
+
data.tar.gz: 1887981dc5dc141deb187f1d8ed7e2c5075c911d87b7a367773b93fd96ea6b52c6564b44a90ee8c32a1e9e6670c5ba381b371a8440fdad39b8ea9fda2d42cb9b
|
|
@@ -11,6 +11,7 @@ module Legion
|
|
|
11
11
|
|
|
12
12
|
def runner_class = self.class
|
|
13
13
|
def runner_function = 'action'
|
|
14
|
+
def check_subtask? = true
|
|
14
15
|
|
|
15
16
|
def action(payload)
|
|
16
17
|
code = payload[:runner_code] || payload[:code]
|
|
@@ -30,17 +31,13 @@ module Legion
|
|
|
30
31
|
result
|
|
31
32
|
rescue StandardError => e
|
|
32
33
|
log.warn("CodeReviewSubscriber failed: #{e.message}")
|
|
33
|
-
{ passed: false, verdict:
|
|
34
|
+
{ passed: false, verdict: 'reject', error: e.message }
|
|
34
35
|
end
|
|
35
36
|
|
|
36
37
|
private
|
|
37
38
|
|
|
38
39
|
def log
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
@log ||= Object.new.tap do |nl|
|
|
42
|
-
%i[debug info warn error fatal].each { |m| nl.define_singleton_method(m) { |*| nil } }
|
|
43
|
-
end
|
|
40
|
+
Legion::Logging
|
|
44
41
|
end
|
|
45
42
|
end
|
|
46
43
|
end
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require 'legion/extensions/actors/subscription'
|
|
3
|
+
require 'legion/extensions/actors/subscription'
|
|
4
4
|
|
|
5
5
|
module Legion
|
|
6
6
|
module Extensions
|
|
@@ -30,12 +30,12 @@ module Legion
|
|
|
30
30
|
false
|
|
31
31
|
end
|
|
32
32
|
|
|
33
|
-
def enabled?
|
|
34
|
-
return false unless
|
|
33
|
+
def enabled? # rubocop:disable Legion/Extension/ActorEnabledSideEffects
|
|
34
|
+
return false unless Legion.const_defined?(:Transport, false)
|
|
35
35
|
return false unless defined?(Legion::Extensions::Eval::Runners::Online)
|
|
36
36
|
|
|
37
37
|
online_enabled?
|
|
38
|
-
rescue StandardError
|
|
38
|
+
rescue StandardError => _e
|
|
39
39
|
false
|
|
40
40
|
end
|
|
41
41
|
|
|
@@ -37,24 +37,24 @@ module Legion
|
|
|
37
37
|
def evaluate_impl(input:, output:, expected:)
|
|
38
38
|
prompt = render_template(input: input, output: output, expected: expected)
|
|
39
39
|
evaluate_structured(prompt)
|
|
40
|
-
rescue StandardError
|
|
40
|
+
rescue StandardError => _e
|
|
41
41
|
evaluate_regex_fallback(prompt)
|
|
42
42
|
end
|
|
43
43
|
|
|
44
44
|
def evaluate_structured(prompt)
|
|
45
45
|
return evaluate_regex_fallback(prompt) unless structured_available?
|
|
46
46
|
|
|
47
|
-
result = Legion::LLM.structured(message: prompt, schema: JUDGE_SCHEMA,
|
|
47
|
+
result = Legion::LLM.structured(message: prompt, schema: JUDGE_SCHEMA, # rubocop:disable Legion/HelperMigration/DirectLlm
|
|
48
48
|
intent: { capability: :reasoning },
|
|
49
49
|
caller: { extension: 'lex-eval', operation: 'judge' })
|
|
50
50
|
{ score: result[:score], passed: result[:passed],
|
|
51
51
|
explanation: result[:explanation], evidence: result[:evidence] || [] }
|
|
52
|
-
rescue StandardError
|
|
52
|
+
rescue StandardError => _e
|
|
53
53
|
evaluate_regex_fallback(prompt)
|
|
54
54
|
end
|
|
55
55
|
|
|
56
56
|
def evaluate_regex_fallback(prompt)
|
|
57
|
-
response = Legion::LLM.chat(message: prompt, intent: { capability: :reasoning },
|
|
57
|
+
response = Legion::LLM.chat(message: prompt, intent: { capability: :reasoning }, # rubocop:disable Legion/HelperMigration/DirectLlm
|
|
58
58
|
caller: { extension: 'lex-eval', operation: 'judge' })
|
|
59
59
|
score = extract_score(response.content)
|
|
60
60
|
{ score: score, explanation: response.content, passed: score >= threshold, evidence: [] }
|
|
@@ -79,11 +79,7 @@ module Legion
|
|
|
79
79
|
end
|
|
80
80
|
|
|
81
81
|
def log
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
@log ||= Object.new.tap do |nl|
|
|
85
|
-
%i[debug info warn error fatal].each { |m| nl.define_singleton_method(m) { |*| nil } }
|
|
86
|
-
end
|
|
82
|
+
Legion::Logging
|
|
87
83
|
end
|
|
88
84
|
end
|
|
89
85
|
end
|
|
@@ -4,7 +4,7 @@ module Legion
|
|
|
4
4
|
module Extensions
|
|
5
5
|
module Eval
|
|
6
6
|
module Runners
|
|
7
|
-
module AgenticReview
|
|
7
|
+
module AgenticReview # rubocop:disable Legion/Extension/RunnerIncludeHelpers
|
|
8
8
|
REVIEW_SCHEMA = {
|
|
9
9
|
type: :object,
|
|
10
10
|
properties: {
|
|
@@ -23,7 +23,7 @@ module Legion
|
|
|
23
23
|
required: %i[confidence recommendation explanation]
|
|
24
24
|
}.freeze
|
|
25
25
|
|
|
26
|
-
def review_output(input:, output:, review_prompt: nil, model: nil, provider: nil, **)
|
|
26
|
+
def review_output(input:, output:, review_prompt: nil, model: nil, provider: nil, **)
|
|
27
27
|
prompt = build_review_message(review_prompt || default_review_prompt, input, output)
|
|
28
28
|
llm_kwargs = {
|
|
29
29
|
message: prompt, schema: REVIEW_SCHEMA,
|
|
@@ -32,14 +32,14 @@ module Legion
|
|
|
32
32
|
}
|
|
33
33
|
llm_kwargs[:model] = model if model
|
|
34
34
|
llm_kwargs[:provider] = provider if provider
|
|
35
|
-
Legion::LLM.structured(**llm_kwargs)
|
|
35
|
+
Legion::LLM.structured(**llm_kwargs) # rubocop:disable Legion/HelperMigration/DirectLlm
|
|
36
36
|
rescue StandardError => e
|
|
37
|
-
log.warn(e.message) if respond_to?(:log, true)
|
|
37
|
+
log.warn(e.message) if respond_to?(:log, true) # rubocop:disable Legion/HelperMigration/LoggingGuard
|
|
38
38
|
{ confidence: 0.0, recommendation: 'reject',
|
|
39
39
|
issues: [], explanation: "review error: #{e.message}" }
|
|
40
40
|
end
|
|
41
41
|
|
|
42
|
-
def review_with_escalation(input:, output:, review_prompt: nil, model: nil, provider: nil, **)
|
|
42
|
+
def review_with_escalation(input:, output:, review_prompt: nil, model: nil, provider: nil, **)
|
|
43
43
|
review = review_output(input: input, output: output, review_prompt: review_prompt,
|
|
44
44
|
model: model, provider: provider)
|
|
45
45
|
action, priority = determine_escalation(review[:confidence])
|
|
@@ -49,7 +49,7 @@ module Legion
|
|
|
49
49
|
review.merge(action: action, escalated: true, priority: priority)
|
|
50
50
|
end
|
|
51
51
|
|
|
52
|
-
def review_experiment(input:, output_a:, output_b:, review_prompt: nil, model: nil, provider: nil, **)
|
|
52
|
+
def review_experiment(input:, output_a:, output_b:, review_prompt: nil, model: nil, provider: nil, **)
|
|
53
53
|
review_a = review_output(input: input, output: output_a, review_prompt: review_prompt,
|
|
54
54
|
model: model, provider: provider)
|
|
55
55
|
review_b = review_output(input: input, output: output_b, review_prompt: review_prompt,
|
|
@@ -73,7 +73,7 @@ module Legion
|
|
|
73
73
|
review_a: review_a,
|
|
74
74
|
review_b: review_b }
|
|
75
75
|
rescue StandardError => e
|
|
76
|
-
log.warn(e.message) if respond_to?(:log, true)
|
|
76
|
+
log.warn(e.message) if respond_to?(:log, true) # rubocop:disable Legion/HelperMigration/LoggingGuard
|
|
77
77
|
{ reviewed: false, reason: "experiment error: #{e.message}" }
|
|
78
78
|
end
|
|
79
79
|
|
|
@@ -5,6 +5,8 @@ module Legion
|
|
|
5
5
|
module Eval
|
|
6
6
|
module Runners
|
|
7
7
|
module Annotation
|
|
8
|
+
extend self
|
|
9
|
+
|
|
8
10
|
def create_queue(name:, **opts)
|
|
9
11
|
db[:annotation_queues].insert(
|
|
10
12
|
name: name,
|
|
@@ -15,7 +17,7 @@ module Legion
|
|
|
15
17
|
created_at: Time.now.utc
|
|
16
18
|
)
|
|
17
19
|
{ created: true, name: name }
|
|
18
|
-
rescue Sequel::UniqueConstraintViolation
|
|
20
|
+
rescue Sequel::UniqueConstraintViolation => _e
|
|
19
21
|
{ error: 'already_exists', name: name }
|
|
20
22
|
end
|
|
21
23
|
|
|
@@ -13,7 +13,7 @@ module Legion
|
|
|
13
13
|
|
|
14
14
|
SPEC_TIMEOUT = 30
|
|
15
15
|
|
|
16
|
-
def review_generated(code:, spec_code:, context
|
|
16
|
+
def review_generated(code:, spec_code:, context: {}, review_k: nil, review_models: nil, **extra) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
17
17
|
settings = validation_settings
|
|
18
18
|
stages = {}
|
|
19
19
|
issues = []
|
|
@@ -22,8 +22,9 @@ module Legion
|
|
|
22
22
|
if settings[:syntax_check] != false
|
|
23
23
|
stages[:syntax] = check_syntax(code, spec_code)
|
|
24
24
|
unless stages[:syntax][:passed]
|
|
25
|
-
return build_result(passed: false, verdict:
|
|
26
|
-
issues: stages[:syntax][:errors], confidence: 0.0
|
|
25
|
+
return build_result(passed: false, verdict: 'reject', stages: stages,
|
|
26
|
+
issues: stages[:syntax][:errors], confidence: 0.0,
|
|
27
|
+
code: code, spec_code: spec_code, extra: extra)
|
|
27
28
|
end
|
|
28
29
|
end
|
|
29
30
|
|
|
@@ -31,7 +32,8 @@ module Legion
|
|
|
31
32
|
stages[:security] = check_security(code)
|
|
32
33
|
unless stages[:security][:passed]
|
|
33
34
|
issues.concat(stages[:security][:flagged].map { |f| "security: #{f[:pattern]} on line #{f[:line]}" })
|
|
34
|
-
return build_result(passed: false, verdict:
|
|
35
|
+
return build_result(passed: false, verdict: 'reject', stages: stages, issues: issues, confidence: 0.0,
|
|
36
|
+
code: code, spec_code: spec_code, extra: extra)
|
|
35
37
|
end
|
|
36
38
|
|
|
37
39
|
# Stage 3: Spec execution (optional)
|
|
@@ -39,7 +41,8 @@ module Legion
|
|
|
39
41
|
stages[:specs] = run_specs(code, spec_code)
|
|
40
42
|
unless stages[:specs][:passed]
|
|
41
43
|
issues << "specs failed: #{stages[:specs][:output]}"
|
|
42
|
-
return build_result(passed: false, verdict:
|
|
44
|
+
return build_result(passed: false, verdict: 'revise', stages: stages, issues: issues, confidence: 0.2,
|
|
45
|
+
code: code, spec_code: spec_code, extra: extra)
|
|
43
46
|
end
|
|
44
47
|
end
|
|
45
48
|
|
|
@@ -65,9 +68,10 @@ module Legion
|
|
|
65
68
|
end
|
|
66
69
|
|
|
67
70
|
confidence = calculate_confidence(stages)
|
|
68
|
-
verdict = confidence >= 0.5 ?
|
|
71
|
+
verdict = confidence >= 0.5 ? 'approve' : 'revise'
|
|
69
72
|
|
|
70
|
-
build_result(passed: true, verdict: verdict, stages: stages, issues: issues, confidence: confidence
|
|
73
|
+
build_result(passed: true, verdict: verdict, stages: stages, issues: issues, confidence: confidence,
|
|
74
|
+
code: code, spec_code: spec_code, extra: extra)
|
|
71
75
|
end
|
|
72
76
|
|
|
73
77
|
private
|
|
@@ -108,7 +112,7 @@ module Legion
|
|
|
108
112
|
false
|
|
109
113
|
end
|
|
110
114
|
|
|
111
|
-
def build_model_assignments(count, models)
|
|
115
|
+
def build_model_assignments(count, models)
|
|
112
116
|
return Array.new(count) { nil } if models.nil? || models.empty?
|
|
113
117
|
|
|
114
118
|
available = models.select do |spec|
|
|
@@ -128,7 +132,7 @@ module Legion
|
|
|
128
132
|
Array.new(count) { |i| available[i % available.size] }
|
|
129
133
|
end
|
|
130
134
|
|
|
131
|
-
def adversarial_llm_review(code, context, count:, models: [])
|
|
135
|
+
def adversarial_llm_review(code, context, count:, models: [])
|
|
132
136
|
assignments = build_model_assignments(count, models)
|
|
133
137
|
|
|
134
138
|
reviews = assignments.map { |spec| llm_review(code, context, model_spec: spec) }
|
|
@@ -201,7 +205,7 @@ module Legion
|
|
|
201
205
|
{ passed: false, output: '', errors: e.message, exit_code: -1 }
|
|
202
206
|
end
|
|
203
207
|
|
|
204
|
-
def llm_review(code, context, model_spec: nil)
|
|
208
|
+
def llm_review(code, context, model_spec: nil)
|
|
205
209
|
return { passed: true, issues: [], confidence: 0.5 } unless defined?(Runners::AgenticReview)
|
|
206
210
|
|
|
207
211
|
extra_kwargs = {}
|
|
@@ -272,7 +276,7 @@ module Legion
|
|
|
272
276
|
scores.sum / scores.size
|
|
273
277
|
end
|
|
274
278
|
|
|
275
|
-
def stage_scores(stages)
|
|
279
|
+
def stage_scores(stages)
|
|
276
280
|
scores = []
|
|
277
281
|
scores << (stage_passed?(stages[:syntax]) ? 1.0 : 0.0) if stages[:syntax]
|
|
278
282
|
scores << (stage_passed?(stages[:security]) ? 1.0 : 0.0) if stages[:security]
|
|
@@ -282,8 +286,12 @@ module Legion
|
|
|
282
286
|
scores
|
|
283
287
|
end
|
|
284
288
|
|
|
285
|
-
def build_result(passed:, verdict:, stages:, issues:, confidence:)
|
|
286
|
-
{ passed: passed, verdict: verdict, confidence: confidence, stages: stages, issues: issues }
|
|
289
|
+
def build_result(passed:, verdict:, stages:, issues:, confidence:, code: nil, spec_code: nil, extra: {})
|
|
290
|
+
result = { passed: passed, verdict: verdict, confidence: confidence, stages: stages, issues: issues }
|
|
291
|
+
result[:code] = code
|
|
292
|
+
result[:spec_code] = spec_code
|
|
293
|
+
extra.each { |k, v| result[k] = v unless result.key?(k) }
|
|
294
|
+
result
|
|
287
295
|
end
|
|
288
296
|
end
|
|
289
297
|
end
|
|
@@ -5,6 +5,8 @@ module Legion
|
|
|
5
5
|
module Eval
|
|
6
6
|
module Runners
|
|
7
7
|
module Evaluation
|
|
8
|
+
extend self # rubocop:disable Style/ModuleFunction
|
|
9
|
+
|
|
8
10
|
def run_evaluation(evaluator_name:, evaluator_config: {}, inputs: [], **)
|
|
9
11
|
evaluator = build_evaluator(evaluator_name, evaluator_config)
|
|
10
12
|
results = inputs.map.with_index do |row, idx|
|
|
@@ -22,7 +22,7 @@ if defined?(Legion::Transport::Exchange)
|
|
|
22
22
|
require_relative 'eval/transport/messages/code_review_completed'
|
|
23
23
|
end
|
|
24
24
|
|
|
25
|
-
require_relative 'eval/actors/code_review_subscriber'
|
|
25
|
+
require_relative 'eval/actors/code_review_subscriber'
|
|
26
26
|
|
|
27
27
|
module Legion
|
|
28
28
|
module Extensions
|