legion-llm 0.3.12 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/lib/legion/llm/arbitrage.rb +105 -0
- data/lib/legion/llm/batch.rb +115 -0
- data/lib/legion/llm/hooks/rag_guard.rb +72 -0
- data/lib/legion/llm/hooks/response_guard.rb +47 -0
- data/lib/legion/llm/hooks.rb +3 -0
- data/lib/legion/llm/scheduling.rb +99 -0
- data/lib/legion/llm/settings.rb +32 -1
- data/lib/legion/llm/version.rb +1 -1
- data/lib/legion/llm.rb +26 -3
- metadata +6 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: fcbccb6df1f7211cd57464dd7a0393d9c90caa73fd504fd8e066e3642ff6f564
|
|
4
|
+
data.tar.gz: 3a103e1db523cdf97cbc0ce2d57981a5f81ffa643ea694b2a7a1e290d1f6971d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 8fc88260d2aaa5d3f6cb78e21e6b102bb44950f20bdca257a82284c8a3a9afb2246a72bf2c223b20663271a29d4c2d63617550de535e5d4cc3538583ab51842c
|
|
7
|
+
data.tar.gz: 5e29c883a22532c9e873126924502d89b6e97ea89cf0b4921960e155c9aa665a8690eba712632b0c7221419382c5a8eaa7442e843518274c186069d50438ff42
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,24 @@
|
|
|
1
1
|
# Legion LLM Changelog
|
|
2
2
|
|
|
3
|
+
## [0.3.14] - 2026-03-21
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- `Legion::LLM::Arbitrage` module for cost-aware model selection: configurable cost table (per-1M-token input/output prices), `cheapest_for(capability:, max_cost:)` filters eligible models and returns the cheapest, `estimated_cost` for per-request USD estimates, settings-defined cost_table overrides, quality_floor and capability-tier filtering
|
|
7
|
+
- `Legion::LLM::Batch` module for non-urgent request batching: `enqueue` stores requests in an in-process queue with UUID tracking, `flush` groups by provider/model and invokes callbacks, configurable window_seconds and max_batch_size, `reset!` for test isolation
|
|
8
|
+
- `Legion::LLM::Scheduling` module for off-peak deferral: `should_defer?(intent:, urgency:)` checks configurable peak hours and intent eligibility, `peak_hours?` evaluates UTC hour against configurable range, `next_off_peak` returns next off-peak window capped at max_defer_hours
|
|
9
|
+
- Default settings for all three features under `llm.arbitrage`, `llm.batch`, `llm.scheduling` — all disabled by default (opt-in)
|
|
10
|
+
- 3 new spec files: `arbitrage_spec.rb` (18 examples), `batch_spec.rb` (16 examples), `scheduling_spec.rb` (24 examples)
|
|
11
|
+
|
|
12
|
+
## [0.3.13] - 2026-03-21
|
|
13
|
+
|
|
14
|
+
### Added
|
|
15
|
+
- `Legion::LLM::Hooks::RagGuard` module with `check_rag_faithfulness` for post-generation RAG faithfulness evaluation via lex-eval
|
|
16
|
+
- `Legion::LLM::Hooks::ResponseGuard` module with `guard_response` as the central dispatch point for post-generation safety checks
|
|
17
|
+
- Response guard wired into `_dispatch_chat`: fires when `Legion::Settings[:llm][:response_guards][:enabled]` is true, attaches `_guard_result` metadata to the response hash without blocking
|
|
18
|
+
- RAG guard skips gracefully when lex-eval is unavailable (returns `reason: :eval_unavailable`) or context is not provided (returns `reason: :no_context`)
|
|
19
|
+
- Settings keys: `llm.rag_guard.enabled`, `llm.rag_guard.threshold` (default 0.7), `llm.rag_guard.evaluators` (default `[:faithfulness, :rag_relevancy]`)
|
|
20
|
+
- 19 new specs in `spec/legion/llm/hooks/rag_guard_spec.rb` and `spec/legion/llm/hooks/response_guard_spec.rb`
|
|
21
|
+
|
|
3
22
|
## [0.3.12] - 2026-03-19
|
|
4
23
|
|
|
5
24
|
### Added
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module LLM
|
|
5
|
+
module Arbitrage
|
|
6
|
+
# Default cost table: per-1M-token input/output prices in USD.
|
|
7
|
+
# Overridable via settings: llm.arbitrage.cost_table
|
|
8
|
+
DEFAULT_COST_TABLE = {
|
|
9
|
+
'claude-sonnet-4-6' => { input: 3.0, output: 15.0 },
|
|
10
|
+
'us.anthropic.claude-sonnet-4-6-v1' => { input: 3.0, output: 15.0 },
|
|
11
|
+
'gpt-4o' => { input: 2.5, output: 10.0 },
|
|
12
|
+
'gpt-4o-mini' => { input: 0.15, output: 0.60 },
|
|
13
|
+
'gemini-2.0-flash' => { input: 0.10, output: 0.40 },
|
|
14
|
+
'llama3' => { input: 0.0, output: 0.0 }
|
|
15
|
+
}.freeze
|
|
16
|
+
|
|
17
|
+
class << self
|
|
18
|
+
# Returns true when arbitrage is enabled in settings.
|
|
19
|
+
def enabled?
|
|
20
|
+
settings.fetch(:enabled, false) == true
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Returns the estimated cost for a request with the given token counts.
|
|
24
|
+
#
|
|
25
|
+
# @param model [String] model identifier
|
|
26
|
+
# @param input_tokens [Integer] estimated number of input tokens
|
|
27
|
+
# @param output_tokens [Integer] estimated number of output tokens
|
|
28
|
+
# @return [Float, nil] estimated cost in USD, or nil if model not in table
|
|
29
|
+
def estimated_cost(model:, input_tokens: 1000, output_tokens: 500)
|
|
30
|
+
entry = cost_table[model.to_s]
|
|
31
|
+
return nil if entry.nil?
|
|
32
|
+
|
|
33
|
+
((entry[:input] * input_tokens) + (entry[:output] * output_tokens)) / 1_000_000.0
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Selects the cheapest model that meets the capability and quality floor requirements.
|
|
37
|
+
#
|
|
38
|
+
# @param capability [String, Symbol] required capability tier (e.g., :basic, :moderate, :reasoning)
|
|
39
|
+
# @param max_cost [Float, nil] maximum acceptable cost per typical request (USD); nil means no limit
|
|
40
|
+
# @param input_tokens [Integer] estimated input tokens for cost calculation
|
|
41
|
+
# @param output_tokens [Integer] estimated output tokens for cost calculation
|
|
42
|
+
# @return [String, nil] cheapest eligible model ID, or nil if none qualify
|
|
43
|
+
def cheapest_for(capability: :moderate, max_cost: nil, input_tokens: 1000, output_tokens: 500)
|
|
44
|
+
return nil unless enabled?
|
|
45
|
+
|
|
46
|
+
quality_floor = settings.fetch(:quality_floor, 0.7)
|
|
47
|
+
eligible = eligible_models(capability: capability, _quality_floor: quality_floor)
|
|
48
|
+
|
|
49
|
+
scored = eligible.filter_map do |model|
|
|
50
|
+
cost = estimated_cost(model: model, input_tokens: input_tokens, output_tokens: output_tokens)
|
|
51
|
+
next if cost.nil?
|
|
52
|
+
next if max_cost && cost > max_cost
|
|
53
|
+
|
|
54
|
+
[model, cost]
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
return nil if scored.empty?
|
|
58
|
+
|
|
59
|
+
scored.min_by { |_model, cost| cost }&.first
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Returns the merged cost table: defaults overridden by any settings-defined entries.
|
|
63
|
+
def cost_table
|
|
64
|
+
overrides = settings.fetch(:cost_table, {})
|
|
65
|
+
return DEFAULT_COST_TABLE if overrides.nil? || overrides.empty?
|
|
66
|
+
|
|
67
|
+
merged = DEFAULT_COST_TABLE.dup
|
|
68
|
+
overrides.each do |model, costs|
|
|
69
|
+
entry = costs.transform_keys(&:to_sym)
|
|
70
|
+
merged[model.to_s] = entry
|
|
71
|
+
end
|
|
72
|
+
merged
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
private
|
|
76
|
+
|
|
77
|
+
def settings
|
|
78
|
+
llm = Legion::Settings[:llm]
|
|
79
|
+
return {} unless llm.is_a?(Hash)
|
|
80
|
+
|
|
81
|
+
arb = llm[:arbitrage] || llm['arbitrage'] || {}
|
|
82
|
+
arb.is_a?(Hash) ? arb.transform_keys(&:to_sym) : {}
|
|
83
|
+
rescue StandardError
|
|
84
|
+
{}
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Returns models eligible for the given capability tier based on quality floor.
|
|
88
|
+
# The quality floor maps capability tiers to minimum acceptable quality scores (0.0-1.0).
|
|
89
|
+
# Models that are local (cost 0) always qualify for :basic capability.
|
|
90
|
+
def eligible_models(capability:, _quality_floor: 0.7)
|
|
91
|
+
cap = capability.to_sym
|
|
92
|
+
|
|
93
|
+
# Capability tiers determine which models are semantically appropriate.
|
|
94
|
+
# :reasoning requires frontier models; :basic allows cheap/local models.
|
|
95
|
+
# _quality_floor reserved for future scoring integration.
|
|
96
|
+
disqualified_for_reasoning = %w[gpt-4o-mini gemini-2.0-flash llama3]
|
|
97
|
+
|
|
98
|
+
cost_table.keys.reject do |model|
|
|
99
|
+
cap == :reasoning && disqualified_for_reasoning.include?(model)
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'securerandom'
|
|
4
|
+
|
|
5
|
+
module Legion
|
|
6
|
+
module LLM
|
|
7
|
+
module Batch
|
|
8
|
+
class << self
|
|
9
|
+
# Returns true when request batching is enabled in settings.
|
|
10
|
+
def enabled?
|
|
11
|
+
settings.fetch(:enabled, false) == true
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# Enqueues a request for deferred batch processing.
|
|
15
|
+
#
|
|
16
|
+
# @param messages [Array<Hash>] chat messages array
|
|
17
|
+
# @param model [String] model to use
|
|
18
|
+
# @param provider [Symbol, nil] provider override
|
|
19
|
+
# @param callback [Proc, nil] called with result hash when batch is flushed
|
|
20
|
+
# @param priority [Symbol] :normal or :low (informational only)
|
|
21
|
+
# @param opts [Hash] additional options forwarded to provider
|
|
22
|
+
# @return [String] batch_request_id
|
|
23
|
+
def enqueue(messages:, model:, callback: nil, provider: nil, priority: :normal, **opts)
|
|
24
|
+
request_id = SecureRandom.uuid
|
|
25
|
+
|
|
26
|
+
entry = {
|
|
27
|
+
id: request_id,
|
|
28
|
+
messages: messages,
|
|
29
|
+
model: model,
|
|
30
|
+
provider: provider,
|
|
31
|
+
callback: callback,
|
|
32
|
+
priority: priority,
|
|
33
|
+
opts: opts,
|
|
34
|
+
queued_at: Time.now.utc
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
queue << entry
|
|
38
|
+
Legion::Logging.debug "Legion::LLM::Batch enqueued #{request_id} (queue size: #{queue.size})"
|
|
39
|
+
request_id
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Flushes accumulated requests up to max_size.
|
|
43
|
+
# Groups entries by provider+model and invokes callbacks with a stub result.
|
|
44
|
+
# In production this would submit to provider batch APIs; here it logs and returns
|
|
45
|
+
# per-request result hashes for callback delivery.
|
|
46
|
+
#
|
|
47
|
+
# @param max_size [Integer] maximum number of requests to flush in one pass
|
|
48
|
+
# @param max_wait [Integer] only flush entries older than this many seconds (0 = all)
|
|
49
|
+
# @return [Array<Hash>] array of { id:, status:, result: } hashes
|
|
50
|
+
def flush(max_size: nil, max_wait: nil)
|
|
51
|
+
effective_max = max_size || settings.fetch(:max_batch_size, 100)
|
|
52
|
+
effective_wait = max_wait || settings.fetch(:window_seconds, 300)
|
|
53
|
+
|
|
54
|
+
cutoff = Time.now.utc - effective_wait
|
|
55
|
+
to_flush = queue.select { |e| e[:queued_at] <= cutoff }.first(effective_max)
|
|
56
|
+
|
|
57
|
+
return [] if to_flush.empty?
|
|
58
|
+
|
|
59
|
+
to_flush.each { |e| queue.delete(e) }
|
|
60
|
+
Legion::Logging.debug "Legion::LLM::Batch flushing #{to_flush.size} request(s)"
|
|
61
|
+
|
|
62
|
+
groups = to_flush.group_by { |e| [e[:provider], e[:model]] }
|
|
63
|
+
results = []
|
|
64
|
+
|
|
65
|
+
groups.each do |(provider, model), entries|
|
|
66
|
+
entries.each do |entry|
|
|
67
|
+
result = submit_single(entry, provider: provider, model: model)
|
|
68
|
+
entry[:callback]&.call(result)
|
|
69
|
+
results << { id: entry[:id], status: result[:status], result: result }
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
results
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Returns the current number of requests in the queue.
|
|
77
|
+
def queue_size
|
|
78
|
+
queue.size
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Clears the queue (useful for testing).
|
|
82
|
+
def reset!
|
|
83
|
+
@queue = []
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
private
|
|
87
|
+
|
|
88
|
+
def queue
|
|
89
|
+
@queue ||= []
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def settings
|
|
93
|
+
llm = Legion::Settings[:llm]
|
|
94
|
+
return {} unless llm.is_a?(Hash)
|
|
95
|
+
|
|
96
|
+
b = llm[:batch] || llm['batch'] || {}
|
|
97
|
+
b.is_a?(Hash) ? b.transform_keys(&:to_sym) : {}
|
|
98
|
+
rescue StandardError
|
|
99
|
+
{}
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def submit_single(entry, provider:, model:)
|
|
103
|
+
{
|
|
104
|
+
status: :batched,
|
|
105
|
+
model: model,
|
|
106
|
+
provider: provider,
|
|
107
|
+
id: entry[:id],
|
|
108
|
+
response: nil,
|
|
109
|
+
meta: { batched: true, queued_at: entry[:queued_at] }
|
|
110
|
+
}
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module LLM
|
|
5
|
+
module Hooks
|
|
6
|
+
module RagGuard
|
|
7
|
+
class << self
|
|
8
|
+
def check_rag_faithfulness(response:, context:, threshold: nil, evaluators: nil, **)
|
|
9
|
+
return { faithful: true, reason: :eval_unavailable } unless eval_available?
|
|
10
|
+
|
|
11
|
+
resolved_threshold = threshold || settings_threshold
|
|
12
|
+
resolved_evaluators = evaluators || settings_evaluators
|
|
13
|
+
|
|
14
|
+
scores = {}
|
|
15
|
+
flagged = []
|
|
16
|
+
|
|
17
|
+
resolved_evaluators.each do |evaluator_name|
|
|
18
|
+
score = run_evaluator(evaluator_name, response: response, context: context)
|
|
19
|
+
scores[evaluator_name] = score
|
|
20
|
+
flagged << evaluator_name if score < resolved_threshold
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
faithful = flagged.empty?
|
|
24
|
+
details = build_details(scores, resolved_threshold, faithful)
|
|
25
|
+
|
|
26
|
+
{ faithful: faithful, scores: scores, flagged_evaluators: flagged, details: details }
|
|
27
|
+
rescue StandardError => e
|
|
28
|
+
Legion::Logging.warn "RagGuard evaluation error: #{e.message}" if logging_available?
|
|
29
|
+
{ faithful: true, reason: :eval_error }
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
private
|
|
33
|
+
|
|
34
|
+
def eval_available?
|
|
35
|
+
defined?(Legion::Extensions::Eval::Client)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def logging_available?
|
|
39
|
+
Legion.const_defined?('Logging')
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def settings_threshold
|
|
43
|
+
val = Legion::Settings.dig(:llm, :rag_guard, :threshold) if Legion.const_defined?('Settings')
|
|
44
|
+
val || 0.7
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def settings_evaluators
|
|
48
|
+
val = Legion::Settings.dig(:llm, :rag_guard, :evaluators) if Legion.const_defined?('Settings')
|
|
49
|
+
val || %i[faithfulness rag_relevancy]
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def run_evaluator(evaluator_name, response:, context:)
|
|
53
|
+
client = Legion::Extensions::Eval::Client.new
|
|
54
|
+
result = client.run_evaluation(
|
|
55
|
+
evaluator_name: evaluator_name,
|
|
56
|
+
inputs: [{ input: context.to_s, output: response.to_s, expected: nil }]
|
|
57
|
+
)
|
|
58
|
+
result.dig(:summary, :avg_score) || 0.0
|
|
59
|
+
rescue StandardError
|
|
60
|
+
0.0
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def build_details(scores, threshold, faithful)
|
|
64
|
+
score_parts = scores.map { |k, v| "#{k}=#{v.round(3)}" }.join(', ')
|
|
65
|
+
status = faithful ? 'passed' : 'failed'
|
|
66
|
+
"RAG faithfulness check #{status} (threshold=#{threshold}): #{score_parts}"
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module LLM
|
|
5
|
+
module Hooks
|
|
6
|
+
module ResponseGuard
|
|
7
|
+
GUARD_REGISTRY = {
|
|
8
|
+
rag: RagGuard
|
|
9
|
+
}.freeze
|
|
10
|
+
|
|
11
|
+
class << self
|
|
12
|
+
def guard_response(response:, context: nil, guards: [:rag], **)
|
|
13
|
+
guard_results = {}
|
|
14
|
+
|
|
15
|
+
guards.each do |guard_name|
|
|
16
|
+
guard_mod = GUARD_REGISTRY[guard_name.to_sym]
|
|
17
|
+
next unless guard_mod
|
|
18
|
+
|
|
19
|
+
guard_results[guard_name] = dispatch_guard(guard_mod, guard_name,
|
|
20
|
+
response: response, context: context)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
passed = guard_results.values.all? { |r| r[:faithful] != false }
|
|
24
|
+
|
|
25
|
+
{ passed: passed, guards: guard_results }
|
|
26
|
+
rescue StandardError => e
|
|
27
|
+
Legion::Logging.warn "ResponseGuard error: #{e.message}" if Legion.const_defined?('Logging')
|
|
28
|
+
{ passed: true, guards: {} }
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
private
|
|
32
|
+
|
|
33
|
+
def dispatch_guard(guard_mod, guard_name, response:, context:)
|
|
34
|
+
case guard_name.to_sym
|
|
35
|
+
when :rag
|
|
36
|
+
return { faithful: true, reason: :no_context } if context.nil?
|
|
37
|
+
|
|
38
|
+
guard_mod.check_rag_faithfulness(response: response, context: context)
|
|
39
|
+
else
|
|
40
|
+
guard_mod.check(response: response, context: context)
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
data/lib/legion/llm/hooks.rb
CHANGED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module LLM
|
|
5
|
+
module Scheduling
|
|
6
|
+
# Default peak hours in UTC: 14:00-22:00 (9 AM - 5 PM CT)
|
|
7
|
+
DEFAULT_PEAK_RANGE = (14..22)
|
|
8
|
+
|
|
9
|
+
# Intents that are eligible for deferral during peak hours.
|
|
10
|
+
DEFAULT_DEFER_INTENTS = %i[batch background maintenance].freeze
|
|
11
|
+
|
|
12
|
+
class << self
|
|
13
|
+
# Returns true when off-peak scheduling is enabled in settings.
|
|
14
|
+
def enabled?
|
|
15
|
+
settings.fetch(:enabled, false) == true
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Determines whether a request should be deferred to off-peak hours.
|
|
19
|
+
#
|
|
20
|
+
# @param intent [Symbol, String] the request intent
|
|
21
|
+
# @param urgency [Symbol] :immediate bypasses deferral regardless of settings
|
|
22
|
+
# @return [Boolean]
|
|
23
|
+
def should_defer?(intent: :normal, urgency: :normal)
|
|
24
|
+
return false unless enabled?
|
|
25
|
+
return false if urgency.to_sym == :immediate
|
|
26
|
+
|
|
27
|
+
eligible_for_deferral?(intent.to_sym) && peak_hours?
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Returns true if the current UTC hour falls within the configured peak window.
|
|
31
|
+
def peak_hours?
|
|
32
|
+
hour = Time.now.utc.hour
|
|
33
|
+
peak_range.cover?(hour)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Returns the next off-peak time as a Time object (UTC).
|
|
37
|
+
# Off-peak begins at the hour after the peak window ends.
|
|
38
|
+
#
|
|
39
|
+
# @return [Time] next off-peak start time
|
|
40
|
+
def next_off_peak
|
|
41
|
+
now = Time.now.utc
|
|
42
|
+
peak_end = peak_range.last
|
|
43
|
+
max_defer = settings.fetch(:max_defer_hours, 8)
|
|
44
|
+
|
|
45
|
+
next_time = if now.hour < peak_range.first
|
|
46
|
+
# Before peak — off-peak is now
|
|
47
|
+
now
|
|
48
|
+
else
|
|
49
|
+
# During or after peak — next off-peak is at peak_end + 1
|
|
50
|
+
candidate = Time.utc(now.year, now.month, now.day, peak_end + 1, 0, 0)
|
|
51
|
+
candidate += 86_400 if candidate <= now
|
|
52
|
+
candidate
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Cap at max_defer_hours from now
|
|
56
|
+
cap = now + (max_defer * 3600)
|
|
57
|
+
[next_time, cap].min
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
private
|
|
61
|
+
|
|
62
|
+
def settings
|
|
63
|
+
llm = Legion::Settings[:llm]
|
|
64
|
+
return {} unless llm.is_a?(Hash)
|
|
65
|
+
|
|
66
|
+
s = llm[:scheduling] || llm['scheduling'] || {}
|
|
67
|
+
s.is_a?(Hash) ? s.transform_keys(&:to_sym) : {}
|
|
68
|
+
rescue StandardError
|
|
69
|
+
{}
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def peak_range
|
|
73
|
+
raw = settings[:peak_hours_utc]
|
|
74
|
+
return DEFAULT_PEAK_RANGE unless raw.is_a?(String) && raw.include?('-')
|
|
75
|
+
|
|
76
|
+
parts = raw.split('-')
|
|
77
|
+
return DEFAULT_PEAK_RANGE unless parts.size == 2
|
|
78
|
+
|
|
79
|
+
start_h = Integer(parts[0], 10)
|
|
80
|
+
end_h = Integer(parts[1], 10)
|
|
81
|
+
(start_h..end_h)
|
|
82
|
+
rescue ArgumentError
|
|
83
|
+
DEFAULT_PEAK_RANGE
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def defer_intents
|
|
87
|
+
raw = settings[:defer_intents]
|
|
88
|
+
return DEFAULT_DEFER_INTENTS unless raw.is_a?(Array)
|
|
89
|
+
|
|
90
|
+
raw.map { |i| i.to_s.to_sym }
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def eligible_for_deferral?(intent)
|
|
94
|
+
defer_intents.include?(intent)
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
data/lib/legion/llm/settings.rb
CHANGED
|
@@ -15,7 +15,10 @@ module Legion
|
|
|
15
15
|
discovery: discovery_defaults,
|
|
16
16
|
gateway: gateway_defaults,
|
|
17
17
|
daemon: daemon_defaults,
|
|
18
|
-
prompt_caching: prompt_caching_defaults
|
|
18
|
+
prompt_caching: prompt_caching_defaults,
|
|
19
|
+
arbitrage: arbitrage_defaults,
|
|
20
|
+
batch: batch_defaults,
|
|
21
|
+
scheduling: scheduling_defaults
|
|
19
22
|
}
|
|
20
23
|
end
|
|
21
24
|
|
|
@@ -81,6 +84,34 @@ module Legion
|
|
|
81
84
|
}
|
|
82
85
|
end
|
|
83
86
|
|
|
87
|
+
def self.arbitrage_defaults
|
|
88
|
+
{
|
|
89
|
+
enabled: false,
|
|
90
|
+
prefer_cheapest: true,
|
|
91
|
+
quality_floor: 0.7,
|
|
92
|
+
cost_table_refresh: 86_400,
|
|
93
|
+
cost_table: {}
|
|
94
|
+
}
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def self.batch_defaults
|
|
98
|
+
{
|
|
99
|
+
enabled: false,
|
|
100
|
+
window_seconds: 300,
|
|
101
|
+
max_batch_size: 100,
|
|
102
|
+
eligible_intents: %w[batch background low_priority]
|
|
103
|
+
}
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def self.scheduling_defaults
|
|
107
|
+
{
|
|
108
|
+
enabled: false,
|
|
109
|
+
peak_hours_utc: '14-22',
|
|
110
|
+
defer_intents: %w[batch background],
|
|
111
|
+
max_defer_hours: 8
|
|
112
|
+
}
|
|
113
|
+
end
|
|
114
|
+
|
|
84
115
|
def self.providers
|
|
85
116
|
{
|
|
86
117
|
bedrock: {
|
data/lib/legion/llm/version.rb
CHANGED
data/lib/legion/llm.rb
CHANGED
|
@@ -12,6 +12,9 @@ require 'legion/llm/hooks'
|
|
|
12
12
|
require 'legion/llm/cache'
|
|
13
13
|
require_relative 'llm/response_cache'
|
|
14
14
|
require_relative 'llm/daemon_client'
|
|
15
|
+
require_relative 'llm/arbitrage'
|
|
16
|
+
require_relative 'llm/batch'
|
|
17
|
+
require_relative 'llm/scheduling'
|
|
15
18
|
|
|
16
19
|
begin
|
|
17
20
|
require 'legion/extensions/llm/gateway'
|
|
@@ -184,7 +187,7 @@ module Legion
|
|
|
184
187
|
|
|
185
188
|
private
|
|
186
189
|
|
|
187
|
-
def _dispatch_chat(model:, provider:, intent:, tier:, escalate:, max_escalations:, quality_check:, message:, **)
|
|
190
|
+
def _dispatch_chat(model:, provider:, intent:, tier:, escalate:, max_escalations:, quality_check:, message:, **kwargs)
|
|
188
191
|
messages = message.is_a?(Array) ? message : [{ role: 'user', content: message.to_s }]
|
|
189
192
|
resolved_model = model || settings[:default_model]
|
|
190
193
|
|
|
@@ -196,11 +199,11 @@ module Legion
|
|
|
196
199
|
result = if gateway_loaded? && message
|
|
197
200
|
gateway_chat(model: model, provider: provider, intent: intent,
|
|
198
201
|
tier: tier, message: message, escalate: escalate,
|
|
199
|
-
max_escalations: max_escalations, quality_check: quality_check, **)
|
|
202
|
+
max_escalations: max_escalations, quality_check: quality_check, **kwargs)
|
|
200
203
|
else
|
|
201
204
|
chat_direct(model: model, provider: provider, intent: intent, tier: tier,
|
|
202
205
|
escalate: escalate, max_escalations: max_escalations,
|
|
203
|
-
quality_check: quality_check, message: message, **)
|
|
206
|
+
quality_check: quality_check, message: message, **kwargs)
|
|
204
207
|
end
|
|
205
208
|
|
|
206
209
|
if defined?(Legion::LLM::Hooks)
|
|
@@ -208,6 +211,8 @@ module Legion
|
|
|
208
211
|
return blocked[:response] if blocked
|
|
209
212
|
end
|
|
210
213
|
|
|
214
|
+
result = apply_response_guards(result, kwargs) if response_guards_enabled? && result.is_a?(Hash)
|
|
215
|
+
|
|
211
216
|
result
|
|
212
217
|
end
|
|
213
218
|
|
|
@@ -370,6 +375,24 @@ module Legion
|
|
|
370
375
|
nil
|
|
371
376
|
end
|
|
372
377
|
|
|
378
|
+
def response_guards_enabled?
|
|
379
|
+
settings.dig(:response_guards, :enabled) == true
|
|
380
|
+
end
|
|
381
|
+
|
|
382
|
+
def apply_response_guards(result, kwargs)
|
|
383
|
+
context = kwargs[:context]
|
|
384
|
+
response_text = result[:response] || result[:content]
|
|
385
|
+
guard_result = Hooks::ResponseGuard.guard_response(
|
|
386
|
+
response: response_text, context: context
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
Legion::Logging.warn "Response guard failed: #{guard_result.inspect}" if !guard_result[:passed] && Legion.const_defined?('Logging')
|
|
390
|
+
|
|
391
|
+
result.merge(_guard_result: guard_result)
|
|
392
|
+
rescue StandardError
|
|
393
|
+
result
|
|
394
|
+
end
|
|
395
|
+
|
|
373
396
|
def cacheable?(cache_opt, temperature, message)
|
|
374
397
|
cache_opt != false && temperature.to_f.zero? && message && Cache.enabled?
|
|
375
398
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: legion-llm
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.3.
|
|
4
|
+
version: 0.3.14
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Esity
|
|
@@ -130,6 +130,8 @@ files:
|
|
|
130
130
|
- docs/plans/2026-03-15-ollama-discovery-implementation.md
|
|
131
131
|
- legion-llm.gemspec
|
|
132
132
|
- lib/legion/llm.rb
|
|
133
|
+
- lib/legion/llm/arbitrage.rb
|
|
134
|
+
- lib/legion/llm/batch.rb
|
|
133
135
|
- lib/legion/llm/bedrock_bearer_auth.rb
|
|
134
136
|
- lib/legion/llm/cache.rb
|
|
135
137
|
- lib/legion/llm/claude_config_loader.rb
|
|
@@ -141,6 +143,8 @@ files:
|
|
|
141
143
|
- lib/legion/llm/escalation_history.rb
|
|
142
144
|
- lib/legion/llm/helpers/llm.rb
|
|
143
145
|
- lib/legion/llm/hooks.rb
|
|
146
|
+
- lib/legion/llm/hooks/rag_guard.rb
|
|
147
|
+
- lib/legion/llm/hooks/response_guard.rb
|
|
144
148
|
- lib/legion/llm/providers.rb
|
|
145
149
|
- lib/legion/llm/quality_checker.rb
|
|
146
150
|
- lib/legion/llm/response_cache.rb
|
|
@@ -150,6 +154,7 @@ files:
|
|
|
150
154
|
- lib/legion/llm/router/health_tracker.rb
|
|
151
155
|
- lib/legion/llm/router/resolution.rb
|
|
152
156
|
- lib/legion/llm/router/rule.rb
|
|
157
|
+
- lib/legion/llm/scheduling.rb
|
|
153
158
|
- lib/legion/llm/settings.rb
|
|
154
159
|
- lib/legion/llm/shadow_eval.rb
|
|
155
160
|
- lib/legion/llm/structured_output.rb
|