lex-reward 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/legion/extensions/reward/client.rb +22 -0
- data/lib/legion/extensions/reward/helpers/constants.rb +63 -0
- data/lib/legion/extensions/reward/helpers/reward_signal.rb +174 -0
- data/lib/legion/extensions/reward/helpers/reward_store.rb +138 -0
- data/lib/legion/extensions/reward/runners/reward.rb +88 -0
- data/lib/legion/extensions/reward/version.rb +9 -0
- data/lib/legion/extensions/reward.rb +16 -0
- metadata +63 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 3289d5de024c3ca01934e873312a89a06e0ce8585c731ce66d484f843010fc59
|
|
4
|
+
data.tar.gz: 7bf0e6577cafb46e3740d9ec0d3e142111720c3fd5d84569c3f2ffb6969284e6
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 71327dd3001af4e5bc4a417f69ebf1e62bfafb454234570b34e5bce982db454816eb1e3fbecb77065914f8b9306ea27575940feeaab80aac98d85835c36dafed
|
|
7
|
+
data.tar.gz: 9f4d6bf4b3a7b278b19383e50ee902a9ba23c26dfa718575d99d0775b45cbfba4bc66fbf099decb671113f8b00ae029c54566f4f63cd7cd9ce3aee1ab00677e5
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'legion/extensions/reward/helpers/constants'
|
|
4
|
+
require 'legion/extensions/reward/helpers/reward_signal'
|
|
5
|
+
require 'legion/extensions/reward/helpers/reward_store'
|
|
6
|
+
require 'legion/extensions/reward/runners/reward'
|
|
7
|
+
|
|
8
|
+
module Legion
|
|
9
|
+
module Extensions
|
|
10
|
+
module Reward
|
|
11
|
+
class Client
|
|
12
|
+
include Runners::Reward
|
|
13
|
+
|
|
14
|
+
attr_reader :reward_store
|
|
15
|
+
|
|
16
|
+
def initialize(reward_store: nil, **)
|
|
17
|
+
@reward_store = reward_store || Helpers::RewardStore.new
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Reward
|
|
6
|
+
module Helpers
|
|
7
|
+
module Constants
|
|
8
|
+
# Reward sources with weights (sum to 1.0)
|
|
9
|
+
# Each source contributes independently to the composite reward signal
|
|
10
|
+
REWARD_SOURCES = {
|
|
11
|
+
prediction_accuracy: { weight: 0.20, description: 'Correct predictions reinforced' },
|
|
12
|
+
curiosity_resolved: { weight: 0.15, description: 'Wonder resolution satisfaction' },
|
|
13
|
+
goal_achieved: { weight: 0.20, description: 'Intention completion reward' },
|
|
14
|
+
social_approval: { weight: 0.10, description: 'Trust increase from peers' },
|
|
15
|
+
flow_state: { weight: 0.10, description: 'Intrinsic flow motivation' },
|
|
16
|
+
error_avoidance: { weight: 0.10, description: 'Low error rate maintenance' },
|
|
17
|
+
novelty_encounter: { weight: 0.10, description: 'Novel experience exploration' },
|
|
18
|
+
homeostatic_balance: { weight: 0.05, description: 'System stability maintenance' }
|
|
19
|
+
}.freeze
|
|
20
|
+
|
|
21
|
+
# EMA alpha for running reward average
|
|
22
|
+
REWARD_ALPHA = 0.15
|
|
23
|
+
|
|
24
|
+
# EMA alpha for reward prediction (expected reward baseline)
|
|
25
|
+
PREDICTION_ALPHA = 0.1
|
|
26
|
+
|
|
27
|
+
# Minimum RPE magnitude to trigger learning signal
|
|
28
|
+
RPE_THRESHOLD = 0.05
|
|
29
|
+
|
|
30
|
+
# Reward signal range
|
|
31
|
+
REWARD_RANGE = { min: -1.0, max: 1.0 }.freeze
|
|
32
|
+
|
|
33
|
+
# RPE classification thresholds
|
|
34
|
+
RPE_LEVELS = {
|
|
35
|
+
large_positive: 0.3, # "Way better than expected!" — strong reinforcement
|
|
36
|
+
positive: 0.1, # "Better than expected" — moderate reinforcement
|
|
37
|
+
neutral: 0.05, # "About as expected" — maintenance
|
|
38
|
+
negative: -0.1, # "Worse than expected" — mild suppression
|
|
39
|
+
large_negative: -0.3 # "Way worse than expected!" — strong suppression
|
|
40
|
+
}.freeze
|
|
41
|
+
|
|
42
|
+
# Temporal discount factor (per tick, for weighted history)
|
|
43
|
+
TEMPORAL_DISCOUNT = 0.95
|
|
44
|
+
|
|
45
|
+
# History cap
|
|
46
|
+
MAX_REWARD_HISTORY = 200
|
|
47
|
+
|
|
48
|
+
# Domain-specific reward history cap
|
|
49
|
+
MAX_DOMAIN_HISTORY = 50
|
|
50
|
+
|
|
51
|
+
# Anhedonia threshold — running average below this triggers concern
|
|
52
|
+
ANHEDONIA_THRESHOLD = -0.3
|
|
53
|
+
|
|
54
|
+
# Euphoria threshold — running average above this triggers concern
|
|
55
|
+
EUPHORIA_THRESHOLD = 0.7
|
|
56
|
+
|
|
57
|
+
# Reward momentum (how much prior reward influences next prediction)
|
|
58
|
+
MOMENTUM_WINDOW = 10
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Reward
|
|
6
|
+
module Helpers
|
|
7
|
+
class RewardSignal
|
|
8
|
+
attr_reader :running_average, :predicted_reward, :last_rpe,
|
|
9
|
+
:history, :domain_history, :tick_count
|
|
10
|
+
|
|
11
|
+
def initialize
|
|
12
|
+
@running_average = 0.0
|
|
13
|
+
@predicted_reward = 0.0
|
|
14
|
+
@last_rpe = 0.0
|
|
15
|
+
@history = []
|
|
16
|
+
@domain_history = {}
|
|
17
|
+
@tick_count = 0
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def compute(source_signals)
|
|
21
|
+
@tick_count += 1
|
|
22
|
+
raw_reward = weighted_sum(source_signals)
|
|
23
|
+
reward = raw_reward.clamp(Constants::REWARD_RANGE[:min], Constants::REWARD_RANGE[:max])
|
|
24
|
+
|
|
25
|
+
@last_rpe = reward - @predicted_reward
|
|
26
|
+
@running_average = ema(@running_average, reward, Constants::REWARD_ALPHA)
|
|
27
|
+
@predicted_reward = ema(@predicted_reward, reward, Constants::PREDICTION_ALPHA)
|
|
28
|
+
|
|
29
|
+
record(reward, source_signals)
|
|
30
|
+
|
|
31
|
+
{
|
|
32
|
+
reward: reward.round(4),
|
|
33
|
+
rpe: @last_rpe.round(4),
|
|
34
|
+
rpe_class: classify_rpe(@last_rpe),
|
|
35
|
+
running_average: @running_average.round(4),
|
|
36
|
+
predicted_reward: @predicted_reward.round(4),
|
|
37
|
+
sources: source_signals,
|
|
38
|
+
learning_signal: learning_signal?
|
|
39
|
+
}
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def record_domain_reward(domain, reward)
|
|
43
|
+
@domain_history[domain] ||= []
|
|
44
|
+
@domain_history[domain] << { reward: reward, at: Time.now.utc }
|
|
45
|
+
@domain_history[domain].shift while @domain_history[domain].size > Constants::MAX_DOMAIN_HISTORY
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def domain_average(domain)
|
|
49
|
+
entries = @domain_history[domain]
|
|
50
|
+
return 0.0 if entries.nil? || entries.empty?
|
|
51
|
+
|
|
52
|
+
entries.sum { |e| e[:reward] } / entries.size.to_f
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def domain_trend(domain)
|
|
56
|
+
entries = @domain_history[domain]
|
|
57
|
+
return :no_data if entries.nil? || entries.size < 5
|
|
58
|
+
|
|
59
|
+
recent = entries.last(10)
|
|
60
|
+
values = recent.map { |e| e[:reward] }
|
|
61
|
+
first_half = values[0...(values.size / 2)]
|
|
62
|
+
second_half = values[(values.size / 2)..]
|
|
63
|
+
diff = mean(second_half) - mean(first_half)
|
|
64
|
+
|
|
65
|
+
if diff > 0.05
|
|
66
|
+
:improving
|
|
67
|
+
elsif diff < -0.05
|
|
68
|
+
:declining
|
|
69
|
+
else
|
|
70
|
+
:stable
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def anhedonic?
|
|
75
|
+
@running_average < Constants::ANHEDONIA_THRESHOLD
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def euphoric?
|
|
79
|
+
@running_average > Constants::EUPHORIA_THRESHOLD
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def learning_signal?
|
|
83
|
+
@last_rpe.abs >= Constants::RPE_THRESHOLD
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def recent_rewards(limit = 20)
|
|
87
|
+
@history.last(limit)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def discounted_return(window = nil)
|
|
91
|
+
entries = window ? @history.last(window) : @history
|
|
92
|
+
return 0.0 if entries.empty?
|
|
93
|
+
|
|
94
|
+
total = 0.0
|
|
95
|
+
entries.reverse_each.with_index do |entry, idx|
|
|
96
|
+
total += entry[:reward] * (Constants::TEMPORAL_DISCOUNT**idx)
|
|
97
|
+
end
|
|
98
|
+
total
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def reward_volatility
|
|
102
|
+
return 0.0 if @history.size < 3
|
|
103
|
+
|
|
104
|
+
recent = @history.last(Constants::MOMENTUM_WINDOW).map { |h| h[:reward] }
|
|
105
|
+
avg = mean(recent)
|
|
106
|
+
variance = recent.sum { |r| (r - avg)**2 } / recent.size.to_f
|
|
107
|
+
Math.sqrt(variance)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def to_h
|
|
111
|
+
{
|
|
112
|
+
running_average: @running_average.round(4),
|
|
113
|
+
predicted_reward: @predicted_reward.round(4),
|
|
114
|
+
last_rpe: @last_rpe.round(4),
|
|
115
|
+
rpe_class: classify_rpe(@last_rpe),
|
|
116
|
+
tick_count: @tick_count,
|
|
117
|
+
learning_signal: learning_signal?,
|
|
118
|
+
anhedonic: anhedonic?,
|
|
119
|
+
euphoric: euphoric?,
|
|
120
|
+
volatility: reward_volatility.round(4),
|
|
121
|
+
domains_tracked: @domain_history.keys.size,
|
|
122
|
+
history_size: @history.size
|
|
123
|
+
}
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
private
|
|
127
|
+
|
|
128
|
+
def weighted_sum(source_signals)
|
|
129
|
+
total = 0.0
|
|
130
|
+
Constants::REWARD_SOURCES.each do |source, config|
|
|
131
|
+
value = source_signals[source] || 0.0
|
|
132
|
+
total += value * config[:weight]
|
|
133
|
+
end
|
|
134
|
+
total
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def classify_rpe(rpe)
|
|
138
|
+
if rpe >= Constants::RPE_LEVELS[:large_positive]
|
|
139
|
+
:large_positive
|
|
140
|
+
elsif rpe >= Constants::RPE_LEVELS[:positive]
|
|
141
|
+
:positive
|
|
142
|
+
elsif rpe >= -Constants::RPE_LEVELS[:neutral]
|
|
143
|
+
:neutral
|
|
144
|
+
elsif rpe >= Constants::RPE_LEVELS[:large_negative]
|
|
145
|
+
:negative
|
|
146
|
+
else
|
|
147
|
+
:large_negative
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def ema(current, observed, alpha)
|
|
152
|
+
(current * (1.0 - alpha)) + (observed * alpha)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def mean(values)
|
|
156
|
+
return 0.0 if values.empty?
|
|
157
|
+
|
|
158
|
+
values.sum / values.size.to_f
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
def record(reward, sources)
|
|
162
|
+
@history << {
|
|
163
|
+
reward: reward,
|
|
164
|
+
rpe: @last_rpe,
|
|
165
|
+
sources: sources,
|
|
166
|
+
at: Time.now.utc
|
|
167
|
+
}
|
|
168
|
+
@history.shift while @history.size > Constants::MAX_REWARD_HISTORY
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
end
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Reward
|
|
6
|
+
module Helpers
|
|
7
|
+
class RewardStore
|
|
8
|
+
attr_reader :signal
|
|
9
|
+
|
|
10
|
+
def initialize(signal: nil)
|
|
11
|
+
@signal = signal || RewardSignal.new
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def process_tick(tick_results)
|
|
15
|
+
source_signals = extract_signals(tick_results)
|
|
16
|
+
result = @signal.compute(source_signals)
|
|
17
|
+
|
|
18
|
+
domain = extract_domain(tick_results)
|
|
19
|
+
@signal.record_domain_reward(domain, result[:reward]) if domain
|
|
20
|
+
|
|
21
|
+
result
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def domain_report(domain)
|
|
25
|
+
{
|
|
26
|
+
domain: domain,
|
|
27
|
+
average: @signal.domain_average(domain),
|
|
28
|
+
trend: @signal.domain_trend(domain),
|
|
29
|
+
history: @signal.domain_history[domain]&.last(10) || []
|
|
30
|
+
}
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def all_domain_averages
|
|
34
|
+
@signal.domain_history.keys.to_h do |domain|
|
|
35
|
+
[domain, @signal.domain_average(domain).round(4)]
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def health_assessment
|
|
40
|
+
avg = @signal.running_average
|
|
41
|
+
vol = @signal.reward_volatility
|
|
42
|
+
|
|
43
|
+
if @signal.anhedonic?
|
|
44
|
+
{ status: :anhedonic, description: 'Persistently low reward — possible disengagement', severity: :high }
|
|
45
|
+
elsif @signal.euphoric?
|
|
46
|
+
{ status: :euphoric, description: 'Persistently high reward — possible overconfidence', severity: :moderate }
|
|
47
|
+
elsif vol > 0.4
|
|
48
|
+
{ status: :volatile, description: 'Highly variable reward — unstable learning signals', severity: :moderate }
|
|
49
|
+
elsif avg.between?(-0.1, 0.1)
|
|
50
|
+
{ status: :neutral, description: 'Low reward signal — minimal learning happening', severity: :low }
|
|
51
|
+
else
|
|
52
|
+
{ status: :healthy, description: 'Balanced reward signal — healthy learning', severity: :none }
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
private
|
|
57
|
+
|
|
58
|
+
def extract_signals(tick_results)
|
|
59
|
+
{
|
|
60
|
+
prediction_accuracy: extract_prediction_reward(tick_results),
|
|
61
|
+
curiosity_resolved: extract_curiosity_reward(tick_results),
|
|
62
|
+
goal_achieved: extract_goal_reward(tick_results),
|
|
63
|
+
social_approval: extract_social_reward(tick_results),
|
|
64
|
+
flow_state: extract_flow_reward(tick_results),
|
|
65
|
+
error_avoidance: extract_error_reward(tick_results),
|
|
66
|
+
novelty_encounter: extract_novelty_reward(tick_results),
|
|
67
|
+
homeostatic_balance: extract_homeostatic_reward(tick_results)
|
|
68
|
+
}
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def extract_prediction_reward(tick_results)
|
|
72
|
+
accuracy = tick_results.dig(:prediction_engine, :rolling_accuracy)
|
|
73
|
+
return 0.0 unless accuracy
|
|
74
|
+
|
|
75
|
+
(accuracy - 0.5) * 2.0
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def extract_curiosity_reward(tick_results)
|
|
79
|
+
resolved = tick_results.dig(:curiosity, :resolved_count) || 0
|
|
80
|
+
intensity = tick_results.dig(:curiosity, :intensity) || 0.0
|
|
81
|
+
|
|
82
|
+
resolved_signal = [resolved * 0.3, 1.0].min
|
|
83
|
+
(resolved_signal + (intensity * 0.2)).clamp(-1.0, 1.0)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def extract_goal_reward(tick_results)
|
|
87
|
+
completed = tick_results.dig(:volition, :completed_count) || 0
|
|
88
|
+
failed = tick_results.dig(:volition, :failed_count) || 0
|
|
89
|
+
|
|
90
|
+
((completed * 0.4) - (failed * 0.3)).clamp(-1.0, 1.0)
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def extract_social_reward(tick_results)
|
|
94
|
+
trust_delta = tick_results.dig(:trust, :composite_delta) || 0.0
|
|
95
|
+
(trust_delta * 2.0).clamp(-1.0, 1.0)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def extract_flow_reward(tick_results)
|
|
99
|
+
in_flow = tick_results.dig(:flow, :in_flow)
|
|
100
|
+
score = tick_results.dig(:flow, :score) || 0.0
|
|
101
|
+
|
|
102
|
+
return score * 0.8 if in_flow
|
|
103
|
+
|
|
104
|
+
-0.1
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def extract_error_reward(tick_results)
|
|
108
|
+
error_rate = tick_results.dig(:prediction_engine, :error_rate)
|
|
109
|
+
return 0.0 unless error_rate
|
|
110
|
+
|
|
111
|
+
(1.0 - (error_rate * 2.0)).clamp(-1.0, 1.0)
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def extract_novelty_reward(tick_results)
|
|
115
|
+
novelty = tick_results.dig(:attention, :novelty_score) || 0.0
|
|
116
|
+
spotlight_count = tick_results.dig(:attention, :spotlight_count) || 0
|
|
117
|
+
|
|
118
|
+
((novelty * 0.5) + [spotlight_count * 0.1, 0.5].min).clamp(-1.0, 1.0)
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def extract_homeostatic_reward(tick_results)
|
|
122
|
+
deviation = tick_results.dig(:homeostasis, :worst_deviation) || 0.0
|
|
123
|
+
allostatic = tick_results.dig(:homeostasis, :allostatic_load) || 0.0
|
|
124
|
+
|
|
125
|
+
stability = 1.0 - [deviation, allostatic].max
|
|
126
|
+
(stability - 0.5).clamp(-1.0, 1.0)
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def extract_domain(tick_results)
|
|
130
|
+
tick_results.dig(:volition, :current_domain) ||
|
|
131
|
+
tick_results.dig(:curiosity, :active_domain) ||
|
|
132
|
+
tick_results.dig(:attention, :focus_domain)
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
end
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Reward
|
|
6
|
+
module Runners
|
|
7
|
+
module Reward
|
|
8
|
+
include Legion::Extensions::Helpers::Lex if Legion::Extensions.const_defined?(:Helpers) &&
|
|
9
|
+
Legion::Extensions::Helpers.const_defined?(:Lex)
|
|
10
|
+
|
|
11
|
+
def compute_reward(tick_results: {}, **)
|
|
12
|
+
result = reward_store.process_tick(tick_results)
|
|
13
|
+
|
|
14
|
+
Legion::Logging.debug "[reward] reward=#{result[:reward]} rpe=#{result[:rpe]} " \
|
|
15
|
+
"class=#{result[:rpe_class]} learning=#{result[:learning_signal]}"
|
|
16
|
+
|
|
17
|
+
result
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def reward_status(**)
|
|
21
|
+
sig = reward_store.signal
|
|
22
|
+
health = reward_store.health_assessment
|
|
23
|
+
|
|
24
|
+
Legion::Logging.debug "[reward] status: avg=#{sig.running_average.round(3)} " \
|
|
25
|
+
"predicted=#{sig.predicted_reward.round(3)} health=#{health[:status]}"
|
|
26
|
+
|
|
27
|
+
sig.to_h.merge(health: health)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def reward_for(domain:, **)
|
|
31
|
+
report = reward_store.domain_report(domain)
|
|
32
|
+
Legion::Logging.debug "[reward] domain=#{domain} avg=#{report[:average].round(3)} trend=#{report[:trend]}"
|
|
33
|
+
report
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def reward_history(limit: 20, **)
|
|
37
|
+
recent = reward_store.signal.recent_rewards(limit)
|
|
38
|
+
Legion::Logging.debug "[reward] history: #{recent.size} entries"
|
|
39
|
+
|
|
40
|
+
{
|
|
41
|
+
history: recent,
|
|
42
|
+
total: reward_store.signal.history.size,
|
|
43
|
+
discounted_return: reward_store.signal.discounted_return(limit).round(4)
|
|
44
|
+
}
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def domain_rewards(**)
|
|
48
|
+
averages = reward_store.all_domain_averages
|
|
49
|
+
Legion::Logging.debug "[reward] domains: #{averages.size} tracked"
|
|
50
|
+
|
|
51
|
+
{
|
|
52
|
+
domains: averages,
|
|
53
|
+
domain_count: averages.size,
|
|
54
|
+
best_domain: averages.max_by { |_, v| v }&.first,
|
|
55
|
+
worst_domain: averages.min_by { |_, v| v }&.first
|
|
56
|
+
}
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def reward_stats(**)
|
|
60
|
+
sig = reward_store.signal
|
|
61
|
+
health = reward_store.health_assessment
|
|
62
|
+
|
|
63
|
+
Legion::Logging.debug '[reward] stats'
|
|
64
|
+
|
|
65
|
+
{
|
|
66
|
+
running_average: sig.running_average.round(4),
|
|
67
|
+
predicted_reward: sig.predicted_reward.round(4),
|
|
68
|
+
volatility: sig.reward_volatility.round(4),
|
|
69
|
+
tick_count: sig.tick_count,
|
|
70
|
+
health: health,
|
|
71
|
+
domains_tracked: sig.domain_history.keys.size,
|
|
72
|
+
history_size: sig.history.size,
|
|
73
|
+
discounted_return: sig.discounted_return.round(4),
|
|
74
|
+
anhedonic: sig.anhedonic?,
|
|
75
|
+
euphoric: sig.euphoric?
|
|
76
|
+
}
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
private
|
|
80
|
+
|
|
81
|
+
def reward_store
|
|
82
|
+
@reward_store ||= Helpers::RewardStore.new
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'legion/extensions/reward/version'
|
|
4
|
+
require 'legion/extensions/reward/helpers/constants'
|
|
5
|
+
require 'legion/extensions/reward/helpers/reward_signal'
|
|
6
|
+
require 'legion/extensions/reward/helpers/reward_store'
|
|
7
|
+
require 'legion/extensions/reward/runners/reward'
|
|
8
|
+
require 'legion/extensions/reward/client'
|
|
9
|
+
|
|
10
|
+
module Legion
|
|
11
|
+
module Extensions
|
|
12
|
+
module Reward
|
|
13
|
+
extend Legion::Extensions::Core if Legion::Extensions.const_defined?(:Core)
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: lex-reward
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Matthew Iverson
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: legion-gaia
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - ">="
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '0'
|
|
19
|
+
type: :development
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - ">="
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '0'
|
|
26
|
+
description: Computes internal reward signals from cognitive outcomes, tracks reward
|
|
27
|
+
prediction error, and drives reinforcement learning
|
|
28
|
+
email:
|
|
29
|
+
- matt@legionIO.com
|
|
30
|
+
executables: []
|
|
31
|
+
extensions: []
|
|
32
|
+
extra_rdoc_files: []
|
|
33
|
+
files:
|
|
34
|
+
- lib/legion/extensions/reward.rb
|
|
35
|
+
- lib/legion/extensions/reward/client.rb
|
|
36
|
+
- lib/legion/extensions/reward/helpers/constants.rb
|
|
37
|
+
- lib/legion/extensions/reward/helpers/reward_signal.rb
|
|
38
|
+
- lib/legion/extensions/reward/helpers/reward_store.rb
|
|
39
|
+
- lib/legion/extensions/reward/runners/reward.rb
|
|
40
|
+
- lib/legion/extensions/reward/version.rb
|
|
41
|
+
homepage: https://github.com/LegionIO/lex-reward
|
|
42
|
+
licenses:
|
|
43
|
+
- MIT
|
|
44
|
+
metadata:
|
|
45
|
+
rubygems_mfa_required: 'true'
|
|
46
|
+
rdoc_options: []
|
|
47
|
+
require_paths:
|
|
48
|
+
- lib
|
|
49
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - ">="
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '3.4'
|
|
54
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
55
|
+
requirements:
|
|
56
|
+
- - ">="
|
|
57
|
+
- !ruby/object:Gem::Version
|
|
58
|
+
version: '0'
|
|
59
|
+
requirements: []
|
|
60
|
+
rubygems_version: 3.6.9
|
|
61
|
+
specification_version: 4
|
|
62
|
+
summary: Dopaminergic reward signal engine for LegionIO cognitive agents
|
|
63
|
+
test_files: []
|