activematrix 0.0.5 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +96 -28
- data/app/jobs/active_matrix/application_job.rb +11 -0
- data/app/models/active_matrix/agent/jobs/memory_reaper.rb +87 -0
- data/app/models/active_matrix/agent.rb +166 -0
- data/app/models/active_matrix/agent_store.rb +80 -0
- data/app/models/active_matrix/application_record.rb +15 -0
- data/app/models/active_matrix/chat_session.rb +105 -0
- data/app/models/active_matrix/knowledge_base.rb +100 -0
- data/exe/activematrix +7 -0
- data/lib/active_matrix/agent_manager.rb +160 -121
- data/lib/active_matrix/agent_registry.rb +25 -21
- data/lib/active_matrix/api.rb +8 -2
- data/lib/active_matrix/async_query.rb +58 -0
- data/lib/active_matrix/bot/base.rb +3 -3
- data/lib/active_matrix/bot/builtin_commands.rb +188 -0
- data/lib/active_matrix/bot/command_parser.rb +175 -0
- data/lib/active_matrix/cli.rb +273 -0
- data/lib/active_matrix/client.rb +21 -6
- data/lib/active_matrix/client_pool.rb +38 -27
- data/lib/active_matrix/daemon/probe_server.rb +118 -0
- data/lib/active_matrix/daemon/signal_handler.rb +156 -0
- data/lib/active_matrix/daemon/worker.rb +109 -0
- data/lib/active_matrix/daemon.rb +236 -0
- data/lib/active_matrix/engine.rb +18 -0
- data/lib/active_matrix/errors.rb +1 -1
- data/lib/active_matrix/event_router.rb +61 -49
- data/lib/active_matrix/events.rb +1 -0
- data/lib/active_matrix/instrumentation.rb +148 -0
- data/lib/active_matrix/memory/agent_memory.rb +7 -21
- data/lib/active_matrix/memory/conversation_memory.rb +4 -20
- data/lib/active_matrix/memory/global_memory.rb +15 -30
- data/lib/active_matrix/message_dispatcher.rb +197 -0
- data/lib/active_matrix/metrics.rb +424 -0
- data/lib/active_matrix/presence_manager.rb +181 -0
- data/lib/active_matrix/railtie.rb +8 -0
- data/lib/active_matrix/telemetry.rb +134 -0
- data/lib/active_matrix/version.rb +1 -1
- data/lib/active_matrix.rb +18 -11
- data/lib/generators/active_matrix/install/install_generator.rb +3 -22
- data/lib/generators/active_matrix/install/templates/README +5 -2
- metadata +191 -31
- data/lib/generators/active_matrix/install/templates/agent_memory.rb +0 -47
- data/lib/generators/active_matrix/install/templates/conversation_context.rb +0 -72
- data/lib/generators/active_matrix/install/templates/create_agent_memories.rb +0 -17
- data/lib/generators/active_matrix/install/templates/create_conversation_contexts.rb +0 -21
- data/lib/generators/active_matrix/install/templates/create_global_memories.rb +0 -20
- data/lib/generators/active_matrix/install/templates/create_matrix_agents.rb +0 -26
- data/lib/generators/active_matrix/install/templates/global_memory.rb +0 -70
- data/lib/generators/active_matrix/install/templates/matrix_agent.rb +0 -127
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ActiveMatrix
|
|
4
|
+
# Dispatches Matrix messages with retry logic and typing indicators
|
|
5
|
+
#
|
|
6
|
+
# @example Basic usage
|
|
7
|
+
# dispatcher = ActiveMatrix::MessageDispatcher.new(api: api, room_id: '!abc:matrix.org')
|
|
8
|
+
# dispatcher.send_text('Hello!')
|
|
9
|
+
#
|
|
10
|
+
# @example With typing indicator
|
|
11
|
+
# dispatcher.send_text('Thinking...', typing_delay: 2.0)
|
|
12
|
+
#
|
|
13
|
+
# @example Thread reply
|
|
14
|
+
# dispatcher.send_text('Reply', thread_id: '$event_id')
|
|
15
|
+
#
|
|
16
|
+
class MessageDispatcher
|
|
17
|
+
include Instrumentation
|
|
18
|
+
|
|
19
|
+
# Default configuration
|
|
20
|
+
DEFAULT_RETRY_COUNT = 3
|
|
21
|
+
DEFAULT_BASE_DELAY = 1.0
|
|
22
|
+
DEFAULT_TYPING_DELAY = 0.5
|
|
23
|
+
DEFAULT_TYPING_TIMEOUT = 30
|
|
24
|
+
|
|
25
|
+
attr_reader :api, :room_id, :user_id
|
|
26
|
+
|
|
27
|
+
# @param api [ActiveMatrix::Api] Matrix API instance
|
|
28
|
+
# @param room_id [String] Room ID to send messages to
|
|
29
|
+
# @param user_id [String] User ID for typing indicator
|
|
30
|
+
# @param retry_count [Integer] Number of retries on failure
|
|
31
|
+
# @param base_delay [Float] Base delay in seconds for exponential backoff
|
|
32
|
+
# @param typing_delay [Float] Default typing delay in seconds
|
|
33
|
+
def initialize(api:, room_id:, user_id:, retry_count: DEFAULT_RETRY_COUNT,
|
|
34
|
+
base_delay: DEFAULT_BASE_DELAY, typing_delay: DEFAULT_TYPING_DELAY)
|
|
35
|
+
@api = api
|
|
36
|
+
@room_id = room_id
|
|
37
|
+
@user_id = user_id
|
|
38
|
+
@retry_count = retry_count
|
|
39
|
+
@base_delay = base_delay
|
|
40
|
+
@default_typing_delay = typing_delay
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Send a plain text message
|
|
44
|
+
#
|
|
45
|
+
# @param text [String] Message text
|
|
46
|
+
# @param msgtype [String] Message type (default: 'm.text')
|
|
47
|
+
# @param typing_delay [Float, nil] Seconds to show typing indicator (nil to skip)
|
|
48
|
+
# @param thread_id [String, nil] Event ID to reply in thread
|
|
49
|
+
# @return [Hash] Response with :event_id
|
|
50
|
+
def send_text(text, msgtype: 'm.text', typing_delay: nil, thread_id: nil)
|
|
51
|
+
content = {
|
|
52
|
+
msgtype: msgtype,
|
|
53
|
+
body: text
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
send_with_typing(content, typing_delay: typing_delay, thread_id: thread_id)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Send an HTML message
|
|
60
|
+
#
|
|
61
|
+
# @param html [String] HTML content
|
|
62
|
+
# @param body [String, nil] Plain text fallback (auto-generated if nil)
|
|
63
|
+
# @param msgtype [String] Message type (default: 'm.text')
|
|
64
|
+
# @param typing_delay [Float, nil] Seconds to show typing indicator
|
|
65
|
+
# @param thread_id [String, nil] Event ID to reply in thread
|
|
66
|
+
# @return [Hash] Response with :event_id
|
|
67
|
+
def send_html(html, body: nil, msgtype: 'm.text', typing_delay: nil, thread_id: nil)
|
|
68
|
+
plain_body = body || strip_html(html)
|
|
69
|
+
|
|
70
|
+
content = {
|
|
71
|
+
msgtype: msgtype,
|
|
72
|
+
body: plain_body,
|
|
73
|
+
format: 'org.matrix.custom.html',
|
|
74
|
+
formatted_body: html
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
send_with_typing(content, typing_delay: typing_delay, thread_id: thread_id)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Send a notice message (typically for bot responses)
|
|
81
|
+
#
|
|
82
|
+
# @param text [String] Notice text
|
|
83
|
+
# @param typing_delay [Float, nil] Seconds to show typing indicator
|
|
84
|
+
# @param thread_id [String, nil] Event ID to reply in thread
|
|
85
|
+
# @return [Hash] Response with :event_id
|
|
86
|
+
def send_notice(text, typing_delay: nil, thread_id: nil)
|
|
87
|
+
send_text(text, msgtype: 'm.notice', typing_delay: typing_delay, thread_id: thread_id)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Send an HTML notice message
|
|
91
|
+
#
|
|
92
|
+
# @param html [String] HTML content
|
|
93
|
+
# @param body [String, nil] Plain text fallback
|
|
94
|
+
# @param typing_delay [Float, nil] Seconds to show typing indicator
|
|
95
|
+
# @param thread_id [String, nil] Event ID to reply in thread
|
|
96
|
+
# @return [Hash] Response with :event_id
|
|
97
|
+
def send_html_notice(html, body: nil, typing_delay: nil, thread_id: nil)
|
|
98
|
+
send_html(html, body: body, msgtype: 'm.notice', typing_delay: typing_delay, thread_id: thread_id)
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Send an emote message (/me action)
|
|
102
|
+
#
|
|
103
|
+
# @param text [String] Emote text
|
|
104
|
+
# @param typing_delay [Float, nil] Seconds to show typing indicator
|
|
105
|
+
# @param thread_id [String, nil] Event ID to reply in thread
|
|
106
|
+
# @return [Hash] Response with :event_id
|
|
107
|
+
def send_emote(text, typing_delay: nil, thread_id: nil)
|
|
108
|
+
send_text(text, msgtype: 'm.emote', typing_delay: typing_delay, thread_id: thread_id)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Show typing indicator
|
|
112
|
+
#
|
|
113
|
+
# @param typing [Boolean] Whether to show or hide typing
|
|
114
|
+
# @param timeout [Integer] Timeout in seconds
|
|
115
|
+
def set_typing(typing: true, timeout: DEFAULT_TYPING_TIMEOUT)
|
|
116
|
+
@api.set_typing(@room_id, @user_id, typing: typing, timeout: timeout)
|
|
117
|
+
rescue StandardError => e
|
|
118
|
+
ActiveMatrix.logger.debug("Failed to set typing indicator: #{e.message}")
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
private
|
|
122
|
+
|
|
123
|
+
def agent_id
|
|
124
|
+
@user_id
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def send_with_typing(content, typing_delay:, thread_id:)
|
|
128
|
+
effective_delay = typing_delay || @default_typing_delay
|
|
129
|
+
|
|
130
|
+
# Show typing indicator
|
|
131
|
+
if effective_delay.positive?
|
|
132
|
+
set_typing(typing: true)
|
|
133
|
+
sleep(effective_delay)
|
|
134
|
+
set_typing(typing: false)
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Add thread relation if specified
|
|
138
|
+
if thread_id
|
|
139
|
+
content[:'m.relates_to'] = {
|
|
140
|
+
rel_type: 'm.thread',
|
|
141
|
+
event_id: thread_id
|
|
142
|
+
}
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
send_with_retry(content)
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def send_with_retry(content)
|
|
149
|
+
attempts = 0
|
|
150
|
+
|
|
151
|
+
instrument_operation(:send_message, room_id: @room_id) do
|
|
152
|
+
@api.send_message_event(@room_id, 'm.room.message', content)
|
|
153
|
+
rescue ActiveMatrix::MatrixRequestError => e
|
|
154
|
+
attempts += 1
|
|
155
|
+
|
|
156
|
+
if attempts <= @retry_count && retryable_error?(e)
|
|
157
|
+
delay = calculate_backoff(attempts)
|
|
158
|
+
ActiveMatrix.logger.warn("Message send failed (attempt #{attempts}/#{@retry_count}), retrying in #{delay}s: #{e.message}")
|
|
159
|
+
sleep(delay)
|
|
160
|
+
retry
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
raise
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def retryable_error?(error)
|
|
168
|
+
# Retry on rate limiting, server errors, or network issues
|
|
169
|
+
case error
|
|
170
|
+
when ActiveMatrix::MatrixTooManyRequestsError
|
|
171
|
+
true
|
|
172
|
+
when ActiveMatrix::MatrixRequestError
|
|
173
|
+
error.httpstatus.to_i >= 500
|
|
174
|
+
else
|
|
175
|
+
false
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def calculate_backoff(attempt)
|
|
180
|
+
# Exponential backoff with full jitter
|
|
181
|
+
max_delay = @base_delay * (2**(attempt - 1))
|
|
182
|
+
rand * max_delay
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def strip_html(html)
|
|
186
|
+
# Simple HTML stripping - remove tags and decode entities
|
|
187
|
+
text = html.gsub(/<br\s*\/?>/i, "\n")
|
|
188
|
+
text = text.gsub(/<\/?[^>]+>/, '')
|
|
189
|
+
text = text.gsub(' ', ' ')
|
|
190
|
+
text = text.gsub('<', '<')
|
|
191
|
+
text = text.gsub('>', '>')
|
|
192
|
+
text = text.gsub('&', '&')
|
|
193
|
+
text = text.gsub('"', '"')
|
|
194
|
+
text.strip
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
end
|
|
@@ -0,0 +1,424 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'concurrent'
|
|
4
|
+
require 'singleton'
|
|
5
|
+
|
|
6
|
+
module ActiveMatrix
|
|
7
|
+
# Metrics collection for Matrix agent operations
|
|
8
|
+
# Provides structured metrics that can be exported to monitoring systems
|
|
9
|
+
#
|
|
10
|
+
# @example Getting agent metrics
|
|
11
|
+
# metrics = ActiveMatrix::Metrics.instance.get_agent_metrics('agent_123')
|
|
12
|
+
# puts metrics[:overall_success_rate]
|
|
13
|
+
#
|
|
14
|
+
# @example Getting health summary
|
|
15
|
+
# summary = ActiveMatrix::Metrics.instance.get_health_summary
|
|
16
|
+
# puts "Healthy agents: #{summary[:healthy_agents]}"
|
|
17
|
+
#
|
|
18
|
+
class Metrics
|
|
19
|
+
include Singleton
|
|
20
|
+
|
|
21
|
+
def initialize
|
|
22
|
+
@metrics = Concurrent::Hash.new
|
|
23
|
+
@component_metrics = Concurrent::Hash.new
|
|
24
|
+
setup_notification_subscribers
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Record operation metrics
|
|
28
|
+
#
|
|
29
|
+
# @param operation [Symbol, String] Operation name
|
|
30
|
+
# @param component [String] Component name (e.g., 'MessageDispatcher')
|
|
31
|
+
# @param agent_id [String] Agent identifier
|
|
32
|
+
# @param status [String] 'success' or 'error'
|
|
33
|
+
# @param duration_ms [Float] Operation duration in milliseconds
|
|
34
|
+
# @param error_class [String, nil] Error class name if status is 'error'
|
|
35
|
+
# @param metadata [Hash] Additional metadata (user_id, room_id, etc.)
|
|
36
|
+
# rubocop:disable Metrics/ParameterLists
|
|
37
|
+
def record_operation(operation, component:, agent_id:, status:, duration_ms:, error_class: nil, **metadata)
|
|
38
|
+
component_key = "#{agent_id}:#{component}"
|
|
39
|
+
operation_key = "#{component_key}:#{operation}"
|
|
40
|
+
|
|
41
|
+
# Initialize metrics if needed
|
|
42
|
+
@component_metrics[component_key] ||= initialize_component_metrics(component, agent_id)
|
|
43
|
+
@metrics[operation_key] ||= initialize_operation_metrics(operation, component, agent_id)
|
|
44
|
+
|
|
45
|
+
# Update component-level metrics
|
|
46
|
+
update_component_metrics(@component_metrics[component_key], status, duration_ms)
|
|
47
|
+
|
|
48
|
+
# Update operation-level metrics
|
|
49
|
+
metric = @metrics[operation_key]
|
|
50
|
+
metric[:total_count].increment
|
|
51
|
+
metric[:last_operation_at] = Time.current
|
|
52
|
+
|
|
53
|
+
# Always update duration stats regardless of status
|
|
54
|
+
update_duration_stats(metric[:duration_stats], duration_ms)
|
|
55
|
+
|
|
56
|
+
case status
|
|
57
|
+
when 'success'
|
|
58
|
+
metric[:success_count].increment
|
|
59
|
+
when 'error'
|
|
60
|
+
metric[:error_count].increment
|
|
61
|
+
metric[:last_error_at] = Time.current
|
|
62
|
+
|
|
63
|
+
error_type = error_class || metadata[:error_type] || 'unknown'
|
|
64
|
+
metric[:error_breakdown][error_type] ||= Concurrent::AtomicFixnum.new(0)
|
|
65
|
+
metric[:error_breakdown][error_type].increment
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Track recent operations (sliding window) with thread-safe array
|
|
69
|
+
metric[:recent_operations] << {
|
|
70
|
+
timestamp: Time.current,
|
|
71
|
+
status: status,
|
|
72
|
+
duration_ms: duration_ms,
|
|
73
|
+
metadata: metadata.merge(error_class: error_class).slice(:error_type, :error_class, :user_id, :room_id)
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
# Keep only last 100 operations
|
|
77
|
+
metric[:recent_operations].shift if metric[:recent_operations].size > 100
|
|
78
|
+
end
|
|
79
|
+
# rubocop:enable Metrics/ParameterLists
|
|
80
|
+
|
|
81
|
+
# Get metrics for a specific agent instance
|
|
82
|
+
#
|
|
83
|
+
# @param agent_id [String] Agent identifier
|
|
84
|
+
# @return [Hash] Agent metrics including components, success rates, and health status
|
|
85
|
+
def get_agent_metrics(agent_id)
|
|
86
|
+
agent_metrics = @metrics.select { |key, _| key.start_with?("#{agent_id}:") }
|
|
87
|
+
|
|
88
|
+
return {} if agent_metrics.empty?
|
|
89
|
+
|
|
90
|
+
components = {}
|
|
91
|
+
total_operations = 0
|
|
92
|
+
total_successes = 0
|
|
93
|
+
total_errors = 0
|
|
94
|
+
|
|
95
|
+
agent_metrics.each do |key, metrics|
|
|
96
|
+
parts = key.split(':', 3)
|
|
97
|
+
component = parts[1]
|
|
98
|
+
operation = parts[2]
|
|
99
|
+
|
|
100
|
+
components[component] ||= {
|
|
101
|
+
operations: {},
|
|
102
|
+
total_count: 0,
|
|
103
|
+
success_count: 0,
|
|
104
|
+
error_count: 0
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
total_count = metrics[:total_count].value
|
|
108
|
+
success_count = metrics[:success_count].value
|
|
109
|
+
error_count = metrics[:error_count].value
|
|
110
|
+
|
|
111
|
+
components[component][:total_count] += total_count
|
|
112
|
+
components[component][:success_count] += success_count
|
|
113
|
+
components[component][:error_count] += error_count
|
|
114
|
+
|
|
115
|
+
total_operations += total_count
|
|
116
|
+
total_successes += success_count
|
|
117
|
+
total_errors += error_count
|
|
118
|
+
|
|
119
|
+
components[component][:operations][operation] = {
|
|
120
|
+
total_count: total_count,
|
|
121
|
+
success_count: success_count,
|
|
122
|
+
error_count: error_count,
|
|
123
|
+
success_rate: calculate_success_rate(metrics),
|
|
124
|
+
avg_duration_ms: metrics[:duration_stats][:avg].value,
|
|
125
|
+
p95_duration_ms: metrics[:duration_stats][:p95].value,
|
|
126
|
+
last_operation_at: metrics[:last_operation_at],
|
|
127
|
+
last_error_at: metrics[:last_error_at],
|
|
128
|
+
error_breakdown: serialize_error_breakdown(metrics[:error_breakdown])
|
|
129
|
+
}
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
{
|
|
133
|
+
agent_id: agent_id,
|
|
134
|
+
total_operations: total_operations,
|
|
135
|
+
total_successes: total_successes,
|
|
136
|
+
total_errors: total_errors,
|
|
137
|
+
overall_success_rate: total_operations.positive? ? (total_successes.to_f / total_operations * 100).round(2) : 0,
|
|
138
|
+
components: components,
|
|
139
|
+
health_status: calculate_agent_health(total_operations, total_successes)
|
|
140
|
+
}
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Get metrics for a specific component
|
|
144
|
+
#
|
|
145
|
+
# @param agent_id [String] Agent identifier
|
|
146
|
+
# @param component [String] Component name
|
|
147
|
+
# @return [Hash] Component metrics
|
|
148
|
+
def get_component_metrics(agent_id, component)
|
|
149
|
+
component_key = "#{agent_id}:#{component}"
|
|
150
|
+
component_metrics = @component_metrics[component_key]
|
|
151
|
+
|
|
152
|
+
return default_component_metrics if component_metrics.nil?
|
|
153
|
+
|
|
154
|
+
operations = @metrics.select { |key, _| key.start_with?("#{component_key}:") }
|
|
155
|
+
|
|
156
|
+
{
|
|
157
|
+
component: component,
|
|
158
|
+
agent_id: agent_id,
|
|
159
|
+
total_operations: component_metrics[:total_count].value,
|
|
160
|
+
success_count: component_metrics[:success_count].value,
|
|
161
|
+
error_count: component_metrics[:error_count].value,
|
|
162
|
+
success_rate: calculate_success_rate(component_metrics),
|
|
163
|
+
avg_duration_ms: component_metrics[:duration_stats][:avg].value,
|
|
164
|
+
p95_duration_ms: component_metrics[:duration_stats][:p95].value,
|
|
165
|
+
operations: operations.transform_keys { |k| k.split(':', 3).last }
|
|
166
|
+
.transform_values { |v| operation_summary(v) }
|
|
167
|
+
}
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Get top operations by volume
|
|
171
|
+
#
|
|
172
|
+
# @param agent_id [String] Agent identifier
|
|
173
|
+
# @param limit [Integer] Maximum number of operations to return
|
|
174
|
+
# @return [Array<Hash>] Top operations sorted by count
|
|
175
|
+
def top_operations_by_volume(agent_id, limit: 10)
|
|
176
|
+
agent_metrics = @metrics.select { |key, _| key.start_with?("#{agent_id}:") }
|
|
177
|
+
|
|
178
|
+
operations = agent_metrics.map do |key, metrics|
|
|
179
|
+
parts = key.split(':', 3)
|
|
180
|
+
{
|
|
181
|
+
component: parts[1],
|
|
182
|
+
operation: parts[2],
|
|
183
|
+
count: metrics[:total_count].value,
|
|
184
|
+
success_rate: calculate_success_rate(metrics),
|
|
185
|
+
avg_duration_ms: metrics[:duration_stats][:avg].value
|
|
186
|
+
}
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
operations.sort_by { |op| -op[:count] }.first(limit)
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Get recent errors
|
|
193
|
+
#
|
|
194
|
+
# @param agent_id [String] Agent identifier
|
|
195
|
+
# @param limit [Integer] Maximum number of errors to return
|
|
196
|
+
# @return [Array<Hash>] Recent errors sorted by timestamp (newest first)
|
|
197
|
+
def recent_errors(agent_id, limit: 20)
|
|
198
|
+
agent_metrics = @metrics.select { |key, _| key.start_with?("#{agent_id}:") }
|
|
199
|
+
errors = []
|
|
200
|
+
|
|
201
|
+
agent_metrics.each do |key, metrics|
|
|
202
|
+
parts = key.split(':', 3)
|
|
203
|
+
component = parts[1]
|
|
204
|
+
operation = parts[2]
|
|
205
|
+
|
|
206
|
+
metrics[:recent_operations].to_a.select { |op| op[:status] == 'error' }.each do |error_op|
|
|
207
|
+
errors << {
|
|
208
|
+
timestamp: error_op[:timestamp],
|
|
209
|
+
component: component,
|
|
210
|
+
operation: operation,
|
|
211
|
+
duration_ms: error_op[:duration_ms],
|
|
212
|
+
metadata: error_op[:metadata]
|
|
213
|
+
}
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
errors.sort_by { |e| -e[:timestamp].to_f }.first(limit)
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# Get health summary for all agents
|
|
221
|
+
#
|
|
222
|
+
# @return [Hash] Summary of agent health across the system
|
|
223
|
+
def get_health_summary
|
|
224
|
+
agent_ids = @metrics.keys.map { |key| key.split(':', 2).first }.uniq
|
|
225
|
+
|
|
226
|
+
agents = agent_ids.map { |agent_id| get_agent_metrics(agent_id) }
|
|
227
|
+
|
|
228
|
+
{
|
|
229
|
+
total_agents: agents.length,
|
|
230
|
+
healthy_agents: agents.count { |a| a[:health_status] == :healthy },
|
|
231
|
+
degraded_agents: agents.count { |a| a[:health_status] == :degraded },
|
|
232
|
+
unhealthy_agents: agents.count { |a| a[:health_status] == :unhealthy },
|
|
233
|
+
total_operations: agents.sum { |a| a[:total_operations] },
|
|
234
|
+
overall_success_rate: calculate_overall_success_rate(agents),
|
|
235
|
+
agents: agents.map do |agent|
|
|
236
|
+
{
|
|
237
|
+
agent_id: agent[:agent_id],
|
|
238
|
+
health_status: agent[:health_status],
|
|
239
|
+
success_rate: agent[:overall_success_rate],
|
|
240
|
+
total_operations: agent[:total_operations]
|
|
241
|
+
}
|
|
242
|
+
end
|
|
243
|
+
}
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
# Reset all metrics (useful for testing)
|
|
247
|
+
def reset!
|
|
248
|
+
@metrics.clear
|
|
249
|
+
@component_metrics.clear
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
# Reset metrics for specific agent
|
|
253
|
+
#
|
|
254
|
+
# @param agent_id [String] Agent identifier
|
|
255
|
+
def reset_agent!(agent_id)
|
|
256
|
+
@metrics.delete_if { |key, _| key.start_with?("#{agent_id}:") }
|
|
257
|
+
@component_metrics.delete_if { |key, _| key.start_with?("#{agent_id}:") }
|
|
258
|
+
ActiveMatrix.logger.info("Reset metrics for Matrix agent: #{agent_id}")
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
private
|
|
262
|
+
|
|
263
|
+
def setup_notification_subscribers
|
|
264
|
+
# Subscribe to ActiveMatrix events
|
|
265
|
+
ActiveSupport::Notifications.subscribe(/^activematrix\./) do |name, start, finish, _id, payload|
|
|
266
|
+
operation = name.sub('activematrix.', '')
|
|
267
|
+
duration_ms = ((finish - start) * 1000).round(2)
|
|
268
|
+
|
|
269
|
+
record_operation(
|
|
270
|
+
operation,
|
|
271
|
+
component: payload[:component] || 'Unknown',
|
|
272
|
+
agent_id: payload[:agent_id] || 'unknown',
|
|
273
|
+
status: payload[:status],
|
|
274
|
+
duration_ms: duration_ms,
|
|
275
|
+
error_type: payload[:error_category],
|
|
276
|
+
error_class: payload[:error_class],
|
|
277
|
+
user_id: payload[:user_id],
|
|
278
|
+
room_id: payload[:room_id]
|
|
279
|
+
)
|
|
280
|
+
end
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
def initialize_component_metrics(component, agent_id)
|
|
284
|
+
{
|
|
285
|
+
component: component,
|
|
286
|
+
agent_id: agent_id,
|
|
287
|
+
total_count: Concurrent::AtomicFixnum.new(0),
|
|
288
|
+
success_count: Concurrent::AtomicFixnum.new(0),
|
|
289
|
+
error_count: Concurrent::AtomicFixnum.new(0),
|
|
290
|
+
duration_stats: initialize_duration_stats,
|
|
291
|
+
created_at: Time.current
|
|
292
|
+
}
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
def initialize_operation_metrics(operation, component, agent_id)
|
|
296
|
+
{
|
|
297
|
+
operation: operation,
|
|
298
|
+
component: component,
|
|
299
|
+
agent_id: agent_id,
|
|
300
|
+
total_count: Concurrent::AtomicFixnum.new(0),
|
|
301
|
+
success_count: Concurrent::AtomicFixnum.new(0),
|
|
302
|
+
error_count: Concurrent::AtomicFixnum.new(0),
|
|
303
|
+
duration_stats: initialize_duration_stats,
|
|
304
|
+
error_breakdown: Concurrent::Hash.new,
|
|
305
|
+
recent_operations: Concurrent::Array.new,
|
|
306
|
+
created_at: Time.current,
|
|
307
|
+
last_operation_at: nil,
|
|
308
|
+
last_error_at: nil
|
|
309
|
+
}
|
|
310
|
+
end
|
|
311
|
+
|
|
312
|
+
def initialize_duration_stats
|
|
313
|
+
Concurrent::Hash.new.tap do |stats|
|
|
314
|
+
stats[:total] = Concurrent::AtomicFixnum.new(0)
|
|
315
|
+
stats[:count] = Concurrent::AtomicFixnum.new(0)
|
|
316
|
+
stats[:avg] = Concurrent::AtomicReference.new(0)
|
|
317
|
+
stats[:min] = Concurrent::AtomicReference.new(Float::INFINITY)
|
|
318
|
+
stats[:max] = Concurrent::AtomicReference.new(0)
|
|
319
|
+
stats[:p95] = Concurrent::AtomicReference.new(0)
|
|
320
|
+
stats[:values] = Concurrent::Array.new
|
|
321
|
+
end
|
|
322
|
+
end
|
|
323
|
+
|
|
324
|
+
def update_component_metrics(component_metrics, status, duration_ms)
|
|
325
|
+
component_metrics[:total_count].increment
|
|
326
|
+
|
|
327
|
+
case status
|
|
328
|
+
when 'success'
|
|
329
|
+
component_metrics[:success_count].increment
|
|
330
|
+
when 'error'
|
|
331
|
+
component_metrics[:error_count].increment
|
|
332
|
+
end
|
|
333
|
+
|
|
334
|
+
update_duration_stats(component_metrics[:duration_stats], duration_ms)
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
def update_duration_stats(stats, duration_ms)
|
|
338
|
+
stats[:total].increment((duration_ms * 100).to_i) # Store as hundredths to preserve decimals
|
|
339
|
+
count = stats[:count].increment
|
|
340
|
+
stats[:avg].set((stats[:total].value.to_f / count / 100).round(2))
|
|
341
|
+
|
|
342
|
+
# Update min atomically
|
|
343
|
+
stats[:min].update { |current| [current, duration_ms].min }
|
|
344
|
+
|
|
345
|
+
# Update max atomically
|
|
346
|
+
stats[:max].update { |current| [current, duration_ms].max }
|
|
347
|
+
|
|
348
|
+
# Keep sliding window of durations for percentile calculation
|
|
349
|
+
stats[:values] << duration_ms
|
|
350
|
+
stats[:values].shift if stats[:values].size > 1000
|
|
351
|
+
|
|
352
|
+
# Calculate P95
|
|
353
|
+
values_array = stats[:values].to_a
|
|
354
|
+
if values_array.size >= 20
|
|
355
|
+
sorted = values_array.sort
|
|
356
|
+
p95_index = (sorted.length * 0.95).ceil - 1
|
|
357
|
+
stats[:p95].set(sorted[p95_index].round(2))
|
|
358
|
+
elsif values_array.size.positive?
|
|
359
|
+
# For small samples, use the max value as P95
|
|
360
|
+
stats[:p95].set(values_array.max.round(2))
|
|
361
|
+
end
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
def calculate_success_rate(metrics)
|
|
365
|
+
total = metrics[:total_count].value
|
|
366
|
+
return 0 if total.zero?
|
|
367
|
+
|
|
368
|
+
((metrics[:success_count].value.to_f / total) * 100).round(2)
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
def calculate_agent_health(total_operations, success_count)
|
|
372
|
+
return :unknown if total_operations < 10 # Need minimum operations
|
|
373
|
+
|
|
374
|
+
success_rate = (success_count.to_f / total_operations * 100)
|
|
375
|
+
|
|
376
|
+
if success_rate >= 95
|
|
377
|
+
:healthy
|
|
378
|
+
elsif success_rate >= 80
|
|
379
|
+
:degraded
|
|
380
|
+
else
|
|
381
|
+
:unhealthy
|
|
382
|
+
end
|
|
383
|
+
end
|
|
384
|
+
|
|
385
|
+
def calculate_overall_success_rate(agents)
|
|
386
|
+
return 0 if agents.empty?
|
|
387
|
+
|
|
388
|
+
total_ops = agents.sum { |a| a[:total_operations] }
|
|
389
|
+
return 0 if total_ops.zero?
|
|
390
|
+
|
|
391
|
+
total_successes = agents.sum { |a| a[:total_successes] }
|
|
392
|
+
((total_successes.to_f / total_ops) * 100).round(2)
|
|
393
|
+
end
|
|
394
|
+
|
|
395
|
+
def serialize_error_breakdown(error_breakdown)
|
|
396
|
+
error_breakdown.transform_values(&:value)
|
|
397
|
+
end
|
|
398
|
+
|
|
399
|
+
def operation_summary(metrics)
|
|
400
|
+
{
|
|
401
|
+
total_count: metrics[:total_count].value,
|
|
402
|
+
success_count: metrics[:success_count].value,
|
|
403
|
+
error_count: metrics[:error_count].value,
|
|
404
|
+
success_rate: calculate_success_rate(metrics),
|
|
405
|
+
avg_duration_ms: metrics[:duration_stats][:avg].value,
|
|
406
|
+
p95_duration_ms: metrics[:duration_stats][:p95].value
|
|
407
|
+
}
|
|
408
|
+
end
|
|
409
|
+
|
|
410
|
+
def default_component_metrics
|
|
411
|
+
{
|
|
412
|
+
component: 'Unknown',
|
|
413
|
+
agent_id: 'unknown',
|
|
414
|
+
total_operations: 0,
|
|
415
|
+
success_count: 0,
|
|
416
|
+
error_count: 0,
|
|
417
|
+
success_rate: 0,
|
|
418
|
+
avg_duration_ms: 0,
|
|
419
|
+
p95_duration_ms: 0,
|
|
420
|
+
operations: {}
|
|
421
|
+
}
|
|
422
|
+
end
|
|
423
|
+
end
|
|
424
|
+
end
|